Question about how to build a web crawler ?

+1 Archit Mathur · December 2, 2015
Code:
import requests
from bs4 import BeautifulSoup


def forum_spider(max_pages):
page = 1
while page <= max_pages:
url = "https://www.thenewboston.com/forum/home.php?page=" + str(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.find_all('a', {'class': 'title text-semibold'}):
href = link.get('href')
print(href)
page += 1

forum_spider(1)




Output(Error):
C:\Python34\python.exe C:/Users/Archit/PycharmProjects/untitled/first.py
Traceback (most recent call last):
  File "C:\Python34\lib\site-packages\requests\packages\urllib3\connectionpool.py", line 559, in urlopen
    body=body, headers=headers)
  File "C:\Python34\lib\site-packages\requests\packages\urllib3\connectionpool.py", line 345, in _make_request
    self._validate_conn(conn)
  File "C:\Python34\lib\site-packages\requests\packages\urllib3\connectionpool.py", line 782, in _validate_conn
    conn.connect()
  File "C:\Python34\lib\site-packages\requests\packages\urllib3\connection.py", line 250, in connect
    ssl_version=resolved_ssl_version)
  File "C:\Python34\lib\site-packages\requests\packages\urllib3\util\ssl_.py", line 285, in ssl_wrap_socket
    return context.wrap_socket(sock, server_hostname=server_hostname)
  File "C:\Python34\lib\ssl.py", line 365, in wrap_socket
    _context=self)
  File "C:\Python34\lib\ssl.py", line 583, in __init__
    self.do_handshake()
  File "C:\Python34\lib\ssl.py", line 810, in do_handshake
    self._sslobj.do_handshake()
ssl.SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:600)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Python34\lib\site-packages\requests\adapters.py", line 370, in send
    timeout=timeout
  File "C:\Python34\lib\site-packages\requests\packages\urllib3\connectionpool.py", line 588, in urlopen
    raise SSLError(e)
requests.packages.urllib3.exceptions.SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:600)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:/Users/Archit/PycharmProjects/untitled/first.py", line 17, in <module>
    forum_spider(1)
  File "C:/Users/Archit/PycharmProjects/untitled/first.py", line 9, in forum_spider
    source_code = requests.get(url)
  File "C:\Python34\lib\site-packages\requests\api.py", line 69, in get
    return request('get', url, params=params, **kwargs)
  File "C:\Python34\lib\site-packages\requests\api.py", line 50, in request
    response = session.request(method=method, url=url, **kwargs)
  File "C:\Python34\lib\site-packages\requests\sessions.py", line 468, in request
    resp = self.send(prep, **send_kwargs)
  File "C:\Python34\lib\site-packages\requests\sessions.py", line 576, in send
    r = adapter.send(request, **kwargs)
  File "C:\Python34\lib\site-packages\requests\adapters.py", line 433, in send
    raise SSLError(e, request=request)
requests.exceptions.SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:600)

Process finished with exit code 1

Please Help.

Post a Reply

Replies

Oldest  Newest  Rating
0 Hasib Ullah · December 19, 2015
i Guess Bucky's New website Can't be Crawled.That's Why you're getting this Error
0 mohak gemini · December 5, 2015
/images/forum/upload/2015-12-04/a11cac5911e08813d53b4f6b359b1fdd.jpg
  • 1

Python

107,046 followers
About

This section is all about snakes! Just kidding.

Links
Moderators
Bucky Roberts Administrator