[source code] Python Programming Tutorial - 25 - How to Make a Web Crawler

+26 Bucky Roberts · September 3, 2014

import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages):
    page = 1
    while page <= max_pages:
        url = "https://buckysroom.org/trade/search.php?page=" + str(page)
        source_code = requests.get(url)
        # just get the code, no headers or anything
        plain_text = source_code.text
        # BeautifulSoup objects can be sorted through easy
        soup = BeautifulSoup(plain_text)
        for link in soup.findAll('a', {'class': 'item-name'}):
            href = "https://buckysroom.org" + link.get('href')
            title = link.string  # just the text, not the HTML
            print(href)
            print(title)
            # get_single_item_data(href)
        page += 1


def get_single_item_data(item_url):
    source_code = requests.get(item_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text)
    # if you want to gather information from that page
    for item_name in soup.findAll('div', {'class': 'i-name'}):
        print(item_name.string)
    # if you want to gather links for a web crawler
    for link in soup.findAll('a'):
        href = "https://buckysroom.org" + link.get('href')
        print(href)


trade_spider(1)

Post a Reply

Replies

- page 4
Oldest  Newest  Rating
0 Kaim Syed · April 11, 2016
uring handling of the above exception, another exception occurred:

Traceback (most recent call last):
 File "C:/Users/NAEEM/PycharmProjects/main.py/main.py", line 36, in <module>
   trade_spider(1)
 File "C:/Users/NAEEM/PycharmProjects/main.py/main.py", line 9, in trade_spider
   source_code = requests.get(url)
 File "C:\Users\NAEEM\AppData\Roaming\Python\Python35\site-packages\requests\api.py", line 67, in get
   return request('get', url, params=params, **kwargs)
 File "C:\Users\NAEEM\AppData\Roaming\Python\Python35\site-packages\requests\api.py", line 53, in request
   return session.request(method=method, url=url, **kwargs)
 File "C:\Users\NAEEM\AppData\Roaming\Python\Python35\site-packages\requests\sessions.py", line 468, in request
   resp = self.send(prep, **send_kwargs)
 File "C:\Users\NAEEM\AppData\Roaming\Python\Python35\site-packages\requests\sessions.py", line 576, in send
   r = adapter.send(request, **kwargs)
 File "C:\Users\NAEEM\AppData\Roaming\Python\Python35\site-packages\requests\adapters.py", line 437, in send
   raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='buckysroom.org', port=443): Max retries exceeded with url: /trade/search.php?page=1 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x0317BC10>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond',))
0 Chandan Chaudhary · May 9, 2015
import requests
from bs4 import BeautifulSoup

def trade_spider(max_pages):
    page = 1
    while page <= max_pages:
        url = 'https://www.thenewboston.com/forum/search_forums.php?s=&orderby=popular&page=' + str(page)
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text)
        for link in soup.findAll('a', {'class': 'category-title'}):
            href = 'https://www.thenewboston.com/' + link.get('href')
            title = link.string
            print(href)
            print(title)
        page =+ 1

trade_spider(1)

# why this programme is not stoping. Help Me
0 Arthur lee · December 11, 2014
hi, bro ,i have a question, how to craw the  pictures down ? which function shall i use?
0 Balaskandan Giri · November 4, 2015
HELP!!!

The crawler is deadlocked if I try to call it recursively!!!!!!
Any solution??
0 Halcyon Abraham Ramirez · November 5, 2015
@Balaskandan Giri

w/out a code snippet or stacktrace no one can help
0 nmelssx nmelssx · September 25, 2015
I have a web crawler code in php, but it just gets the words, without the css style of the website. Is that normal or no?
0 Aditya Joshi · April 28, 2016
I was getting the html.parser error so I added "htl.parser" to the plain_text. Now getting this error

"I am getting the html parser error so i added "html.parser"after plain_text, but then I am still getting the following error

Traceback (most recent call last):
 File "file path", line 22, in <module>
   trade_spider(1)
 File "file path", line 13, in trade_spider
   soup = BeautifulSoup(plain_text, "html parser")

bs4.FeatureNotFound: Couldn't find a tree builder with the features you requested: html parser. Do you need to install a parser library??"
0 Randy Strong · November 7, 2015
What did I do wrong?  I've tried running it with python 2.7 libraries as well as 3.5 and it's still giving me these errors?

/Library/Frameworks/Python.framework/Versions/3.5/bin/python3.5 /Users/CUTTERINTERACTIVE/PycharmProjects/FINAL/crawler.py
Traceback (most recent call last):
  File "/Users/CUTTERINTERACTIVE/PycharmProjects/FINAL/crawler.py", line 36, in <module>
    trade_spider(1)
  File "/Users/CUTTERINTERACTIVE/PycharmProjects/FINAL/crawler.py", line 9, in trade_spider
    source_code = requests.get(url)
  File "/Users/CUTTERINTERACTIVE/Library/Python/3.5/lib/python/site-packages/requests/api.py", line 69, in get
    return request('get', url, params=params, **kwargs)
  File "/Users/CUTTERINTERACTIVE/Library/Python/3.5/lib/python/site-packages/requests/api.py", line 50, in request
    response = session.request(method=method, url=url, **kwargs)
  File "/Users/CUTTERINTERACTIVE/Library/Python/3.5/lib/python/site-packages/requests/sessions.py", line 454, in request
    prep = self.prepare_request(req)
  File "/Users/CUTTERINTERACTIVE/Library/Python/3.5/lib/python/site-packages/requests/sessions.py", line 388, in prepare_request
    hooks=merge_hooks(request.hooks, self.hooks),
  File "/Users/CUTTERINTERACTIVE/Library/Python/3.5/lib/python/site-packages/requests/models.py", line 293, in prepare
    self.prepare_url(url, params)
  File "/Users/CUTTERINTERACTIVE/Library/Python/3.5/lib/python/site-packages/requests/models.py", line 353, in prepare_url
    raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '1.php?page=1': No schema supplied. Perhaps you meant http://1.php?page=1?

Python

107,143 followers
About

This section is all about snakes! Just kidding.

Links
Moderators
Bucky Roberts Administrator