[source code] Python Programming Tutorial - 25 - How to Make a Web Crawler

+39 Bucky Roberts · September 3, 2014

import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages):
    page = 1
    while page <= max_pages:
        url = "https://buckysroom.org/trade/search.php?page=" + str(page)
        source_code = requests.get(url)
        # just get the code, no headers or anything
        plain_text = source_code.text
        # BeautifulSoup objects can be sorted through easy
        soup = BeautifulSoup(plain_text)
        for link in soup.findAll('a', {'class': 'item-name'}):
            href = "https://buckysroom.org" + link.get('href')
            title = link.string  # just the text, not the HTML
            print(href)
            print(title)
            # get_single_item_data(href)
        page += 1


def get_single_item_data(item_url):
    source_code = requests.get(item_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text)
    # if you want to gather information from that page
    for item_name in soup.findAll('div', {'class': 'i-name'}):
        print(item_name.string)
    # if you want to gather links for a web crawler
    for link in soup.findAll('a'):
        href = "https://buckysroom.org" + link.get('href')
        print(href)


trade_spider(1)

Post a Reply

Replies

- page 2
Oldest  Newest  Rating
+1 Chris Nelson · January 4, 2015
<br> is a line break in old HTML.
I believe HTML 5 replaces that with.. <br />

I could be wrong, it has been so long since writing any HTML.

I haven't gone into the webcrawler tutorial.. But if those are referenced in Bucky's python videos, I believe that because you can use certain HTML code in python.

But please someone with more experience, feel free to correct me!
0 Ola Berglund · January 4, 2015
Yeah I had totally forgotten about this, thanks for reminding me that they are in his videos, great help!!
0 Tanner Hoke · January 29, 2015
import requests


from bs4 import BeautifulSoup

source_code = requests.get('https://www.thenewboston.com/forum/topic.php?id=1610')
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
print(str(soup.find('code')).replace('<code>', '').replace('<br>', '\n').replace('?', '\t').replace('</code>', '').replace('</br>', ''))
0 Daniel Stoica · February 1, 2015
It's kind of hard to use this tutorial, considering that there's nothing on your trade page. :(
0 Probuddha N · April 17, 2015
I used the same coding (changing the page attributes) to get data from Truelocal.com.au


import requests
from bs4 import BeautifulSoup

def tru_crawler(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.truelocal.com.au/find/car-rental/' + str(page)
code = requests.get(url)
text = code.text
soup = BeautifulSoup(text)
for link in soup.findAll('a', {'class':'name'}):
href = 'http://www.truelocal.com.au' + link.get('href')
title = link.string
# print (href)
# print (title)
get_single_item_data(href)
page += 1

def get_single_item_data(item_url):
code = requests.get(item_url)
text = code.text
soup = BeautifulSoup(text)
for item_name in soup.findAll('h1',{'itemprop':'name'}):
print item_name.string
for link in soup.findAll('a'):
href = 'http://www.truelocal.com.au' + link.get('href')
print (href)


tru_crawler(2)


and after getting the first page title and all the links from that page, I got the following error


Traceback (most recent call last):
  File "C:/Users/PB/PycharmProjects/crawler/new-crawler.py", line 30, in <module>
    tru_crawler(2)
  File "C:/Users/PB/PycharmProjects/crawler/new-crawler.py", line 16, in tru_crawler
    get_single_item_data(href)
  File "C:/Users/PB/PycharmProjects/crawler/new-crawler.py", line 26, in get_single_item_data
    href = 'http://www.truelocal.com.au' + link.get('href')
TypeError: cannot concatenate 'str' and 'NoneType' objects


Please advise
0 Chandan Chaudhary · May 9, 2015
import requests
from bs4 import BeautifulSoup

def trade_spider(max_pages):
    page = 1
    while page <= max_pages:
        url = 'https://www.thenewboston.com/forum/search_forums.php?s=&orderby=popular&page=' + str(page)
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text)
        for link in soup.findAll('a', {'class': 'category-title'}):
            href = 'https://www.thenewboston.com/' + link.get('href')
            title = link.string
            print(href)
            print(title)
        page =+ 1

trade_spider(1)

# why this programme is not stoping. Help Me
0 Mehmet Kıdıman · May 14, 2015
?mport package then click hit bulb and select install package section
0 Mehmet Kıdıman · May 14, 2015

import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages):
page = 1
while page
**** half working** no error
? try it on some shopping website and i couln't get "href"  anyhow . you may check it if u have an solution please share it.
0 Saili Ghavat · May 26, 2015
is the trade page removed? I am trying to write the web crawler code but giving me a list of errors like:
  
File "C:\Users\Saili\AppData\Roaming\Python\Python34\site-packages\requests\sessions.py", line 573, in send
    r = adapter.send(request, **kwargs)
  File "C:\Users\Saili\AppData\Roaming\Python\Python34\site-packages\requests\adapters.py", line 415, in send
    raise ConnectionError(err, request=request)
0 yiting chua · June 1, 2015
Hey NewBoston,  I tried to follow your tutorial, but I'm having some issues on Tut 26.


import requests
from bs4 import BeautifulSoup

def trade_spider(max_pages):

page = 1
while page
I got the following error when I run it,

Traceback (most recent call last):
  File "C:/Users/User/PycharmProjects/Basic/WebCrawlerTest.py", line 19, in <module>
    trade_spider(2)
  File "C:/Users/User/PycharmProjects/Basic/WebCrawlerTest.py", line 9, in trade_spider
    source_code = requests.get(url)
  File "C:\Users\User\AppData\Roaming\Python\Python34\site-packages\requests\api.py", line 69, in get
    return request('get', url, params=params, **kwargs)
  File "C:\Users\User\AppData\Roaming\Python\Python34\site-packages\requests\api.py", line 50, in request
    response = session.request(method=method, url=url, **kwargs)
  File "C:\Users\User\AppData\Roaming\Python\Python34\site-packages\requests\sessions.py", line 465, in request
    resp = self.send(prep, **send_kwargs)
  File "C:\Users\User\AppData\Roaming\Python\Python34\site-packages\requests\sessions.py", line 594, in send
    history = [resp for resp in gen] if allow_redirects else []
  File "C:\Users\User\AppData\Roaming\Python\Python34\site-packages\requests\sessions.py", line 594, in <listcomp>
    history = [resp for resp in gen] if allow_redirects else []
  File "C:\Users\User\AppData\Roaming\Python\Python34\site-packages\requests\sessions.py", line 114, in resolve_redirects
    raise TooManyRedirects('Exceeded %s redirects.' % self.max_redirects)
requests.exceptions.TooManyRedirects: Exceeded 30 redirects.


able to help?

Python

130,964 followers
About

This section is all about snakes! Just kidding.

Links
Moderators
Bucky Roberts Administrator