[source code] Python Programming Tutorial - 25 - How to Make a Web Crawler

+38 Bucky Roberts · September 3, 2014

import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages):
    page = 1
    while page <= max_pages:
        url = "https://buckysroom.org/trade/search.php?page=" + str(page)
        source_code = requests.get(url)
        # just get the code, no headers or anything
        plain_text = source_code.text
        # BeautifulSoup objects can be sorted through easy
        soup = BeautifulSoup(plain_text)
        for link in soup.findAll('a', {'class': 'item-name'}):
            href = "https://buckysroom.org" + link.get('href')
            title = link.string  # just the text, not the HTML
            print(href)
            print(title)
            # get_single_item_data(href)
        page += 1


def get_single_item_data(item_url):
    source_code = requests.get(item_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text)
    # if you want to gather information from that page
    for item_name in soup.findAll('div', {'class': 'i-name'}):
        print(item_name.string)
    # if you want to gather links for a web crawler
    for link in soup.findAll('a'):
        href = "https://buckysroom.org" + link.get('href')
        print(href)


trade_spider(1)

Post a Reply

Replies

- page 3
Oldest  Newest  Rating
0 sfolje 0 · January 30, 2016
[ Edited 15.2.2016, working then ]

Tip: when you copy&paste, use tool Find&Replace, to replace question marks into tabs. Warning : dont replace them into spaces, because it causes indentation errors. Replace them with nothing (empty string or ""). Then indent all from beginning.


#Description:
#this program goes on website https://thenewboston.com/search.php?type=0&sort=reputation ,and goes on every user's profile, and on that profile, it prints the first few (approx. 20) links of latest photos. To view photos, click on url or copy in web broser.


import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages):         #this function gets links to user's profiles
    page = 1
    while page <= max_pages:
        url = 'https://thenewboston.com/search.php?type=0&sort=reputation&page==' + str(page)
        source_code = requests.get(url, allow_redirects=False)
        plain_text = source_code.text.encode('ascii', 'replace')
        # soup = BeautifulSoup(plain_text) #this line causes error! (uncomment, to learn new things)
        soup = BeautifulSoup(plain_text,'html.parser')
        for link in soup.findAll('a', {'class': 'user-name'}): #all profiles
            print(' <<---BEGINNING OF LINK--->>')
            print('link: ',link)
            href = link.get('href')
            title = link.string
            print('href: ' ,href)
            print('title: ',title)
            get_single_item_data(href)        #comment this for better understanding
            print(' <<---END OF link--->>')
        print('page: ',page)
        page += 1

def get_single_item_data(item_url):                    #I am now on user's profile and I now print links to users's photos

    print(' <<--- BEGINNING OF get_single_item_data() --->>')
    source_code = requests.get(item_url)
    plain_text = source_code.text
    # soup = BeautifulSoup(plain_text) #this causes error!! (uncomment, to learn new things)
    soup = BeautifulSoup(plain_text,"lxml") #use this line, to avoid error!!
    for item_name in soup.findAll('img', {'class': 'img-responsive'}): # all photos of the user
        print('item_name :',item_name)
        photo='https://thenewboston.com'+item_name.get('src')
        print('Click the link to open the photo: ', photo)    
    print(' <<--- END OF get_single_item_data() --->>')

trade_spider(1)

use this copypaste code to avoid future indentation errors:


#Description:
#this program goes on website https://thenewboston.com/search.php?type=0&sort=reputation ,and goes on every user's profile, and on that profile, it prints the first few (approx. 20) links of latest photos. To view photos, click on url or copy in web broser.


import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages):#this function gets links to user's profiles
page = 1
while page <= max_pages:
url = 'https://thenewboston.com/search.php?type=0&sort=reputation&page==' + str(page)
source_code = requests.get(url, allow_redirects=False)
plain_text = source_code.text.encode('ascii', 'replace')
# soup = BeautifulSoup(plain_text) #this line causes error! (uncomment, to learn new things)
soup = BeautifulSoup(plain_text,'html.parser')
for link in soup.findAll('a', {'class': 'user-name'}):# all profiles
print(' <<---BEGINNING OF LINK--->>')
print('link: ',link)
href = link.get('href')
title = link.string
print('href: ' ,href)
print('title: ',title)
get_single_item_data(href)# comment this for better understanding
print(' <<---END OF link--->>')
print('page: ',page)
page += 1

def get_single_item_data(item_url):# I am now on user's profile and I now print links to users's photos

print(' <<--- BEGINNING OF get_single_item_data() --->>')
source_code = requests.get(item_url)
plain_text = source_code.text
# soup = BeautifulSoup(plain_text)# this causes error!! (uncomment, to learn new things)
soup = BeautifulSoup(plain_text,"lxml")#use this line, to avoid error!!
for item_name in soup.findAll('img', {'class': 'img-responsive'}):# all photos of the user
print('item_name :',item_name)
photo='https://thenewboston.com'+item_name.get('src')
print('Click the link to open the photo: ', photo)
print(' <<--- END OF get_single_item_data() --->>')

trade_spider(1)



I hope this code helps. It prints alot of stuff, to see what is going on with the code - what are our variables being used. Please give me a feedback if it is too messy, if it is helpful or whatever.
0 Carlos G. · July 13, 2016
Hi Bucky. Thanks for these useful tutorials. I have two questions. Thanks for your reply:

1) How would you set a different "user-agent" in this script so you won't be banned when scraping another website?

2) How can you test that your "fake" user-agent is really working (how can you retrieve its value) before running the script on a website?

Thanks!
0 Dimitar Vasilev · January 30, 2016
import requests
from bs4 import BeautifulSoup

the_url = 'https://www.thenewboston.com/forum/topic.php?id=1610'
def getting_the_info(url):
the_real_url=url
response=requests.get(the_real_url)
plain_text=response.text
soup=BeautifulSoup(plain_text, 'html.parser')
for line in soup.find_all('code',):
line = str(line).replace('<br>','\n')
if soup.find_all('td',{'class':'third-column'}):
break
print(line)

# print(plain_text)
getting_the_info(the_url)
0 sfolje 0 · October 7, 2015
[ Edit: this post was edited 30.1.2016, because of change of website on thenewboston.com (https://www.thenewboston.com/tops.php? does not exist any more) ]

[ Just a matter of time when this code wont work anymore. ]

For y'all happy people out there going like "trade page available not anymore", "buckysroom is error 404", "https://www.thenewboston.com/trade/search.php is error 404", "what webpage use can I, doge?" ;) im posting my own post, hoping it will help anyone.
I believe, someone before me posted similar code, solving same problem, but i am too lazy to check for sure.

Simple example:
import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages):   
    page = 1
    while page <= max_pages:
        url = 'https://www.thenewboston.com/search.php?type=0&sort=reputation&page==' + str(page)
        source_code = requests.get(url, allow_redirects=False)
        plain_text = source_code.text.encode('ascii', 'replace')
        soup = BeautifulSoup(plain_text,'html.parser')
        for link in soup.findAll('a', {'class': 'user-name'}): #all profiles
            href = link.get('href')
            title = link.string
            print('href: ' ,href)
            print('title: ',title)
            #get_single_item_data(href)
        page += 1


trade_spider(1)


Full webcrowler:
Only for visual look, I do not recommend to copypaste it, because of frustrating unpredictable errors.
#Description:
#this program goes on website https://www.thenewboston.com/search.php?type=0&sort=reputation ,and goes on every user's profile, and on that profile, it prints the first few (approx. 20) links of latest photos. To view photos, click on url or copy in web broser.


import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages): ?? ??? ?#this function gets links to user's profiles
?? ?page = 1
?? ?while page <= max_pages:
?? ??? ?url = 'https://www.thenewboston.com/search.php?type=0&sort=reputation&page==' + str(page)
?? ??? ?source_code = requests.get(url, allow_redirects=False)
?? ??? ?plain_text = source_code.text.encode('ascii', 'replace')
?? ??? ?# soup = BeautifulSoup(plain_text) #this line causes error! (uncomment, to learn new things)
?? ??? ?soup = BeautifulSoup(plain_text,'html.parser')
?? ??? ?for link in soup.findAll('a', {'class': 'user-name'}): #all profiles
?? ??? ??? ?print(' <<---BEGINNING OF LINK--->>')
?? ??? ??? ?print('link: ',link)
?? ??? ??? ?href = link.get('href')
?? ??? ??? ?title = link.string
?? ??? ??? ?print('href: ' ,href)
?? ??? ??? ?print('title: ',title)
?? ??? ??? ?get_single_item_data(href)?? ??? ?#comment this for better understanding
?? ??? ??? ?print(' <<---END OF link--->>')
?? ??? ?print('page: ',page)
?? ??? ?page += 1

def get_single_item_data(item_url):?? ??? ??? ??? ??? ?#I am now on user's profile and I now print links to users's photos

?? ?print(' <<--- BEGINNING OF get_single_item_data() --->>')
?? ?source_code = requests.get(item_url)
?? ?plain_text = source_code.text
?? ?# soup = BeautifulSoup(plain_text) #this causes error!! (uncomment, to learn new things)
?? ?soup = BeautifulSoup(plain_text,"lxml") #use this line, to avoid error!!
?? ?for item_name in soup.findAll('img', {'class': 'img-responsive'}): # all photos of the user
?? ??? ?print('item_name :',item_name)
?? ??? ?photo='https://www.thenewboston.com'+item_name.get('src')
?? ??? ?print('Click the link to open the photo: ', photo)?? ?
?? ?print(' <<--- END OF get_single_item_data() --->>')

trade_spider(1)

Copypaste folowing code to avoid future indentation errors (Recommended) :
#Description:
#this program goes on website https://www.thenewboston.com/search.php?type=0&sort=reputation ,and goes on every user's profile, and on that profile, it prints the first few (approx. 20) links of latest photos. To view photos, click on url or copy in web broser.


import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages): ?? ??? ?#this function gets links to user's profiles
page = 1
while page <= max_pages:
url = 'https://www.thenewboston.com/search.php?type=0&sort=reputation&page==' + str(page)
source_code = requests.get(url, allow_redirects=False)
plain_text = source_code.text.encode('ascii', 'replace')
# soup = BeautifulSoup(plain_text) #this line causes error! (uncomment, to learn new things)
soup = BeautifulSoup(plain_text,'html.parser')
for link in soup.findAll('a', {'class': 'user-name'}): #all profiles
print(' <<---BEGINNING OF LINK--->>')
print('link: ',link)
href = link.get('href')
title = link.string
print('href: ' ,href)
print('title: ',title)
get_single_item_data(href)?? ??? ?#comment this for better understanding
print(' <<---END OF link--->>')
print('page: ',page)
page += 1

def get_single_item_data(item_url):?? ??? ??? ??? ??? ?#I am now on user's profile and I now print links to users's photos

print(' <<--- BEGINNING OF get_single_item_data() --->>')
source_code = requests.get(item_url)
plain_text = source_code.text
# soup = BeautifulSoup(plain_text) #this causes error!! (uncomment, to learn new things)
soup = BeautifulSoup(plain_text,"lxml") #use this line, to avoid error!!
for item_name in soup.findAll('img', {'class': 'img-responsive'}): # all photos of the user
print('item_name :',item_name)
photo='https://www.thenewboston.com'+item_name.get('src')
print('Click the link to open the photo: ', photo)?? ?
print(' <<--- END OF get_single_item_data() --->>')

trade_spider(1)
0 yiting chua · June 1, 2015
Hey NewBoston,  I tried to follow your tutorial, but I'm having some issues on Tut 26.


import requests
from bs4 import BeautifulSoup

def trade_spider(max_pages):

page = 1
while page
I got the following error when I run it,

Traceback (most recent call last):
  File "C:/Users/User/PycharmProjects/Basic/WebCrawlerTest.py", line 19, in <module>
    trade_spider(2)
  File "C:/Users/User/PycharmProjects/Basic/WebCrawlerTest.py", line 9, in trade_spider
    source_code = requests.get(url)
  File "C:\Users\User\AppData\Roaming\Python\Python34\site-packages\requests\api.py", line 69, in get
    return request('get', url, params=params, **kwargs)
  File "C:\Users\User\AppData\Roaming\Python\Python34\site-packages\requests\api.py", line 50, in request
    response = session.request(method=method, url=url, **kwargs)
  File "C:\Users\User\AppData\Roaming\Python\Python34\site-packages\requests\sessions.py", line 465, in request
    resp = self.send(prep, **send_kwargs)
  File "C:\Users\User\AppData\Roaming\Python\Python34\site-packages\requests\sessions.py", line 594, in send
    history = [resp for resp in gen] if allow_redirects else []
  File "C:\Users\User\AppData\Roaming\Python\Python34\site-packages\requests\sessions.py", line 594, in <listcomp>
    history = [resp for resp in gen] if allow_redirects else []
  File "C:\Users\User\AppData\Roaming\Python\Python34\site-packages\requests\sessions.py", line 114, in resolve_redirects
    raise TooManyRedirects('Exceeded %s redirects.' % self.max_redirects)
requests.exceptions.TooManyRedirects: Exceeded 30 redirects.


able to help?
0 Mehmet Kıdıman · May 14, 2015
?mport package then click hit bulb and select install package section
0 Shashikanth Reddy Palvatla · April 23, 2017
@Muhammad Talha Zaroon

just like the error message says replace the following line in your code

soup = BeautifulSoup(plain_text)

to

soup = BeautifulSoup(plain_text, "html.parser")
0 Mehmet Kıdıman · May 14, 2015

import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages):
page = 1
while page
**** half working** no error
? try it on some shopping website and i couln't get "href"  anyhow . you may check it if u have an solution please share it.
0 john lalani · January 20, 2016
How would I go about making where if a post had no strings & just a pic to replace it with a string like: "No text in post".


import requests
from bs4 import BeautifulSoup

def get_words(item_url):
token = 0
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for item_descript in soup.findAll('div', {'class': 'post-content'}):
for item_like in soup.findAll('a', {'class': 'post-likes-count'})[token]:
print(item_like.string)
token += 1
for string in item_descript.stripped_strings:
print(repr(string))



def search_spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.thenewboston.com/search.php?type=1&sort=pop&page=' + str(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for link in soup.findAll('a', {'cclass': 'user-name'}):
href = 'https://www.thenewboston.com/' + link.get('href')
title = link.string
get_words(href)
page += 1


search_spider(2)
0 Saili Ghavat · May 26, 2015
is the trade page removed? I am trying to write the web crawler code but giving me a list of errors like:
  
File "C:\Users\Saili\AppData\Roaming\Python\Python34\site-packages\requests\sessions.py", line 573, in send
    r = adapter.send(request, **kwargs)
  File "C:\Users\Saili\AppData\Roaming\Python\Python34\site-packages\requests\adapters.py", line 415, in send
    raise ConnectionError(err, request=request)

Python

126,771 followers
About

This section is all about snakes! Just kidding.

Links
Moderators
Bucky Roberts Administrator