[source code] Python Programming Tutorial - 25 - How to Make a Web Crawler

+37 Bucky Roberts · September 3, 2014

import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages):
    page = 1
    while page <= max_pages:
        url = "https://buckysroom.org/trade/search.php?page=" + str(page)
        source_code = requests.get(url)
        # just get the code, no headers or anything
        plain_text = source_code.text
        # BeautifulSoup objects can be sorted through easy
        soup = BeautifulSoup(plain_text)
        for link in soup.findAll('a', {'class': 'item-name'}):
            href = "https://buckysroom.org" + link.get('href')
            title = link.string  # just the text, not the HTML
            print(href)
            print(title)
            # get_single_item_data(href)
        page += 1


def get_single_item_data(item_url):
    source_code = requests.get(item_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text)
    # if you want to gather information from that page
    for item_name in soup.findAll('div', {'class': 'i-name'}):
        print(item_name.string)
    # if you want to gather links for a web crawler
    for link in soup.findAll('a'):
        href = "https://buckysroom.org" + link.get('href')
        print(href)


trade_spider(1)

Post a Reply

Replies

- page 2
Oldest  Newest  Rating
0 sfolje 0 · January 30, 2016
[ Edited 15.2.2016, working then ]

Tip: when you copy&paste, use tool Find&Replace, to replace question marks into tabs. Warning : dont replace them into spaces, because it causes indentation errors. Replace them with nothing (empty string or ""). Then indent all from beginning.


#Description:
#this program goes on website https://thenewboston.com/search.php?type=0&sort=reputation ,and goes on every user's profile, and on that profile, it prints the first few (approx. 20) links of latest photos. To view photos, click on url or copy in web broser.


import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages):         #this function gets links to user's profiles
    page = 1
    while page <= max_pages:
        url = 'https://thenewboston.com/search.php?type=0&sort=reputation&page==' + str(page)
        source_code = requests.get(url, allow_redirects=False)
        plain_text = source_code.text.encode('ascii', 'replace')
        # soup = BeautifulSoup(plain_text) #this line causes error! (uncomment, to learn new things)
        soup = BeautifulSoup(plain_text,'html.parser')
        for link in soup.findAll('a', {'class': 'user-name'}): #all profiles
            print(' <<---BEGINNING OF LINK--->>')
            print('link: ',link)
            href = link.get('href')
            title = link.string
            print('href: ' ,href)
            print('title: ',title)
            get_single_item_data(href)        #comment this for better understanding
            print(' <<---END OF link--->>')
        print('page: ',page)
        page += 1

def get_single_item_data(item_url):                    #I am now on user's profile and I now print links to users's photos

    print(' <<--- BEGINNING OF get_single_item_data() --->>')
    source_code = requests.get(item_url)
    plain_text = source_code.text
    # soup = BeautifulSoup(plain_text) #this causes error!! (uncomment, to learn new things)
    soup = BeautifulSoup(plain_text,"lxml") #use this line, to avoid error!!
    for item_name in soup.findAll('img', {'class': 'img-responsive'}): # all photos of the user
        print('item_name :',item_name)
        photo='https://thenewboston.com'+item_name.get('src')
        print('Click the link to open the photo: ', photo)    
    print(' <<--- END OF get_single_item_data() --->>')

trade_spider(1)

use this copypaste code to avoid future indentation errors:


#Description:
#this program goes on website https://thenewboston.com/search.php?type=0&sort=reputation ,and goes on every user's profile, and on that profile, it prints the first few (approx. 20) links of latest photos. To view photos, click on url or copy in web broser.


import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages):#this function gets links to user's profiles
page = 1
while page <= max_pages:
url = 'https://thenewboston.com/search.php?type=0&sort=reputation&page==' + str(page)
source_code = requests.get(url, allow_redirects=False)
plain_text = source_code.text.encode('ascii', 'replace')
# soup = BeautifulSoup(plain_text) #this line causes error! (uncomment, to learn new things)
soup = BeautifulSoup(plain_text,'html.parser')
for link in soup.findAll('a', {'class': 'user-name'}):# all profiles
print(' <<---BEGINNING OF LINK--->>')
print('link: ',link)
href = link.get('href')
title = link.string
print('href: ' ,href)
print('title: ',title)
get_single_item_data(href)# comment this for better understanding
print(' <<---END OF link--->>')
print('page: ',page)
page += 1

def get_single_item_data(item_url):# I am now on user's profile and I now print links to users's photos

print(' <<--- BEGINNING OF get_single_item_data() --->>')
source_code = requests.get(item_url)
plain_text = source_code.text
# soup = BeautifulSoup(plain_text)# this causes error!! (uncomment, to learn new things)
soup = BeautifulSoup(plain_text,"lxml")#use this line, to avoid error!!
for item_name in soup.findAll('img', {'class': 'img-responsive'}):# all photos of the user
print('item_name :',item_name)
photo='https://thenewboston.com'+item_name.get('src')
print('Click the link to open the photo: ', photo)
print(' <<--- END OF get_single_item_data() --->>')

trade_spider(1)



I hope this code helps. It prints alot of stuff, to see what is going on with the code - what are our variables being used. Please give me a feedback if it is too messy, if it is helpful or whatever.
0 Gunseli Ozkan · January 28, 2016
Hello Bucky, I do whatever you say but my program prints out nothing. BTW, I guess you closed the webpage for trading and I couldn't find something similar to it. Can you guys help me out?
0 john lalani · January 20, 2016
How would I go about making where if a post had no strings & just a pic to replace it with a string like: "No text in post".


import requests
from bs4 import BeautifulSoup

def get_words(item_url):
token = 0
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for item_descript in soup.findAll('div', {'class': 'post-content'}):
for item_like in soup.findAll('a', {'class': 'post-likes-count'})[token]:
print(item_like.string)
token += 1
for string in item_descript.stripped_strings:
print(repr(string))



def search_spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.thenewboston.com/search.php?type=1&sort=pop&page=' + str(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for link in soup.findAll('a', {'cclass': 'user-name'}):
href = 'https://www.thenewboston.com/' + link.get('href')
title = link.string
get_words(href)
page += 1


search_spider(2)
0 Steve Gregg · January 5, 2016
I am doing this Python tutorial and decided to do my own web crawler. It gets all the stories from slashdot.org and saves them to an html file so you can read it offline. Any feedback or comments are appreciated


import requests
from bs4 import BeautifulSoup

def story_extractor(href):
source = requests.get(href) #get the page containing the story
plain_text = source.text #save the plain text version of the source code in a variable
soup = BeautifulSoup(plain_text,'lxml') #convert the source to a Beautiful Soup object
paragraph = soup.findAll('div',{'class':'p'}) #find the <div> tag where the class is 'p'
story_text = str(paragraph) #convert the story text found to a string
length = len(story_text) #find the length of the story in characters to use in the next line
return story_text[1:length-1] #return the story minus the first and last characters which were square brackets "[ ]"


def slashdot_crawler(pagesToCrawl): #begin definition of slashdot_crawler function
sd = open('slashdot.html', 'w') #create html file to write stories to
sd.write('<html><head><title>Slashdot stories</title></head><body><h1>The latest Slashdot stories</h1><br>') #write the initial HTML tags to the file to start the page
page = 0 #initialize page to 0 since slashdot starts counting their pages at 0
while page < pagesToCrawl: #keep looping until we get to the defined number of pages to crawl
print("Processing page " + str(page+1) + " of " + str(pagesToCrawl) + "\n") #prompt user on progress
url = "http://slashdot.org/?page=" + str(page) #assign the slashdot url to a variable
source = requests.get(url) #get the source code of the page
plain_text = source.text #save the plain text version of the source code in a variable
soup = BeautifulSoup(plain_text,'lxml') #convert the source to a Beautiful Soup object
for link in soup.findAll('span',{'class':'story-title'}): #loop through all the <span> tags with their class = story-title
href = link.a.get("href") #get the link to the story
title = link.a.string #store the story title
#print(title)
#print('http:' + href)
#print('\n')
sd.write('<p><a href="' + href + '">' + title + '</a><br>') #for each story start a paragraph and create a link to the original story using the story title
story_link = "http:" + href #add "http:" to the beginning of the url since slashdot links start with //
story_contents = story_extractor(story_link) #pass the link to the story extractor function which returns the actual text content of the story
sd.write(story_contents) #write the story content to the html file
sd.write("</p><br>") #close the html paragraph and insert a return
page += 1 #increment page
sd.write('</body></html>') #once the loop exits, no more stories so we can close the html page
sd.close() #close the html file we have been writing to

slashdot_crawler(3) #call the function to crawl slashdot.org
print("Processing complete") #prompt user to show that the program is done

0 sfolje 0 · November 7, 2015
maybe your url doesnt exist. check it out.

post your code.
0 Randy Strong · November 7, 2015
What did I do wrong?  I've tried running it with python 2.7 libraries as well as 3.5 and it's still giving me these errors?

/Library/Frameworks/Python.framework/Versions/3.5/bin/python3.5 /Users/CUTTERINTERACTIVE/PycharmProjects/FINAL/crawler.py
Traceback (most recent call last):
  File "/Users/CUTTERINTERACTIVE/PycharmProjects/FINAL/crawler.py", line 36, in <module>
    trade_spider(1)
  File "/Users/CUTTERINTERACTIVE/PycharmProjects/FINAL/crawler.py", line 9, in trade_spider
    source_code = requests.get(url)
  File "/Users/CUTTERINTERACTIVE/Library/Python/3.5/lib/python/site-packages/requests/api.py", line 69, in get
    return request('get', url, params=params, **kwargs)
  File "/Users/CUTTERINTERACTIVE/Library/Python/3.5/lib/python/site-packages/requests/api.py", line 50, in request
    response = session.request(method=method, url=url, **kwargs)
  File "/Users/CUTTERINTERACTIVE/Library/Python/3.5/lib/python/site-packages/requests/sessions.py", line 454, in request
    prep = self.prepare_request(req)
  File "/Users/CUTTERINTERACTIVE/Library/Python/3.5/lib/python/site-packages/requests/sessions.py", line 388, in prepare_request
    hooks=merge_hooks(request.hooks, self.hooks),
  File "/Users/CUTTERINTERACTIVE/Library/Python/3.5/lib/python/site-packages/requests/models.py", line 293, in prepare
    self.prepare_url(url, params)
  File "/Users/CUTTERINTERACTIVE/Library/Python/3.5/lib/python/site-packages/requests/models.py", line 353, in prepare_url
    raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '1.php?page=1': No schema supplied. Perhaps you meant http://1.php?page=1?
0 Halcyon Abraham Ramirez · November 5, 2015
@Balaskandan Giri

w/out a code snippet or stacktrace no one can help
0 Balaskandan Giri · November 4, 2015
HELP!!!

The crawler is deadlocked if I try to call it recursively!!!!!!
Any solution??
0 anthom antho · October 8, 2015
I tried to save it to a file and then PyCharm threw an Encoding Exception [if they are not called in Exception in Python I am sorry. I am originally a Java guy ]

So I managed to replace them. What I now did is, I used your code Anton, and shortened it a bit as the last soup-object call was redundant. And now it prints it out nicely to the console

Added a few more comments to explain what I changed. So it kinda looks longer now 
0 sfolje 0 · October 7, 2015
[ Edit: this post was edited 30.1.2016, because of change of website on thenewboston.com (https://www.thenewboston.com/tops.php? does not exist any more) ]

[ Just a matter of time when this code wont work anymore. ]

For y'all happy people out there going like "trade page available not anymore", "buckysroom is error 404", "https://www.thenewboston.com/trade/search.php is error 404", "what webpage use can I, doge?" ;) im posting my own post, hoping it will help anyone.
I believe, someone before me posted similar code, solving same problem, but i am too lazy to check for sure.

Simple example:
import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages):   
    page = 1
    while page <= max_pages:
        url = 'https://www.thenewboston.com/search.php?type=0&sort=reputation&page==' + str(page)
        source_code = requests.get(url, allow_redirects=False)
        plain_text = source_code.text.encode('ascii', 'replace')
        soup = BeautifulSoup(plain_text,'html.parser')
        for link in soup.findAll('a', {'class': 'user-name'}): #all profiles
            href = link.get('href')
            title = link.string
            print('href: ' ,href)
            print('title: ',title)
            #get_single_item_data(href)
        page += 1


trade_spider(1)


Full webcrowler:
Only for visual look, I do not recommend to copypaste it, because of frustrating unpredictable errors.
#Description:
#this program goes on website https://www.thenewboston.com/search.php?type=0&sort=reputation ,and goes on every user's profile, and on that profile, it prints the first few (approx. 20) links of latest photos. To view photos, click on url or copy in web broser.


import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages): ?? ??? ?#this function gets links to user's profiles
?? ?page = 1
?? ?while page <= max_pages:
?? ??? ?url = 'https://www.thenewboston.com/search.php?type=0&sort=reputation&page==' + str(page)
?? ??? ?source_code = requests.get(url, allow_redirects=False)
?? ??? ?plain_text = source_code.text.encode('ascii', 'replace')
?? ??? ?# soup = BeautifulSoup(plain_text) #this line causes error! (uncomment, to learn new things)
?? ??? ?soup = BeautifulSoup(plain_text,'html.parser')
?? ??? ?for link in soup.findAll('a', {'class': 'user-name'}): #all profiles
?? ??? ??? ?print(' <<---BEGINNING OF LINK--->>')
?? ??? ??? ?print('link: ',link)
?? ??? ??? ?href = link.get('href')
?? ??? ??? ?title = link.string
?? ??? ??? ?print('href: ' ,href)
?? ??? ??? ?print('title: ',title)
?? ??? ??? ?get_single_item_data(href)?? ??? ?#comment this for better understanding
?? ??? ??? ?print(' <<---END OF link--->>')
?? ??? ?print('page: ',page)
?? ??? ?page += 1

def get_single_item_data(item_url):?? ??? ??? ??? ??? ?#I am now on user's profile and I now print links to users's photos

?? ?print(' <<--- BEGINNING OF get_single_item_data() --->>')
?? ?source_code = requests.get(item_url)
?? ?plain_text = source_code.text
?? ?# soup = BeautifulSoup(plain_text) #this causes error!! (uncomment, to learn new things)
?? ?soup = BeautifulSoup(plain_text,"lxml") #use this line, to avoid error!!
?? ?for item_name in soup.findAll('img', {'class': 'img-responsive'}): # all photos of the user
?? ??? ?print('item_name :',item_name)
?? ??? ?photo='https://www.thenewboston.com'+item_name.get('src')
?? ??? ?print('Click the link to open the photo: ', photo)?? ?
?? ?print(' <<--- END OF get_single_item_data() --->>')

trade_spider(1)

Copypaste folowing code to avoid future indentation errors (Recommended) :
#Description:
#this program goes on website https://www.thenewboston.com/search.php?type=0&sort=reputation ,and goes on every user's profile, and on that profile, it prints the first few (approx. 20) links of latest photos. To view photos, click on url or copy in web broser.


import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages): ?? ??? ?#this function gets links to user's profiles
page = 1
while page <= max_pages:
url = 'https://www.thenewboston.com/search.php?type=0&sort=reputation&page==' + str(page)
source_code = requests.get(url, allow_redirects=False)
plain_text = source_code.text.encode('ascii', 'replace')
# soup = BeautifulSoup(plain_text) #this line causes error! (uncomment, to learn new things)
soup = BeautifulSoup(plain_text,'html.parser')
for link in soup.findAll('a', {'class': 'user-name'}): #all profiles
print(' <<---BEGINNING OF LINK--->>')
print('link: ',link)
href = link.get('href')
title = link.string
print('href: ' ,href)
print('title: ',title)
get_single_item_data(href)?? ??? ?#comment this for better understanding
print(' <<---END OF link--->>')
print('page: ',page)
page += 1

def get_single_item_data(item_url):?? ??? ??? ??? ??? ?#I am now on user's profile and I now print links to users's photos

print(' <<--- BEGINNING OF get_single_item_data() --->>')
source_code = requests.get(item_url)
plain_text = source_code.text
# soup = BeautifulSoup(plain_text) #this causes error!! (uncomment, to learn new things)
soup = BeautifulSoup(plain_text,"lxml") #use this line, to avoid error!!
for item_name in soup.findAll('img', {'class': 'img-responsive'}): # all photos of the user
print('item_name :',item_name)
photo='https://www.thenewboston.com'+item_name.get('src')
print('Click the link to open the photo: ', photo)?? ?
print(' <<--- END OF get_single_item_data() --->>')

trade_spider(1)

Python

124,500 followers
About

This section is all about snakes! Just kidding.

Links
Moderators
Bucky Roberts Administrator