[source code] Python Programming Tutorial - 25 - How to Make a Web Crawler

+33 Bucky Roberts · September 3, 2014

import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages):
    page = 1
    while page <= max_pages:
        url = "https://buckysroom.org/trade/search.php?page=" + str(page)
        source_code = requests.get(url)
        # just get the code, no headers or anything
        plain_text = source_code.text
        # BeautifulSoup objects can be sorted through easy
        soup = BeautifulSoup(plain_text)
        for link in soup.findAll('a', {'class': 'item-name'}):
            href = "https://buckysroom.org" + link.get('href')
            title = link.string  # just the text, not the HTML
            print(href)
            print(title)
            # get_single_item_data(href)
        page += 1


def get_single_item_data(item_url):
    source_code = requests.get(item_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text)
    # if you want to gather information from that page
    for item_name in soup.findAll('div', {'class': 'i-name'}):
        print(item_name.string)
    # if you want to gather links for a web crawler
    for link in soup.findAll('a'):
        href = "https://buckysroom.org" + link.get('href')
        print(href)


trade_spider(1)

Post a Reply

Replies

- page 2
Oldest  Newest  Rating
0 sfolje 0 · November 7, 2015
maybe your url doesnt exist. check it out.

post your code.
0 Poz Pozeidon · April 23, 2017
I want to add a functionality to this web crawler, actually when crawling a website like nytimes.com, I want to obtain all the text under the website and put them together with their urls. What do you recomend
0 Gunseli Ozkan · January 28, 2016
Hello Bucky, I do whatever you say but my program prints out nothing. BTW, I guess you closed the webpage for trading and I couldn't find something similar to it. Can you guys help me out?
0 Tanner Hoke · January 29, 2015
import requests


from bs4 import BeautifulSoup

source_code = requests.get('https://www.thenewboston.com/forum/topic.php?id=1610')
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
print(str(soup.find('code')).replace('<code>', '').replace('<br>', '\n').replace('?', '\t').replace('</code>', '').replace('</br>', ''))
0 Steve Gregg · January 5, 2016
I am doing this Python tutorial and decided to do my own web crawler. It gets all the stories from slashdot.org and saves them to an html file so you can read it offline. Any feedback or comments are appreciated


import requests
from bs4 import BeautifulSoup

def story_extractor(href):
source = requests.get(href) #get the page containing the story
plain_text = source.text #save the plain text version of the source code in a variable
soup = BeautifulSoup(plain_text,'lxml') #convert the source to a Beautiful Soup object
paragraph = soup.findAll('div',{'class':'p'}) #find the <div> tag where the class is 'p'
story_text = str(paragraph) #convert the story text found to a string
length = len(story_text) #find the length of the story in characters to use in the next line
return story_text[1:length-1] #return the story minus the first and last characters which were square brackets "[ ]"


def slashdot_crawler(pagesToCrawl): #begin definition of slashdot_crawler function
sd = open('slashdot.html', 'w') #create html file to write stories to
sd.write('<html><head><title>Slashdot stories</title></head><body><h1>The latest Slashdot stories</h1><br>') #write the initial HTML tags to the file to start the page
page = 0 #initialize page to 0 since slashdot starts counting their pages at 0
while page < pagesToCrawl: #keep looping until we get to the defined number of pages to crawl
print("Processing page " + str(page+1) + " of " + str(pagesToCrawl) + "\n") #prompt user on progress
url = "http://slashdot.org/?page=" + str(page) #assign the slashdot url to a variable
source = requests.get(url) #get the source code of the page
plain_text = source.text #save the plain text version of the source code in a variable
soup = BeautifulSoup(plain_text,'lxml') #convert the source to a Beautiful Soup object
for link in soup.findAll('span',{'class':'story-title'}): #loop through all the <span> tags with their class = story-title
href = link.a.get("href") #get the link to the story
title = link.a.string #store the story title
#print(title)
#print('http:' + href)
#print('\n')
sd.write('<p><a href="' + href + '">' + title + '</a><br>') #for each story start a paragraph and create a link to the original story using the story title
story_link = "http:" + href #add "http:" to the beginning of the url since slashdot links start with //
story_contents = story_extractor(story_link) #pass the link to the story extractor function which returns the actual text content of the story
sd.write(story_contents) #write the story content to the html file
sd.write("</p><br>") #close the html paragraph and insert a return
page += 1 #increment page
sd.write('</body></html>') #once the loop exits, no more stories so we can close the html page
sd.close() #close the html file we have been writing to

slashdot_crawler(3) #call the function to crawl slashdot.org
print("Processing complete") #prompt user to show that the program is done

0 Mark Drew · April 23, 2017
Hello all

I have tried to amend Bucky's code to work with a different website given the one in the tutorial is no longer available.  Given I am actually trying to follow the principal rather than just typing the code verbatim, it is probably a good thing.

Anyway, here is the code I have come up with to search a local buy and sell site.


import requests
from bs4 import BeautifulSoup

def trade_spider(max_pages):
   page = 1
   while page <= max_pages:
       url = 'https://www.gumtree.com.au/s-appliances/page-' + str(page) + '/c20088'
       source_code = requests.get(url)
       plain_text = source_code.text
       soup = BeautifulSoup(plain_text)
       for link in soup.findAll('a', {'class': 'ad-listing__title-link'}):
           href = 'https://www.gumtree.com.au' + link.get('href')
           title = link.string
           print(href)
           print(title)
           page += 1

trade_spider(1)




I know it is because I am not understanding the html code properly but I am not seeing what I expect to see.


  • Rather than seeing all the ads, I only recognise the last 8 (out of 30) hrefs.

  • And for the title, I get 'none' for each item.



I was hoping someone could have a look and help me understand why it is behaving like this.

Thanks
0 Daniel Stoica · February 1, 2015
It's kind of hard to use this tutorial, considering that there's nothing on your trade page. :(
0 Ola Berglund · January 4, 2015
I tried to solve the problem Bucky gave us, but i couldn't figured it out so I went to watch these spoilers. Now I'm completely lost. What is this "('<br>',)"? I really need to understand since a webcrawler is basic stuff!
0 Probuddha N · April 17, 2015
I used the same coding (changing the page attributes) to get data from Truelocal.com.au


import requests
from bs4 import BeautifulSoup

def tru_crawler(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.truelocal.com.au/find/car-rental/' + str(page)
code = requests.get(url)
text = code.text
soup = BeautifulSoup(text)
for link in soup.findAll('a', {'class':'name'}):
href = 'http://www.truelocal.com.au' + link.get('href')
title = link.string
# print (href)
# print (title)
get_single_item_data(href)
page += 1

def get_single_item_data(item_url):
code = requests.get(item_url)
text = code.text
soup = BeautifulSoup(text)
for item_name in soup.findAll('h1',{'itemprop':'name'}):
print item_name.string
for link in soup.findAll('a'):
href = 'http://www.truelocal.com.au' + link.get('href')
print (href)


tru_crawler(2)


and after getting the first page title and all the links from that page, I got the following error


Traceback (most recent call last):
  File "C:/Users/PB/PycharmProjects/crawler/new-crawler.py", line 30, in <module>
    tru_crawler(2)
  File "C:/Users/PB/PycharmProjects/crawler/new-crawler.py", line 16, in tru_crawler
    get_single_item_data(href)
  File "C:/Users/PB/PycharmProjects/crawler/new-crawler.py", line 26, in get_single_item_data
    href = 'http://www.truelocal.com.au' + link.get('href')
TypeError: cannot concatenate 'str' and 'NoneType' objects


Please advise
0 Ola Berglund · January 4, 2015
Yeah I had totally forgotten about this, thanks for reminding me that they are in his videos, great help!!

Python

118,134 followers
About

This section is all about snakes! Just kidding.

Links
Moderators
Bucky Roberts Administrator