[source code] Python Programming Tutorial - 25 - How to Make a Web Crawler

+39 Bucky Roberts · September 3, 2014

import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages):
    page = 1
    while page <= max_pages:
        url = "https://buckysroom.org/trade/search.php?page=" + str(page)
        source_code = requests.get(url)
        # just get the code, no headers or anything
        plain_text = source_code.text
        # BeautifulSoup objects can be sorted through easy
        soup = BeautifulSoup(plain_text)
        for link in soup.findAll('a', {'class': 'item-name'}):
            href = "https://buckysroom.org" + link.get('href')
            title = link.string  # just the text, not the HTML
            print(href)
            print(title)
            # get_single_item_data(href)
        page += 1


def get_single_item_data(item_url):
    source_code = requests.get(item_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text)
    # if you want to gather information from that page
    for item_name in soup.findAll('div', {'class': 'i-name'}):
        print(item_name.string)
    # if you want to gather links for a web crawler
    for link in soup.findAll('a'):
        href = "https://buckysroom.org" + link.get('href')
        print(href)


trade_spider(1)

Post a Reply

Replies

- page 4
Oldest  Newest  Rating
0 john lalani · January 20, 2016
How would I go about making where if a post had no strings & just a pic to replace it with a string like: "No text in post".


import requests
from bs4 import BeautifulSoup

def get_words(item_url):
token = 0
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for item_descript in soup.findAll('div', {'class': 'post-content'}):
for item_like in soup.findAll('a', {'class': 'post-likes-count'})[token]:
print(item_like.string)
token += 1
for string in item_descript.stripped_strings:
print(repr(string))



def search_spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.thenewboston.com/search.php?type=1&sort=pop&page=' + str(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for link in soup.findAll('a', {'cclass': 'user-name'}):
href = 'https://www.thenewboston.com/' + link.get('href')
title = link.string
get_words(href)
page += 1


search_spider(2)
0 Gunseli Ozkan · January 28, 2016
Hello Bucky, I do whatever you say but my program prints out nothing. BTW, I guess you closed the webpage for trading and I couldn't find something similar to it. Can you guys help me out?
0 sfolje 0 · January 30, 2016
[ Edited 15.2.2016, working then ]

Tip: when you copy&paste, use tool Find&Replace, to replace question marks into tabs. Warning : dont replace them into spaces, because it causes indentation errors. Replace them with nothing (empty string or ""). Then indent all from beginning.


#Description:
#this program goes on website https://thenewboston.com/search.php?type=0&sort=reputation ,and goes on every user's profile, and on that profile, it prints the first few (approx. 20) links of latest photos. To view photos, click on url or copy in web broser.


import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages):         #this function gets links to user's profiles
    page = 1
    while page <= max_pages:
        url = 'https://thenewboston.com/search.php?type=0&sort=reputation&page==' + str(page)
        source_code = requests.get(url, allow_redirects=False)
        plain_text = source_code.text.encode('ascii', 'replace')
        # soup = BeautifulSoup(plain_text) #this line causes error! (uncomment, to learn new things)
        soup = BeautifulSoup(plain_text,'html.parser')
        for link in soup.findAll('a', {'class': 'user-name'}): #all profiles
            print(' <<---BEGINNING OF LINK--->>')
            print('link: ',link)
            href = link.get('href')
            title = link.string
            print('href: ' ,href)
            print('title: ',title)
            get_single_item_data(href)        #comment this for better understanding
            print(' <<---END OF link--->>')
        print('page: ',page)
        page += 1

def get_single_item_data(item_url):                    #I am now on user's profile and I now print links to users's photos

    print(' <<--- BEGINNING OF get_single_item_data() --->>')
    source_code = requests.get(item_url)
    plain_text = source_code.text
    # soup = BeautifulSoup(plain_text) #this causes error!! (uncomment, to learn new things)
    soup = BeautifulSoup(plain_text,"lxml") #use this line, to avoid error!!
    for item_name in soup.findAll('img', {'class': 'img-responsive'}): # all photos of the user
        print('item_name :',item_name)
        photo='https://thenewboston.com'+item_name.get('src')
        print('Click the link to open the photo: ', photo)    
    print(' <<--- END OF get_single_item_data() --->>')

trade_spider(1)

use this copypaste code to avoid future indentation errors:


#Description:
#this program goes on website https://thenewboston.com/search.php?type=0&sort=reputation ,and goes on every user's profile, and on that profile, it prints the first few (approx. 20) links of latest photos. To view photos, click on url or copy in web broser.


import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages):#this function gets links to user's profiles
page = 1
while page <= max_pages:
url = 'https://thenewboston.com/search.php?type=0&sort=reputation&page==' + str(page)
source_code = requests.get(url, allow_redirects=False)
plain_text = source_code.text.encode('ascii', 'replace')
# soup = BeautifulSoup(plain_text) #this line causes error! (uncomment, to learn new things)
soup = BeautifulSoup(plain_text,'html.parser')
for link in soup.findAll('a', {'class': 'user-name'}):# all profiles
print(' <<---BEGINNING OF LINK--->>')
print('link: ',link)
href = link.get('href')
title = link.string
print('href: ' ,href)
print('title: ',title)
get_single_item_data(href)# comment this for better understanding
print(' <<---END OF link--->>')
print('page: ',page)
page += 1

def get_single_item_data(item_url):# I am now on user's profile and I now print links to users's photos

print(' <<--- BEGINNING OF get_single_item_data() --->>')
source_code = requests.get(item_url)
plain_text = source_code.text
# soup = BeautifulSoup(plain_text)# this causes error!! (uncomment, to learn new things)
soup = BeautifulSoup(plain_text,"lxml")#use this line, to avoid error!!
for item_name in soup.findAll('img', {'class': 'img-responsive'}):# all photos of the user
print('item_name :',item_name)
photo='https://thenewboston.com'+item_name.get('src')
print('Click the link to open the photo: ', photo)
print(' <<--- END OF get_single_item_data() --->>')

trade_spider(1)



I hope this code helps. It prints alot of stuff, to see what is going on with the code - what are our variables being used. Please give me a feedback if it is too messy, if it is helpful or whatever.
0 Dimitar Vasilev · January 30, 2016
import requests
from bs4 import BeautifulSoup

the_url = 'https://www.thenewboston.com/forum/topic.php?id=1610'
def getting_the_info(url):
the_real_url=url
response=requests.get(the_real_url)
plain_text=response.text
soup=BeautifulSoup(plain_text, 'html.parser')
for line in soup.find_all('code',):
line = str(line).replace('<br>','\n')
if soup.find_all('td',{'class':'third-column'}):
break
print(line)

# print(plain_text)
getting_the_info(the_url)
0 Kaim Syed · April 11, 2016
uring handling of the above exception, another exception occurred:

Traceback (most recent call last):
 File "C:/Users/NAEEM/PycharmProjects/main.py/main.py", line 36, in <module>
   trade_spider(1)
 File "C:/Users/NAEEM/PycharmProjects/main.py/main.py", line 9, in trade_spider
   source_code = requests.get(url)
 File "C:\Users\NAEEM\AppData\Roaming\Python\Python35\site-packages\requests\api.py", line 67, in get
   return request('get', url, params=params, **kwargs)
 File "C:\Users\NAEEM\AppData\Roaming\Python\Python35\site-packages\requests\api.py", line 53, in request
   return session.request(method=method, url=url, **kwargs)
 File "C:\Users\NAEEM\AppData\Roaming\Python\Python35\site-packages\requests\sessions.py", line 468, in request
   resp = self.send(prep, **send_kwargs)
 File "C:\Users\NAEEM\AppData\Roaming\Python\Python35\site-packages\requests\sessions.py", line 576, in send
   r = adapter.send(request, **kwargs)
 File "C:\Users\NAEEM\AppData\Roaming\Python\Python35\site-packages\requests\adapters.py", line 437, in send
   raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='buckysroom.org', port=443): Max retries exceeded with url: /trade/search.php?page=1 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x0317BC10>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond',))
0 Aditya Joshi · April 28, 2016
I was getting the html.parser error so I added "htl.parser" to the plain_text. Now getting this error

"I am getting the html parser error so i added "html.parser"after plain_text, but then I am still getting the following error

Traceback (most recent call last):
 File "file path", line 22, in <module>
   trade_spider(1)
 File "file path", line 13, in trade_spider
   soup = BeautifulSoup(plain_text, "html parser")

bs4.FeatureNotFound: Couldn't find a tree builder with the features you requested: html parser. Do you need to install a parser library??"
+1 David Refaeli · May 3, 2016
So this is if you want all the source-code :-) 


import requests
import re
from bs4 import BeautifulSoup

def search_spider(max_pages):
   page = 1
   while page <= max_pages:
       url = 'https://thenewboston.com/forum/category.php?id=15&orderby=recent&page=' + str(page)
       source_code = requests.get(url)
       plain_text = source_code.text
       soup = BeautifulSoup(plain_text, "html.parser")
       for link in soup.findAll('a', {'class': 'title text-semibold'}):
           str1 = link.string
           str2 = '[source code]'
           if str2 in str1:
               href = link.get('href')
               print(href)
               get_source(href)
       page += 1

def get_source(url):
   source = requests.get(url)
   text = source.text
   soup = BeautifulSoup(text, "html.parser")
   for code in soup.find('code'):
       line = str(code).replace('<br>','\n')
       soup2 = BeautifulSoup(line, "html.parser")
       script = soup2.get_text()
       script = script.replace('?', ' ')
       script = script.replace('Â', ' ')
       print(script)


search_spider(22)
#so far there's 22 pages for the forum
0 Carlos G. · July 13, 2016
Hi Bucky. Thanks for these useful tutorials. I have two questions. Thanks for your reply:

1) How would you set a different "user-agent" in this script so you won't be banned when scraping another website?

2) How can you test that your "fake" user-agent is really working (how can you retrieve its value) before running the script on a website?

Thanks!
0 Muhammad Talha Zaroon · January 19, 2017
here's my code and I am getting the following error plz help:

import requests
from bs4 import BeautifulSoup

def web_crawler(max_pages):
   page=1
   while page <=max_pages:
       url = "https://thenewboston.com/search.php?page=" + str(page)
       source_code = requests.get(url)
       plain_text = source_code.text
       soup = BeautifulSoup(plain_text)

       for link in soup.findAll('a',{'class': 'user-name'}):
           title = link.string
           href = link.get('href')
           #print(href)
           #print(title)
           get_single_item(href)
       page += 1


def get_single_item(item_url):
   source_code = request.get(item_url)
   plain_text = source_code.text
   soup = BeautifulSoup(plain_text)
   for item_name in soup.find_all('span',{'class': 'text-semibold'}):
       print(item_name.string)


web_crawler(1)

/images/forum/upload/2017-01-07/dd7a9f8283911a0808c415fb20d7cf52.png
0 Shashikanth Reddy Palvatla · April 23, 2017
@Muhammad Talha Zaroon

just like the error message says replace the following line in your code

soup = BeautifulSoup(plain_text)

to

soup = BeautifulSoup(plain_text, "html.parser")

Python

131,231 followers
About

This section is all about snakes! Just kidding.

Links
Moderators
Bucky Roberts Administrator