[source code] Python Programming Tutorial - 25 - How to Make a Web Crawler

+39 Bucky Roberts · September 3, 2014

import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages):
    page = 1
    while page <= max_pages:
        url = "https://buckysroom.org/trade/search.php?page=" + str(page)
        source_code = requests.get(url)
        # just get the code, no headers or anything
        plain_text = source_code.text
        # BeautifulSoup objects can be sorted through easy
        soup = BeautifulSoup(plain_text)
        for link in soup.findAll('a', {'class': 'item-name'}):
            href = "https://buckysroom.org" + link.get('href')
            title = link.string  # just the text, not the HTML
            print(href)
            print(title)
            # get_single_item_data(href)
        page += 1


def get_single_item_data(item_url):
    source_code = requests.get(item_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text)
    # if you want to gather information from that page
    for item_name in soup.findAll('div', {'class': 'i-name'}):
        print(item_name.string)
    # if you want to gather links for a web crawler
    for link in soup.findAll('a'):
        href = "https://buckysroom.org" + link.get('href')
        print(href)


trade_spider(1)

Post a Reply

Replies

- page 1
Oldest  Newest  Rating
0 Poz Pozeidon · April 23, 2017
I want to add a functionality to this web crawler, actually when crawling a website like nytimes.com, I want to obtain all the text under the website and put them together with their urls. What do you recomend
0 Mark Drew · April 23, 2017
Hello all

I have tried to amend Bucky's code to work with a different website given the one in the tutorial is no longer available.  Given I am actually trying to follow the principal rather than just typing the code verbatim, it is probably a good thing.

Anyway, here is the code I have come up with to search a local buy and sell site.


import requests
from bs4 import BeautifulSoup

def trade_spider(max_pages):
   page = 1
   while page <= max_pages:
       url = 'https://www.gumtree.com.au/s-appliances/page-' + str(page) + '/c20088'
       source_code = requests.get(url)
       plain_text = source_code.text
       soup = BeautifulSoup(plain_text)
       for link in soup.findAll('a', {'class': 'ad-listing__title-link'}):
           href = 'https://www.gumtree.com.au' + link.get('href')
           title = link.string
           print(href)
           print(title)
           page += 1

trade_spider(1)




I know it is because I am not understanding the html code properly but I am not seeing what I expect to see.


  • Rather than seeing all the ads, I only recognise the last 8 (out of 30) hrefs.

  • And for the title, I get 'none' for each item.



I was hoping someone could have a look and help me understand why it is behaving like this.

Thanks
+1 Mark Drew · April 23, 2017
Hi All

Earlier today I posted a question (hasn't yet been approved by moderators).  The question I posed was why the results were not the same as the url.  The answer is that the website I used was changing too frequently.  I used a more static list and got the result I was after.  
I also wondered why my code was providing None for all the titles, I have since found that was because the a tag had no title attribute.  I have been able to achieve the same result as Bucky with the following script:


import requests
from bs4 import BeautifulSoup

def trade_spider(max_pages):
   page = 1
   while page <= max_pages:
       url = 'https://www.gumtree.com.au/s-property-for-rent/hobart-cbd-hobart/page-' + str(page) + '/c18364l3000302'
       source_code = requests.get(url)
       plain_text = source_code.text
       soup = BeautifulSoup(plain_text)
       for link in soup.findAll('span', {'itemprop': 'name'}):
           title = link.string
           for each in soup.findAll('a', {'class': 'ad-listing__title-link'}):
               href = 'https://www.gumtree.com.au' + each.get('href')
           print(title)
           print(href)

       page += 1

trade_spider(2)
0 Shashikanth Reddy Palvatla · April 23, 2017
@Muhammad Talha Zaroon

just like the error message says replace the following line in your code

soup = BeautifulSoup(plain_text)

to

soup = BeautifulSoup(plain_text, "html.parser")
0 Muhammad Talha Zaroon · January 19, 2017
here's my code and I am getting the following error plz help:

import requests
from bs4 import BeautifulSoup

def web_crawler(max_pages):
   page=1
   while page <=max_pages:
       url = "https://thenewboston.com/search.php?page=" + str(page)
       source_code = requests.get(url)
       plain_text = source_code.text
       soup = BeautifulSoup(plain_text)

       for link in soup.findAll('a',{'class': 'user-name'}):
           title = link.string
           href = link.get('href')
           #print(href)
           #print(title)
           get_single_item(href)
       page += 1


def get_single_item(item_url):
   source_code = request.get(item_url)
   plain_text = source_code.text
   soup = BeautifulSoup(plain_text)
   for item_name in soup.find_all('span',{'class': 'text-semibold'}):
       print(item_name.string)


web_crawler(1)

/images/forum/upload/2017-01-07/dd7a9f8283911a0808c415fb20d7cf52.png
0 Carlos G. · July 13, 2016
Hi Bucky. Thanks for these useful tutorials. I have two questions. Thanks for your reply:

1) How would you set a different "user-agent" in this script so you won't be banned when scraping another website?

2) How can you test that your "fake" user-agent is really working (how can you retrieve its value) before running the script on a website?

Thanks!
+1 David Refaeli · May 3, 2016
So this is if you want all the source-code :-) 


import requests
import re
from bs4 import BeautifulSoup

def search_spider(max_pages):
   page = 1
   while page <= max_pages:
       url = 'https://thenewboston.com/forum/category.php?id=15&orderby=recent&page=' + str(page)
       source_code = requests.get(url)
       plain_text = source_code.text
       soup = BeautifulSoup(plain_text, "html.parser")
       for link in soup.findAll('a', {'class': 'title text-semibold'}):
           str1 = link.string
           str2 = '[source code]'
           if str2 in str1:
               href = link.get('href')
               print(href)
               get_source(href)
       page += 1

def get_source(url):
   source = requests.get(url)
   text = source.text
   soup = BeautifulSoup(text, "html.parser")
   for code in soup.find('code'):
       line = str(code).replace('<br>','\n')
       soup2 = BeautifulSoup(line, "html.parser")
       script = soup2.get_text()
       script = script.replace('?', ' ')
       script = script.replace('Â', ' ')
       print(script)


search_spider(22)
#so far there's 22 pages for the forum
0 Aditya Joshi · April 28, 2016
I was getting the html.parser error so I added "htl.parser" to the plain_text. Now getting this error

"I am getting the html parser error so i added "html.parser"after plain_text, but then I am still getting the following error

Traceback (most recent call last):
 File "file path", line 22, in <module>
   trade_spider(1)
 File "file path", line 13, in trade_spider
   soup = BeautifulSoup(plain_text, "html parser")

bs4.FeatureNotFound: Couldn't find a tree builder with the features you requested: html parser. Do you need to install a parser library??"
0 Kaim Syed · April 11, 2016
uring handling of the above exception, another exception occurred:

Traceback (most recent call last):
 File "C:/Users/NAEEM/PycharmProjects/main.py/main.py", line 36, in <module>
   trade_spider(1)
 File "C:/Users/NAEEM/PycharmProjects/main.py/main.py", line 9, in trade_spider
   source_code = requests.get(url)
 File "C:\Users\NAEEM\AppData\Roaming\Python\Python35\site-packages\requests\api.py", line 67, in get
   return request('get', url, params=params, **kwargs)
 File "C:\Users\NAEEM\AppData\Roaming\Python\Python35\site-packages\requests\api.py", line 53, in request
   return session.request(method=method, url=url, **kwargs)
 File "C:\Users\NAEEM\AppData\Roaming\Python\Python35\site-packages\requests\sessions.py", line 468, in request
   resp = self.send(prep, **send_kwargs)
 File "C:\Users\NAEEM\AppData\Roaming\Python\Python35\site-packages\requests\sessions.py", line 576, in send
   r = adapter.send(request, **kwargs)
 File "C:\Users\NAEEM\AppData\Roaming\Python\Python35\site-packages\requests\adapters.py", line 437, in send
   raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='buckysroom.org', port=443): Max retries exceeded with url: /trade/search.php?page=1 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x0317BC10>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond',))
0 Dimitar Vasilev · January 30, 2016
import requests
from bs4 import BeautifulSoup

the_url = 'https://www.thenewboston.com/forum/topic.php?id=1610'
def getting_the_info(url):
the_real_url=url
response=requests.get(the_real_url)
plain_text=response.text
soup=BeautifulSoup(plain_text, 'html.parser')
for line in soup.find_all('code',):
line = str(line).replace('<br>','\n')
if soup.find_all('td',{'class':'third-column'}):
break
print(line)

# print(plain_text)
getting_the_info(the_url)

Python

129,904 followers
About

This section is all about snakes! Just kidding.

Links
Moderators
Bucky Roberts Administrator