[source code] Python Programming Tutorial - 25 - How to Make a Web Crawler

+39 Bucky Roberts · September 3, 2014

import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages):
    page = 1
    while page <= max_pages:
        url = "https://buckysroom.org/trade/search.php?page=" + str(page)
        source_code = requests.get(url)
        # just get the code, no headers or anything
        plain_text = source_code.text
        # BeautifulSoup objects can be sorted through easy
        soup = BeautifulSoup(plain_text)
        for link in soup.findAll('a', {'class': 'item-name'}):
            href = "https://buckysroom.org" + link.get('href')
            title = link.string  # just the text, not the HTML
            print(href)
            print(title)
            # get_single_item_data(href)
        page += 1


def get_single_item_data(item_url):
    source_code = requests.get(item_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text)
    # if you want to gather information from that page
    for item_name in soup.findAll('div', {'class': 'i-name'}):
        print(item_name.string)
    # if you want to gather links for a web crawler
    for link in soup.findAll('a'):
        href = "https://buckysroom.org" + link.get('href')
        print(href)


trade_spider(1)

Post a Reply

Replies

- page 3
Oldest  Newest  Rating
0 nmelssx nmelssx · September 25, 2015
I have a web crawler code in php, but it just gets the words, without the css style of the website. Is that normal or no?
0 Simon Ward · September 12, 2015
Here is my solution to the challenge, although i took some of it from Patrick.
It is now able to browse all of this forum to search for the source code of these Python tutorials and save them directly to a .py file.


import requests
from bs4 import BeautifulSoup

def code_spider(max_pages,ttnr,specifier): #maximum of pages in forum, number of tutorial
    dest_url = 'source'+str(ttnr)+'_'+specifier+'.py'
    page = 1
    done = False
    no = ttnr
    while page <= max_pages:
        url = 'https://www.thenewboston.com/forum/category.php?id=15&orderby=recent&page=' + str(page)
        source_code = requests.get(url)                      #holt den source code
        plain_text = source_code.text                        #verwandelt sc in plain text
        soup = BeautifulSoup(plain_text, "html.parser")                     #laedt den plain text in ein bs objekt
        for link in soup.findAll('a', {'class':'post-title'}): #loop thru source and find links(a) with the 'class' of 'ad-title'
            href = 'https://www.thenewboston.com' + link.get('href')
            title = link.string                              #takes the string value
            if title[0:46] == '[source code] Python Programming Tutorial - '+str(ttnr):
                print(title)
                print(href)
                done = True
                break
        if done:
            code_getter(href,dest_url)
            break
        page +=1

def code_getter(code_url,rl):
    codeExtract = requests.get(code_url)
    plainText = codeExtract.text
    soup = BeautifulSoup(plainText, "html.parser")
    fx = open(rl,"w")
    for results in soup.find('code'):
        resultLine = str(results).replace('<br>', '\n').replace("\ufffd", '  ').replace('&l_t_;', '<').replace('</br>', '')
        fx.write(resultLine + "\n")
    fx.close()

code_spider(20,25,'web_crawler')
0 Halcyon Abraham Ramirez · June 3, 2015

from selenium import webdriver


class Bucky:
    def __init__(self):
        self.driver = webdriver.Firefox()
        self.driver.get("https://www.thenewboston.com/forum/topic.php?id=1610")
        self.parse()

    def parse(self):
        code = self.driver.find_element_by_xpath("//code[@class=' hljs bash']").text
        print(code)

a = Bucky()

this script could be made even shorter but I used classes just for practice.

This outputs everything on the source code bucky gave. no more picking out the <br> tags etc it outputs exactly as how you see it




from selenium import webdriver

def parse():
    driver = webdriver.Firefox()
    driver.get("https://www.thenewboston.com/forum/topic.php?id=1610")
    code = driver.find_element_by_xpath("//code[@class=' hljs bash']").text
    print(code)

parse()

this is the short version without the class only using a function



from selenium import webdriver

driver = webdriver.Firefox()
driver.get("https://www.thenewboston.com/forum/topic.php?id=1610")
code = driver.find_element_by_xpath("//code[@class=' hljs bash']").text
print(code)



an even shorter version without using functions...ok now im just showing off XD.
0 yiting chua · June 1, 2015
Hey NewBoston,  I tried to follow your tutorial, but I'm having some issues on Tut 26.


import requests
from bs4 import BeautifulSoup

def trade_spider(max_pages):

page = 1
while page
I got the following error when I run it,

Traceback (most recent call last):
  File "C:/Users/User/PycharmProjects/Basic/WebCrawlerTest.py", line 19, in <module>
    trade_spider(2)
  File "C:/Users/User/PycharmProjects/Basic/WebCrawlerTest.py", line 9, in trade_spider
    source_code = requests.get(url)
  File "C:\Users\User\AppData\Roaming\Python\Python34\site-packages\requests\api.py", line 69, in get
    return request('get', url, params=params, **kwargs)
  File "C:\Users\User\AppData\Roaming\Python\Python34\site-packages\requests\api.py", line 50, in request
    response = session.request(method=method, url=url, **kwargs)
  File "C:\Users\User\AppData\Roaming\Python\Python34\site-packages\requests\sessions.py", line 465, in request
    resp = self.send(prep, **send_kwargs)
  File "C:\Users\User\AppData\Roaming\Python\Python34\site-packages\requests\sessions.py", line 594, in send
    history = [resp for resp in gen] if allow_redirects else []
  File "C:\Users\User\AppData\Roaming\Python\Python34\site-packages\requests\sessions.py", line 594, in <listcomp>
    history = [resp for resp in gen] if allow_redirects else []
  File "C:\Users\User\AppData\Roaming\Python\Python34\site-packages\requests\sessions.py", line 114, in resolve_redirects
    raise TooManyRedirects('Exceeded %s redirects.' % self.max_redirects)
requests.exceptions.TooManyRedirects: Exceeded 30 redirects.


able to help?
0 Saili Ghavat · May 26, 2015
is the trade page removed? I am trying to write the web crawler code but giving me a list of errors like:
  
File "C:\Users\Saili\AppData\Roaming\Python\Python34\site-packages\requests\sessions.py", line 573, in send
    r = adapter.send(request, **kwargs)
  File "C:\Users\Saili\AppData\Roaming\Python\Python34\site-packages\requests\adapters.py", line 415, in send
    raise ConnectionError(err, request=request)
0 Mehmet Kıdıman · May 14, 2015
?mport package then click hit bulb and select install package section
0 Mehmet Kıdıman · May 14, 2015

import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages):
page = 1
while page
**** half working** no error
? try it on some shopping website and i couln't get "href"  anyhow . you may check it if u have an solution please share it.
0 Chandan Chaudhary · May 9, 2015
import requests
from bs4 import BeautifulSoup

def trade_spider(max_pages):
    page = 1
    while page <= max_pages:
        url = 'https://www.thenewboston.com/forum/search_forums.php?s=&orderby=popular&page=' + str(page)
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text)
        for link in soup.findAll('a', {'class': 'category-title'}):
            href = 'https://www.thenewboston.com/' + link.get('href')
            title = link.string
            print(href)
            print(title)
        page =+ 1

trade_spider(1)

# why this programme is not stoping. Help Me
0 Probuddha N · April 17, 2015
I used the same coding (changing the page attributes) to get data from Truelocal.com.au


import requests
from bs4 import BeautifulSoup

def tru_crawler(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.truelocal.com.au/find/car-rental/' + str(page)
code = requests.get(url)
text = code.text
soup = BeautifulSoup(text)
for link in soup.findAll('a', {'class':'name'}):
href = 'http://www.truelocal.com.au' + link.get('href')
title = link.string
# print (href)
# print (title)
get_single_item_data(href)
page += 1

def get_single_item_data(item_url):
code = requests.get(item_url)
text = code.text
soup = BeautifulSoup(text)
for item_name in soup.findAll('h1',{'itemprop':'name'}):
print item_name.string
for link in soup.findAll('a'):
href = 'http://www.truelocal.com.au' + link.get('href')
print (href)


tru_crawler(2)


and after getting the first page title and all the links from that page, I got the following error


Traceback (most recent call last):
  File "C:/Users/PB/PycharmProjects/crawler/new-crawler.py", line 30, in <module>
    tru_crawler(2)
  File "C:/Users/PB/PycharmProjects/crawler/new-crawler.py", line 16, in tru_crawler
    get_single_item_data(href)
  File "C:/Users/PB/PycharmProjects/crawler/new-crawler.py", line 26, in get_single_item_data
    href = 'http://www.truelocal.com.au' + link.get('href')
TypeError: cannot concatenate 'str' and 'NoneType' objects


Please advise
0 Daniel Stoica · February 1, 2015
It's kind of hard to use this tutorial, considering that there's nothing on your trade page. :(

Python

129,860 followers
About

This section is all about snakes! Just kidding.

Links
Moderators
Bucky Roberts Administrator