[source code] Python Programming Tutorial - 25 - How to Make a Web Crawler

+36 Bucky Roberts · September 3, 2014

import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages):
    page = 1
    while page <= max_pages:
        url = "https://buckysroom.org/trade/search.php?page=" + str(page)
        source_code = requests.get(url)
        # just get the code, no headers or anything
        plain_text = source_code.text
        # BeautifulSoup objects can be sorted through easy
        soup = BeautifulSoup(plain_text)
        for link in soup.findAll('a', {'class': 'item-name'}):
            href = "https://buckysroom.org" + link.get('href')
            title = link.string  # just the text, not the HTML
            print(href)
            print(title)
            # get_single_item_data(href)
        page += 1


def get_single_item_data(item_url):
    source_code = requests.get(item_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text)
    # if you want to gather information from that page
    for item_name in soup.findAll('div', {'class': 'i-name'}):
        print(item_name.string)
    # if you want to gather links for a web crawler
    for link in soup.findAll('a'):
        href = "https://buckysroom.org" + link.get('href')
        print(href)


trade_spider(1)

Post a Reply

Replies

- page 4
Oldest  Newest  Rating
0 anthom antho · October 8, 2015
I tried to save it to a file and then PyCharm threw an Encoding Exception [if they are not called in Exception in Python I am sorry. I am originally a Java guy ]

So I managed to replace them. What I now did is, I used your code Anton, and shortened it a bit as the last soup-object call was redundant. And now it prints it out nicely to the console

Added a few more comments to explain what I changed. So it kinda looks longer now 
0 Simon Ward · September 12, 2015
Here is my solution to the challenge, although i took some of it from Patrick.
It is now able to browse all of this forum to search for the source code of these Python tutorials and save them directly to a .py file.


import requests
from bs4 import BeautifulSoup

def code_spider(max_pages,ttnr,specifier): #maximum of pages in forum, number of tutorial
    dest_url = 'source'+str(ttnr)+'_'+specifier+'.py'
    page = 1
    done = False
    no = ttnr
    while page <= max_pages:
        url = 'https://www.thenewboston.com/forum/category.php?id=15&orderby=recent&page=' + str(page)
        source_code = requests.get(url)                      #holt den source code
        plain_text = source_code.text                        #verwandelt sc in plain text
        soup = BeautifulSoup(plain_text, "html.parser")                     #laedt den plain text in ein bs objekt
        for link in soup.findAll('a', {'class':'post-title'}): #loop thru source and find links(a) with the 'class' of 'ad-title'
            href = 'https://www.thenewboston.com' + link.get('href')
            title = link.string                              #takes the string value
            if title[0:46] == '[source code] Python Programming Tutorial - '+str(ttnr):
                print(title)
                print(href)
                done = True
                break
        if done:
            code_getter(href,dest_url)
            break
        page +=1

def code_getter(code_url,rl):
    codeExtract = requests.get(code_url)
    plainText = codeExtract.text
    soup = BeautifulSoup(plainText, "html.parser")
    fx = open(rl,"w")
    for results in soup.find('code'):
        resultLine = str(results).replace('<br>', '\n').replace("\ufffd", '  ').replace('&l_t_;', '<').replace('</br>', '')
        fx.write(resultLine + "\n")
    fx.close()

code_spider(20,25,'web_crawler')
0 Alexander Mentyu · November 4, 2014
import requests
from bs4 import BeautifulSoup

def code_crawl(id):
    url = 'https://buckysroom.org/forum/topic.php?id=' + str(id)
    raw_html = requests.get(url)
    plain_text = raw_html.text
    soup = BeautifulSoup(plain_text)

    for code_line in soup.find('code'):
        result_line = str(code_line).replace('<br>', '\n').replace('</br>', '').replace('\ufffd', ' ')
        print(result_line)

code_crawl(1610)
0 Halcyon Abraham Ramirez · June 3, 2015

from selenium import webdriver


class Bucky:
    def __init__(self):
        self.driver = webdriver.Firefox()
        self.driver.get("https://www.thenewboston.com/forum/topic.php?id=1610")
        self.parse()

    def parse(self):
        code = self.driver.find_element_by_xpath("//code[@class=' hljs bash']").text
        print(code)

a = Bucky()

this script could be made even shorter but I used classes just for practice.

This outputs everything on the source code bucky gave. no more picking out the <br> tags etc it outputs exactly as how you see it




from selenium import webdriver

def parse():
    driver = webdriver.Firefox()
    driver.get("https://www.thenewboston.com/forum/topic.php?id=1610")
    code = driver.find_element_by_xpath("//code[@class=' hljs bash']").text
    print(code)

parse()

this is the short version without the class only using a function



from selenium import webdriver

driver = webdriver.Firefox()
driver.get("https://www.thenewboston.com/forum/topic.php?id=1610")
code = driver.find_element_by_xpath("//code[@class=' hljs bash']").text
print(code)



an even shorter version without using functions...ok now im just showing off XD.
0 Kaim Syed · April 11, 2016
uring handling of the above exception, another exception occurred:

Traceback (most recent call last):
 File "C:/Users/NAEEM/PycharmProjects/main.py/main.py", line 36, in <module>
   trade_spider(1)
 File "C:/Users/NAEEM/PycharmProjects/main.py/main.py", line 9, in trade_spider
   source_code = requests.get(url)
 File "C:\Users\NAEEM\AppData\Roaming\Python\Python35\site-packages\requests\api.py", line 67, in get
   return request('get', url, params=params, **kwargs)
 File "C:\Users\NAEEM\AppData\Roaming\Python\Python35\site-packages\requests\api.py", line 53, in request
   return session.request(method=method, url=url, **kwargs)
 File "C:\Users\NAEEM\AppData\Roaming\Python\Python35\site-packages\requests\sessions.py", line 468, in request
   resp = self.send(prep, **send_kwargs)
 File "C:\Users\NAEEM\AppData\Roaming\Python\Python35\site-packages\requests\sessions.py", line 576, in send
   r = adapter.send(request, **kwargs)
 File "C:\Users\NAEEM\AppData\Roaming\Python\Python35\site-packages\requests\adapters.py", line 437, in send
   raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='buckysroom.org', port=443): Max retries exceeded with url: /trade/search.php?page=1 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x0317BC10>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond',))
0 Chandan Chaudhary · May 9, 2015
import requests
from bs4 import BeautifulSoup

def trade_spider(max_pages):
    page = 1
    while page <= max_pages:
        url = 'https://www.thenewboston.com/forum/search_forums.php?s=&orderby=popular&page=' + str(page)
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text)
        for link in soup.findAll('a', {'class': 'category-title'}):
            href = 'https://www.thenewboston.com/' + link.get('href')
            title = link.string
            print(href)
            print(title)
        page =+ 1

trade_spider(1)

# why this programme is not stoping. Help Me
0 Arthur lee · December 11, 2014
hi, bro ,i have a question, how to craw the  pictures down ? which function shall i use?
0 Balaskandan Giri · November 4, 2015
HELP!!!

The crawler is deadlocked if I try to call it recursively!!!!!!
Any solution??
0 Halcyon Abraham Ramirez · November 5, 2015
@Balaskandan Giri

w/out a code snippet or stacktrace no one can help
0 nmelssx nmelssx · September 25, 2015
I have a web crawler code in php, but it just gets the words, without the css style of the website. Is that normal or no?

Python

123,032 followers
About

This section is all about snakes! Just kidding.

Links
Moderators
Bucky Roberts Administrator