[source code] Python Programming Tutorial - 25 - How to Make a Web Crawler

+26 Bucky Roberts · September 3, 2014

import requests
from bs4 import BeautifulSoup


def trade_spider(max_pages):
    page = 1
    while page <= max_pages:
        url = "https://buckysroom.org/trade/search.php?page=" + str(page)
        source_code = requests.get(url)
        # just get the code, no headers or anything
        plain_text = source_code.text
        # BeautifulSoup objects can be sorted through easy
        soup = BeautifulSoup(plain_text)
        for link in soup.findAll('a', {'class': 'item-name'}):
            href = "https://buckysroom.org" + link.get('href')
            title = link.string  # just the text, not the HTML
            print(href)
            print(title)
            # get_single_item_data(href)
        page += 1


def get_single_item_data(item_url):
    source_code = requests.get(item_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text)
    # if you want to gather information from that page
    for item_name in soup.findAll('div', {'class': 'i-name'}):
        print(item_name.string)
    # if you want to gather links for a web crawler
    for link in soup.findAll('a'):
        href = "https://buckysroom.org" + link.get('href')
        print(href)


trade_spider(1)

Post a Reply

Replies

- page 1
Oldest  Newest  Rating
+3 Russell Allen · October 11, 2014
I'm getting an error when I try to import requests.  Am I missing a package?  
+2 DG Wright · December 8, 2014

I'm getting an error when I try to import requests. Am I missing a package?



Apparently so; from your Project Interpreter in Settings, add the package 'requests' (it's described as "Python HTTP for Humans"). Then the code should work 'as advertised' ;)
+1 Amine ElQaraoui · September 19, 2014
import requests
from bs4 import BeautifulSoup

def code_spider(id):
    url =  'https://buckysroom.org/forum/topic.php?id=' + str(id)
    source = requests.get(url)
    text = source.text
    soup = BeautifulSoup(text)
    for code in soup.findAll('code'):
        line = str(code).replace('<br>','\n')
        soup2 = BeautifulSoup(line)
        script = soup2.get_text()
        script = script.replace('?', ' ')
        print(script)

code_spider(1610)
+1 David Refaeli · May 3, 2016
So this is if you want all the source-code :-) 


import requests
import re
from bs4 import BeautifulSoup

def search_spider(max_pages):
   page = 1
   while page <= max_pages:
       url = 'https://thenewboston.com/forum/category.php?id=15&orderby=recent&page=' + str(page)
       source_code = requests.get(url)
       plain_text = source_code.text
       soup = BeautifulSoup(plain_text, "html.parser")
       for link in soup.findAll('a', {'class': 'title text-semibold'}):
           str1 = link.string
           str2 = '[source code]'
           if str2 in str1:
               href = link.get('href')
               print(href)
               get_source(href)
       page += 1

def get_source(url):
   source = requests.get(url)
   text = source.text
   soup = BeautifulSoup(text, "html.parser")
   for code in soup.find('code'):
       line = str(code).replace('<br>','\n')
       soup2 = BeautifulSoup(line, "html.parser")
       script = soup2.get_text()
       script = script.replace('?', ' ')
       script = script.replace('Â', ' ')
       print(script)


search_spider(22)
#so far there's 22 pages for the forum
+1 Anton Vasilev · September 4, 2014
How to Crawl itself and be stuck in an infinite loop


import requests
from bs4 import BeautifulSoup


def buckysChallangeGetCode(ID):
    url = 'https://buckysroom.org/forum/topic.php?id=' + str(ID)
    codeExtract = requests.get(url)
    plainText = codeExtract.text
    soup = BeautifulSoup(plainText)
    for results in soup.find('code'):
    # changed findAll to find - if another code is submitted it won't be printed (assuming Buckys code will come first)
        resultLine = str(results).replace('<br>','\n') # replacing the html breaks with new lines
        soupClear = BeautifulSoup(resultLine)
        resultsClear = soupClear.get_text() # couldn't get this working with the .string soup method
        print(resultsClear)


buckysChallangeGetCode(1610) # the parameter is the ID of the forum topic so it can be used for other threads as well


Any ideas on how to replace the "?" that is the indentation?
+1 Chris Nelson · January 4, 2015
<br> is a line break in old HTML.
I believe HTML 5 replaces that with.. <br />

I could be wrong, it has been so long since writing any HTML.

I haven't gone into the webcrawler tutorial.. But if those are referenced in Bucky's python videos, I believe that because you can use certain HTML code in python.

But please someone with more experience, feel free to correct me!
+1 Patrick Lehmann · September 4, 2014
I had to trick a little bit to figure out the issue with the ? - Symbol.

I tried to save it to a file and then PyCharm threw an Encoding Exception [if they are not called in Exception in Python I am sorry. I am originally a Java guy :P]

So I managed to replace them. What I now did is, I used your code Anton, and shortened it a bit as the last soup-object call was redundant. And now it prints it out nicely to the console

Added a few more comments to explain what I changed. So it kinda looks longer now :P

import requests
from bs4 import BeautifulSoup


def buckysChallangeGetCode(ID):
    url = 'https://buckysroom.org/forum/topic.php?id=' + str(ID)
    codeExtract = requests.get(url)
    plainText = codeExtract.text
    soup = BeautifulSoup(plainText)
    for results in soup.find('code'):
         #changed findAll to find - if another code is submitted it won't be printed
         #[Patrick's Code Changes]
         #I removed the converting of the text into a Soup-Object as it was redundant
         #I also added a few .replace myself. \ufffd are the ?-Symbols. Identified by the browser as Tabs AFAIK
         #replaced it with 2 Spaces so it looks nicer
         #replace &l_t_; [the _ is just so the forums displays it] with <
         #replaced the </br> tag that was at the end of the text to ''
#just so it looks nicer and has no unnecessary line
         #breaks at the end of the code
        resultLine = str(results).replace('<br>', '\n').replace("\ufffd", '  ').replace('&l_t_;', '<').replace('</br>', '')
        print(resultLine)


buckysChallangeGetCode(1610)  # the parameter is the ID of the forum topic so it can be used for other threads as well
+1 jadava Umesh · December 10, 2014
I am sorry to say Bucky, I am getting error like this, please explain me what to do.

Traceback (most recent call last):
  File "try.py", line 36, in <module>
    trade_spider(1)
  File "try.py", line 9, in trade_spider
    source_code = requests.get(url)
  File "/usr/lib/python2.7/dist-packages/requests/api.py", line 55, in get
    return request('get', url, **kwargs)
  File "/usr/lib/python2.7/dist-packages/requests/api.py", line 44, in request
    return session.request(method=method, url=url, **kwargs)
  File "/usr/lib/python2.7/dist-packages/requests/sessions.py", line 455, in request
    resp = self.send(prep, **send_kwargs)
  File "/usr/lib/python2.7/dist-packages/requests/sessions.py", line 578, in send
    history = [resp for resp in gen] if allow_redirects else []
  File "/usr/lib/python2.7/dist-packages/requests/sessions.py", line 178, in resolve_redirects
    allow_redirects=False,
  File "/usr/lib/python2.7/dist-packages/requests/sessions.py", line 558, in send
    r = adapter.send(request, **kwargs)
  File "/usr/lib/python2.7/dist-packages/requests/adapters.py", line 385, in send
    raise SSLError(e)
requests.exceptions.SSLError: hostname 'www.thenewboston.com' doesn't match either of 'www.buckysroom.org', 'buckysroom.org'
+1 Nate Penner · October 4, 2014
Bucky! For some reason I cannot get anything from the trades page! Every page in the trade sections says 'No data available' so I have to skip the Web Crawler tutorial :(
0 sfolje 0 · November 7, 2015
maybe your url doesnt exist. check it out.

post your code.

Python

107,166 followers
About

This section is all about snakes! Just kidding.

Links
Moderators
Bucky Roberts Administrator