Q:

python web crawler

# basic web scraping with python
# Import libraries
import requests
import urllib.request
import time
from bs4 import BeautifulSoup

# Set the URL you want to webscrape from
url = 'http://web.mta.info/developers/turnstile.html'

# Connect to the URL
response = requests.get(url)

# Parse HTML and save to BeautifulSoup object¶
soup = BeautifulSoup(response.text, "html.parser")

# To download the whole data set, let's do a for loop through all a tags
line_count = 1 #variable to track what line you are on
for one_a_tag in soup.findAll('a'):  #'a' tags are for links
    if line_count >= 36: #code for text files starts at line 36
        link = one_a_tag['href']
        download_url = 'http://web.mta.info/developers/'+ link
        urllib.request.urlretrieve(download_url,'./'+link[link.find('/turnstile_')+1:]) 
        time.sleep(1) #pause the code for a sec
    #add 1 for next line
    line_count +=1
1
import scrapy
from ..items import SampletestItem #items class

class QuoteTestSpider(scrapy.Spider):
    name = 'quote_test'
    start_urls = ['https://quotes.toscrape.com/']

    def parse(self, response):
        items = SampletestItem() #items class
        quotes = response.css("div.quote")
        for quote in quotes:
            items['title'] = quote.css("span.text::text").get()
            items['author'] = quote.css(".author::text").get()
            items['tags'] = quote.css(".tags .tag::text").getall()
            
            yield items
            next_page = response.css(".next a::attr(href)").get()
            if next_page is not None:
                next_url = response.urljoin(next_page)
                yield scrapy.Request(next_url, callback=self.parse)
1
import requests
from bs4 import BeautifulSoup

URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)

soup = BeautifulSoup(page.content, 'html.parser')
9
import scrapy

class BlogSpider(scrapy.Spider):
    name = 'blogspider'
    start_urls = ['https://blog.scrapinghub.com']

    def parse(self, response):
        for title in response.css('.post-header>h2'):
            yield {'title': title.css('a ::text').get()}

        for next_page in response.css('a.next-posts-link'):
            yield response.follow(next_page, self.parse)
1
>>> raw_html = simple_get('http://www.fabpedigree.com/james/mathmen.htm')
>>> html = BeautifulSoup(raw_html, 'html.parser')
>>> for i, li in enumerate(html.select('li')):
        print(i, li.text)

0  Isaac Newton
 Archimedes
 Carl F. Gauss
 Leonhard Euler
 Bernhard Riemann

1  Archimedes
 Carl F. Gauss
 Leonhard Euler
 Bernhard Riemann

2  Carl F. Gauss
 Leonhard Euler 
 Bernhard Riemann

 3  Leonhard Euler
 Bernhard Riemann

4  Bernhard Riemann

# 5 ... and many more...
0
def get_names():
    """
    Downloads the page where the list of mathematicians is found
    and returns a list of strings, one per mathematician
    """
    url = 'http://www.fabpedigree.com/james/mathmen.htm'
    response = simple_get(url)

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        names = set()
        for li in html.select('li'):
            for name in li.text.split('\n'):
                if len(name) > 0:
                    names.add(name.strip())
        return list(names)

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))
0
>>> from bs4 import BeautifulSoup
>>> raw_html = open('contrived.html').read()
>>> html = BeautifulSoup(raw_html, 'html.parser')
>>> for p in html.select('p'):
...     if p['id'] == 'walrus':
...         print(p.text)

'I am the walrus'
0
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
-1
def get_hits_on_name(name):
    """
    Accepts a `name` of a mathematician and returns the number
    of hits that mathematician's Wikipedia page received in the 
    last 60 days, as an `int`
    """
    # url_root is a template string that is used to build a URL.
    url_root = 'URL_REMOVED_SEE_NOTICE_AT_START_OF_ARTICLE'
    response = simple_get(url_root.format(name))

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')

        hit_link = [a for a in html.select('a')
                    if a['href'].find('latest-60') > -1]

        if len(hit_link) > 0:
            # Strip commas
            link_text = hit_link[0].text.replace(',', '')
            try:
                # Convert to integer
                return int(link_text)
            except:
                log_error("couldn't parse {} as an `int`".format(link_text))

    log_error('No pageviews found for {}'.format(name))
    return None
0

New to Communities?

Join the community