web scraper python
# basic web scraping with python
# Import libraries
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
# Set the URL you want to webscrape from
url = 'http://web.mta.info/developers/turnstile.html'
# Connect to the URL
response = requests.get(url)
# Parse HTML and save to BeautifulSoup object¶
soup = BeautifulSoup(response.text, "html.parser")
# To download the whole data set, let's do a for loop through all a tags
line_count = 1 #variable to track what line you are on
for one_a_tag in soup.findAll('a'): #'a' tags are for links
if line_count >= 36: #code for text files starts at line 36
link = one_a_tag['href']
download_url = 'http://web.mta.info/developers/'+ link
urllib.request.urlretrieve(download_url,'./'+link[link.find('/turnstile_')+1:])
time.sleep(1) #pause the code for a sec
#add 1 for next line
line_count +=1
import scrapy
from ..items import SampletestItem #items class
class QuoteTestSpider(scrapy.Spider):
name = 'quote_test'
start_urls = ['https://quotes.toscrape.com/']
def parse(self, response):
items = SampletestItem() #items class
quotes = response.css("div.quote")
for quote in quotes:
items['title'] = quote.css("span.text::text").get()
items['author'] = quote.css(".author::text").get()
items['tags'] = quote.css(".tags .tag::text").getall()
yield items
next_page = response.css(".next a::attr(href)").get()
if next_page is not None:
next_url = response.urljoin(next_page)
yield scrapy.Request(next_url, callback=self.parse)
# example of web scraping links using asyncio and using all cores
import asyncio, requests, aiohttp, os
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
from bs4 import BeautifulSoup as BS
executor = ThreadPoolExecutor(max_workers=8)
loop = asyncio.get_event_loop()
async def make_requests():
urls = ['http://www.filedropper.com/lister.php?id=0', 'http://www.filedropper.com/lister.php?id=1', 'http://www.filedropper.com/lister.php?id=2', 'http://www.filedropper.com/lister.php?id=3', 'http://www.filedropper.com/lister.php?id=4', 'http://www.filedropper.com/lister.php?id=5', 'http://www.filedropper.com/lister.php?id=6', 'http://www.filedropper.com/lister.php?id=7', 'http://www.filedropper.com/lister.php?id=8', 'http://www.filedropper.com/lister.php?id=9', 'http://www.filedropper.com/lister.php?id=a', 'http://www.filedropper.com/lister.php?id=b', 'http://www.filedropper.com/lister.php?id=c', 'http://www.filedropper.com/lister.php?id=d', 'http://www.filedropper.com/lister.php?id=e', 'http://www.filedropper.com/lister.php?id=f', 'http://www.filedropper.com/lister.php?id=g', 'http://www.filedropper.com/lister.php?id=h', 'http://www.filedropper.com/lister.php?id=i', 'http://www.filedropper.com/lister.php?id=j', 'http://www.filedropper.com/lister.php?id=k', 'http://www.filedropper.com/lister.php?id=l', 'http://www.filedropper.com/lister.php?id=m', 'http://www.filedropper.com/lister.php?id=n', 'http://www.filedropper.com/lister.php?id=o', 'http://www.filedropper.com/lister.php?id=p', 'http://www.filedropper.com/lister.php?id=q', 'http://www.filedropper.com/lister.php?id=r', 'http://www.filedropper.com/lister.php?id=s', 'http://www.filedropper.com/lister.php?id=t', 'http://www.filedropper.com/lister.php?id=u', 'http://www.filedropper.com/lister.php?id=v', 'http://www.filedropper.com/lister.php?id=w', 'http://www.filedropper.com/lister.php?id=x', 'http://www.filedropper.com/lister.php?id=y', 'http://www.filedropper.com/lister.php?id=z']
futures = [loop.run_in_executor(executor, requests.get, url) for url in urls]
await asyncio.wait(futures)
for future in futures:
soup = BS(future.result().content)
for all_links in soup.find_all('a', href=True):
print("URL:", all_links['href'])
with open('filedropper_com.txt', 'a') as f:
f.write(all_links['href'] + '\n')
loop.run_until_complete(make_requests())
import scrapy
class BlogSpider(scrapy.Spider):
name = 'blogspider'
start_urls = ['https://blog.scrapinghub.com']
def parse(self, response):
for title in response.css('.post-header>h2'):
yield {'title': title.css('a ::text').get()}
for next_page in response.css('a.next-posts-link'):
yield response.follow(next_page, self.parse)
>>> raw_html = simple_get('http://www.fabpedigree.com/james/mathmen.htm')
>>> html = BeautifulSoup(raw_html, 'html.parser')
>>> for i, li in enumerate(html.select('li')):
print(i, li.text)
0 Isaac Newton
Archimedes
Carl F. Gauss
Leonhard Euler
Bernhard Riemann
1 Archimedes
Carl F. Gauss
Leonhard Euler
Bernhard Riemann
2 Carl F. Gauss
Leonhard Euler
Bernhard Riemann
3 Leonhard Euler
Bernhard Riemann
4 Bernhard Riemann
# 5 ... and many more...
def get_names():
"""
Downloads the page where the list of mathematicians is found
and returns a list of strings, one per mathematician
"""
url = 'http://www.fabpedigree.com/james/mathmen.htm'
response = simple_get(url)
if response is not None:
html = BeautifulSoup(response, 'html.parser')
names = set()
for li in html.select('li'):
for name in li.text.split('\n'):
if len(name) > 0:
names.add(name.strip())
return list(names)
# Raise an exception if we failed to get any data from the url
raise Exception('Error retrieving contents at {}'.format(url))