0% found this document useful (0 votes)

205 views4 pages

Data Science Web Scraping Guide

The document contains code for scraping book data from a website, making API calls to GitHub and Twitter, and parsing JSON. It includes functions for extracting book metadata from HTML, making requests to various APIs, parsing responses into Python objects, and collecting tweets from Twitter's streaming API. The code also demonstrates writing to and reading from CSV files with different delimiters and parsing sample JSON data.

Uploaded by

gprasadatvu

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

205 views4 pages

Data Science Web Scraping Guide

Uploaded by

gprasadatvu

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

9/26/2017 [Link]

from future import division

from collections import Counter
import math, random, csv, json

from bs4 import BeautifulSoup

import requests

######
#
# BOOKS ABOUT DATA
#
######

def is_video(td):
"""it's a video if it has exactly one pricelabel, and if
the stripped text inside that pricelabel starts with 'Video'"""
pricelabels = td('span', 'pricelabel')
return (len(pricelabels) == 1 and
pricelabels[0].[Link]().startswith("Video"))

def book_info(td):
"""given a BeautifulSoup <td> Tag representing a book,
extract the book's details and return a dict"""

title = [Link]("div", "thumbheader").[Link]

by_author = [Link]('div', 'AuthorName').text
authors = [[Link]() for x in [Link]("^By ", "", by_author).split(",")]
isbn_link = [Link]("div", "thumbheader").[Link]("href")
isbn = [Link]("/product/(.*)\.do", isbn_link).groups()[0]
date = [Link]("span", "directorydate").[Link]()

return {
"title" : title,
"authors" : authors,
"isbn" : isbn,
"date" : date
}

from time import sleep

def scrape(num_pages=31):
base_url = "[Link] + \
"[Link]?sortby=publicationDate&page="

books = []

for page_num in range(1, num_pages + 1):

print "souping page", page_num
url = base_url + str(page_num)
soup = BeautifulSoup([Link](url).text, 'html5lib')

for td in soup('td', 'thumbtext'):

if not is_video(td):
[Link](book_info(td))

# now be a good citizen and respect the [Link]!

sleep(30)

return books

def get_year(book):
"""book["date"] looks like 'November 2014' so we need to
split on the space and then take the second piece"""
return int(book["date"].split()[1])

def plot_years(plt, books):

# 2014 is the last complete year of data (when I ran this)
year_counts = Counter(get_year(book) for book in books
if get_year(book) <= 2014)
[Link] 1/4
9/26/2017 [Link]

years = sorted(year_counts)
book_counts = [year_counts[year] for year in x]
[Link]([x - 0.5 for x in years], book_counts)
[Link]("year")
[Link]("# of data books")
[Link]("Data is Big!")
[Link]()

##
#
# APIs
#
##

endpoint = "[Link]

repos = [Link]([Link](endpoint).text)

from [Link] import parse

dates = [parse(repo["created_at"]) for repo in repos]

month_counts = Counter([Link] for date in dates)
weekday_counts = Counter([Link]() for date in dates)

####
#
# Twitter
#
####

from twython import Twython

# fill these in if you want to use the code

CONSUMER_KEY = ""
CONSUMER_SECRET = ""
ACCESS_TOKEN = ""
ACCESS_TOKEN_SECRET = ""

def call_twitter_search_api():

twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET)

# search for tweets containing the phrase "data science"

for status in [Link](q='"data science"')["statuses"]:
user = status["user"]["screen_name"].encode('utf-8')
text = status["text"].encode('utf-8')
print user, ":", text
print

from twython import TwythonStreamer

# appending data to a global variable is pretty poor form

# but it makes the example much simpler
tweets = []

class MyStreamer(TwythonStreamer):
"""our own subclass of TwythonStreamer that specifies
how to interact with the stream"""

def on_success(self, data):

"""what do we do when twitter sends us data?
here data will be a Python object representing a tweet"""

# only want to collect English-language tweets

if data['lang'] == 'en':
[Link](data)

# stop when we've collected enough

[Link] 2/4
9/26/2017 [Link]

if len(tweets) >= 1000:

[Link]()

def on_error(self, status_code, data):

print status_code, data
[Link]()

def call_twitter_streaming_api():
stream = MyStreamer(CONSUMER_KEY, CONSUMER_SECRET,
ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

# starts consuming public statuses that contain the keyword 'data'

[Link](track='data')

if __name__ == "__main__":

def process(date, symbol, price):

print date, symbol, price

print "tab delimited stock prices:"

with open('tab_delimited_stock_prices.txt', 'rb') as f:

reader = [Link](f, delimiter='\t')
for row in reader:
date = row[0]
symbol = row[1]
closing_price = float(row[2])
process(date, symbol, closing_price)

print "colon delimited stock prices:"

with open('colon_delimited_stock_prices.txt', 'rb') as f:

reader = [Link](f, delimiter=':')
for row in reader:
date = row["date"]
symbol = row["symbol"]
closing_price = float(row["closing_price"])
process(date, symbol, closing_price)

print "writing out comma_delimited_stock_prices.txt"

today_prices = { 'AAPL' : 90.91, 'MSFT' : 41.68, 'FB' : 64.5 }

with open('comma_delimited_stock_prices.txt','wb') as f:
writer = [Link](f, delimiter=',')
for stock, price in today_prices.items():
[Link]([stock, price])

print "BeautifulSoup"
html = [Link]("[Link]
soup = BeautifulSoup(html)
print soup
print

print "parsing json"

serialized = """{ "title" : "Data Science Book",

"author" : "Joel Grus",
"publicationYear" : 2014,
"topics" : [ "data", "science", "data science"] }"""

# parse the JSON to create a Python object

deserialized = [Link](serialized)
if "data science" in deserialized["topics"]:

[Link] 3/4
9/26/2017 [Link]

print deserialized

print "GitHub API"

print "dates", dates
print "month_counts", month_counts
print "weekday_count", weekday_counts

last_5_repositories = sorted(repos,
key=lambda r: r["created_at"],
reverse=True)[:5]

print "last five languages", [repo["language"]

for repo in last_5_repositories]

[Link] 4/4

Getting Data
No ratings yet
Getting Data
54 pages
DCAI Assignment 3
No ratings yet
DCAI Assignment 3
10 pages
Python AI Prompts and Code Examples
No ratings yet
Python AI Prompts and Code Examples
9 pages
Data Science Papers
No ratings yet
Data Science Papers
109 pages
? Python Topics For Data Science
No ratings yet
? Python Topics For Data Science
3 pages
Data Science - A First Introduction With Python (Z-Lib - Io)
No ratings yet
Data Science - A First Introduction With Python (Z-Lib - Io)
452 pages
Unit 3
No ratings yet
Unit 3
110 pages
Programming 2 Lectures
No ratings yet
Programming 2 Lectures
52 pages
Advance Data Mining Assignment
No ratings yet
Advance Data Mining Assignment
10 pages
Comprehensive Python Data Science Course
No ratings yet
Comprehensive Python Data Science Course
8 pages
Python Record Manual
No ratings yet
Python Record Manual
18 pages
Cheat Sheet: Python For Data Science
No ratings yet
Cheat Sheet: Python For Data Science
4 pages
Cheat Sheet: Python For Data Science
No ratings yet
Cheat Sheet: Python For Data Science
4 pages
A Z Cheatsheet Python DA
No ratings yet
A Z Cheatsheet Python DA
7 pages
Shirin Dalvi Fundraiser Analysis: Getting The Donor Info
No ratings yet
Shirin Dalvi Fundraiser Analysis: Getting The Donor Info
6 pages
The Ultimate Guide To Python Programming With Python 3.10
No ratings yet
The Ultimate Guide To Python Programming With Python 3.10
2 pages
Advance Python Syllabus
No ratings yet
Advance Python Syllabus
2 pages
Exercises 5
No ratings yet
Exercises 5
7 pages
PDS Merged New
No ratings yet
PDS Merged New
19 pages
DeepSeek - Python Tutorial
No ratings yet
DeepSeek - Python Tutorial
8 pages
Social Media Data Collection Techniques
No ratings yet
Social Media Data Collection Techniques
9 pages
Python Data Science Bootcamp Overview
No ratings yet
Python Data Science Bootcamp Overview
16 pages
Api and Data Structure
No ratings yet
Api and Data Structure
3 pages
Coding - Python Camp - Notes
No ratings yet
Coding - Python Camp - Notes
5 pages
DATASCIENCE (Unit-1) Question Bank
No ratings yet
DATASCIENCE (Unit-1) Question Bank
6 pages
Python Applications
No ratings yet
Python Applications
8 pages
Zarnain Test
No ratings yet
Zarnain Test
6 pages
Python Data Science Project Guide
No ratings yet
Python Data Science Project Guide
4 pages
21CSS203TCT-1 - SET A - Answer Key
No ratings yet
21CSS203TCT-1 - SET A - Answer Key
4 pages
Anis D. Ultimate Step by Step Guide To Data Science..Python.2021
No ratings yet
Anis D. Ultimate Step by Step Guide To Data Science..Python.2021
161 pages
Sentiment Analysis with CSV Data
No ratings yet
Sentiment Analysis with CSV Data
24 pages
Real Python Interview Questions American Express
No ratings yet
Real Python Interview Questions American Express
7 pages
Chapter 1+ Python Basics-1
No ratings yet
Chapter 1+ Python Basics-1
16 pages
MCP Lab-2023 ContentForPythonLibrariesTopic
No ratings yet
MCP Lab-2023 ContentForPythonLibrariesTopic
9 pages
Python Cheat Sheet - The Basics CC
No ratings yet
Python Cheat Sheet - The Basics CC
2 pages
Python Data Science Handbook - Python Data Science Handbook
No ratings yet
Python Data Science Handbook - Python Data Science Handbook
4 pages
Python Cheat Sheet - The Basics Edx
No ratings yet
Python Cheat Sheet - The Basics Edx
2 pages
Python Tools for Data Scientists
100% (1)
Python Tools for Data Scientists
23 pages
Beginner's Guide To Python For Data Science Rodriguez Special
No ratings yet
Beginner's Guide To Python For Data Science Rodriguez Special
7 pages
Python Essentials Objectives
No ratings yet
Python Essentials Objectives
2 pages
Chapter1 Notes Python Data Analysis
No ratings yet
Chapter1 Notes Python Data Analysis
2 pages
Complet Programming Language
No ratings yet
Complet Programming Language
55 pages
Python For Data Engineering
No ratings yet
Python For Data Engineering
18 pages
ML Lab File
No ratings yet
ML Lab File
33 pages
Web Scraping Weather Data Using Python - by Abhishek Khatri - Medium
No ratings yet
Web Scraping Weather Data Using Python - by Abhishek Khatri - Medium
8 pages
AIML Manual Lab-For Students
No ratings yet
AIML Manual Lab-For Students
45 pages
Data Extraction: Parse A 3-Nested JSON Object and Convert It To A Pandas Dataframe
No ratings yet
Data Extraction: Parse A 3-Nested JSON Object and Convert It To A Pandas Dataframe
1 page
Data Science Overview
No ratings yet
Data Science Overview
2 pages
Python Basics for Aspiring Data Scientists
No ratings yet
Python Basics for Aspiring Data Scientists
16 pages
Ass1 DSBDA Writeup
No ratings yet
Ass1 DSBDA Writeup
8 pages
Automation Cheat Sheet 2.0
100% (1)
Automation Cheat Sheet 2.0
6 pages
Python and R Data Processing Guide
No ratings yet
Python and R Data Processing Guide
6 pages
Esc Enter M Y A B D + D Z F Shift + Up/Down Space Shift + Space
No ratings yet
Esc Enter M Y A B D + D Z F Shift + Up/Down Space Shift + Space
12 pages
1000 Simple Sentences For Kids
No ratings yet
1000 Simple Sentences For Kids
200 pages
1000 Words Writing Practice For Kids
No ratings yet
1000 Words Writing Practice For Kids
84 pages
Hall Ticket Cum Test Fee Receipt: Important Instructions
No ratings yet
Hall Ticket Cum Test Fee Receipt: Important Instructions
1 page
Bamber CH 13 India Slides
No ratings yet
Bamber CH 13 India Slides
25 pages
4 Motiv, Emot, Values Fall09
No ratings yet
4 Motiv, Emot, Values Fall09
35 pages
Human Resource Planning IGNOU All in One
No ratings yet
Human Resource Planning IGNOU All in One
202 pages
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
No ratings yet
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
5 pages
Ir & Emerging Socio Economic Scenario...
89% (9)
Ir & Emerging Socio Economic Scenario...
8 pages
Regex - Extract Pattern From String, Strip Text, Convert To Numeric and Sum in R Data
No ratings yet
Regex - Extract Pattern From String, Strip Text, Convert To Numeric and Sum in R Data
2 pages
Text Mining Political Speeches
No ratings yet
Text Mining Political Speeches
17 pages
IELTS Task 1 New Answer Sheet
No ratings yet
IELTS Task 1 New Answer Sheet
2 pages
Cheat Sheets of Python Libraries Tensorflow
No ratings yet
Cheat Sheets of Python Libraries Tensorflow
3 pages
Q
No ratings yet
Q
2 pages
Know Thy Complexities!: Big-O Complexity Chart
No ratings yet
Know Thy Complexities!: Big-O Complexity Chart
2 pages
Data Manipulation & Visualization
No ratings yet
Data Manipulation & Visualization
7 pages
MSC in Applied Data Science & Big Data - Data ScienceTech Institute
No ratings yet
MSC in Applied Data Science & Big Data - Data ScienceTech Institute
8 pages
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
No ratings yet
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
5 pages
Canadian Students Studying Abroad
No ratings yet
Canadian Students Studying Abroad
1 page
Google Scholar Pros and Cons Review
No ratings yet
Google Scholar Pros and Cons Review
8 pages
A Lightweight and Secure-Enhanced Strong PUF Design On FPGA: Letter
No ratings yet
A Lightweight and Secure-Enhanced Strong PUF Design On FPGA: Letter
6 pages
Lease Line
No ratings yet
Lease Line
11 pages
Cubase LE 12 Operation Manual en
No ratings yet
Cubase LE 12 Operation Manual en
777 pages
FM Approvals
No ratings yet
FM Approvals
24 pages
CSS 11 Reviewer For Summative Test and 1ST Quarter Exam
No ratings yet
CSS 11 Reviewer For Summative Test and 1ST Quarter Exam
2 pages
Allen Bradley-PLC5 Programming Guide
No ratings yet
Allen Bradley-PLC5 Programming Guide
147 pages
Question: Solve in Java Starter Code Import Java - Io. Import Java - Math.
No ratings yet
Question: Solve in Java Starter Code Import Java - Io. Import Java - Math.
1 page
Data Communication and Networking II
No ratings yet
Data Communication and Networking II
109 pages
3.7m Ka-Band LEO Earth Station (Viasat)
No ratings yet
3.7m Ka-Band LEO Earth Station (Viasat)
2 pages
Pioneers of Computing: Babbage, Lovelace, Pascal
No ratings yet
Pioneers of Computing: Babbage, Lovelace, Pascal
2 pages
SlideRulesThroughTime1787to1905 220718
No ratings yet
SlideRulesThroughTime1787to1905 220718
90 pages
Bachelor of Information Technology
No ratings yet
Bachelor of Information Technology
3 pages
Power System Loadability
No ratings yet
Power System Loadability
4 pages
C120 User English
No ratings yet
C120 User English
12 pages
Scheme of Work Feb 2020 Student
No ratings yet
Scheme of Work Feb 2020 Student
3 pages
ATM Project Interview Questions With Answers
No ratings yet
ATM Project Interview Questions With Answers
4 pages
Ce - Dept - R23 - Course Structure
No ratings yet
Ce - Dept - R23 - Course Structure
8 pages
Foot Spa User Manual & Guide
No ratings yet
Foot Spa User Manual & Guide
8 pages
Huawei ICT Competition 2021 Overview
No ratings yet
Huawei ICT Competition 2021 Overview
13 pages
Installation Guidelines: Programming of Levels
No ratings yet
Installation Guidelines: Programming of Levels
4 pages
Notes FOS
No ratings yet
Notes FOS
58 pages
Coconut Tree Climber Machine Invented by Appachan
No ratings yet
Coconut Tree Climber Machine Invented by Appachan
6 pages
1222EDI516
No ratings yet
1222EDI516
57 pages
Gmail - Invite To Participate in The TCS Selection Process
No ratings yet
Gmail - Invite To Participate in The TCS Selection Process
1 page
ECMS2 Training Slides
No ratings yet
ECMS2 Training Slides
394 pages
Types of Material Handling Equipment
No ratings yet
Types of Material Handling Equipment
4 pages
Datacard Maxsys Card Issuance System: Embossing/Indent Module Service Manual
No ratings yet
Datacard Maxsys Card Issuance System: Embossing/Indent Module Service Manual
172 pages
Automated Air Cargo Terminal Simulation
No ratings yet
Automated Air Cargo Terminal Simulation
9 pages
Gas Leak Alarm Systems PDF
No ratings yet
Gas Leak Alarm Systems PDF
87 pages

Data Science Web Scraping Guide

Uploaded by

Data Science Web Scraping Guide

Uploaded by

9/26/2017 [Link]

from __future__ import division

from bs4 import BeautifulSoup

title = [Link]("div", "thumbheader").[Link]

from time import sleep

for page_num in range(1, num_pages + 1):

for td in soup('td', 'thumbtext'):

# now be a good citizen and respect the [Link]!

def plot_years(plt, books):

from [Link] import parse

dates = [parse(repo["created_at"]) for repo in repos]

from twython import Twython

# fill these in if you want to use the code

twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET)

# search for tweets containing the phrase "data science"

from twython import TwythonStreamer

# appending data to a global variable is pretty poor form

def on_success(self, data):

# only want to collect English-language tweets

# stop when we've collected enough

if len(tweets) >= 1000:

def on_error(self, status_code, data):

# starts consuming public statuses that contain the keyword 'data'

def process(date, symbol, price):

print "tab delimited stock prices:"

with open('tab_delimited_stock_prices.txt', 'rb') as f:

print "colon delimited stock prices:"

with open('colon_delimited_stock_prices.txt', 'rb') as f:

print "writing out comma_delimited_stock_prices.txt"

today_prices = { 'AAPL' : 90.91, 'MSFT' : 41.68, 'FB' : 64.5 }

print "parsing json"

serialized = """{ "title" : "Data Science Book",

# parse the JSON to create a Python object

print "GitHub API"

print "last five languages", [repo["language"]

You might also like

from future import division