9/26/2017 [Link]
py
from __future__ import division
from collections import Counter
import math, random, csv, json
from bs4 import BeautifulSoup
import requests
######
#
# BOOKS ABOUT DATA
#
######
def is_video(td):
"""it's a video if it has exactly one pricelabel, and if
the stripped text inside that pricelabel starts with 'Video'"""
pricelabels = td('span', 'pricelabel')
return (len(pricelabels) == 1 and
pricelabels[0].[Link]().startswith("Video"))
def book_info(td):
"""given a BeautifulSoup <td> Tag representing a book,
extract the book's details and return a dict"""
title = [Link]("div", "thumbheader").[Link]
by_author = [Link]('div', 'AuthorName').text
authors = [[Link]() for x in [Link]("^By ", "", by_author).split(",")]
isbn_link = [Link]("div", "thumbheader").[Link]("href")
isbn = [Link]("/product/(.*)\.do", isbn_link).groups()[0]
date = [Link]("span", "directorydate").[Link]()
return {
"title" : title,
"authors" : authors,
"isbn" : isbn,
"date" : date
}
from time import sleep
def scrape(num_pages=31):
base_url = "[Link] + \
"[Link]?sortby=publicationDate&page="
books = []
for page_num in range(1, num_pages + 1):
print "souping page", page_num
url = base_url + str(page_num)
soup = BeautifulSoup([Link](url).text, 'html5lib')
for td in soup('td', 'thumbtext'):
if not is_video(td):
[Link](book_info(td))
# now be a good citizen and respect the [Link]!
sleep(30)
return books
def get_year(book):
"""book["date"] looks like 'November 2014' so we need to
split on the space and then take the second piece"""
return int(book["date"].split()[1])
def plot_years(plt, books):
# 2014 is the last complete year of data (when I ran this)
year_counts = Counter(get_year(book) for book in books
if get_year(book) <= 2014)
[Link] 1/4
9/26/2017 [Link]
years = sorted(year_counts)
book_counts = [year_counts[year] for year in x]
[Link]([x - 0.5 for x in years], book_counts)
[Link]("year")
[Link]("# of data books")
[Link]("Data is Big!")
[Link]()
##
#
# APIs
#
##
endpoint = "[Link]
repos = [Link]([Link](endpoint).text)
from [Link] import parse
dates = [parse(repo["created_at"]) for repo in repos]
month_counts = Counter([Link] for date in dates)
weekday_counts = Counter([Link]() for date in dates)
####
#
# Twitter
#
####
from twython import Twython
# fill these in if you want to use the code
CONSUMER_KEY = ""
CONSUMER_SECRET = ""
ACCESS_TOKEN = ""
ACCESS_TOKEN_SECRET = ""
def call_twitter_search_api():
twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET)
# search for tweets containing the phrase "data science"
for status in [Link](q='"data science"')["statuses"]:
user = status["user"]["screen_name"].encode('utf-8')
text = status["text"].encode('utf-8')
print user, ":", text
print
from twython import TwythonStreamer
# appending data to a global variable is pretty poor form
# but it makes the example much simpler
tweets = []
class MyStreamer(TwythonStreamer):
"""our own subclass of TwythonStreamer that specifies
how to interact with the stream"""
def on_success(self, data):
"""what do we do when twitter sends us data?
here data will be a Python object representing a tweet"""
# only want to collect English-language tweets
if data['lang'] == 'en':
[Link](data)
# stop when we've collected enough
[Link] 2/4
9/26/2017 [Link]
if len(tweets) >= 1000:
[Link]()
def on_error(self, status_code, data):
print status_code, data
[Link]()
def call_twitter_streaming_api():
stream = MyStreamer(CONSUMER_KEY, CONSUMER_SECRET,
ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
# starts consuming public statuses that contain the keyword 'data'
[Link](track='data')
if __name__ == "__main__":
def process(date, symbol, price):
print date, symbol, price
print "tab delimited stock prices:"
with open('tab_delimited_stock_prices.txt', 'rb') as f:
reader = [Link](f, delimiter='\t')
for row in reader:
date = row[0]
symbol = row[1]
closing_price = float(row[2])
process(date, symbol, closing_price)
print
print "colon delimited stock prices:"
with open('colon_delimited_stock_prices.txt', 'rb') as f:
reader = [Link](f, delimiter=':')
for row in reader:
date = row["date"]
symbol = row["symbol"]
closing_price = float(row["closing_price"])
process(date, symbol, closing_price)
print
print "writing out comma_delimited_stock_prices.txt"
today_prices = { 'AAPL' : 90.91, 'MSFT' : 41.68, 'FB' : 64.5 }
with open('comma_delimited_stock_prices.txt','wb') as f:
writer = [Link](f, delimiter=',')
for stock, price in today_prices.items():
[Link]([stock, price])
print "BeautifulSoup"
html = [Link]("[Link]
soup = BeautifulSoup(html)
print soup
print
print "parsing json"
serialized = """{ "title" : "Data Science Book",
"author" : "Joel Grus",
"publicationYear" : 2014,
"topics" : [ "data", "science", "data science"] }"""
# parse the JSON to create a Python object
deserialized = [Link](serialized)
if "data science" in deserialized["topics"]:
[Link] 3/4
9/26/2017 [Link]
print deserialized
print
print "GitHub API"
print "dates", dates
print "month_counts", month_counts
print "weekday_count", weekday_counts
last_5_repositories = sorted(repos,
key=lambda r: r["created_at"],
reverse=True)[:5]
print "last five languages", [repo["language"]
for repo in last_5_repositories]
[Link] 4/4