B站学学爬虫,自己写写这种简单的,还可以存到MongoDB里面,就是格式化输出的时候还是有点格式对不上
网址是这么来的
#-*-coding:utf-8-*-
import re
# import pymongo
import requests
# from config import *
# client = pymongo.MongoClient(MONGO_URL) #声明mongodb对象
# db = client[MONGO_DB] #数据库对象
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
}
def getHtmlpage(base_url,num): #url,num为爬取多少页
for i in range(num):
url = base_url.format(i*20)
try:
r = requests.get(url,headers=head)
r.raise_for_status()
r.encoding = 'utf-8'
if r.status_code == 200:
print(i,'成功')
yield r.text
except Exception as e:
print('失败',e)
def parserPage(html,infoDict):
try:
pattern = re.compile(r'"rank":(\d+),.*?title":"(.*?)".*?_date":"(.*?)".*?score":"(.*?)"',re.S)
lst = re.findall(pattern,html)
for i in range(len(lst)):
infoDict['movie'].append({
'排名':lst[i][0],
'电影':lst[i][1],
'上映日期':lst[i][2],
'得分':lst[i][3]
})
except Exception as e:
print('处理失败',e)
def printList(infoDict):
tplt = '{0:{4}^5}\t{1:{4}^20}\t{2:{4}^20}\t{3:{4}^5}\n'
with open('豆瓣.txt','w',encoding='utf-8') as fp:
fp.write(tplt.format('排名','电影','上映日期','得分',chr(12288)))
for lst in infoDict['movie']:
fp.write(tplt.format(lst['排名'],lst['电影'],lst['上映日期'],lst['得分'],chr(12288)))
print('打印成功')
# def saveToMongo(data):
# if db[MONGO_TABLE].insert(data):
# print('存储到MongoDB成功')
# else:
# print('失败')
def main():
infoDict = {
'movie':[]
}
base_url = 'https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start={}&limit=20'
for html in getHtmlpage(base_url,1):#后面数字就是爬取多少页
parserPage(html,infoDict)
# saveToMongo(infoDict)
printList(infoDict)
if __name__ == '__main__':
main()
存储到数据库的config.py
MONGO_URL='localhost' #链接地址
MONGO_DB ='douban' #数据库名称
MONGO_TABLE='douban' #表名