import requests as req
import bs4
from wordcloud import *
from jieba import *
from matplotlib.pyplot import imread
import os
import time
import re
mask=imread('chinamap.jpg')
h={'User-Agent':'Mozilla/5.0(Windows NT 10.0;Win64;x64)'
'AppleWebKit/537.36(KHTML,like Gecko)'
'Chrome/79.0.3945.130 Safari/537.36 OPR/66.0.3515.115'}#获取短评
page=['0','20','40']
for a in page:
url1='https://book.douban.com/subject/1449351/comments/?start='
url2='&limit=20&status=P&sort=score'
r=req.get(url1+a+url2,headers=h)
print(r.status_code)
soup=bs4.BeautifulSoup(r.text)
pattern3=soup.find_all('span','short')
ls=[]
for e in pattern3:
print(e.text)
ls.append(e.text+'\n')
fp=open('comment1.text','a')
for i in range(len(ls)):
fp.write(ls[i])
fp.close()
#pattern4=re.compile('<span class="user-stars allstar(.*?) rating"')
pt=re.findall('<span class="user-stars allstar(.*?) rating"',r.text)
startlist=[]
for e in pt:
startlist.append(e)
sum=0
for i in startlist:
sum+=eval(i)
print('该页评分总和为:')
print(sum)
path='picture1'
if os.path.exists(path)==False:
os.makedirs(path)
#print('正在查找/下载网页中的图片')
pic=[]
for e in soup.find_all('img'):
addr=e.get('src')
pic.append(addr)
n=eval(a)
for e in pic:
n+=1
photo=req.get(e,timeout=10)
fp=open(path+'\{0:03}.jpg'.format(n),'wb')
fp.write(photo.content)
fp.close()
time.sleep(5)
print('图片下载完成')
f=open('comment1.text','r',encoding='ANSI')#词云展示
t=f.read()
f.close()
ls=lcut(t)
txt=' '.join(ls)
w=WordCloud(font_path='msyh.ttc',mask=mask,width=1000,height=700,background_color='white')
w.generate(txt)
w.to_file('luxuncomment1.png')