……
文章目录
……
一,创建 DouBanFilmSpider.py
1.提供URL, HTTP request
urllib\requests\selenium
2.response, 解析,提取目标数据
re\Beautifulsoup\pyquery\xpath
3.保存
JSON\txt\csv\数据库\图片\视频
# 导入requests模块 和 BeautifulSoup模块
import requests
from bs4 import BeautifulSoup
# 豆瓣TOP250的url地址
url = 'https://movie.douban.com/top250'
# 所有1-10页的url地址组成的列表
url_list = ['https://movie.douban.com/top250?start=%d&filter=' % i for i in range(0, 226, 25)]
# 无法解析的特殊字符列表
unrecognized_charList = [u'\xa0',u'\xf4',u'\xee',u'\xf6',u'\u0161',u'\xfb',u'\xe5']
class Spider:
__unrecognized_charList = (u'\xa0', u'\xf4', u'\xee', u'\xf6', u'\u0161', u'\xfb', u'\xe5', u'\u22ef')
def __init__(self, url=None, url_list=None):
self.__titles = [] # 电影标题
self.__comments = [] # 电影影评
self.__details = [] #电影详细内容(导演和年份等)
if url and not url_list: # 如果只传了一个url地址
self.get_contents(url) # 调用函数
elif url_list: # 如果传了一个url列表
for url in url_list:
self.get_contents(url)
def get_contents(self, url):
# 首先解析,然后查找ol元素里面的li列表,得到电影列表
self.__movie_list = BeautifulSoup(requests.get(url).text, 'html.parser').find('ol').find_all('li')
for movie in self.__movie_list:
# 继续查找span标签,class为title的元素,并获取内容,去掉右侧\n,添加到__titles列表中
self.__titles.append(movie.find('span', attrs={'class': 'title'}).getText().rstrip())
# class为inq的span和p标签的元素,最后同理添加
comment = movie.find('span', attrs={'class': 'inq'})
details = movie.find('p').getText().strip()
if comment and details:
comment = comment.getText()
for char in Spider.__unrecognized_charList:
# 用空格替换掉无法识别的字符编码
comment = comment.replace(char, u' ')
details = details.replace(char, u' ')
self.__comments.append(comment)
self.__details.append(details)
def get_titles(self):
# 返回电影标题列表
return self.__titles
def get_comments(self):
# 返回电影的评论列表
return self.__comments
def get_details(self):
# 返回电影的详细内容的列表
return self.__details
二,创建myApplication.py
1.添加代码
from tkinter import *
import douBanFilmSpider as db
class App(Frame):
def __init__(self, master=None):
Frame.__init__(self, master)
self.master.title("Application")
self.master.geometry("1000x600")
self.master.resizable(False, False)
self.pack()
self.entrythingy = Entry()
self.entrythingy.place(x=200,y=0,width=400,height=20)
self.contents = StringVar()
self.index = IntVar()
self.index.set(0)
self.contents.set(db.url_list[self.index.get()])
self.entrythingy["textvariable"] = self.contents
#self.entrythingy.bind('<Key-Return>', self.show(self.contents.get()))
bn = Button(self.master, text='搜索', command=self.show)
bn.place(x=600,y=0,width=40,height=20)
btl = Button(self.master, text='上一页', command=self.pre_page)
btl.place(x=140,y=560,width=40,height=20)
btr = Button(self.master, text='下一页', command=self.next_page)
btr.place(x=200,y=560,width=40,height=20)
self.label = Label(self.master, width=40, bg='white', text='第%d页'%(self.index.get()+1), font=('Courier', 10))
self.label.place(x=260,y=560,width=40,height=20)
def next_page(self):
self.contents.set(db.url_list[self.index.get()+1])
self.index.set(self.index.get()+1)
self.label['text'] = '第%d页'%(self.index.get()+1)
self.show()
def pre_page(self):
self.contents.set(db.url_list[self.index.get()-1])
if self.index.get()==0:
self.index.set(len(db.url_list))
self.index.set(self.index.get()-1)
self.label['text'] = '第%d页'%(self.index.get()+1)
self.show()
def show(self):
spider = db.Spider(url=self.contents.get())
movieList = spider.get_titles()
comments = spider.get_comments()
for title in movieList:
label = Label(self.master, width=400, bg='white',text=title, font=('Courier', 10))
label.place(x=0,y=40+movieList.index(title)*20,width=400,height=20)
for comment in comments:
label = Label(self.master, width=400, bg='white', text=comment, font=('Courier', 10))
label.place(x=400,y=40+comments.index(comment)*20,width=600,height=20)
app = App()
app.mainloop()