实现提取某电影网基本信息并存入数据库,修改数据库连接即可使用
网站:aHR0cHM6Ly93d3cuZHl0dDhhLmNvbQ==(b64加密)
代码如下:
import requests
from lxml import etree
import pymysql
from pymysql.converters import escape_string
class Mysqlmovie(object):
def __init__(self):
self.table = 'movie'
self.db = pymysql.connect(
"""
"""
)
self.cursor = self.db.cursor()
def process(self, item):
movie = self.select_one(item['name'])
if movie:
pass
else:
self.insert_one(item)
def select_one(self, moviename):
sql = "SELECT moviename from movie where moviename = '%s';" % (escape_string(moviename))
self.cursor.execute(sql)
column_tuples = self.cursor.fetchall()
columns = []
for column_tuple in column_tuples:
columns.append(column_tuple[0])
return columns
def insert_one(self, item):
sql = "insert ignore into movie (`moviename`, `place`, `score`, `year`, `type`) " \
"values " \
"('%s', '%s', '%s', '%s', '%s')" % \
(escape_string(item['name']), escape_string(item['place']), escape_string(item['score']), item['year'], escape_string(item['type']))
self.cursor.execute(sql)
self.db.commit()
def get_one_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.62',
}
resp = requests.get(url, headers=headers)
resp.encoding = "utf-8"
html = etree.HTML(resp.text)
lis = html.xpath("/html/body/div[1]/div/div[3]/div/div/ul/li")
for li in lis:
item = {}
name = li.xpath("./div/div/h4/a/text()")[-1]
item['name'] = name
data = li.xpath("./div/div/p/text()")[-1]
score = li.xpath('./div/a/span[2]/span[1]/text()')[-1]
item['score'] = score.replace('分', '')
# print(data)
datas = data.split('/')
# print(datas)
if len(datas) == 3:
item['year'] = datas[0]
item['place'] = datas[1]
item['type'] = datas[2]
item['page'] = i
print(item)
# print('***'*5)
Mysqlmovie().process(item)
if __name__ == '__main__':
for i in range(1, 10): # 自己改页码
url = "https://www.dytt8a.com/dytt/1-{}.html".format(i)
print(url)
get_one_page(url)
# 获取片名,地区,评分,年份,类型