python爬虫 爬取古诗词网站
导入库
import requests
from lxml import etree
函数1:获取网页
def get_html(k, type_v1):
url = "https://so.gushiwen.cn/mingju/default.aspx?p={}&c={}&t=".format(k + 1, type_v1)
print(url)
ua = {
'User-Agent': "User-Agent:Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
resp = requests.get(url, headers=ua)
print(resp.status_code)
return resp
函数2:转换格式
def change_html(resp):
html = etree.HTML(resp.text)
result = etree.tostring(html)
return html
函数3:解析元素
def elem_def(html):
title_v1 = html.xpath