按f12,查看页面元素
就会发现它所有的标题,和网址,我们要获取小说内容的,那就需要获取其网页链接,然后发送请求,获取网页内容,然后就是解析网页,筛选其中的内容
最终效果
用到的库
pip install requests
pip install bs4
完整代码
import os
import time
import requests
from bs4 import BeautifulSoup
def books(path,url1):
# 创建文件夹
path = path
# path是文件夹或者文件的相对路径或者绝对路径
if os.path.exists(path):
pass
else:
os.mkdir(path)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/96.0.4664.45 Safari/537.36 '
}
url1 = url1
response = requests.get(url=url1, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
# 页数
select = soup.select("body > div.zjbox > div > select > option") #获取其select标签下的内容,并存入列表
List = []
for i in select:
s_url = i.get('value')
List.append(s_url)
for ul in List:
url2 = "https://www.bbiquge.net"
url2 = url2 + ul #拼接获取每一个option的小说标题
# print(url2)
response = requests.get(url=url2, headers=headers)
html_two = BeautifulSoup(response.text, 'lxml')
st = html_two.select('body > div.zjbox > dl > dd > a')
for s in st:
title = s.get_text() # 标题
if os.path.exists(fr"{path}\{title}.txt"): # 验证是否已经下载,避免每次重复下载
print(title, "已下载")
continue
else:
#如歌没有下载,则开始获取小说内容网址,发送请求
url3 = url1 + s.get('href')
response = requests.get(url=url3, headers=headers)
html_three = BeautifulSoup(response.text, 'lxml') # 解析网页
html_three.encoding = 'gbk' # 指定网页编码格式
context = html_three.find('div', id='content').get_text()
line_list = context.split() # 整理格式
te = '\n'.join(line_list) # 拼接
with open(fr"{path}\{title}.txt", "w", newline='', encoding='utf-8') as f:
f.write(te) # 自带文件关闭功能,不需要再写f.close()
print(f"下载完章节{title}-----------")
if __name__ == '__main__':
time_start = time.time() # 开始计算时
path = input("输入保存地址")
url1=input("输入网址目录网址,例如(https://www.bbiquge.net/book/126240/)")
# url1="https://www.bbiquge.net/book/126240/"
books(path,url1)
time_end = time.time() # 结束计时
time_c = time_end - time_start # 运行所花时间
print('花费', time_c, 's')