python爬取某网站小说_python抓取付费小说-CSDN博客

python爬取某网站小说（cookie 失效自己F12获取最新的）
【仅做学习研究，不要用于非法用途】

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Time    : 2024/8/23 12:41
# @Author  : 何胜金-heshengjin
# @Site    :
# @File    : http_test.py
# @Software: PyCharm
"""
虚拟virtualenv
pip install requests
pip install beautifulsoup4
"""

import requests
from bs4 import BeautifulSoup
import time

# 请求头，添加你的浏览器信息后才可以正常运行
host = 'www.gushufang.com'
host_http = 'http://www.gushufang.com'
referer = 'http://www.gushufang.com/ertong/yiqianlingyiyetonghuagushi/'
cookie = 'Hm_lvt_1c9140008e587d9bd24d8fae079380ff=1735087028; HMACCOUNT=FF82E8281B1A4114; Hm_lpvt_1c9140008e587d9bd24d8fae079380ff=1735087058'
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Encoding': 'gzip, deflate, br, zstd',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Cache-Control': 'max-age=0',
    'Referer': referer,
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
    'Cookie': cookie,
    'Host': host,
    'Connection': 'keep-alive'
}
content_txt = "一千零一夜.txt"
tmp_html = "temp.html"
next_text = '【下一篇】'
next_text_constant = '【下一篇】'
# 小说起始页
main_url = "http://www.gushufang.com/ertong/yiqianlingyiyetonghuagushi/67860.html"

while True:
    print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

    # 使用get方法请求网页
    source_html = requests.get(main_url, headers=headers)
    # 设置编码
    source_html.encoding = 'gbk'

    # 覆盖写入 temp.html
    with open(tmp_html, "w+", encoding="gbk") as f:
        f.write(source_html.text)
        f.seek(0)
        html_handle = f.read()

    title_text = ''
    soup = BeautifulSoup(html_handle, "html.parser")

    if next_text == next_text_constant:
        title = soup.find('div', id='maincontent').find('h1').text
        title_text += '\nchattts正文'
        title_text += title
        # 打印title
        print(title_text)
        title_text += '\n'

    text = soup.find('div', id='content').text
    # .replace('：“', '。“')
    text = text.replace('。”', '”。').replace('。"', '"。').replace("。’", "’。").replace('！”', '"。').replace('！"', '"。').replace("！’", "’。").replace('!”', '"。').replace('!"', '"。').replace('？”', '"。').replace('？"', '"。').replace("？’", "’。").replace('?”', '"。').replace('?"', '"。')
    text = text.replace('\u3000', '').replace("。", "。\n").replace("；", "。\n")
    title_text += text
    # print(text)

    children_a = soup.find('span', class_='todown1').find("a")
    main_url = host_http + children_a['href']
    next_text = children_a.get_text()
    print(next_text + main_url + "\n")
    if next_text != next_text_constant:
        break

    # 追加写入 一千零一夜.txt
    with open(content_txt, "a+", encoding="utf-8") as fc:
        # 处理NBSP
        fc.write(title_text.replace(u'\xa0', ''))

    # 30s
    time.sleep(30)

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Time    : 2024/8/23 12:41
# @Author  : 何胜金-heshengjin
# @Site    :
# @File    : http_test.py
# @Software: PyCharm
"""
虚拟virtualenv
pip install requests
pip install beautifulsoup4
"""

import requests
from bs4 import BeautifulSoup
import time

# 请求头，添加你的浏览器信息后才可以正常运行
host = 'www.shu3.net'
host_http = 'http://www.shu3.net'
referer = 'http://www.shu3.net/bookinfo/73422.html'
cookie = "PHPSESSID=1qvprqgtf4avd3k1t3080e4cf2; pp_vod_v=%u519B%u4E34%u5929%u4E0B%7C/bookinfo/73422.html^%7Chttp%3A//www.shu3.net/bookinfo/73422.html_$_|; Hm_tf_udbh2lt7blv=1736492402; Hm_lvt_udbh2lt7blv=1736492402; _ga=GA1.1.459446933.1736492402; Hm_lpvt_udbh2lt7blv=1736492414; _ga_J2MC6ZQ4CR=GS1.1.1736492401.1.1.1736492418.0.0.0"
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Cache-Control': 'max-age=0',
    'Referer': referer,
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
    'Cookie': cookie,
    'Host': host,
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1'
}
content_txt = "军临天下324续.txt"
tmp_html = "temp.html"
next_text = '下一页'
next_text_constant = '下一页'
# 小说起始页
main_url = "http://www.shu3.net/bookread/73422-0-317.html"

while True:
    print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

    # 使用get方法请求网页
    source_html = requests.get(main_url, headers=headers)
    # 设置编码
    source_html.encoding = 'utf-8'

    # 覆盖写入 temp.html
    with open(tmp_html, "w+", encoding="utf-8") as f:
        f.write(source_html.text)
        f.seek(0)
        html_handle = f.read()

    title_text = ''
    soup = BeautifulSoup(html_handle, "html.parser")

    if next_text == next_text_constant:
        title = soup.find('div', id='title').text
        title_text += '\nchattts正文'
        title_text += title
        # 打印title
        print(title_text)
        title_text += '\n'

    text = soup.find('div', id='content').text
    # text = text.replace('。”', '”。').replace('。"', '"。').replace("。’", "’。").replace('！”', '"。').replace('！"', '"。').replace("！’", "’。").replace('!”', '"。').replace('!"', '"。').replace('？”', '"。').replace('？"', '"。').replace("？’", "’。").replace('?”', '"。').replace('?"', '"。')
    # text = text.replace('\u3000', '').replace("。", "。\n").replace("；", "。\n")
    title_text += text
    # print(text)

    # 追加写入 军临天下324续.txt
    with open(content_txt, "a+", encoding="utf-8") as fc:
        # 处理NBSP
        fc.write(title_text.replace(u'\xa0', ''))

    # 下一章
    children_all = soup.find('div', id='footlink').findAll("a")
    main_url = host_http + children_all[-1]['href']
    next_text = children_all[-1].get_text()
    print(next_text + main_url + "\n")
    if next_text != next_text_constant:
        break

    # 30s
    time.sleep(30)