python爬取[百度热搜]

Bruk.Liu

于 2025-01-06 14:33:06 发布

阅读量415

点赞数 3

CC 4.0 BY-SA版权

文章标签： python 开发语言

本文链接：https://blog.csdn.net/weixin_46054799/article/details/144963324

import re
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook

# 首页地址
base_url = "https://top.baidu.com/board?tab=realtime"
# 请求头
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
# 保存文件
save_file = "热搜.xlsx"

# 准备导出excel
wb = Workbook()
ws = wb.active
ws.title = '热搜榜'
# 设置表头
ws.append(["标题", "热搜指数", "详情链接", "缩略图"])

# 请求页面
response = requests.get(base_url, headers=headers)
# 自动检测编码
response.encoding = response.apparent_encoding
# 使用 BeautifulSoup 解析 HTML
soup = BeautifulSoup(response.text, 'lxml')
# 更清晰地打印 HTML
# print(soup.prettify())
# 使用正则匹配类名以 'category-wrap_' 开头的元素
pattern = re.compile(r'^category-wrap_')
# 找到所有匹配的元素
matching_elements = soup.find_all(class_=pattern)

# 打印匹配结果
for element in matching_elements:
    print(f"=====================================================")
    # 获取标题
    title = element.select(".c-single-text-ellipsis")[0].text
    print(f"标题:{title}")
    # 热搜指数
    pattern = re.compile(r'^hot-index_')
    hot_div = element.find_all(class_=pattern)[0]
    print(f"热搜指数:{hot_div.text}")
    # 详情链接
    hot_url_a = element.find_all('a')
    hot_url = hot_url_a[0]['href']
    print(f"详情链接:{hot_url}")
    # 缩略图
    hot_img = hot_url_a[0].find_all('img')
    hot_img_url = hot_img[0]['src']
    print(f"缩略图:{hot_img_url}")
    # 写入excel
    ws.append([title, hot_url, hot_url, hot_img_url])

# 保存excel
wb.save(save_file)
print(f"热搜数据已保存到 {save_file}")