'''
爬虫流程:
1.确定需求:
你的目标是什么?
名称,时间,内容
2.找到数据源地址:
你的需求在哪个链接中
https://s.weibo.com/weibo?q=python
3.构造请求头,向服务器发送请并且拿到源代码:
添加请求头处理反爬虫
4.解析数据:
把你需要的数据在源代码中解析出来
re模块,bs4模块,lxml模块
5.存储数据:
数据可以存放到excel中
6.修改代码实现多页爬取
找url规律
'''
import requests
f=open('微博.csv','w',encoding='utf-8')
for q in ['java','python']:
for page in range(1,4):
url=f'https://s.weibo.com/weibo?q={q}&page={page}'
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"priority": "u=0, i",
"referer": "https://weibo.com/",
"sec-ch-ua": "\"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"126\", \"Google Chrome\";v=\"126\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-site",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
,'Cookie':'SINAGLOBAL=7144301666601.54.1712030005467; SCF=ApI57uOujeYAkyo_9zp_2OF8qyOFGDSaSb-c-CEPQmDS3hRcfu9qVZplCGINVt0rcBYwmYe1WyIrjJsAPg0l5kA.; SUB=_2A25LmNNwDeRhGeNG6VQS8yrNyjuIHXVo1Gq4rDV8PUNbmtAbLWv3kW9NS3qSABaEnorcIjFpU0fEwdnAbhIij-LV; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WF2EgOj_qlqsxnLSjV6mMYf5JpX5KzhUgL.Fo-Reoq0e0BpeKM2dJLoIEUq-XQLxK-LB-qL1KzLxK-L1hqLBo5LxKBLBo.L12zLxK.L1-zLB-2t; ALF=02_1724133408; _s_tentry=weibo.com; Apache=617500258953.485.1721729368416; ULV=1721729368419:3:1:1:617500258953.485.1721729368416:1713403940613'
}
res=requests.get(url,headers=headers).text #发送请求并且拿到源代码
#**************************************************************************************
from bs4 import BeautifulSoup
res=BeautifulSoup(res,'lxml') #初始化
#名称
title=[]
for i in res.find_all(class_="name"): #class在python中是关键字所以class_或者使用attrs={'class':'name'} 返回结果是list
title.append(i.text) #提取文本
#时间
time=[]
for i in res.find_all(class_="from"):
time.append(i.find('a').text.strip()) #.strip()去除左右两边指点字符默认去除空格和换行'\n'
#内容
import re
content=[]
for i in res.find_all(class_="txt"): #尽量使用class或者id
content.append(re.sub('展开.','',i.text.strip()))
for i in range(len(time)):
p=re.sub(',',',',title[i])+','+re.sub(',',',',time[i])+','+re.sub(',',',',content[i])+'\n'
f.write(p)
f.close()
微博数据爬取
最新推荐文章于 2025-06-15 12:47:25 发布