微博数据爬取

最新推荐文章于 2025-06-15 12:47:25 发布
原创最新推荐文章于 2025-06-15 12:47:25 发布 · 365 阅读
CC 4.0 BY-SA版权
文章标签：
'''
爬虫流程：
    1.确定需求：
        你的目标是什么？
        名称，时间，内容
    2.找到数据源地址：
        你的需求在哪个链接中
        https://s.weibo.com/weibo?q=python
    3.构造请求头，向服务器发送请并且拿到源代码：
        添加请求头处理反爬虫
    4.解析数据：
        把你需要的数据在源代码中解析出来
        re模块,bs4模块，lxml模块
    5.存储数据：
        数据可以存放到excel中
    6.修改代码实现多页爬取
        找url规律
'''
import requests
f=open('微博.csv','w',encoding='utf-8')
for q in ['java','python']:
    for page in range(1,4):
        url=f'https://s.weibo.com/weibo?q={q}&page={page}'
        
        headers = {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
            "accept-language": "zh-CN,zh;q=0.9",
            "cache-control": "max-age=0",
            "priority": "u=0, i",
            "referer": "https://weibo.com/",
            "sec-ch-ua": "\"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"126\", \"Google Chrome\";v=\"126\"",
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "\"Windows\"",
            "sec-fetch-dest": "document",
            "sec-fetch-mode": "navigate",
            "sec-fetch-site": "same-site",
            "sec-fetch-user": "?1",
            "upgrade-insecure-requests": "1",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
        ,'Cookie':'SINAGLOBAL=7144301666601.54.1712030005467; SCF=ApI57uOujeYAkyo_9zp_2OF8qyOFGDSaSb-c-CEPQmDS3hRcfu9qVZplCGINVt0rcBYwmYe1WyIrjJsAPg0l5kA.; SUB=_2A25LmNNwDeRhGeNG6VQS8yrNyjuIHXVo1Gq4rDV8PUNbmtAbLWv3kW9NS3qSABaEnorcIjFpU0fEwdnAbhIij-LV; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WF2EgOj_qlqsxnLSjV6mMYf5JpX5KzhUgL.Fo-Reoq0e0BpeKM2dJLoIEUq-XQLxK-LB-qL1KzLxK-L1hqLBo5LxKBLBo.L12zLxK.L1-zLB-2t; ALF=02_1724133408; _s_tentry=weibo.com; Apache=617500258953.485.1721729368416; ULV=1721729368419:3:1:1:617500258953.485.1721729368416:1713403940613'
        }
        res=requests.get(url,headers=headers).text #发送请求并且拿到源代码
        
        #**************************************************************************************
        from bs4 import BeautifulSoup
        res=BeautifulSoup(res,'lxml') #初始化
        
        #名称
        title=[]
        for i in res.find_all(class_="name"): #class在python中是关键字所以class_或者使用attrs={'class':'name'} 返回结果是list
            title.append(i.text) #提取文本
        
        #时间
        time=[]
        for i in res.find_all(class_="from"):
            time.append(i.find('a').text.strip()) #.strip()去除左右两边指点字符默认去除空格和换行'\n'
        
        #内容
        import re
        content=[]
        for i in res.find_all(class_="txt"): #尽量使用class或者id
            content.append(re.sub('展开.','',i.text.strip()))
        
        for i in range(len(time)):
            p=re.sub(',','，',title[i])+','+re.sub(',','，',time[i])+','+re.sub(',','，',content[i])+'\n'
            f.write(p)
f.close()