二手房信息爬虫带翻页程序代码QZQ

import requests
import pandas as pd
import time
import parsel
import re
import csv
import os

# 设置要爬取的页数
total_pages = 5  # 可以根据需要修改页数

# 将所有写入操作放在with语句块内
with open('data翻页.csv', mode='w', encoding='utf-8', newline='') as f:
    # 创建CSV写入对象
    csv_writer = csv.DictWriter(f, fieldnames=[
        '标题',
        '售价',
        '单价',
        '小区',
        '商圈',
        '户型',
        '面积',
        '朝向',
        '装修',
        '楼层',
        '年份',
        '建筑结构',
        '详情页',
    ])

    # 写入表头
    csv_writer.writeheader()

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36 Edg/140.0.0.0",
        'cookie': 'select_city=430100; lianjia_ssid=9455e186-cb3c-48ea-a643-8c955eddbfb7; lianjia_uuid=6d4ba934-6fe8-4849-8c07-3a062deb862b; Hm_lvt_46bf127ac9b856df503ec2dbf942b67e=1758002123; Hm_lpvt_46bf127ac9b856df503ec2dbf942b67e=1758002123; HMACCOUNT=4D76AF8D7A0527E2; _jzqa=1.4606591800669647400.1758002123.1758002123.1758002123.1; _jzqc=1; _jzqckmp=1; _qzja=1.1281956671.1758002123256.1758002123256.1758002123257.1758002123256.1758002123257.0.0.0.1.1; _qzjc=1; _qzjto=1.1.0; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221995117926dc8-03edbba1febe0f-4c657b58-1327104-1995117926ef3%22%2C%22%24device_id%22%3A%221995117926dc8-03edbba1febe0f-4c657b58-1327104-1995117926ef3%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiOWJlNjA4MThlODllOGQzNjZmMzEwYTY3ZWVlMDM0YTQwNmM3NjExZWEyMDhiNDZjYzk5MGYxNDJmMzU5MWE5ZTUwMjJiNGI1MTUwYzU2YTRhMjdhZWUwODVmNGRjZmU4YzQ0M2Q3ZThhY2E5NDBhZjI5Yzk3ZDY5MzYyY2E2NDVkMjg0ZTYyYzMxMDcyYWFmMjZjMGExODIyODE1NjMxMGRmNGNlYTA0NDFhNzhjY2YxNWZhMTQ5NDViNzYzZGY3NDg1Njg1NWNhYTY5ZDg4ZjliZjE2ZjAyNDU1NzE3NDFmZTNhOTA0M2NkYjQ1OWJkZjVkNGQ0N2VkMGFhNTc3MlwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCI5OTM3ZTYwYVwifSIsInIiOiJodHRwczovL2NzLmxpYW5qaWEuY29tL2Vyc2hvdWZhbmcvIiwib3MiOiJ3ZWIiLCJ2IjoiMC4xIn0=; _jzqb=1.1.10.1758002123.1; _qzjb=1.1758002123256.1.0.0.0; _ga=GA1.2.741644465.1758002134; _gid=GA1.2.613203556.1758002134; _gat=1; _gat_global=1; _gat_new_global=1; _gat_dianpu_agent=1; _ga_4JBJY7Y7MX=GS2.2.s1758002135$o1$g0$t1758002135$j60$l0$h0'
    }

    # 循环爬取每一页
    for page in range(1, total_pages + 1):
        print(f"正在爬取第 {page} 页...")

        # 构造分页URL
        if page == 1:
            url = 'https://cs.lianjia.com/ershoufang/'
        else:
            url = f'https://cs.lianjia.com/ershoufang/pg{page}/'

        try:
            # 发送请求
            res = requests.get(url=url, headers=headers).text
            selector = parsel.Selector(res)
            li = selector.css('.sellListContent li .info')

            # 如果没有获取到数据,可能是被反爬或已到最后一页
            if not li:
                print(f"第 {page} 页未获取到数据,可能已达最大页数或被反爬")
                break

            for l in li:
                # 获取信息
                title = l.css('.title a::text').get()
                href = l.css('.title a::attr(href)').get()
                totalPrice = l.css('.totalPrice span::text').get()
                unitPrice = l.css(' .unitPrice::attr(data-price)').get()
                positionInfo = l.css('.positionInfo a::text').getall()
                houseInfo_text = l.css('.houseInfo::text').get()

                # 处理可能的空值
                if houseInfo_text:
                    houseInfo = houseInfo_text.split(' | ')
                else:
                    houseInfo = []

                # 处理年份信息
                if len(houseInfo) == 7:
                    data = houseInfo[-2]
                else:
                    data = '未知'

                # 构建数据字典
                dit = {
                    '标题': title,
                    '售价': totalPrice,
                    '单价': unitPrice,
                    '小区': positionInfo[0] if len(positionInfo) > 0 else '',
                    '商圈': positionInfo[1] if len(positionInfo) > 1 else '',
                    '户型': houseInfo[0] if len(houseInfo) > 0 else '',
                    '面积': houseInfo[1] if len(houseInfo) > 1 else '',
                    '朝向': houseInfo[2] if len(houseInfo) > 2 else '',
                    '装修': houseInfo[3] if len(houseInfo) > 3 else '',
                    '楼层': houseInfo[4] if len(houseInfo) > 4 else '',
                    '年份': data,
                    '建筑结构': houseInfo[-1] if len(houseInfo) > 0 else '',
                    '详情页': href,
                }

                # 写入数据
                csv_writer.writerow(dit)

            # 每爬完一页休眠一段时间,避免被反爬
            time.sleep(2)

        except Exception as e:
            print(f"爬取第 {page} 页时出错: {str(e)}")
            # 出错时也休眠一下
            time.sleep(3)

    print(f"\n爬取完成!共爬取 {min(total_pages, page)} 页数据,CSV文件已保存至:data.csv")

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

EYYLTV

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值