import requests
import pandas as pd
import time
import parsel
import re
import csv
import os
# 设置要爬取的页数
total_pages = 5 # 可以根据需要修改页数
# 将所有写入操作放在with语句块内
with open('data翻页.csv', mode='w', encoding='utf-8', newline='') as f:
# 创建CSV写入对象
csv_writer = csv.DictWriter(f, fieldnames=[
'标题',
'售价',
'单价',
'小区',
'商圈',
'户型',
'面积',
'朝向',
'装修',
'楼层',
'年份',
'建筑结构',
'详情页',
])
# 写入表头
csv_writer.writeheader()
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36 Edg/140.0.0.0",
'cookie': 'select_city=430100; lianjia_ssid=9455e186-cb3c-48ea-a643-8c955eddbfb7; lianjia_uuid=6d4ba934-6fe8-4849-8c07-3a062deb862b; Hm_lvt_46bf127ac9b856df503ec2dbf942b67e=1758002123; Hm_lpvt_46bf127ac9b856df503ec2dbf942b67e=1758002123; HMACCOUNT=4D76AF8D7A0527E2; _jzqa=1.4606591800669647400.1758002123.1758002123.1758002123.1; _jzqc=1; _jzqckmp=1; _qzja=1.1281956671.1758002123256.1758002123256.1758002123257.1758002123256.1758002123257.0.0.0.1.1; _qzjc=1; _qzjto=1.1.0; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221995117926dc8-03edbba1febe0f-4c657b58-1327104-1995117926ef3%22%2C%22%24device_id%22%3A%221995117926dc8-03edbba1febe0f-4c657b58-1327104-1995117926ef3%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiOWJlNjA4MThlODllOGQzNjZmMzEwYTY3ZWVlMDM0YTQwNmM3NjExZWEyMDhiNDZjYzk5MGYxNDJmMzU5MWE5ZTUwMjJiNGI1MTUwYzU2YTRhMjdhZWUwODVmNGRjZmU4YzQ0M2Q3ZThhY2E5NDBhZjI5Yzk3ZDY5MzYyY2E2NDVkMjg0ZTYyYzMxMDcyYWFmMjZjMGExODIyODE1NjMxMGRmNGNlYTA0NDFhNzhjY2YxNWZhMTQ5NDViNzYzZGY3NDg1Njg1NWNhYTY5ZDg4ZjliZjE2ZjAyNDU1NzE3NDFmZTNhOTA0M2NkYjQ1OWJkZjVkNGQ0N2VkMGFhNTc3MlwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCI5OTM3ZTYwYVwifSIsInIiOiJodHRwczovL2NzLmxpYW5qaWEuY29tL2Vyc2hvdWZhbmcvIiwib3MiOiJ3ZWIiLCJ2IjoiMC4xIn0=; _jzqb=1.1.10.1758002123.1; _qzjb=1.1758002123256.1.0.0.0; _ga=GA1.2.741644465.1758002134; _gid=GA1.2.613203556.1758002134; _gat=1; _gat_global=1; _gat_new_global=1; _gat_dianpu_agent=1; _ga_4JBJY7Y7MX=GS2.2.s1758002135$o1$g0$t1758002135$j60$l0$h0'
}
# 循环爬取每一页
for page in range(1, total_pages + 1):
print(f"正在爬取第 {page} 页...")
# 构造分页URL
if page == 1:
url = 'https://cs.lianjia.com/ershoufang/'
else:
url = f'https://cs.lianjia.com/ershoufang/pg{page}/'
try:
# 发送请求
res = requests.get(url=url, headers=headers).text
selector = parsel.Selector(res)
li = selector.css('.sellListContent li .info')
# 如果没有获取到数据,可能是被反爬或已到最后一页
if not li:
print(f"第 {page} 页未获取到数据,可能已达最大页数或被反爬")
break
for l in li:
# 获取信息
title = l.css('.title a::text').get()
href = l.css('.title a::attr(href)').get()
totalPrice = l.css('.totalPrice span::text').get()
unitPrice = l.css(' .unitPrice::attr(data-price)').get()
positionInfo = l.css('.positionInfo a::text').getall()
houseInfo_text = l.css('.houseInfo::text').get()
# 处理可能的空值
if houseInfo_text:
houseInfo = houseInfo_text.split(' | ')
else:
houseInfo = []
# 处理年份信息
if len(houseInfo) == 7:
data = houseInfo[-2]
else:
data = '未知'
# 构建数据字典
dit = {
'标题': title,
'售价': totalPrice,
'单价': unitPrice,
'小区': positionInfo[0] if len(positionInfo) > 0 else '',
'商圈': positionInfo[1] if len(positionInfo) > 1 else '',
'户型': houseInfo[0] if len(houseInfo) > 0 else '',
'面积': houseInfo[1] if len(houseInfo) > 1 else '',
'朝向': houseInfo[2] if len(houseInfo) > 2 else '',
'装修': houseInfo[3] if len(houseInfo) > 3 else '',
'楼层': houseInfo[4] if len(houseInfo) > 4 else '',
'年份': data,
'建筑结构': houseInfo[-1] if len(houseInfo) > 0 else '',
'详情页': href,
}
# 写入数据
csv_writer.writerow(dit)
# 每爬完一页休眠一段时间,避免被反爬
time.sleep(2)
except Exception as e:
print(f"爬取第 {page} 页时出错: {str(e)}")
# 出错时也休眠一下
time.sleep(3)
print(f"\n爬取完成!共爬取 {min(total_pages, page)} 页数据,CSV文件已保存至:data.csv")
二手房信息爬虫带翻页程序代码QZQ
最新推荐文章于 2025-09-16 15:55:09 发布