步骤一:

import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/89.0.4389.82 Safari/537.36'
}
def getBasicInfomation():
"""
获取八大城区的名字和网址,并存入列表
:return: 返回名字和网址列表
"""
url = 'https://cd.lianjia.com/zufang'
response = requests.get(url=url, headers=headers).text
soup = BeautifulSoup(response, 'lxml')
temp = soup.find('div', id='filter')
temp = temp.select('div > ul > .filter__item--level2')[1:9]
nameList = []
urlList = []
cnt = 0
mb = len(temp)
print('开始获取八大城区基础信息!')
for i in temp:
nameList.append(i.a.text)
temp = 'https://cd.lianjia.com' + i.a['href']
cnt += 1
print('已获取数量:{} 完成度:{:.1f}%'.format(cnt, cnt * 100 / mb))
urlList.append(temp)
print('获取八大城区基础信息完成!')
print('结果如下:')
for i in zip(nameList, urlList):
print(i)
return nameList, urlList
def getHouseAreaAndNum(nameList, urlList):
'''
:param urlList: 八大城区的网址列表
:param nameList: 八大城区的名称
:return: 返回每个城区的租房区域划分情况、挂网租房数量
'''
areaDict = {}
cnt = 0
for i in urlList:
areaUrl = []
areaName = []
response = requests.get(url=i, headers=headers).text
soup = BeautifulSoup(response, 'lxml')
print('开始获取当前城区:{} 区域划分情况!'.format(nameList[cnt]))
temp = soup.find_all('li', class_=['filter__item--title', 'filter__item--level3'])[1:]
rt = ''
flag = 1
cntt = 0
for j in temp:
strr = j.text
if strr >= 'a' and strr <= 'z':
rt = strr.upper()
flag = 0
else:
areaUrl.append('https://cd.lianjia.com' + j.a['href'])
strr = strr.strip('\n')
if flag == 0:
rt = rt + strr
flag = 1
areaName.append(rt)
else:
areaName.append(strr)
cntt += 1
print("已获取数量:{} 完成度:{:.1f}%".format(cntt, cntt * 100 / len(temp)))
print('获取当前城区:{} 区域划分情况完成!'.format(nameList[cnt]))
print('开始获取当前城区:{} 每个划分区域的挂网租房数量!'.format(nameList[cnt]))
areaNum = []
cntt = 0
for j in areaUrl:
newResponse = requests.get(url=j, headers=headers).text
soup = BeautifulSoup(newResponse, 'lxml')
temp = soup.find('span', class_='content__title--hl')
areaNum.append(int(temp.text))
cntt += 1
print("已获取数量:{} 完成度:{:.1f}%".format(cntt, cntt * 100 / len(areaUrl)))
print('获取当前城区:{} 每个划分区域的挂网租房数量完成!'.format(nameList[cnt]))
areaDict[nameList[cnt]] = (areaName, areaNum)
cnt += 1
print('总共:已获取数量:{} 完成度:{:.1f}%'.format(cnt, cnt * 100 / len(urlList)))
print('所有城区的租房区域划分情况和挂网租房数量,获取完成!')
print('结果如下:')
total = []
for key, val in areaDict.items():
total.append(sum(val[1]))
myDF = pd.DataFrame(areaDict)
myDF.loc['2'] = total
myDF.index = ['当前城区区域划分情况', '当前城区区域划分挂网租房数量(单位:套)', '当前城区总挂网租房数量(单位:套)']
print(myDF)
myDF.to_csv('test.csv')
print('结果已存入csv文件!')
nameList, urlList = getBasicInfomation()
getHouseAreaAndNum(nameList, urlList)
步骤二、三

'''
导入各种包,其中:
re.findall用来进行正则匹配
csv用来写csv文件
asyncio和aiohttp用来配合进行异步协程爬虫开发
time用来记录运行时间
logging用来显示错误日志
'''
from re import findall
import csv
import asyncio
import aiohttp
import time
import logging
baseurl = "https://cd.lianjia.com/zufang"
block_list = ["锦江", "青羊", "武侯", "高新", "成华", "金牛", "天府新区", "高新西"]
session = None
semaphore = asyncio.Semaphore(8)
'''
这个函数定义了一个基本的用来实现一个使用get方法获取目标网页html文本的接口,相当于requests.get
input: A URL
output: This URL's HTML
'''
async def get(url):
async with semaphore:
try:
logging.info('Getting %s',url)
async with session.get(url) as response:
return await response.text()
except aiohttp.ClientError:
logging.error('Error occurred while getting %s',url,exc_info=True)
def get_blockurls(html):
result = []
for block in block_list:
block_url = findall(r'href="/zufang(.*?)" >'+block, html)[0]
result.append(block_url)
return result
def get_subblock(html):
result = []
html = html.replace("\n","").replace("\r","").replace("\t","").replace(" ","")
temp = findall(r'--level3"><ahref="/zufang(.*?)</a>', html)
for t in temp:
result.append(t.split('">')[1])
return result
def get_roomnum(html):
result = 0
result = findall(r'content__title--hl">(.*?)</span>', html)[0]
return result
async def get_roomurls(html, num):
result = []
pagenum = int((num - (num%30))/30) + 1
html = html.replace("\n","").replace("\r","").replace("\t","").replace(" ","")
urls = findall(r'class="content__list--item--aside"target="_blank"href="/zufang(.*?)"title="', html)
for u in urls:
result.append(baseurl+u)
for p in range(2,pagenum+1):
html = await get(baseurl+"/pg"+str(p)+"/#contentList")
if not html: continue
html = html.replace("\n", "").replace("\r","").replace("\t","").replace(" ","")
urls = findall(r'class="content__list--item--aside"target="_blank"href="/zufang(.*?)"title="', html)
for u in urls:
result.append(baseurl+u)
return result
async def get_roommessage(html, bname, w2):
result = {'village':'','style':'','time':'','base':[],'pay':[],'install':[]}
html = html.replace("\n","").replace("\r","").replace("\t","")
subname = findall(r'此房源位于成都(.*?)的',html)[0].replace(bname,"")
basemessage = findall(r'<p class="content__title">(.*?)</p>', html)[0].split('·')[1]
result['village'] = basemessage.split(' ')[0]
result['style'] = basemessage.split(' ')[1]
result['time'] = findall(r'房源维护时间:(.*?) <!--', html)[0]
roommessage = findall(r'<li class="fl oneline">(.*?)</li>', html)
for m in roommessage:
try:
result['base'].append(m.split(':')[1])
except:
pass
result['pay'].append(findall(r'<li class="table_col font_gray">(.*?)</li>',html)[0])
result['pay'].append(findall(r'<li class="table_col font_orange">(.*?)</li>',html)[0])
result['pay'].append(findall(r'<li class="table_col">(.*?)</li>',html)[5])
html = html.replace(" ","").replace("(","")
install = findall(r'</li><liclass="(.*?)"><istyle="background-image:urlhttps://image1',html)
for i in install:
if 'flonelinefacility_no' in i:
result['install'].append('0')
else:
result['install'].append('1')
w2.writerow([bname, subname, result['village'], result['style'],
result['time'],result['base'][0],result['base'][1],result['base'][2],result['base'][3],
result['base'][4],result['base'][5],result['base'][6],result['base'][7],result['base'][8],
result['base'][9],result['base'][10],result['base'][11],result['base'][12],result['pay'][0],
result['pay'][1],result['pay'][2],result['install'][0],result['install'][1],result['install'][2],
result['install'][3],result['install'][4],result['install'][5],result['install'][6],result['install'][7],
result['install'][8],result['install'][9]])
async def get_rooms(html, num, bname, w2):
if num < 1000:
room_urls = await get_roomurls(html, num)
else:
room_urls = await get_roomurls(html, 1000)
if not room_urls: return
for u in room_urls:
room_r = await get(u)
if not room_r:
continue
try:
room_message = await get_roommessage(room_r, bname, w2)
except:
pass
async def geturls(block, bname):
blockurl = baseurl + block
block_r = await get(blockurl)
sub_blocks = get_subblock(block_r)
return sub_blocks
async def get_message_main(block, bname, w1, w2):
print("运行了main一次")
blockurl = baseurl + block
block_r = await get(blockurl)
room_num = get_roomnum(block_r)
result = await get_rooms(block_r, int(room_num), bname, w2)
async def main():
global session
session = aiohttp.ClientSession()
f2 = open('file2.csv','w',encoding='utf-8')
w2 = csv.writer(f2)
w2.writerow(['行政区域','区域','小区','房型','房源维护时间','面积','朝向','维护','入住','楼层','电梯','车位','用水','用电','燃气','采暖','租期','看房','付款方式','租金','押金','洗衣机','空调','衣柜','电视','冰箱','热水器','床','暖气','宽带','天然气'])
base_r = await get(baseurl)
block_urls = get_blockurls(base_r)
indextasks = [asyncio.ensure_future(get_message_main(block,bname,w1,w2)) for block,bname in zip(block_urls,block_list)]
result = await asyncio.gather(*indextasks)
f2.close()
await session.close()
if __name__ == '__main__':
start = time.time()
print("Start at: " , start)
asyncio.get_event_loop().run_until_complete(main())
end = time.time()
print("End at: " , end)
print("Time cost:" , end-start)