import json
import requests
import urllib.request
from urllib import parse
import re
def getTotalPage(job,city):
#url 编码
city=urllib.parse.urlencode({'city':city})
#字符串编码
job=urllib.request.quote(job)
url='https://www.lagou.com/jobs/list_'+job+'?'+city
#设置请求头
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
#请求体
req=urllib.request.Request(url,headers=headers)
#获取响应
respData=urllib.request.urlopen(req).read().decode('utf-8')
#正则匹配
totalPage=int(re.findall('totalNum">(\d+)</span>',respData)[0])
print(totalPage)
return totalPage
def getJobList(page,job,city=None):
city = urllib.parse.urlencode({'city': city})
url='https://www.lagou.com/jobs/positionAjax.json?'+city+'&needAddtionalResult=false'
proxies={'http':'111.231.115.150:8888'}
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Language": "zh,en;q=0.9,zh-CN;q=0.8",
"Connection": "keep-alive",
"Content-Length": "24",
"Referer":'https://www.lagou.com/jobs/list_python?city=%E5%B9%BF%E5%B7%9E',
"Cookie": "user_trace_token=20180328163118-62e60eb0-3262-11e8-a293-525400f775ce; LGUID=20180328163118-62e6134d-3262-11e8-a293-525400f775ce; _ga=GA1.2.1052090742.1522225873; _gid=GA1.2.1773365958.1530503916; index_location_city=%E5%B9%BF%E5%B7%9E; JSESSIONID=ABAAABAAAIAACBI000BF87EA63784CD45C92FCAC898A635; _gat=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1530503916,1530517219; LGSID=20180702154019-2b58d020-7dcb-11e8-98e2-5254005c3644; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2F2.zoppoz.workers.dev%3A443%2Fhttps%2Fwww.baidu.com%2Flink%3Furl%3Dx-DqSEgfwYSPksRlStkAdjxOGGMcsWtcJBYHyvYhFy3%26wd%3D%26eqid%3Db0d02f82000311aa000000065b39d6dd; PRE_LAND=https%3A%2F%2F2.zoppoz.workers.dev%3A443%2Fhttps%2Fwww.lagou.com%2F; TG-TRACK-CODE=search_code; SEARCH_ID=0705a0e9f9744ba680317f94da58d2fd; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1530517370; LGRID=20180702154250-8549fa30-7dcb-11e8-bcc8-525400f775ce",
"Host": "www.lagou.com",
"Origin": "https://www.lagou.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
"X-Anit-Forge-Code": "0",
"X-Anit-Forge-Token": "None",
"X-Requested-With": "XMLHttpRequest",
}
formData = {
'first': 'false',
'pn': page,
'kd': job
}
resp=requests.post(url,data=formData,headers=headers,proxies=proxies)
data=resp.json()
return data['content']['positionResult']['result']
if __name__ == '__main__':
job=input('输入职位:'+'\n')
city=input('输入城市:'+'\n')
totalPage=getTotalPage(job,city)
for page in range(1,totalPage+1):
print(page)
jobList=getJobList(str(page),'python','广州')
for job in jobList:
print(job)
爬取拉勾网
最新推荐文章于 2021-11-22 22:01:03 发布