爬取拉勾网

本文介绍了一种使用Python爬取拉勾网招聘信息的方法。通过分析网页结构和使用正则表达式,实现了对指定职位和城市的招聘信息抓取,并展示了如何翻页获取所有招聘信息。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

import json
import requests
import urllib.request
from  urllib import  parse
import re

def getTotalPage(job,city):
    #url 编码
    city=urllib.parse.urlencode({'city':city})
    #字符串编码
    job=urllib.request.quote(job)
    url='https://www.lagou.com/jobs/list_'+job+'?'+city
    #设置请求头
    headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
    #请求体
    req=urllib.request.Request(url,headers=headers)
    #获取响应
    respData=urllib.request.urlopen(req).read().decode('utf-8')
    #正则匹配
    totalPage=int(re.findall('totalNum">(\d+)</span>',respData)[0])
    print(totalPage)
    return totalPage

def getJobList(page,job,city=None):
    city = urllib.parse.urlencode({'city': city})
    url='https://www.lagou.com/jobs/positionAjax.json?'+city+'&needAddtionalResult=false'
    proxies={'http':'111.231.115.150:8888'}
    headers = {
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Accept-Language": "zh,en;q=0.9,zh-CN;q=0.8",
        "Connection": "keep-alive",
        "Content-Length": "24",
        "Referer":'https://www.lagou.com/jobs/list_python?city=%E5%B9%BF%E5%B7%9E',
        "Cookie": "user_trace_token=20180328163118-62e60eb0-3262-11e8-a293-525400f775ce; LGUID=20180328163118-62e6134d-3262-11e8-a293-525400f775ce; _ga=GA1.2.1052090742.1522225873; _gid=GA1.2.1773365958.1530503916; index_location_city=%E5%B9%BF%E5%B7%9E; JSESSIONID=ABAAABAAAIAACBI000BF87EA63784CD45C92FCAC898A635; _gat=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1530503916,1530517219; LGSID=20180702154019-2b58d020-7dcb-11e8-98e2-5254005c3644; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2F2.zoppoz.workers.dev%3A443%2Fhttps%2Fwww.baidu.com%2Flink%3Furl%3Dx-DqSEgfwYSPksRlStkAdjxOGGMcsWtcJBYHyvYhFy3%26wd%3D%26eqid%3Db0d02f82000311aa000000065b39d6dd; PRE_LAND=https%3A%2F%2F2.zoppoz.workers.dev%3A443%2Fhttps%2Fwww.lagou.com%2F; TG-TRACK-CODE=search_code; SEARCH_ID=0705a0e9f9744ba680317f94da58d2fd; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1530517370; LGRID=20180702154250-8549fa30-7dcb-11e8-bcc8-525400f775ce",
        "Host": "www.lagou.com",
        "Origin": "https://www.lagou.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
        "X-Anit-Forge-Code": "0",
        "X-Anit-Forge-Token": "None",
        "X-Requested-With": "XMLHttpRequest",
    }
    formData = {
        'first': 'false',
        'pn': page,
        'kd': job
    }
    resp=requests.post(url,data=formData,headers=headers,proxies=proxies)
    data=resp.json()
    return data['content']['positionResult']['result']


if __name__ == '__main__':

    job=input('输入职位:'+'\n')
    city=input('输入城市:'+'\n')
    totalPage=getTotalPage(job,city)
    for page in range(1,totalPage+1):
        print(page)
       
 jobList=getJobList(str(page),'python','广州')
        for job in jobList:
            print(job)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值