一、爬虫部分
爬虫说明:
1、本爬虫是以面向对象的方式进行代码架构的
2、本爬虫是通过将前程无忧网页转换成一定端来进行求职信息爬取的
3、本爬虫爬取的数据存入到MongoDB数据库中
4、爬虫代码中有详细注释
代码展示
import time
from pymongo import MongoClient
import requests
from lxml import html
class JobSpider():
def __init__(self):
# 构造请求头信息
self.headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
}
# 起始url路径
self.start_url = 'https://msearch.51job.com/job_list.php?keyword=python&jobarea=000000&pageno=1'
# 求职列表页url
self.url_list = 'https://msearch.51job.com/job_list.php?keyword=python&jobarea=000000&pageno={}'
# 初始化MongoDB数据库
self.client = MongoClient()
self.collection = self.client['test']['wuyou_job']
# 构造所有列表页的url地址
def get_url_list(self,total_num):
total_num = int(total_num)
page = total_num//50+1 if total_num%50!=0 else total_num//50
return [self.url_list.format(i) for i in range(1,page+1)]
# 请求并解析网页
def parse_url(self,url):
# 睡眠 防止因爬取速度过快而导致程序停止
time.sleep(0.5)
rest = requests.get(url,headers=self.headers)
return rest.content.decode()
# 获取列表页数据
def get_content_list(self,str_html):
str_html = html.etree.HTML(str_html)
# 将求职信息进行分组
job_list = str_html.xpath('//div[@id="pageContent"]/div[@class="list"]/a')
# 遍历每一个求职信息获取详细url 再次请求获取详细求职信息
for i in job_list:
detail_url = i.xpath('./@href')[0]
self.parse_detail(detail_url)
# 解析详情页
def parse_detail(self,detail_url):
time.sleep(0.1)
rest = requests.get(detail_url,headers = self.headers)
str_html = html.etree.HTML(rest.content.decode())
item = {}
# 通过使用Python三目运算符的机制稳固爬虫程序 使爬虫在未爬取到信息的情况下不会报错并填充None
item['职位'] = str_html.xpath('//p[@class="jname"]/text()')
item['职位'] = item['职位'][0] if len(item['职位'])>0 else None
item['公司名称'] = str_html.xpath('//div[@class="info"]/h3/text()')
item['公司名称'] = item['公司名称'][0] if len(item['公司名称'])>0 else None
item['公司地点'] = str_html.xpath('//div[@class="jbox"]/div[@class="m_bre"]/span[2]/text()')
item['公司地点'] = item['公司地点'][0] if len(item['公司地点'])>0 else None
item['公司性质'] = str_html.xpath('//div[@class="info"]/div[@class="m_bre"]/span[1]/text()')
item['公司性质'] = item['公司性质'][0] if len(item['公司性质'])>0 else None
item['薪资'] = str_html.xpath('//p[@class="sal"]/text()')
item['薪资'] = item['薪资'][0] if len(item['薪资'])>0 else None
item['学历要求'] = str_html.xpath('//div[@class="jbox"]/div[@class="m_bre"]/span[3]/text()')
item['学历要求'] = item['学历要求'][0] if len(item['学历要求'])>0 else None
item['工作经验'] = str_html.xpath('//div[@class="jbox"]/div[@class="m_bre"]/span[4]/text()')
item['工作经验'] = item['工作经验'][0] if len(item['工作经验'])>0 else None
item['公司规模'] = str_html.xpath('//div[@class="info"]/div[@class="m_bre"]/span[2]/text()')
item['公司规模'] = item['公司规模'][0] if len(item['公司规模'])>0 else None
item['公司类型'] = str_html.xpath('//div[@class="info"]/div[@class="m_bre"]/span[3]/text()')
item['公司类型'] = item['公司类型'][0] if len(item['公司类型'])>0 else None
item['公司福利'] = str_html.xpath('//div[@class="tbox"]/span/text()')
item['公司福利'] = '-'.join(item['公司福利'])
item['发布时间'] = str_html.xpath('//span[@class="date"]/text()')
item['发布时间'] = item['发布时间'][0] if len(item['发布时间'])>0 else None
print(item)
self.save(item)
# 保存爬取内容
def save(self,item):
self.collection.insert(item)
# 主函数
def run(self):
# 先请求一次起始url获取总搜索条数
rest = requests.get(self.start_url,headers = self.headers)
str_html = html.etree.HTML(rest.content.decode())
total_num = str_html.xpath('//p[@class="result"]/span/text()')[0]
# 获取所有的列表页
url_list = self.get_url_list(total_num)
# 遍历列表页
for i in url_list:
str_html = self.parse_url(i)
self.get_content_list(str_html)
if __name__ == '__main__':
job = JobSpider()
job.run()
二、数据分析和数据可视化部分
数据分析和数据可视化说明:
1、本博客通过Flask框架来进行数据分析和数据可视化
2、项目的架构图为

代码展示
- 数据分析代码展示(analysis.py)
import re
import pandas as pd
import numpy as np
from pymongo import MongoClient
import pymysql
# 薪资处理函数 将薪资转换成以千为单位的数值 针对不同类型进行不同的处理
def salary_process(df):
# 处理元/每天的数据
df['薪资'] = df['薪资'].apply(
lambda x: str(round(float(re.findall('(.*)元', x)[0]) / 1000 * 30, 1)) if x.endswith('元/天') else x)
# 处理千/月的数据
df['薪资'] = df['薪资'].apply(lambda x: str(
round((float(re.findall(r'(.*)千', x)[0].split('-')[0]) + float(re.findall(r'(.*)千', x)[0].split('-')[1])) / 2,
1)) if x.endswith('千/月') else x)
# 处理万/月的数据
df['薪资'] = df['薪资'].apply(lambda x: str(round(
(float(re.findall(r'(.*)万', x)[0].split('-')[0]) + float(re.findall(r'(.*)万', x)[0].split('-')[1])) / 2 * 10,
1)) if len(re.findall(r'万', x)) > 0 and len(re.findall(r'-', x)) > 0 else x)
# 处理千以下/月的数据
df['薪资'] = df['薪资'].apply(lambda x: re.findall('(.*)千以下', x)[0] if x.endswith('千以下/月') else x)
# 将除以上结果的数据删除
df['薪资'] = df['薪资'].apply(lambda x: x if len(re.findall('\d\.\d', x)) > 0 else np.nan)
df['薪资'].dropna(how='any', inplace=True)
# 将薪资类型转换成float
df['薪资'] = df['薪资'].astype(np.float)
df.reset_index(drop=True, inplace=True)
return df
# 工作经验与薪资的关系
def experience_salary(df):
# 按照薪资进行分组
grouped = df.groupby('工作经验')['薪资'].mean().reset_index()
# 数据只保留带有工作经验和在校生的部分
grouped = grouped[(grouped['工作经验'].str.contains('经验'))|(grouped['工作经验'].str.contains('在校生'))]
# 将单位转换成万并保留一位小数
grouped['薪资'] = grouped['薪资'].apply(lambda x:round(x/10,1))
# 将数据转换成列表嵌套列表的格式方便数据库批量导入
data = [[i['工作经验'],i['薪资']] for i in grouped.to_dict(orient='records')]
print(data)
return data
# 学历与薪资的关系
def education_salary(df):
grouped = df.groupby('学历要求').mean().reset_index()
grouped['薪资'] = grouped['薪资'].apply(lambda x:round(x/10,1))
# 将非学历字段删除
grouped = grouped[grouped['学历要求'].str.contains('经验')==False]
# 将数据转换成列表嵌套列表的格式方便数据库批量导入
data = [[i['学历要求'],i['薪资']] for i in grouped.to_dict(orient='records')]
return data
# python城市需求地理位置分布图
def city_need(df):
# 各城市的经纬度数据
geoCoordMap = {
'海门': [121.15, 31.89],
'鄂尔多斯': [109.781327, 39.608266],
'招远': [120.38, 37.35],
'舟山': [122.207216, 29.985295],
'齐齐哈尔': [123.97, 47.33],
'盐城': [120.13, 33.38],
'赤峰': [118.87, 42.28],
'青岛': [120.33, 36.07],
'乳山': [121.52, 36.89],
'金昌': [102.188043, 38.520089],
'泉州': [118.58, 24.93],
'莱西': [120.53, 36.86],
'日照': [119.46, 35.42],
'胶南': [119.97, 35.88],
'南通': [121.05, 32.08],
'拉萨': [91.11, 29.97],
'云浮': [112.02, 22.93],
'梅州': [116.1, 24.55],
'文登': [122.05, 37.2],
'上海': [121.48, 31.22],
'攀枝花': [101.718637, 26.582347],
'威海': [122.1, 37.5],
'承德': [117.93, 40.97],
'厦门': [118.1, 24.46],
'汕尾': [115.375279, 22.786211],
'潮州': [116.63, 23.68],
'丹东': [124.37, 40.13],
'太仓': [121.1, 31.45],
'曲靖': [103.79, 25.51],
'烟台': [121.39, 37.52],
'福州': [119.3, 26.08],
'瓦房店': [121.979603, 39.627114],
'即墨': [120.45, 36.38],
'抚顺': [123.97, 41.97],
'玉溪': [102.52, 24.35],
'张家口': [114.87, 40.82],
'阳泉': [113.57, 37.85],
'莱州': [119.942327, 37.177017],
'湖州': [120.1, 30.86],
'汕头': [116.69, 23.39],
'昆山': [120.95, 31.39],
'宁波': [121.56, 29.86],
'湛江': [110.359377, 21.270708],
'揭阳': [116.35, 23.55],
'荣成': [122.41, 37.16],
'连云港': [119.16, 34.59],
'葫芦岛': [120.836932, 40.711052],
'常熟': [120.74, 31.64],
'东莞': [113.75, 23.04],
'河源': [114.68, 23.73],
'淮安': [1