R - 爬猎聘网职位酬薪

library(rvest)
library(magrittr)
library(dplyr)
library(tidyr)


### 单页抓取数据
get_job_on_page <- function(url){
##1、通过CSS 选择器,抓取网页的数据
  #url <- c("https://www.liepin.com/zhaopin/?init=-1&headckid=c88035ff1557e3f8&fromSearchBtn=2&ckid=c88035ff1557e3f8&degradeFlag=0&sfrom=click-pc_homepage-centre_searchbox-search_new&key=CRA&siTag=xKLlO2y_xees_Q4GxkmxTA%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_fp&d_ckId=de8e60921cc286bb43b240c184600370&d_curPage=1&d_pageSize=40&d_headId=de8e60921cc286bb43b240c184600370&curPage=0")
  jobs_webpage <- read_html(url)
  joblists <- jobs_webpage %>% html_nodes(css = '.condition') %>% html_attr('title') %>% strsplit('_') %>% as.data.frame() %>% t() %>% as.data.frame()
  
##2、数据过滤
  rownames(joblists) <- NULL
  names(joblists) <- c("salary", "city", "education", "experience")
  #t1 <- joblists[joblists$salary != "面议",]
  #t2 <- subset(joblists, salary != "面议")
  
  joblists %<>% filter(salary != "面议") %>% filter(city   != "不限") %>% filter(!(grepl("省", city)))
  
  joblists$salary <- sub("万", "", joblists$salary) %>% strsplit("-") %>% lapply(function(x){mean(as.numeric(x))}) %>% unlist()
  
  joblists %<>% apply(1,function(x){
    if(grepl("-", x[2])){
      x[2] <- strsplit(x[2],"-")[[1]][1]
      }
    x
    }) %>% t() %>% as.data.frame()
  
###3、拆分行
  joblists <- separate_rows(joblists, city,  convert = TRUE)
  return(joblists)  
}

get_job_on_all_page <- function(keyword, maxpage){
  base_url <- paste0('https://www.liepin.com/zhaopin/?init=-1&headckid=c88035ff1557e3f8&fromSearchBtn=2&ckid=c88035ff1557e3f8&degradeFlag=0&sfrom=click-pc_homepage-centre_searchbox-search_new&key=',keyword,'&siTag=xKLlO2y_xees_Q4GxkmxTA%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_fp&d_ckId=de8e60921cc286bb43b240c184600370&d_curPage=1&d_pageSize=40&d_headId=de8e60921cc286bb43b240c184600370&curPage=')
  jobs <- data.frame()
  for (i in 0:(maxpage-1)) {
    print(i)
    total_url <- paste0(base_url,i)
    jobs <- rbind(jobs, get_job_on_page(total_url))
  }
  return(jobs)
}

CRA_jobs <- get_job_on_all_page("CRA", 99)
write.table(CRA_jobs , 'CRA_jobs.txt', sep='\t',quote=F,row.names=F,col.names=T)

模仿:https://ask.hellobi.com/blog/R_shequ/11523

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值