使用sql计算一次计算模型多个指标ks-CSDN博客

使用sql计算一次计算模型多个指标ks

上篇文章模型训练通过pandas一次分组计算多个标签的ks模型训练通过pandas一次分组计算多个标签的ks使用pandas一次计算了模型多个标签的ks指标，但是在数据量较大的时候pandas可能需要跑很久或者直接OOM，此时可以使用sql或者spark以集群资源进行计算，本文将以hive sql的方式进行说明sql计算ks的代码
如果当前没有hive环境，可以关注公众号(恩的漂流笔记)回复hadoop即可获取docker一键安装hadoop集群代码，包含hive、hdfs、spark、yarn、presto等一键全部部署

计算原理

和正常计算ks的逻辑一致: 计算每个阈值下的tpr和fpr的差，然后取绝对值的最大值即可

导入数据

将用到的数据导入hive,好像无法上传文件，公众号回复 ks数据即可获取

create table if not exists t1.demo_data(
    month int,
    pred double,
    y1 double,
    y2 double,
    y3 double
)
row format delimited fields terminated by '\t'
stored as textfile;

load data  inpath '/datas/data.csv' into table t1.demo_data;

计算单个ks

with
  step1 as (
    select *,
             sum(y1) over (partition by month order by pred desc range between unbounded preceding and current row) as acc_pos,
             sum(1-y1) over (partition by month order by pred desc range between unbounded preceding and current row) as acc_neg,
             sum(y1) over (partition by month) as all_pos,
             sum(1-y1) over (partition by month) as all_neg
      from t1.demo_data
  ),
  step2 as (
    select month,
           max(abs(1.000*acc_pos/all_pos-1.000*acc_neg/all_neg)) as y1_ks
    from step1
    group by month
  )

select * from step2;
-- 输出结果如下
-- month,y1_ks
-- 1,0.206674338319908
-- 2,0.13867488443759624
-- 3,0.1394064872325742
-- 4,0.10476190476190472
-- 5,0.12083333333333333

一次多个指标ks

with

  data as (
    select a.*,t.*
    from t1.demo_data a
    lateral view explode(map('y1',y1,'y2',y2,'y3',y3)) t as k,y
   ),
  step1 as (
    select *,
             sum(y) over (partition by month,k order by pred desc range between unbounded preceding and current row) as acc_pos,
             sum(1-y) over (partition by month,k order by pred desc range between unbounded preceding and current row) as acc_neg,
             sum(y) over (partition by month,k) as all_pos,
             sum(1-y) over (partition by month,k) as all_neg
      from data
  ),
  step2 as (
    select month,k,
           max(abs(1.000*acc_pos/all_pos-1.000*acc_neg/all_neg)) as ks
    from step1
    group by month,k
  ),
  res as (
    select month,
          max(case when k='y1' then ks end ) as y1_ks,
          max(case when k='y2' then ks end ) as y2_ks,
          max(case when k='y3' then ks end ) as y3_ks
    from step2
    group by month
   )

select * from res;
--输出
-- month,y1_ks,y2_ks,y3_ks
-- 1,0.206674338319908,0.270108695652174,0.10144927536231885
-- 2,0.13867488443759624,0.08841807909604515,0.13571676501214852
-- 3,0.1394064872325742,0.12063492063492065,0.12549019607843137
-- 4,0.10476190476190472,0.19269269269269262,0.14832309872461025
-- 5,0.12083333333333333,0.09269128283212791,0.0734295415959253