动态加载类,构造类的实例,并转化为DataFrame,同时还要解决入表时,类属性值与表字段一一对应,同时只有部分字段有值,其他字段提供根据字段类型提供默认值
def mergeRDD(spark:SparkSession,countRDDs:RDD[(String, (Int,Int))],dtoName:String):DataFrame={ import spark.implicits._ // 聚合所有指标,并封装为HashMap ,key 为dto中序号,value为dto 值 val rdd = countRDDs.groupByKey().map(x =>(x._1,x._2.toList)).map{ x => //从keys分离出相关属性 ex 1_1_1 val keyList = x._1.split("\t").map{y => val fields =y.split("\001") val value = fields(2) match { case "1" => (fields(1).toInt,fields(0)) case "2"if(!"null".equals(fields(1))) => (fields(1).toString,fields(0)) case _ =>(null,fields(0)) } value } keyList++x._2 }.map{ x => var map : mutable.HashMap[String,Any] =mutable.HashMap() x.map{y =>map.put(y._2.toString,y._1)} map } // 注意此处的scala类在转df时出现问题,所以采用java类,放弃case class来装载数据 val classType = Class.forName(dtoName) val tx = rdd.map{y => val obj = classType.newInstance() val fields = classType.getDeclaredFields for (i <- 1 until fields.length) { val field = fields(i) val indexKey = i+1+"" field.setAccessible(true) if(i==fields.length-1){ field.set(obj,DateUtil.getSimpleDate(new Date())) }else { if(field.getType.getName.endsWith("Double")){ field.set(obj, y.getOrElse(indexKey, 0.00)) }else if(field.getType.getName.endsWith("Integer")){ field.set(obj, y.getOrElse(indexKey, 0)) }else { field.set(obj, y.getOrElse(indexKey, "")) } } } obj } //此处的classType是java类 val df = spark.createDataFrame(tx,classType) df }