前言
因为内部平台spark-sql不方便调优、配置资源的因素,某些情况还是会用到scala dateframe操作数据,今天记录一些关于聚合,count distinct多个字段,重命名展示的一些示例。
数据准备
Jack2 22 20200405
Jack2 21 20200401
Jack2 21 20200401
Kate 22 20200406
Mi2ng 20 20200406
代码示例
def main(args: Array[String]): Unit = {
val sc = SparkSession.builder().master(master = "local[4]").getOrCreate()
val rdd = sc.sparkContext.textFile("src/main/resources/2020.txt")
val schemaFiled = "name,age,p_day"
val schemaString = schemaFiled.split(",")
val schema = StructType(List(
StructField(schemaString(0), StringType, nullable = true),
StructField(schemaString(1), StringType, nullable = true),
StructField(schemaString(2), IntegerType, nullable = true)
))
val rowRDD1 = rdd.map(_.split(" ")).map(x=> Row(x(0), x(1), x(2).toInt))
val df = sc.createDataFrame(rowRDD1, schema)
val name_list = "ds,name_,age_uv,age_day_uv,cn,22_age_uv".split(",")
// 聚合操作,多种表达式实现
df.cube("name").agg(
max("p_day").as(name_list(0))
,countDistinct("age").as(name_list(2))
,expr("count(distinct age,p_day)").as(name_list(3))
,expr("'中文'").as(name_list(4))
,countDistinct(expr("if(age == '22', age, null)")).as(name_list(5))
)//重命名聚合字段
.withColumnRenamed("name","name_")
//在列表中可指定查询的字段和顺序
.selectExpr(name_list: _*)
.show()
}
结果
+--------+-----+------+----------+---+---------+
| ds|name_|age_uv|age_day_uv| cn|22_age_uv|
+--------+-----+------+----------+---+---------+
|20200406| null| 3| 4| 中文| 1|
|20200406| Kate| 1| 1| 中文| 1|
|20200406|Mi2ng| 1| 1| 中文| 0|
|20200405|Jack2| 2| 2| 中文| 1|
+--------+-----+------+----------+---+---------+