数据源 popple.csv:
name;age;job
Jorge;30;Developer
Bob;32;Developer
Ani;11;Developer
Lily;11;Manager
Put;11;Developer
Alice;9;Manager
Alice;9;Manager
Alice;9;Manager
Alice;9;Manager
Alice;;Manager
Alice;9;
zhangsan;;
;21;
# coding:utf8
import time
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, IntegerType
import pandas as pd
from pyspark.sql import functions as F
if __name__ == '__main__':
# 0. 构建执行环境入口对象SparkSession
spark = SparkSession.builder.\
appName("test").\
master("local[*]").\
config("spark.sql.shuffle.partitions", 2).\
getOrCreate()
sc = spark.sparkContext
df=spark.read.format("csv").option("sep",";").option("header",True).load("../../data/input/sql/people.csv")
# 数据清洗: 数据去重
# dropDuplicates 是Dat