from
pyspark.sql
import
SparkSession
from
pyspark.sql.functions
import
col
def
create_session():
spk
=
SparkSession.builder \
.appName(
"Corona_cases_statewise.com"
) \
.getOrCreate()
return
spk
def
create_RDD(sc_obj,data):
df
=
sc.parallelize(data)
return
df
if
__name__
=
=
"__main__"
:
input_data
=
[(
"Uttar Pradesh"
,
122000
,
89600
,
12238
),
(
"Maharashtra"
,
454000
,
380000
,
67985
),
(
"Tamil Nadu"
,
115000
,
102000
,
13933
),
(
"Karnataka"
,
147000
,
111000
,
15306
),
(
"Kerala"
,
153000
,
124000
,
5259
)]
spark
=
create_session()
sc
=
spark.sparkContext
rd_df
=
create_RDD(sc,input_data)
schema_lst
=
[
"State"
,
"Cases"
,
"Recovered"
,
"Deaths"
]
df
=
spark.createDataFrame(rd_df,schema_lst)
df.printSchema()
df.show()
print
(
"Retrieved Data is:-"
)
for
row
in
df.collect()[
0
:
3
]:
print
((row[
"State"
]),
","
,
str
(row[
"Cases"
]),
","
,
str
(row[
"Recovered"
]),
","
,
str
(row[
"Deaths"
]))