SCD Typ2 in Databricks Azure
SCD Typ2 in Databricks Azure
IN SPARK FRAMEWORK
Pandas
Pandas is an open source library used in python.
Pandas Dataframes will look similar to table and used to perform all SQL related data manipulations
and data handling.
Since Spark supports various programming languages it’s possible to use native python code in Spark
Framework.
This below code logic requires changes in Spark configuration if the Data volume is huge.
Code Highlights
Split inactive records from history table in to separate Dataframe for overwrite option in the end.
Identify the business key columns for new insert records using column defined as set functions.
Identify the Type2 columns for update records by merging Dataframes.
Populates required fields for insert records.
Generates unique id similar to surrogate key generator.
Merge all the Dataframes into single to overwrite in the end with updated values as per delta table.
1
import datetime
from decimal import *
##########################################
#Use this DF containing active records for processing, during overwriting merge Hist_Table with
Hist_Table_INACTV_DF
HIST_DF = HIST_FULL_DF.loc[MASK,:]
HIST_INACTV_DF = HIST_FULL_DF.loc[~MASK,:]
###########################################################################
###### Functions ######
2
###########################################################################
def POPULATE_FLDS_INSERT(DF,ID):
###########################################################################
#THIS function populates all the fields for inserting new records into Hist_Table hive table
#INPUT-- 1 Dataframe and BU_ID. 1) DF with new BU NAME AND BU DESC 2) Value of last BU_ID
generated
#RETURNS-- 1 Dataframe. DF with all fields populated for insert
###################################################################################
BU_ID = [(ID+i+1) for i in range(len(DF))]
last_ID = BU_ID[-1]
DF = DF.assign(bus_unt_id = BU_ID, row_efctv_to_tmsp = '2200-04-11 18:47:16',crtd_by_nm = \
'Type2Job.py',updtd_by_nm = 'Type2Job.py',is_curr_row_ind = 'Y')
x = '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now())
DF = DF.assign(crtd_tmsp = x, updtd_tmsp = x, row_efctv_from_tmsp = x)
DF = DF[['bus_unt_id','bus_unt_nm','bus_unt_desc','crtd_by_nm','crtd_tmsp','updtd_by_nm',\
'updtd_tmsp','row_efctv_from_tmsp','row_efctv_to_tmsp','is_curr_row_ind']]
return DF
#######################################################################################
#THIS function identifies and expires records that needs update, subsequently creates new records with
updates
#INPUT -- 2 Dataframes. 1) HIS_DF of History table 2) DEL_DF of Delta table
#RETURNS -- 1 Dataframe. DF with old Records expired that needs update, and new rows inserted with
updates
#######################################################################################
#Get required columns to DF for validation
HIS_DF_TRIM = HIS_DF[['bus_unt_nm','bus_unt_desc']]
#Type check use merge fuction to identify updated records (Type2 Column - bus_unt_desc)
3
global CHNGD_BU_DF
CHNGD_BU_DF = pd.DataFrame()
#identifies records with mismatch in descr for given BU name
#When no updates records found return the HIS_DF that has latest BUS_UNT data
if len(CHNGD_BU_DF) ==0:
return HIS_DF
else:
CHNGD_BU = CHNGD_BU_DF.bus_unt_nm.tolist()
EXPRD_BU_ID = HIS_DF['bus_unt_id'].loc[HIS_DF.bus_unt_nm.isin(CHNGD_BU)].tolist()
HIS_DF['row_efctv_to_tmsp'].loc[HIS_DF.bus_unt_nm.isin(CHNGD_BU)] = cur_tmsp
#After above call INSRT_UPDTBU_DF DF will have new ID's generated for type2 update rows
#create a DF with old and new BU_ID
global OLD_NEW_ID
OLD_NEW_ID = pd.DataFrame()
OLD_NEW_ID = INSRT_UPDTBU_DF[['bus_unt_id']]
OLD_NEW_ID['EXPRD_BU_ID'] = EXPRD_BU_ID
OLD_NEW_ID = OLD_NEW_ID.rename(columns={'bus_unt_id': 'new_bus_unt_id', 'EXPRD_BU_ID' :\
'bus_unt_id' })
4
HIS_DF = pd.concat([HIS_DF,INSRT_UPDTBU_DF],ignore_index=True)
return HIS_DF
def WRITE_TO_HIVE(DF,TB_NM):
###################################################################################
#THIS function converts PANDAS DF to SPARK DF and overwrites it to HIVE table
#INPUT -- 1) DF with data to be loaded to HIVE table 2) HDFS path name as string
#RETURNS -- Not Applicable
###################################################################################
DF = DF.astype('str')
DF.columns = map(str.upper, DF.columns)
#covert DF to SDF
DQ_SDF = spark.createDataFrame(DF)
DQ_SDF.write.format("parquet").mode("overwrite").save('hdfs:/data/hadoop/Hist_DB/Hist_Table/')
return
###########################################################################
###### Validation for new insert, update SCD records ######
###########################################################################
#Get the last ID from HIST Table for surrogate key generation
DQ_BUS_ID = HIST_FULL_DF.bus_unt_id.tolist()
DQ_BUS_ID.sort()
global last_ID
last_ID = DQ_BUS_ID[-1]
5
#Create a DF with new BU NAME and DESCR from CSV
BU_INSRT = BUS_UNT_DEL_DF.loc[BUS_UNT_DEL_DF.bus_unt_nm.isin(new_BU)]
#Append the new rows to HIST_DF dataframe for hive table insert and reset index
HIST_DF = pd.concat([HIST_DF,DF_HIVE_INSRT],ignore_index=True)
HIVE_LOAD_DF = pd.concat([HIVE_LOAD_DF,HIST_INACTV_DF],ignore_index=True)
#WRITE call is MUST as this loop has new records to be inserted into HIVE table
print('************New Bus Unt Identified for this Run**************')
WRITE_TO_HIVE(HIVE_LOAD_DF,VAR_TB_NM)
if len(CHNGD_BU_DF) ==0: