Scenario 1:
Create table using Python Databricks Workspace and apply transformation:
from io import StringIO
import pandas as pd
from [Link] import SparkSession
# Create a raw string of your CSV
csv_data = """EMPLOYEE_ID,FIRST_NAME,LAST_NAME,SALARY
101,John,Doe,50000
102,Jane,Smith,60000
103,Ravi,Kumar,55000"""
# Read CSV as Pandas
pdf = pd.read_csv(StringIO(csv_data))
# Convert Pandas to Spark DataFrame
spark = [Link]()
df = [Link](pdf)
# Transform: Add phone number
from [Link] import lit
df_transformed = [Link]("Phone_Number", lit("9999999999"))
df_transformed.show()
Scenario 2:
Save the data in Databricks Local, apply transformation and load the
transformed data locally:
from [Link] import SparkSession
from [Link] import lit
# Create Spark session
spark = [Link]()
# Use temporary directory paths
input_path = "dbfs:/tmp/[Link]"
output_path = "dbfs:/tmp/employees_transformed"
# Step 1: Read CSV from DBFS
df = [Link]("header", True).csv(input_path)
# Step 2: Transform — Add new column
df_transformed = [Link]("Phone_Number", lit("9999999999"))
# Step 3: Write transformed data back to DBFS tmp directory
df_transformed.[Link]("overwrite").option("header", True).csv(output_path)
print("Transformed file saved to:", output_path)
Scenario 3:
Extract the data from AWS S3 apply transformation using Databricks workspace
and load the data to AWS S3:
from [Link] import SparkSession
from [Link] import lit
# Step 1: Set your AWS credentials here
access_key = "AKIAR5YA6K6I75YQTKUF"
secret_key = "DPzE0iPug44TSmRQEJjlBspkpy3x+2RCNVWFEVCj"
# Step 2: Define S3 input/output paths
input_path = "s3a://3marchtest/[Link]"
output_path = "s3a://3marchtestoutput/customers_transformed"
# Step 3: Set up Spark session with AWS S3 credentials
spark = [Link]("S3DatabricksDemo").getOrCreate()
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("[Link]", access_key)
hadoop_conf.set("[Link]", secret_key)
hadoop_conf.set("[Link]", "[Link]")
hadoop_conf.set("[Link]", "[Link].s3a.S3AFileSystem")
hadoop_conf.set("[Link]", "true")
# Step 4: Read from S3
df = [Link]("header", True).option("inferSchema", True).csv(input_path)
# Step 5: Add Phone Number column
df_transformed = [Link]("Phone_Number", lit("9999999999"))
# Step 6: Write back to S3
df_transformed.[Link]("overwrite").option("header", True).csv(output_path)
print(" Data written to:", output_path)