Merging database-style data frames
Inner Join (Default Merge)
import pandas as pd
# Creating two sample data frames
df1 = pd.DataFrame({
'ID': [1, 2, 3, 4],
'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, 30, 35, 40]
})
df2 = pd.DataFrame({
'ID': [1, 2, 3],
'Salary': [50000, 60000, 70000]
})
# Merging the data frames on the 'ID' column
merged_df = pd.merge(df1, df2, on='ID')
print(merged_df)
Output:
nginx
Copy
ID Name Age Salary
0 1 Alice 25 50000
1 2 Bob 30 60000
2 3 Charlie 35 70000
Left Join
# Merging the data frames with a left join
merged_df_left = pd.merge(df1, df2, on='ID', how='left')
print(merged_df_left)
Output:
pgsql
Copy
ID Name Age Salary
0 1 Alice 25 50000.0
1 2 Bob 30 60000.0
2 3 Charlie 35 70000.0
3 4 David 40 NaN
Right Join
# Merging the data frames with a right join
merged_df_right = pd.merge(df1, df2, on='ID', how='right')
print(merged_df_right)
Output:
nginx
Copy
ID Name Age Salary
0 1 Alice 25.0 50000
1 2 Bob 30.0 60000
2 3 Charlie 35.0 70000
Outer Join
# Merging the data frames with an outer join
merged_df_outer = pd.merge(df1, df2, on='ID', how='outer')
print(merged_df_outer)
Output:
pgsql
Copy
ID Name Age Salary
0 1 Alice 25.0 50000.0
1 2 Bob 30.0 60000.0
2 3 Charlie 35.0 70000.0
3 4 David 40.0 NaN
Merging on Multiple Columns
# Creating two data frames with multiple common columns
df1 = pd.DataFrame({
'ID': [1, 2, 3],
'Department': ['HR', 'Finance', 'IT'],
'Employee': ['Alice', 'Bob', 'Charlie']
})
df2 = pd.DataFrame({
'ID': [1, 2, 3],
'Department': ['HR', 'Finance', 'IT'],
'Salary': [50000, 60000, 70000]
})
# Merging based on both 'ID' and 'Department'
merged_df_multi = pd.merge(df1, df2, on=['ID', 'Department'])
print(merged_df_multi)
Output:
nginx
Copy
ID Department Employee Salary
0 1 HR Alice 50000
1 2 Finance Bob 60000
2 3 IT Charlie 70000
Merging with Different Column Names
# Creating data frames with different column names for the merge
df1 = pd.DataFrame({
'EmployeeID': [1, 2, 3],
'EmployeeName': ['Alice', 'Bob', 'Charlie']
})
df2 = pd.DataFrame({
'ID': [1, 2, 3],
'Salary': [50000, 60000, 70000]
})
# Merging based on different column names
merged_df_diff_names = pd.merge(df1, df2, left_on='EmployeeID', right_on='ID')
print(merged_df_diff_names)
Output:
nginx
Copy
EmployeeID EmployeeName ID Salary
0 1 Alice 1 50000
1 2 Bob 2 60000
2 3 Charlie 3 70000
Concatenating along with an axis
Concatenating Vertically (Row-wise)
import pandas as pd
# Creating two data frames
df1 = pd.DataFrame({
'ID': [1, 2, 3],
'Name': ['Alice', 'Bob', 'Charlie']
})
df2 = pd.DataFrame({
'ID': [4, 5, 6],
'Name': ['David', 'Eve', 'Frank']
})
# Concatenating along axis=0 (row-wise)
concatenated_df = pd.concat([df1, df2], axis=0, ignore_index=True)
print(concatenated_df)
Output:
nginx
Copy
ID Name
0 1 Alice
1 2 Bob
2 3 Charlie
3 4 David
4 5 Eve
5 6 Frank
Concatenating Horizontally (Column-wise)
# Creating two data frames with the same index but different columns
df1 = pd.DataFrame({
'ID': [1, 2, 3],
'Name': ['Alice', 'Bob', 'Charlie']
})
df2 = pd.DataFrame({
'Age': [25, 30, 35],
'Salary': [50000, 60000, 70000]
})
# Concatenating along axis=1 (column-wise)
concatenated_df_horizontal = pd.concat([df1, df2], axis=1)
print(concatenated_df_horizontal)
Output:
nginx
Copy
ID Name Age Salary
0 1 Alice 25 50000
1 2 Bob 30 60000
2 3 Charlie 35 70000
Concatenating with Different Indexes
# Creating two data frames with different indexes
df1 = pd.DataFrame({
'ID': [1, 2],
'Name': ['Alice', 'Bob']
}, index=[0, 1])
df2 = pd.DataFrame({
'Age': [25, 30],
'Salary': [50000, 60000]
}, index=[1, 2])
# Concatenating along axis=0 (row-wise), handling different indexes
concatenated_df_diff_index = pd.concat([df1, df2], axis=0, ignore_index=True)
print(concatenated_df_diff_index)
Output:
pgsql
Copy
ID Name Age Salary
0 1 Alice NaN NaN
1 2 Bob 25.0 50000.0
2 1 NaN 30.0 60000.0
3 2 NaN NaN NaN
Concatenating with Keys (Creating a MultiIndex)
# Concatenating with keys to create a hierarchical index
concatenated_df_keys = pd.concat([df1, df2], axis=0, keys=['df1', 'df2'])
print(concatenated_df_keys)
Output:
pgsql
Copy
ID Name Age Salary
df1 0 1 Alice NaN NaN
1 2 Bob 25.0 50000.0
df2 1 1 NaN 30.0 60000.0
2 2 NaN NaN NaN
Concatenating with Mismatched Columns
# Creating two data frames with mismatched columns
df1 = pd.DataFrame({
'ID': [1, 2],
'Name': ['Alice', 'Bob']
})
df2 = pd.DataFrame({
'Age': [25, 30],
'Salary': [50000, 60000]
})
# Concatenating along axis=1 (column-wise) with mismatched columns
concatenated_df_mismatched = pd.concat([df1, df2], axis=1)
print(concatenated_df_mismatched)
Output:
pgsql
Copy
ID Name Age Salary
0 1 Alice NaN NaN
1 2 Bob 25.0 50000.0
Merging on index
Simple Merge on Index
import pandas as pd
# Creating two data frames with meaningful indexes
df1 = pd.DataFrame({
'Name': ['Alice', 'Bob', 'Charlie'],
'Age': [25, 30, 35]
}, index=['a', 'b', 'c'])
df2 = pd.DataFrame({
'Salary': [50000, 60000, 70000],
'Department': ['HR', 'Finance', 'IT']
}, index=['a', 'b', 'c'])
# Merging the data frames on the index
merged_df = pd.merge(df1, df2, left_index=True, right_index=True)
print(merged_df)
Output:
css
Copy
Name Age Salary Department
a Alice 25 50000 HR
b Bob 30 60000 Finance
c Charlie 35 70000 IT
Merge on Index with Different Column Names
# Creating two data frames with different column names but same index
df1 = pd.DataFrame({
'Name': ['Alice', 'Bob', 'Charlie'],
'Age': [25, 30, 35]
}, index=['a', 'b', 'c'])
df2 = pd.DataFrame({
'Salary': [50000, 60000, 70000],
'Department': ['HR', 'Finance', 'IT']
}, index=['a', 'b', 'c'])
# Merging the data frames on index
merged_df_diff_columns = pd.merge(df1, df2, left_index=True, right_index=True)
print(merged_df_diff_columns)
Output:
css
Copy
Name Age Salary Department
a Alice 25 50000 HR
b Bob 30 60000 Finance
c Charlie 35 70000 IT
Merge on Index with how Parameter
# Creating two data frames with different indexes
df1 = pd.DataFrame({
'Name': ['Alice', 'Bob', 'Charlie'],
'Age': [25, 30, 35]
}, index=['a', 'b', 'c'])
df2 = pd.DataFrame({
'Salary': [50000, 60000],
'Department': ['HR', 'Finance']
}, index=['a', 'b'])
# Merging with 'left' join on the index
merged_left = pd.merge(df1, df2, left_index=True, right_index=True, how='left')
print(merged_left)
Output:
Copy
Name Age Salary Department
a Alice 25 50000 HR
b Bob 30 60000 Finance
c Charlie 35 NaN NaN
Merge with outer Join on Index
# Merging with an outer join on the index
merged_outer = pd.merge(df1, df2, left_index=True, right_index=True, how='outer')
print(merged_outer)
Output:
Copy
Name Age Salary Department
a Alice 25 50000 HR
b Bob 30 60000 Finance
c Charlie 35 NaN NaN
Merge on Index and Column (Multi-key Merge)
# Creating two data frames with different columns and indexes
df1 = pd.DataFrame({
'Name': ['Alice', 'Bob', 'Charlie'],
'Age': [25, 30, 35]
}, index=['a', 'b', 'c'])
df2 = pd.DataFrame({
'Salary': [50000, 60000, 70000],
'Department': ['HR', 'Finance', 'IT'],
'Age': [25, 30, 35]
}, index=['a', 'b', 'c'])
# Merging on both index and a column
merged_df = pd.merge(df1, df2, left_index=True, right_index=True, on='Age')
print(merged_df)
Output:
css
Copy
Name Age Salary Department
a Alice 25 50000 HR
b Bob 30 60000 Finance
c Charlie 35 70000 IT
Reshaping and pivoting
Pivoting Data to Wide Format
import pandas as pd
# Creating a sample data frame
df = pd.DataFrame({
'Date': ['2025-03-01', '2025-03-01', '2025-03-02', '2025-03-02'],
'City': ['New York', 'Los Angeles', 'New York', 'Los Angeles'],
'Temperature': [58, 70, 60, 72]
})
# Pivoting the data: Dates as rows, cities as columns, and Temperature as values
pivoted_df = df.pivot(index='Date', columns='City', values='Temperature')
print(pivoted_df)
Output:
sql
Copy
City Los Angeles New York
Date
2025-03-01 70 58
2025-03-02 72 60
Melting DataFrames with melt()
# Creating a wide-format data frame
df_wide = pd.DataFrame({
'Date': ['2025-03-01', '2025-03-02'],
'New York': [58, 60],
'Los Angeles': [70, 72]
})
# Melting the data: Convert cities from columns to a single "City" column
melted_df = pd.melt(df_wide, id_vars=['Date'], var_name='City', value_name='Temperature')
print(melted_df)
Output:
yaml
Copy
Date City Temperature
0 2025-03-01 New York 58
1 2025-03-02 New York 60
2 2025-03-01 Los Angeles 70
3 2025-03-02 Los Angeles 72
Stacking DataFrame
# Creating a sample DataFrame
df = pd.DataFrame({
'City': ['New York', 'Los Angeles', 'Chicago'],
'Population': [8175133, 3792621, 2695598],
'Area': [789, 503, 589]
})
# Setting 'City' as the index
df.set_index('City', inplace=True)
# Stacking the DataFrame: Converts columns into a MultiIndex (rows)
stacked_df = df.stack()
print(stacked_df)
Output:
mathematica
Copy
City
New York Population 8175133
Area 789
Los Angeles Population 3792621
Area 503
Chicago Population 2695598
Area 589
dtype: int64
stack(): Converts columns into rows, resulting in a hierarchical index.
Unstacking DataFrame
# Unstacking the stacked data: Converts rows back to columns
unstacked_df = stacked_df.unstack()
print(unstacked_df)
Output:
sql
Copy
City Population Area
New York 8175133 789
Los Angeles 3792621 503
Chicago 2695598 589
Reshaping with pivot_table()
# Creating a sample DataFrame
df = pd.DataFrame({
'Date': ['2025-03-01', '2025-03-01', '2025-03-02', '2025-03-02'],
'City': ['New York', 'Los Angeles', 'New York', 'Los Angeles'],
'Temperature': [58, 70, 60, 72],
'Humidity': [60, 50, 65, 55]
})
# Using pivot_table to reshape and calculate the average Temperature per Date and City
pivot_table_df = df.pivot_table(index='Date', columns='City', values='Temperature',
aggfunc='mean')
print(pivot_table_df)
Output:
sql
Copy
City Los Angeles New York
Date
2025-03-01 70 58
2025-03-02 72 60
Handling Missing Data with pivot_table()
# Creating a sample DataFrame with missing values
df_with_missing = pd.DataFrame({
'Date': ['2025-03-01', '2025-03-01', '2025-03-02'],
'City': ['New York', 'Los Angeles', 'New York'],
'Temperature': [58, 70, None],
'Humidity': [60, 50, 65]
})
# Pivoting with missing data and using mean as aggregation function
pivot_table_missing = df_with_missing.pivot_table(index='Date', columns='City',
values='Temperature', aggfunc='mean')
print(pivot_table_missing)
Output:
pgsql
Copy
City Los Angeles New York
Date
2025-03-01 70.0 58.0
2025-03-02 NaN NaN