"""
Created on Tue May 9 10:16:28 2023
@author: student
2) Perform the following operations using Python on the given data
sets:
a. Data cleaning
b. Data integration
c. Data transformation
d. Error correcting
e. Data model building
"""
'\nCreated on Tue May 9 10:16:28 2023\n\n@author: student\n\n2)
Perform the following operations using Python on the given data sets:\
na. Data cleaning\nb. Data integration\nc. Data transformation\nd.
Error correcting\ne. Data model building\n\n'
import pandas as pd
import numpy as np
#Treat NaN missing values as NaN
cars_data=pd.read_csv('Toyota.csv',index_col=0,na_values=['??','????']
)
cars_data1=cars_data.copy(deep=True)
#Observe shape and values of dataset
cars_data.shape
cars_data.info()
<class 'pandas.core.frame.DataFrame'>
Index: 1436 entries, 0 to 1435
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Price 1436 non-null int64
1 Age 1336 non-null float64
2 KM 1421 non-null float64
3 FuelType 1336 non-null object
4 HP 1430 non-null float64
5 MetColor 1286 non-null float64
6 Automatic 1436 non-null int64
7 CC 1436 non-null int64
8 Doors 1436 non-null object
9 Weight 1436 non-null int64
dtypes: float64(4), int64(4), object(2)
memory usage: 123.4+ KB
##-----------DATA CLEANING---------------------
cars_data.describe()
Price Age KM HP
MetColor \
count 1436.000000 1336.000000 1421.000000 1430.000000
1286.000000
mean 10730.824513 55.672156 68647.239972 101.478322
0.674961
std 3626.964585 18.589804 37333.023589 14.768255
0.468572
min 4350.000000 1.000000 1.000000 69.000000
0.000000
25% 8450.000000 43.000000 43210.000000 90.000000
0.000000
50% 9900.000000 60.000000 63634.000000 110.000000
1.000000
75% 11950.000000 70.000000 87000.000000 110.000000
1.000000
max 32500.000000 80.000000 243000.000000 192.000000
1.000000
Automatic CC Weight
count 1436.000000 1436.000000 1436.00000
mean 0.055710 1566.827994 1072.45961
std 0.229441 187.182436 52.64112
min 0.000000 1300.000000 1000.00000
25% 0.000000 1400.000000 1040.00000
50% 0.000000 1600.000000 1070.00000
75% 0.000000 1600.000000 1085.00000
max 1.000000 2000.000000 1615.00000
#Check count of missing values present in each column
cars_data.isnull().sum()
Price 0
Age 100
KM 15
FuelType 100
HP 6
MetColor 150
Automatic 0
CC 0
Doors 0
Weight 0
dtype: int64
import warnings
warnings.filterwarnings('ignore')
#Replacing NaN values for numeric variables by mean / median of that
variable
cars_data['Age'].fillna(cars_data['Age'].mean())
cars_data['Age'].fillna(cars_data['Age'].mean(),inplace=True)
cars_data['HP'].fillna(cars_data['HP'].mean(),inplace=True)
cars_data['KM'].fillna(cars_data['KM'].median(),inplace=True)
cars_data.isnull().sum()
Price 0
Age 0
KM 0
FuelType 100
HP 0
MetColor 150
Automatic 0
CC 0
Doors 0
Weight 0
dtype: int64
#Remove duplicate records
cars_data.drop_duplicates()
Price Age KM FuelType HP MetColor Automatic
CC \
0 13500 23.000000 46986.0 Diesel 90.0 1.0 0
2000
1 13750 23.000000 72937.0 Diesel 90.0 1.0 0
2000
2 13950 24.000000 41711.0 Diesel 90.0 NaN 0
2000
3 14950 26.000000 48000.0 Diesel 90.0 0.0 0
2000
4 13750 30.000000 38500.0 Diesel 90.0 0.0 0
2000
... ... ... ... ... ... ... ...
...
1431 7500 55.672156 20544.0 Petrol 86.0 1.0 0
1300
1432 10845 72.000000 63634.0 Petrol 86.0 0.0 0
1300
1433 8500 55.672156 17016.0 Petrol 86.0 0.0 0
1300
1434 7250 70.000000 63634.0 NaN 86.0 1.0 0
1300
1435 6950 76.000000 1.0 Petrol 110.0 0.0 0
1600
Doors Weight
0 three 1165
1 3 1165
2 3 1165
3 3 1165
4 3 1170
... ... ...
1431 3 1025
1432 3 1015
1433 3 1015
1434 3 1015
1435 5 1114
[1436 rows x 10 columns]
#Replacing NaN values for categorical variables
cars_data['FuelType'].value_counts()
FuelType
Petrol 1177
Diesel 144
CNG 15
Name: count, dtype: int64
#Get mode of such variables for count series
cars_data['FuelType'].value_counts()
cars_data['FuelType'].fillna(cars_data['FuelType'].value_counts(),inpl
ace=True)
cars_data['MetColor'].fillna(cars_data['MetColor'].value_counts(),inpl
ace=True)
#Check that all columns' data is corrected
cars_data.isnull().sum()
Price 0
Age 0
KM 0
FuelType 100
HP 0
MetColor 150
Automatic 0
CC 0
Doors 0
Weight 0
dtype: int64
#---- DATA INTEGRATION----------------
d1={'RollNo':[1,2,3,4,5,6,7,8,9,10],
'StudentName':['Raj','Sunil','Ram','Mahesh','John','Deepak',
'Pooja','Gaurav','Nikita','Ganesh'],
'Age':[22,23,22,22,23,22,22,22,23,23]}
df1=pd.DataFrame(d1)
d2={'RollNo':[1,3,8,9,10,11,12,13,14,15],
'StudentName':['Raj','Ram','Gaurav','Nikita','Ganesh','Anil',
'Gitesh','Sudip','Sameer','Hrutvik'],
'Age':[22,22,22,23,22,22,23,22,22,22]}
df2=pd.DataFrame(d2)
merged=pd.merge(df1,df2) #Integate common records - Inner join
#Merging records for common roll numbers in both datasets
merged_df=pd.merge (df1,df2, on='RollNo') #Perform join
#---- DATA TRANSFORMATION-----------
cars_data.dtypes
#Converting datatypes explicitly
cars_data['MetColor']=cars_data['MetColor'].astype('object')
cars_data['Automatic']=cars_data['Automatic'].astype('object')
cars_data.dtypes
cars_data['Price']=cars_data['Price'].astype('float')
cars_data['KM']=cars_data['KM'].astype('int64')
cars_data.dtypes
Price float64
Age float64
KM int64
FuelType object
HP float64
MetColor object
Automatic object
CC int64
Doors object
Weight int64
dtype: object
'''
# Define a transformation function
Transform data to Nornamalized form to scale data of an attribute so
that it falls
in range such as 0 to 1 or -1 to 1 and used for classification
algorithm
3 ways are- 1. Decimal Scaling Normalization
2. Min-Max Normalization
3.Z-Score Normalization
'''
'\n# Define a transformation function\nTransform data to Nornamalized
form to scale data of an attribute so that it falls \nin range such as
0 to 1 or -1 to 1 and used for classification algorithm\n3 ways are-
1. Decimal Scaling Normalization\n 2. Min-Max
Normalization\n 3.Z-Score Normalization\n'
# 1. Decimal Scaling Normalization
cars_data['Price'].max() # 5 digit value
## Price is in scale of ten thousands so normalize we can devide data
by 100000 to get LL Vlues in 0 to 1
def DScale_normalize(x):
return x / 100000
cars_data['Price'] =DScale_normalize(cars_data['Price'])
#2. Min -Max Normalization
def MinMax_normalize(x):
return (x - x.min()) /(x.max()-x.min())
# Apply the transformation function to a column KM
MinMax_normalize(cars_data['KM'])
cars_data['KM'] =MinMax_normalize(cars_data['KM'])
#3. Z Score Normalization
def ZScore_normalize(x):
return (x - x.mean()) / x.std()
# Apply the transformation function to a column
ZScore_normalize(cars_data['HP'])
cars_data['HP'] =ZScore_normalize(cars_data['HP'])
## get dummies for categorical data
d={'Student':['raj','sunil','raam','mahesh','jon','deepk','pooja'],
'age':[22,23,22,22,23,22,22],'Grade':['C','A','A','B','A','C','A']}
df=pd.DataFrame(d)
df_onGrade=pd.get_dummies(df,columns=['Grade'])
cars_sub1=cars_data[['Price','Age','FuelType','Automatic']]
dummy_df1=pd.get_dummies(cars_sub1,columns=['FuelType'])
dummy_df2=pd.get_dummies(cars_sub1,columns=['FuelType','Automatic'])
#---- DATA ERROR CORRECTION
## Replce to set uniform format
print(np.unique(cars_data['Doors'])) # array(['2', '3', '4', '5',
'five', 'four', 'three']
cars_data['Doors'].replace('three',3,inplace=True)
cars_data['Doors'].replace('four',4,inplace=True)
cars_data['Doors'].replace('five',5,inplace=True)
['2' '3' '4' '5' 'five' 'four' 'three']
# Convering Percent value to probability and viceversa etc..
# matplotlib library to do visualization
import matplotlib.pyplot as plt
import seaborn as sns
# treat nan missing values as nan
cars_data=pd.read_csv('Toyota.csv',index_col=0,na_values=['??','????']
)
cars_data1=cars_data.copy(deep=True)
# Observe shape and values of dataset
cars_data.shape
cars_data.info()
cars_data.isnull().sum()
# removing nan valuse
cars_data.dropna(axis=0,inplace=True)
cars_data.size
cars_data.shape
<class 'pandas.core.frame.DataFrame'>
Index: 1436 entries, 0 to 1435
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Price 1436 non-null int64
1 Age 1336 non-null float64
2 KM 1421 non-null float64
3 FuelType 1336 non-null object
4 HP 1430 non-null float64
5 MetColor 1286 non-null float64
6 Automatic 1436 non-null int64
7 CC 1436 non-null int64
8 Doors 1436 non-null object
9 Weight 1436 non-null int64
dtypes: float64(4), int64(4), object(2)
memory usage: 123.4+ KB
(1096, 10)
## ---- Dada Visualization using matplotlib Library
-------------------
## SCATTER PLOT
plt.scatter(cars_data['Age'],cars_data['Price'],c='blue')
plt.title("Scatter Plot Car Price vs Age")
plt.xlabel('Agee in months')
plt.ylabel('Price in Dollars')
Text(0, 0.5, 'Price in Dollars')
## HISTOGRAM
plt.hist(cars_data["KM"])
# histogram with default arguments
plt.hist(cars_data['KM'],color='blue', edgecolor='white',bins=5)
plt.hist(cars_data['KM'],color='blue', edgecolor='white',bins=8)
# bins specify the count of distribution range
plt.title("Histogram of Kilometer run")
plt.xlabel('Kelometers')
plt.ylabel('Frequency')
plt.show()
'''
plt. show() starts an event loop, looks for all currently active
figure objects,
and opens one or more interactive windows that display your figure or
figures.
'''
'\nplt. show() starts an event loop, looks for all currently active
figure objects, \nand opens one or more interactive windows that
display your figure or figures.\n'
## BAR PLOT
# Seting attribute counts, fuelTypes and index based on Dataset
cars_data['FuelType'].value_counts() # get count of categorical
variable
counts=cars_data['FuelType'].value_counts()
fuelTypes=('Petrol', 'Disel','CNG')
index=np.arange(len(fuelTypes))
#counts=[50,100,75]
plt.bar(index,counts,color=['red','green','cyan'])
plt.title("Bar Plot of Fuel Type")
plt.xlabel('Fuel Used')
plt.ylabel('Frequency')
# Bar label
#plt.xticks(index,fuelTypes)
plt.xticks(index,fuelTypes,rotation=90)
plt.show()
## ---- Data Visualization using seaborn library -------------------
# SCATTER PLOT using seaborn regplot() and lmplot() method
sns.set(style='darkgrid')
sns.regplot(x=cars_data['Age'],y=cars_data['Price'])
sns.regplot(fit_reg=False,x=cars_data['Age'],y=cars_data['Price'],mark
er='+')
<Axes: xlabel='Age', ylabel='Price'>
##
sns.lmplot(x='Age',y='Price', data=cars_data)
sns.lmplot(x='Age',y='Price', data=cars_data,hue='FuelType')
<seaborn.axisgrid.FacetGrid at 0x260ea5a51d0>
##-------HISTOGRAM using seaborn distplot() ----------------
# Histogram with default kernel density estimate
sns.distplot(cars_data['Age'])
# xxx sns.distplot(cars_data['FuelType'])
sns.distplot(cars_data['KM'],bins=10)
<Axes: xlabel='KM', ylabel='Density'>
# Histogram without kernel density estimate
sns.distplot(cars_data['Age'],kde=False)
<Axes: xlabel='Age'>
## --------BAR PLOT using seaborn countplot()
sns.countplot(x='FuelType',data=cars_data)
<Axes: xlabel='FuelType', ylabel='count'>
#We can have grouped bar plot by setting hue as another categorical
vriable
# Grouped bar plot of FuelType and Automatic
sns.countplot(x='FuelType',data=cars_data,hue='Automatic')
<Axes: xlabel='FuelType', ylabel='count'>
##--------- BOX PLOT using seaborn boxplot()-----
sns.boxplot(y=cars_data['Price'])
# box plot for numerical data vs categorical data
sns.boxplot(y=cars_data['Price'], x=cars_data['FuelType'])
<Axes: xlabel='FuelType', ylabel='Price'>