AMDA Practical - A048
AMDA Practical - A048
Shubham Phatangare
Practical 1
#Step 1
a=read.csv("C:\\Users\\ADMIN\\Downloads\\walmart (1).csv")
#Step 2
#checking dimension of the data
dim(a)
## [1] 421570 12
#Step 3a
#checking missing values
sapply(a,function(x) sum(is.na(x)))
#deletion of variable
df=select(a,-10)
#Method 2-MICE
#install.packages("mice")
library(mice)
md.pattern(df)
'
#Step 3b
#outliers
par(mfrow=c(2,4))
boxplot(df$Store,data=df,main="STORES")
boxplot(df$Dept,data=df,main="DEPARTMENT")
boxplot(df$Size,data=df,main="SIZE OF STORE")
boxplot(df$Temperature,data=df,main="TEMPERATURE")
boxplot(df$Fuel_Price,data=df,main="FUEL PRICE")
boxplot(df$CPI,data=df,main="CONSUMER PRICE INDEX")
boxplot(df$Unemployment,data=df,main="UNEMPLOYMENT RATE")
boxplot(df$Weekly_Sales,data=df,main="WEEKLY SALES")
#Treating outliers
#Quantile based flooring and capping
outlier_norm=function(x){
qntile=quantile(x,probs=c(.25,.75))
caps=quantile(x,probs=c(.05,.95))
H=1.5*IQR(x,na.rm=T)
x[x<(qntile[1]-H)]=caps[1]
x[x>(qntile[2]+H)]=caps[2]
return(x)
}
#replacing outliers
df$Unemployment=outlier_norm(df$Unemployment)
par(mfrow=c(1,1))
boxplot(df$Unemployment,data=df,main="UNEMPLOYMENT RATE")
#Step 3c
#Checking for normality of dependent continuous variable
#install.packages("moments")
library(moments)
#Filled density plot
d=density(df$Weekly_Sales)
d
##
## Call:
## density.default(x = df$Weekly_Sales)
##
## Data: df$Weekly_Sales (421570 obs.); Bandwidth 'bw' = 913
##
## x y
## Min. : -7728 Min. :0.000e+00
## 1st Qu.:168164 1st Qu.:0.000e+00
## Median :344055 Median :5.700e-10
## Mean :344055 Mean :1.420e-06
## 3rd Qu.:519947 3rd Qu.:4.364e-08
## Max. :695838 Max. :7.422e-05
plot(d,main="Weekly Sales")
polygon(d,col="red",border="blue")
skewness(df$Weekly_Sales)
## [1] 3.261997
#for example
d=density(df$Unemployment)
plot(d,main="Unemployment")
polygon(d,col="red",border="blue")
skewness(df$Unemployment)
## [1] 0.7303465
#Transformation
#log
df$Unemploymentlog=log(df$Unemployment)
skewness(df$Unemploymentlog)
## [1] 0.012207
d=density(df$Unemploymentlog)
plot(d,main="Unemploymentlog")
polygon(d,col="red",border="blue")
#power-square
df$Unemploymentsrt=sqrt(df$Unemployment)
skewness(df$Unemploymentsrt)
## [1] 0.373224
d=density(df$Unemploymentsrt)
plot(d,main="Unemploymentsrt")
polygon(d,col="red",border="blue")
#power-cube
df$Unemploymentcube=(df$Unemployment)^(-1/3)
skewness(df$Unemploymentcube)
## [1] 0.228456
d=density(df$Unemploymentcube)
plot(d,main="Unemploymentcube")
polygon(d,col="red",border="blue")
#car library
library(car)
s=powerTransform(df$Unemployment)
summary(s)
df$UnemploymentPT=df$Unemployment^(-0.02)
skewness(df$UnemploymentPT)
## [1] 0.002248032
d=density(df$UnemploymentPT)
plot(d,main="UnemploymentPT")
polygon(d,col="red",border="blue")
#QQ-plot
qqnorm(df$Unemployment,pch=1,frame=FALSE)
qqline(df$Unemployment,col="steelblue",lwd=2)
#Step 3d
#Inconsistency
##
## 0 1
## 391909 29661
dmy=dummyVars(" ~ .",data=df,fullRank=T)
df=data.frame(predict(dmy,newdata=df))
str(df)
df$Temperature=scale(df$Temperature)
df$Fuel_Price=scale(df$Fuel_Price)
df$CPI=scale(df$CPI)
#Step 5
#Feature engineering
a$year=format(a$Date,"%Y")
a$day=format(a$Date,"%d")
a$month=format(a$Date,"%m")
#Step 6
#train-test split
set.seed(101) #Set seed so that same sample can be reproduced in future also
#Now selecting 70% of data as sample from total 'n' rows of the data
sample=sample.int(n=nrow(df),size=floor(.70*nrow(df)),replace=F)
train=df[sample,]
test=df[-sample,]
Practical 2 Find Univariate and Bivariate analysis on UScereal data.
library(MASS)
library(lattice)
data=UScereal
#Univariate Analysis
plot(data$mfr)
histogram(data$mfr)
#Barchart
table=table(data$mfr)
barplot(table,main="manufacturer",xlab="manufacturer",ylab="count",col="blue"
)
#Piechart
tabel1=table(data$vitamins)
pie(tabel1,main="vitamins",col=rainbow(length(tabel1)))
#Boxplot
par(mfrow=c(1,2))
boxplot(data$calories,main="For calories",col="pink") #Postively skewed
boxplot(data$carbo,main="For carbs",col="yellow") #Symmetric
#Histogram
par(mfrow=c(1,1))
hist(data$fibre,col="orange",border="black",xlab="fibre",prob=T,main="Histogr
am of fibre")
lines(density(data$fibre),lwd=2,col="black")
#Linechart
plot(data$sugars,type="o",col="red",xlab="sugar",main="Sugar")
#Bivariate Analysis
#Qualitative vs Qualitative
#scatterplot
plot(data$calories,data$fat,xlab="calories",ylab="fat",main="calories vs
fat",lwd=2)
plot(data$calories,data$sugars,pch=data$shelf)
legend("topright",c("Shelf 1","Shelf 2","Shelf 3"),cex=0.8,pch=1:3)
library(reshape2)
library(ggplot2)
dat=data.frame(data$protein,data$fat)
ggplot(melt(dat),aes(value,color=variable))+geom_density()
#Quantitative vs Qualitative
boxplot(data$calories~data$mfr)
#n is postively skewed and r is negatively skewed
#Correlation plot
c=Filter(is.numeric,data)
#install.packages("corrplot")
library(corrplot)
library(RColorBrewer)
m=cor(c)
corrplot(m,type="upper",order="hclust",col=brewer.pal(n=8,name="RdYlBu"))
#Correlation plot with histogram
#install.packages("psych")
library(psych)
##
## Attaching package: 'psych'
pairs.panels(dat,density=T,ellipses=F)
Practical 3 Run Factor analysis on mtcars data
#install.packages("FactoMineR")
library(FactoMineR)
library(psych)
cars=mtcars
summary(cars)
#checking correlation
cordata=cor(cars)
cordata
#indicates that factoe analysis may be useful for our data checking
determinant
#of corr matrix
det(cordata)
library(ggplot2)
fafitfree=fa(cordata,nfactors=ncol(cordata),rotate="none")
n_factors=length(fafitfree$e.values)
scree=data.frame(factor_n=as.factor(1:n_factors),eigenvalue=fafitfree$e.value
s)
ggplot(scree,aes(x=factor_n,y=eigenvalue,group=1))+geom_point()+geom_line()+
xlab("no. of factors")+ylab("initial eigenvalue")+labs(title="scree plot")
fa.none=fa(cordata,nfactors=3,n.obs=NA,n.iter=1,rotate="varimax",scores="regr
ession",fm="pa",max.iter=50)
fa.none
fa.diagram(fa.none) #black line +ve related & red line -ve related with that
factor
factor.scores(cars,fa.none,Phi=NULL,method=c("Barlett",rho=NULL,impute="none"
))
Practical 4 Discriminanat analysis
#1)View Dataset
View(iris)
str(iris)
#2)Draw scatter plots along with the correlation coefficients for each pair
of variables
#other than "Species"
library(psych)
pairs.panels(iris[1:4],gap=0,bg=c("red","green","purple")
[iris$Species],pch=21)
#pvalue<0.05
#Reject H0 i.e.populaton variances are not equal
library(MASS)
#LDA
linear=lda(Species~.,training)
linear
linear$prior
linear$counts
linear$scaling
p=predict(linear,training)
#6)Obtain the confusion matrix and calculate accuracy for train as well as
test data
library(caret)
library(e1071)
confusionMatrix(p$class,training$Species)
sum(diag(tab))/sum(tab) #accuracy
## [1] 0.9767442
p2=predict(linear,testing)$class
tab2=table(Predicted=p2,Actual=testing$Species)
tab2
## Actual
## Predicted setosa versicolor virginica
## setosa 17 0 0
## versicolor 0 22 0
## virginica 0 1 24
sum(diag(tab2))/sum(tab2)
## [1] 0.984375
Practical 5 Cluster analysis
#USArrests data cluster analysis
View(USArrests)
d=scale(USArrests)
"We scale the data so that factors which have large values won't influence"
singlec=hclust(dist(d),method="single")
plot(singlec) #2 clusters
completec=hclust(dist(d),method="complete")
plot(completec) #4 clusters
#K-means cluster analysis
kmc=kmeans(d,center=3)
kmc
library(fpc)
library(prabclus)
plotcluster(d,kmc$cluster)
library(cluster)
clusplot(d,kmc$cluster,color=T,shade=T,labels=2,lines=0)
#Dunn Index-value between 0 to infinity, higher the better
library(clValid)
cc=cutree(avgc,3)
dunn(dist(d),cc)
## [1] 0.2214287
#Ward's method
wm=hclust(dist(d),method="ward.D")
plot(wm)
rect.hclust(wm,4)
profiling=cutree(wm,4)
tab=data.frame(d,profiling)
View(tab)
"K-means clustering does involve a random selectional process for the initial
centroid
guesses, so you may get different results from different runs."
## [1] "K-means clustering does involve a random selectional process for the
initial centroid\nguesses, so you may get different results from different
runs."