0% found this document useful (0 votes)
14 views35 pages

AMDA Practical - A048

The document describes steps taken in an analysis of retail sales data. It cleans the data by handling missing values, outliers, and normalizing variables. Univariate and bivariate analyses are conducted to understand the data distribution and relationships. Discriminant analysis and factor analysis are then performed on different datasets.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
14 views35 pages

AMDA Practical - A048

The document describes steps taken in an analysis of retail sales data. It cleans the data by handling missing values, outliers, and normalizing variables. Univariate and bivariate analyses are conducted to understand the data distribution and relationships. Discriminant analysis and factor analysis are then performed on different datasets.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 35

AMDA-Practical

Shubham Phatangare

Roll No: A048

SAP ID: 86072300048

Practical 1
#Step 1
a=read.csv("C:\\Users\\ADMIN\\Downloads\\walmart (1).csv")

#Step 2
#checking dimension of the data
dim(a)

## [1] 421570 12

#Getting data types of all variables


sapply(a,class)

#changing dtype of date variable from character to date


a$Date=as.Date(a$Date,"%d-%m-%Y")
sapply(a,class)

#checking the structure of data


str(a)

#Step 3a
#checking missing values
sapply(a,function(x) sum(is.na(x)))

#Treating missing values


#Method 1
library(dplyr)

#deletion of variable
df=select(a,-10)

#replacing with mean or median for numeric variable


df$Temperature[is.na(df$Temperature)]=round(mean(df$Temperature,na.rm=TRUE))
df$CPI[is.na(df$CPI)]=round(mean(df$CPI,na.rm=TRUE))

#replacing missing value for logical/categorical variable


table(df$IsHoliday) #if false is more we replace missing value by false and
vice-versa
df$IsHoliday[is.na(df$IsHoliday)]=FALSE
str(df)

my_mode=function(x){ #create mode function


unique_x=unique(x) #returns unique values in the function
tabulate_x=tabulate(match(x,unique_x)) #returns no. of false and true
values
unique_x[tabulate_x==max(tabulate_x)] #returns max value between true &
false
}
df$IsHoliday[is.na(df$IsHoliday)]=my_mode(df$IsHoliday)

#Method 2-MICE
#install.packages("mice")
library(mice)
md.pattern(df)

'

#Step 3b

#outliers
par(mfrow=c(2,4))
boxplot(df$Store,data=df,main="STORES")
boxplot(df$Dept,data=df,main="DEPARTMENT")
boxplot(df$Size,data=df,main="SIZE OF STORE")
boxplot(df$Temperature,data=df,main="TEMPERATURE")
boxplot(df$Fuel_Price,data=df,main="FUEL PRICE")
boxplot(df$CPI,data=df,main="CONSUMER PRICE INDEX")
boxplot(df$Unemployment,data=df,main="UNEMPLOYMENT RATE")
boxplot(df$Weekly_Sales,data=df,main="WEEKLY SALES")

#Treating outliers
#Quantile based flooring and capping
outlier_norm=function(x){
qntile=quantile(x,probs=c(.25,.75))
caps=quantile(x,probs=c(.05,.95))
H=1.5*IQR(x,na.rm=T)
x[x<(qntile[1]-H)]=caps[1]
x[x>(qntile[2]+H)]=caps[2]
return(x)
}

#replacing outliers
df$Unemployment=outlier_norm(df$Unemployment)
par(mfrow=c(1,1))
boxplot(df$Unemployment,data=df,main="UNEMPLOYMENT RATE")
#Step 3c
#Checking for normality of dependent continuous variable
#install.packages("moments")
library(moments)
#Filled density plot
d=density(df$Weekly_Sales)
d

##
## Call:
## density.default(x = df$Weekly_Sales)
##
## Data: df$Weekly_Sales (421570 obs.); Bandwidth 'bw' = 913
##
## x y
## Min. : -7728 Min. :0.000e+00
## 1st Qu.:168164 1st Qu.:0.000e+00
## Median :344055 Median :5.700e-10
## Mean :344055 Mean :1.420e-06
## 3rd Qu.:519947 3rd Qu.:4.364e-08
## Max. :695838 Max. :7.422e-05

plot(d,main="Weekly Sales")
polygon(d,col="red",border="blue")
skewness(df$Weekly_Sales)

## [1] 3.261997

#our dependent variable is specifically used for non-linear modelling.


#In order to understand normality and transformation we will use the below
variable.

#for example
d=density(df$Unemployment)
plot(d,main="Unemployment")
polygon(d,col="red",border="blue")
skewness(df$Unemployment)

## [1] 0.7303465

#Transformation
#log
df$Unemploymentlog=log(df$Unemployment)
skewness(df$Unemploymentlog)

## [1] 0.012207

d=density(df$Unemploymentlog)
plot(d,main="Unemploymentlog")
polygon(d,col="red",border="blue")
#power-square
df$Unemploymentsrt=sqrt(df$Unemployment)
skewness(df$Unemploymentsrt)

## [1] 0.373224

d=density(df$Unemploymentsrt)
plot(d,main="Unemploymentsrt")
polygon(d,col="red",border="blue")
#power-cube
df$Unemploymentcube=(df$Unemployment)^(-1/3)
skewness(df$Unemploymentcube)

## [1] 0.228456

d=density(df$Unemploymentcube)
plot(d,main="Unemploymentcube")
polygon(d,col="red",border="blue")
#car library
library(car)

s=powerTransform(df$Unemployment)
summary(s)

df$UnemploymentPT=df$Unemployment^(-0.02)
skewness(df$UnemploymentPT)

## [1] 0.002248032

d=density(df$UnemploymentPT)
plot(d,main="UnemploymentPT")
polygon(d,col="red",border="blue")
#QQ-plot
qqnorm(df$Unemployment,pch=1,frame=FALSE)
qqline(df$Unemployment,col="steelblue",lwd=2)
#Step 3d
#Inconsistency

#Encoding the categorical variables


#Label Encoding
df$IsHoliday=ifelse(df$IsHoliday=="TRUE",1,0)
table(df$IsHoliday)

##
## 0 1
## 391909 29661

#One hot encoding(creating dummy variables)


library(caret)

dmy=dummyVars(" ~ .",data=df,fullRank=T)
df=data.frame(predict(dmy,newdata=df))
str(df)

#scaling data[when variables have different units causing interruption in our


analysis , we scale our variables]

df$Temperature=scale(df$Temperature)
df$Fuel_Price=scale(df$Fuel_Price)
df$CPI=scale(df$CPI)

#Step 5
#Feature engineering
a$year=format(a$Date,"%Y")
a$day=format(a$Date,"%d")
a$month=format(a$Date,"%m")

#Step 6
#train-test split
set.seed(101) #Set seed so that same sample can be reproduced in future also
#Now selecting 70% of data as sample from total 'n' rows of the data
sample=sample.int(n=nrow(df),size=floor(.70*nrow(df)),replace=F)
train=df[sample,]
test=df[-sample,]
Practical 2 Find Univariate and Bivariate analysis on UScereal data.
library(MASS)

library(lattice)
data=UScereal

#Univariate Analysis
plot(data$mfr)

histogram(data$mfr)
#Barchart
table=table(data$mfr)
barplot(table,main="manufacturer",xlab="manufacturer",ylab="count",col="blue"
)
#Piechart
tabel1=table(data$vitamins)
pie(tabel1,main="vitamins",col=rainbow(length(tabel1)))
#Boxplot
par(mfrow=c(1,2))
boxplot(data$calories,main="For calories",col="pink") #Postively skewed
boxplot(data$carbo,main="For carbs",col="yellow") #Symmetric

#Histogram
par(mfrow=c(1,1))
hist(data$fibre,col="orange",border="black",xlab="fibre",prob=T,main="Histogr
am of fibre")
lines(density(data$fibre),lwd=2,col="black")
#Linechart
plot(data$sugars,type="o",col="red",xlab="sugar",main="Sugar")
#Bivariate Analysis

#Qualitative vs Qualitative

#multiple bar diagram


counts=table(data$vitamins,data$mfr)
barplot(counts,main="Vitamins by different
manufactures",beside=T,legend=rownames(counts),col=5:7)

#stacked bar diagram


counts=table(data$shelf,data$mfr)
barplot(counts,main="Shelves by different
manufactures",beside=F,legend=rownames(counts),col=5:7)
#Quantitative vs Quantitative

#scatterplot
plot(data$calories,data$fat,xlab="calories",ylab="fat",main="calories vs
fat",lwd=2)
plot(data$calories,data$sugars,pch=data$shelf)
legend("topright",c("Shelf 1","Shelf 2","Shelf 3"),cex=0.8,pch=1:3)
library(reshape2)

## Warning: package 'reshape2' was built under R version 4.3.2

library(ggplot2)
dat=data.frame(data$protein,data$fat)
ggplot(melt(dat),aes(value,color=variable))+geom_density()

## No id variables; using all as measure variables

#Quantitative vs Qualitative
boxplot(data$calories~data$mfr)
#n is postively skewed and r is negatively skewed

#Correlation plot
c=Filter(is.numeric,data)
#install.packages("corrplot")
library(corrplot)

## corrplot 0.92 loaded

library(RColorBrewer)
m=cor(c)
corrplot(m,type="upper",order="hclust",col=brewer.pal(n=8,name="RdYlBu"))
#Correlation plot with histogram
#install.packages("psych")
library(psych)

## Warning: package 'psych' was built under R version 4.3.3

##
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':


##
## %+%, alpha

## The following object is masked from 'package:car':


##
## logit

pairs.panels(dat,density=T,ellipses=F)
Practical 3 Run Factor analysis on mtcars data
#install.packages("FactoMineR")
library(FactoMineR)
library(psych)
cars=mtcars
summary(cars)

str(cars) #checking for quantitative and qualitative data

cars=cars[,c(-8,-9)] #removing qualitative data


str(cars)

#checking correlation
cordata=cor(cars)
cordata

#Barlett's test of sphericity


#Barlett's test of sphericity tests the hypothesis that your correlation
matrix is an
#identity matrix

#H0:Variables are uncorrelated


cortest.bartlett(cordata) #p value is very small so we reject H0

#indicates that factoe analysis may be useful for our data checking
determinant
#of corr matrix
det(cordata)

#determinant is positive implying factor analysis is applicable


#KMO test: To check for sample adequacy
#above 0.9=superb; 0.8-0.9=great; 0.7-0.8=good; 0.5-0.7=mediocre;
<0.5=unacceptable
KMO(cordata)

library(ggplot2)
fafitfree=fa(cordata,nfactors=ncol(cordata),rotate="none")
n_factors=length(fafitfree$e.values)
scree=data.frame(factor_n=as.factor(1:n_factors),eigenvalue=fafitfree$e.value
s)
ggplot(scree,aes(x=factor_n,y=eigenvalue,group=1))+geom_point()+geom_line()+
xlab("no. of factors")+ylab("initial eigenvalue")+labs(title="scree plot")
fa.none=fa(cordata,nfactors=3,n.obs=NA,n.iter=1,rotate="varimax",scores="regr
ession",fm="pa",max.iter=50)
fa.none

fa.diagram(fa.none) #black line +ve related & red line -ve related with that
factor
factor.scores(cars,fa.none,Phi=NULL,method=c("Barlett",rho=NULL,impute="none"
))
Practical 4 Discriminanat analysis
#1)View Dataset
View(iris)
str(iris)

#2)Draw scatter plots along with the correlation coefficients for each pair
of variables
#other than "Species"

library(psych)
pairs.panels(iris[1:4],gap=0,bg=c("red","green","purple")
[iris$Species],pch=21)

#3)Partition data into train (60%) and test (40%)


set.seed(555)
ind=sample(2,nrow(iris),replace=TRUE,prob=c(0.6,0.4))
training=iris[ind==1,]
testing=iris[ind==2,]

#4)Run discriminant analysis on train data with "Species" as dependent


variable
library(biotools)
res=boxM(iris[,1:4],iris[,"Species"]) #used to compare variation
res
#Barlett's test for homogenity of variances
#H0:Population variances are equal
bartlett.test(c(iris$Sepal.Length+iris$Sepal.Width+iris$Petal.Length+iris$Pet
al.Width)~iris$Species)

#pvalue<0.05
#Reject H0 i.e.populaton variances are not equal

library(MASS)
#LDA

linear=lda(Species~.,training)
linear

#in output, 2 equations are given:

linear$prior

## setosa versicolor virginica


## 0.3837209 0.3139535 0.3023256

linear$counts

linear$scaling

p=predict(linear,training)

#5)Draw histogram to check how well discriminant functions separate the 3


classes
ldahist(data=p$x[,1],g=training$Species)
ldahist(data=p$x[,2],g=training$Species)
#First graph,lda1 is better helping us distinguish all 3 species

#6)Obtain the confusion matrix and calculate accuracy for train as well as
test data
library(caret)
library(e1071)
confusionMatrix(p$class,training$Species)

sum(diag(tab))/sum(tab) #accuracy

## [1] 0.9767442

p2=predict(linear,testing)$class
tab2=table(Predicted=p2,Actual=testing$Species)
tab2

## Actual
## Predicted setosa versicolor virginica
## setosa 17 0 0
## versicolor 0 22 0
## virginica 0 1 24

sum(diag(tab2))/sum(tab2)

## [1] 0.984375
Practical 5 Cluster analysis
#USArrests data cluster analysis
View(USArrests)
d=scale(USArrests)

"We scale the data so that factors which have large values won't influence"

#Hierarchical clustering analysis

avgc=hclust(dist(d),method="average") #default is complete linkage


plot(avgc) #3 clusters

singlec=hclust(dist(d),method="single")
plot(singlec) #2 clusters
completec=hclust(dist(d),method="complete")
plot(completec) #4 clusters
#K-means cluster analysis

kmc=kmeans(d,center=3)
kmc

library(fpc)
library(prabclus)

plotcluster(d,kmc$cluster)

library(cluster)

clusplot(d,kmc$cluster,color=T,shade=T,labels=2,lines=0)
#Dunn Index-value between 0 to infinity, higher the better

library(clValid)

cc=cutree(avgc,3)
dunn(dist(d),cc)

## [1] 0.2214287

#dist(d)=Distance matrix computation

#Ward's method
wm=hclust(dist(d),method="ward.D")
plot(wm)

rect.hclust(wm,4)
profiling=cutree(wm,4)
tab=data.frame(d,profiling)
View(tab)

"K-means clustering does involve a random selectional process for the initial
centroid
guesses, so you may get different results from different runs."

## [1] "K-means clustering does involve a random selectional process for the
initial centroid\nguesses, so you may get different results from different
runs."

You might also like