0% found this document useful (0 votes)

14 views35 pages

AMDA Practical - A048

The document describes steps taken in an analysis of retail sales data. It cleans the data by handling missing values, outliers, and normalizing variables. Univariate and bivariate analyses are conducted to understand the data distribution and relationships. Discriminant analysis and factor analysis are then performed on different datasets.

Uploaded by

Shubham Phatangare

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

14 views35 pages

AMDA Practical - A048

Uploaded by

Shubham Phatangare

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 35

AMDA-Practical

Shubham Phatangare

Roll No: A048

SAP ID: 86072300048

Practical 1
#Step 1
a=read.csv("C:\\Users\\ADMIN\\Downloads\\walmart (1).csv")

#Step 2
#checking dimension of the data
dim(a)

## [1] 421570 12

#Getting data types of all variables

sapply(a,class)

#changing dtype of date variable from character to date

a$Date=as.Date(a$Date,"%d-%m-%Y")
sapply(a,class)

#checking the structure of data

str(a)

#Step 3a
#checking missing values
sapply(a,function(x) sum(is.na(x)))

#Treating missing values

#Method 1
library(dplyr)

#deletion of variable
df=select(a,-10)

#replacing with mean or median for numeric variable

df$Temperature[is.na(df$Temperature)]=round(mean(df$Temperature,na.rm=TRUE))
df$CPI[is.na(df$CPI)]=round(mean(df$CPI,na.rm=TRUE))

#replacing missing value for logical/categorical variable

table(df$IsHoliday) #if false is more we replace missing value by false and
vice-versa
df$IsHoliday[is.na(df$IsHoliday)]=FALSE
str(df)

my_mode=function(x){ #create mode function

unique_x=unique(x) #returns unique values in the function
tabulate_x=tabulate(match(x,unique_x)) #returns no. of false and true
values
unique_x[tabulate_x==max(tabulate_x)] #returns max value between true &
false
}
df$IsHoliday[is.na(df$IsHoliday)]=my_mode(df$IsHoliday)

#Method 2-MICE
#install.packages("mice")
library(mice)
md.pattern(df)

#Step 3b

#outliers
par(mfrow=c(2,4))
boxplot(df$Store,data=df,main="STORES")
boxplot(df$Dept,data=df,main="DEPARTMENT")
boxplot(df$Size,data=df,main="SIZE OF STORE")
boxplot(df$Temperature,data=df,main="TEMPERATURE")
boxplot(df$Fuel_Price,data=df,main="FUEL PRICE")
boxplot(df$CPI,data=df,main="CONSUMER PRICE INDEX")
boxplot(df$Unemployment,data=df,main="UNEMPLOYMENT RATE")
boxplot(df$Weekly_Sales,data=df,main="WEEKLY SALES")

#Treating outliers
#Quantile based flooring and capping
outlier_norm=function(x){
qntile=quantile(x,probs=c(.25,.75))
caps=quantile(x,probs=c(.05,.95))
H=1.5*IQR(x,na.rm=T)
x[x<(qntile[1]-H)]=caps[1]
x[x>(qntile[2]+H)]=caps[2]
return(x)
}

#replacing outliers
df$Unemployment=outlier_norm(df$Unemployment)
par(mfrow=c(1,1))
boxplot(df$Unemployment,data=df,main="UNEMPLOYMENT RATE")
#Step 3c
#Checking for normality of dependent continuous variable
#install.packages("moments")
library(moments)
#Filled density plot
d=density(df$Weekly_Sales)
d

##
## Call:
## density.default(x = df$Weekly_Sales)
##
## Data: df$Weekly_Sales (421570 obs.); Bandwidth 'bw' = 913
##
## x y
## Min. : -7728 Min. :0.000e+00
## 1st Qu.:168164 1st Qu.:0.000e+00
## Median :344055 Median :5.700e-10
## Mean :344055 Mean :1.420e-06
## 3rd Qu.:519947 3rd Qu.:4.364e-08
## Max. :695838 Max. :7.422e-05

plot(d,main="Weekly Sales")
polygon(d,col="red",border="blue")
skewness(df$Weekly_Sales)

## [1] 3.261997

#our dependent variable is specifically used for non-linear modelling.

#In order to understand normality and transformation we will use the below
variable.

#for example
d=density(df$Unemployment)
plot(d,main="Unemployment")
polygon(d,col="red",border="blue")
skewness(df$Unemployment)

## [1] 0.7303465

#Transformation
#log
df$Unemploymentlog=log(df$Unemployment)
skewness(df$Unemploymentlog)

## [1] 0.012207

d=density(df$Unemploymentlog)
plot(d,main="Unemploymentlog")
polygon(d,col="red",border="blue")
#power-square
df$Unemploymentsrt=sqrt(df$Unemployment)
skewness(df$Unemploymentsrt)

## [1] 0.373224

d=density(df$Unemploymentsrt)
plot(d,main="Unemploymentsrt")
polygon(d,col="red",border="blue")
#power-cube
df$Unemploymentcube=(df$Unemployment)^(-1/3)
skewness(df$Unemploymentcube)

## [1] 0.228456

d=density(df$Unemploymentcube)
plot(d,main="Unemploymentcube")
polygon(d,col="red",border="blue")
#car library
library(car)

s=powerTransform(df$Unemployment)
summary(s)

df$UnemploymentPT=df$Unemployment^(-0.02)
skewness(df$UnemploymentPT)

## [1] 0.002248032

d=density(df$UnemploymentPT)
plot(d,main="UnemploymentPT")
polygon(d,col="red",border="blue")
#QQ-plot
qqnorm(df$Unemployment,pch=1,frame=FALSE)
qqline(df$Unemployment,col="steelblue",lwd=2)
#Step 3d
#Inconsistency

#Encoding the categorical variables

#Label Encoding
df$IsHoliday=ifelse(df$IsHoliday=="TRUE",1,0)
table(df$IsHoliday)

##
## 0 1
## 391909 29661

#One hot encoding(creating dummy variables)

library(caret)

dmy=dummyVars(" ~ .",data=df,fullRank=T)
df=data.frame(predict(dmy,newdata=df))
str(df)

#scaling data[when variables have different units causing interruption in our

analysis , we scale our variables]

df$Temperature=scale(df$Temperature)
df$Fuel_Price=scale(df$Fuel_Price)
df$CPI=scale(df$CPI)

#Step 5
#Feature engineering
a$year=format(a$Date,"%Y")
a$day=format(a$Date,"%d")
a$month=format(a$Date,"%m")

#Step 6
#train-test split
set.seed(101) #Set seed so that same sample can be reproduced in future also
#Now selecting 70% of data as sample from total 'n' rows of the data
sample=sample.int(n=nrow(df),size=floor(.70*nrow(df)),replace=F)
train=df[sample,]
test=df[-sample,]
Practical 2 Find Univariate and Bivariate analysis on UScereal data.
library(MASS)

library(lattice)
data=UScereal

#Univariate Analysis
plot(data$mfr)

histogram(data$mfr)
#Barchart
table=table(data$mfr)
barplot(table,main="manufacturer",xlab="manufacturer",ylab="count",col="blue"
)
#Piechart
tabel1=table(data$vitamins)
pie(tabel1,main="vitamins",col=rainbow(length(tabel1)))
#Boxplot
par(mfrow=c(1,2))
boxplot(data$calories,main="For calories",col="pink") #Postively skewed
boxplot(data$carbo,main="For carbs",col="yellow") #Symmetric

#Histogram
par(mfrow=c(1,1))
hist(data$fibre,col="orange",border="black",xlab="fibre",prob=T,main="Histogr
am of fibre")
lines(density(data$fibre),lwd=2,col="black")
#Linechart
plot(data$sugars,type="o",col="red",xlab="sugar",main="Sugar")
#Bivariate Analysis

#Qualitative vs Qualitative

#multiple bar diagram

counts=table(data$vitamins,data$mfr)
barplot(counts,main="Vitamins by different
manufactures",beside=T,legend=rownames(counts),col=5:7)

#stacked bar diagram

counts=table(data$shelf,data$mfr)
barplot(counts,main="Shelves by different
manufactures",beside=F,legend=rownames(counts),col=5:7)
#Quantitative vs Quantitative

#scatterplot
plot(data$calories,data$fat,xlab="calories",ylab="fat",main="calories vs
fat",lwd=2)
plot(data$calories,data$sugars,pch=data$shelf)
legend("topright",c("Shelf 1","Shelf 2","Shelf 3"),cex=0.8,pch=1:3)
library(reshape2)

## Warning: package 'reshape2' was built under R version 4.3.2

library(ggplot2)
dat=data.frame(data$protein,data$fat)
ggplot(melt(dat),aes(value,color=variable))+geom_density()

## No id variables; using all as measure variables

#Quantitative vs Qualitative
boxplot(data$calories~data$mfr)
#n is postively skewed and r is negatively skewed

#Correlation plot
c=Filter(is.numeric,data)
#install.packages("corrplot")
library(corrplot)

## corrplot 0.92 loaded

library(RColorBrewer)
m=cor(c)
corrplot(m,type="upper",order="hclust",col=brewer.pal(n=8,name="RdYlBu"))
#Correlation plot with histogram
#install.packages("psych")
library(psych)

## Warning: package 'psych' was built under R version 4.3.3

##
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':

##
## %+%, alpha

## The following object is masked from 'package:car':

##
## logit

pairs.panels(dat,density=T,ellipses=F)
Practical 3 Run Factor analysis on mtcars data
#install.packages("FactoMineR")
library(FactoMineR)
library(psych)
cars=mtcars
summary(cars)

str(cars) #checking for quantitative and qualitative data

cars=cars[,c(-8,-9)] #removing qualitative data

str(cars)

#checking correlation
cordata=cor(cars)
cordata

#Barlett's test of sphericity

#Barlett's test of sphericity tests the hypothesis that your correlation
matrix is an
#identity matrix

#H0:Variables are uncorrelated

cortest.bartlett(cordata) #p value is very small so we reject H0

#indicates that factoe analysis may be useful for our data checking
determinant
#of corr matrix
det(cordata)

#determinant is positive implying factor analysis is applicable

#KMO test: To check for sample adequacy
#above 0.9=superb; 0.8-0.9=great; 0.7-0.8=good; 0.5-0.7=mediocre;
<0.5=unacceptable
KMO(cordata)

library(ggplot2)
fafitfree=fa(cordata,nfactors=ncol(cordata),rotate="none")
n_factors=length(fafitfree$e.values)
scree=data.frame(factor_n=as.factor(1:n_factors),eigenvalue=fafitfree$e.value
s)
ggplot(scree,aes(x=factor_n,y=eigenvalue,group=1))+geom_point()+geom_line()+
xlab("no. of factors")+ylab("initial eigenvalue")+labs(title="scree plot")
fa.none=fa(cordata,nfactors=3,n.obs=NA,n.iter=1,rotate="varimax",scores="regr
ession",fm="pa",max.iter=50)
fa.none

fa.diagram(fa.none) #black line +ve related & red line -ve related with that
factor
factor.scores(cars,fa.none,Phi=NULL,method=c("Barlett",rho=NULL,impute="none"
))
Practical 4 Discriminanat analysis
#1)View Dataset
View(iris)
str(iris)

#2)Draw scatter plots along with the correlation coefficients for each pair
of variables
#other than "Species"

library(psych)
pairs.panels(iris[1:4],gap=0,bg=c("red","green","purple")
[iris$Species],pch=21)

#3)Partition data into train (60%) and test (40%)

set.seed(555)
ind=sample(2,nrow(iris),replace=TRUE,prob=c(0.6,0.4))
training=iris[ind==1,]
testing=iris[ind==2,]

#4)Run discriminant analysis on train data with "Species" as dependent

variable
library(biotools)
res=boxM(iris[,1:4],iris[,"Species"]) #used to compare variation
res
#Barlett's test for homogenity of variances
#H0:Population variances are equal
bartlett.test(c(iris$Sepal.Length+iris$Sepal.Width+iris$Petal.Length+iris$Pet
al.Width)~iris$Species)

#pvalue<0.05
#Reject H0 i.e.populaton variances are not equal

library(MASS)
#LDA

linear=lda(Species~.,training)
linear

#in output, 2 equations are given:

linear$prior

## setosa versicolor virginica

## 0.3837209 0.3139535 0.3023256

linear$counts

linear$scaling

p=predict(linear,training)

#5)Draw histogram to check how well discriminant functions separate the 3

classes
ldahist(data=p$x[,1],g=training$Species)
ldahist(data=p$x[,2],g=training$Species)
#First graph,lda1 is better helping us distinguish all 3 species

#6)Obtain the confusion matrix and calculate accuracy for train as well as
test data
library(caret)
library(e1071)
confusionMatrix(p$class,training$Species)

sum(diag(tab))/sum(tab) #accuracy

## [1] 0.9767442

p2=predict(linear,testing)$class
tab2=table(Predicted=p2,Actual=testing$Species)
tab2

## Actual
## Predicted setosa versicolor virginica
## setosa 17 0 0
## versicolor 0 22 0
## virginica 0 1 24

sum(diag(tab2))/sum(tab2)

## [1] 0.984375
Practical 5 Cluster analysis
#USArrests data cluster analysis
View(USArrests)
d=scale(USArrests)

"We scale the data so that factors which have large values won't influence"

#Hierarchical clustering analysis

avgc=hclust(dist(d),method="average") #default is complete linkage

plot(avgc) #3 clusters

singlec=hclust(dist(d),method="single")
plot(singlec) #2 clusters
completec=hclust(dist(d),method="complete")
plot(completec) #4 clusters
#K-means cluster analysis

kmc=kmeans(d,center=3)
kmc

library(fpc)
library(prabclus)

plotcluster(d,kmc$cluster)

library(cluster)

clusplot(d,kmc$cluster,color=T,shade=T,labels=2,lines=0)
#Dunn Index-value between 0 to infinity, higher the better

library(clValid)

cc=cutree(avgc,3)
dunn(dist(d),cc)

## [1] 0.2214287

#dist(d)=Distance matrix computation

#Ward's method
wm=hclust(dist(d),method="ward.D")
plot(wm)

rect.hclust(wm,4)
profiling=cutree(wm,4)
tab=data.frame(d,profiling)
View(tab)

"K-means clustering does involve a random selectional process for the initial
centroid
guesses, so you may get different results from different runs."

## [1] "K-means clustering does involve a random selectional process for the
initial centroid\nguesses, so you may get different results from different
runs."

Verzani Answers
100% (8)
Verzani Answers
94 pages
Astm D3359-23
No ratings yet
Astm D3359-23
9 pages
Analysis Using Statistical: Introduction & Data Exploration
No ratings yet
Analysis Using Statistical: Introduction & Data Exploration
23 pages
Advanced R Data Analysis Training PDF
No ratings yet
Advanced R Data Analysis Training PDF
72 pages
IntroR 2
No ratings yet
IntroR 2
18 pages
Commands For Data Analysis Using R
No ratings yet
Commands For Data Analysis Using R
11 pages
R Practicals
No ratings yet
R Practicals
32 pages
A Short List of The Most Useful R Commands
No ratings yet
A Short List of The Most Useful R Commands
11 pages
Unit - 2: Data Manipulation With R & Data Visualization in Watson Studio
No ratings yet
Unit - 2: Data Manipulation With R & Data Visualization in Watson Studio
58 pages
Ds
No ratings yet
Ds
2 pages
Intro To R Software
No ratings yet
Intro To R Software
7 pages
Module2 BDA
No ratings yet
Module2 BDA
44 pages
Notes
No ratings yet
Notes
6 pages
R Syntax Examples 1
No ratings yet
R Syntax Examples 1
6 pages
Introduction To Quantitative Analysis. Leonardo D. Villamil. HW2 09/26/2016
No ratings yet
Introduction To Quantitative Analysis. Leonardo D. Villamil. HW2 09/26/2016
7 pages
A Short List of The Most Useful R Commands
No ratings yet
A Short List of The Most Useful R Commands
8 pages
Basics: TH TH TH TH TH TH TH
No ratings yet
Basics: TH TH TH TH TH TH TH
3 pages
(Practical) Programming With R
No ratings yet
(Practical) Programming With R
5 pages
R Course
No ratings yet
R Course
7 pages
Cost Practical
No ratings yet
Cost Practical
13 pages
Rstudio Cours
No ratings yet
Rstudio Cours
11 pages
X - 15 x-1 2. Print ('Hello Word!') ## (1) "Hello Word!" 3. X - 4 y - 5 Z - X+y Print (Z) 4. X - 4 y - 5 Cat ('The Sum of X and y Is', X+y)
No ratings yet
X - 15 x-1 2. Print ('Hello Word!') ## (1) "Hello Word!" 3. X - 4 y - 5 Z - X+y Print (Z) 4. X - 4 y - 5 Cat ('The Sum of X and y Is', X+y)
15 pages
ProbList2 24 SLN
No ratings yet
ProbList2 24 SLN
20 pages
R Console
No ratings yet
R Console
6 pages
BAN5
No ratings yet
BAN5
2 pages
STAT-2450 Assignment 1: Name:, Student ID: B00
No ratings yet
STAT-2450 Assignment 1: Name:, Student ID: B00
9 pages
R Commands
No ratings yet
R Commands
18 pages
Useful R Functions-1
No ratings yet
Useful R Functions-1
4 pages
Pool
No ratings yet
Pool
13 pages
R
No ratings yet
R
6 pages
Da Lab File 2
No ratings yet
Da Lab File 2
13 pages
Workshop Activity: X Seq y Length
No ratings yet
Workshop Activity: X Seq y Length
3 pages
R Cheat Sheet
No ratings yet
R Cheat Sheet
9 pages
Rstudio Study Notes For PA 20181126
No ratings yet
Rstudio Study Notes For PA 20181126
6 pages
Model Lab
No ratings yet
Model Lab
6 pages
Lab File AD PDF
No ratings yet
Lab File AD PDF
25 pages
R Codes
No ratings yet
R Codes
5 pages
Module - 4 (R Training) - Basic Stats & Modeling
No ratings yet
Module - 4 (R Training) - Basic Stats & Modeling
15 pages
R Code
No ratings yet
R Code
9 pages
R Programming-1
No ratings yet
R Programming-1
6 pages
Chapter 2. Pre-Processing Data
No ratings yet
Chapter 2. Pre-Processing Data
37 pages
Applied Statistics MAT1011
No ratings yet
Applied Statistics MAT1011
22 pages
Copy Entire Document Content in R Studio: R Script Compiled by Mr. Anup Sharma (Strictly To Be Used As Class Notes)
No ratings yet
Copy Entire Document Content in R Studio: R Script Compiled by Mr. Anup Sharma (Strictly To Be Used As Class Notes)
15 pages
Functions and Packages
No ratings yet
Functions and Packages
7 pages
Modelling With R
No ratings yet
Modelling With R
3 pages
Copy Entire Document Content in R Studio
No ratings yet
Copy Entire Document Content in R Studio
17 pages
Lab Exercise 1
No ratings yet
Lab Exercise 1
16 pages
DEV Lab Manual
No ratings yet
DEV Lab Manual
27 pages
Final Cost Practical
No ratings yet
Final Cost Practical
29 pages
Unit Ii Eda Using R
No ratings yet
Unit Ii Eda Using R
11 pages
Module 2 ExploratoryDataAnalysis
No ratings yet
Module 2 ExploratoryDataAnalysis
22 pages
Practical4 Solution-1
No ratings yet
Practical4 Solution-1
9 pages
ASSIGNMENT NO - 2, FDAS - SUMANYAKUMARI - Bfia
No ratings yet
ASSIGNMENT NO - 2, FDAS - SUMANYAKUMARI - Bfia
6 pages
Correlation Diploma
No ratings yet
Correlation Diploma
10 pages
Statistics With R
No ratings yet
Statistics With R
20 pages
Practical 1 EDA
No ratings yet
Practical 1 EDA
14 pages
R Cheatsheet ABCD
No ratings yet
R Cheatsheet ABCD
3 pages
R Record-1
No ratings yet
R Record-1
57 pages
Praktikum Modul 3
No ratings yet
Praktikum Modul 3
5 pages
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
Design and Analysis of A Compact Wide-Field Unobscured Zoom Mirror System (3129-09)
No ratings yet
Design and Analysis of A Compact Wide-Field Unobscured Zoom Mirror System (3129-09)
12 pages
Energies 14 07812 v2
No ratings yet
Energies 14 07812 v2
14 pages
Savonius WT - 3D Model Validation and Parametric Analysis
No ratings yet
Savonius WT - 3D Model Validation and Parametric Analysis
23 pages
TB 9 2920 225 34 1
100% (1)
TB 9 2920 225 34 1
70 pages
June 2018 MS - Paper 2P Edexcel Physics IGCSE
No ratings yet
June 2018 MS - Paper 2P Edexcel Physics IGCSE
13 pages
5.4.2 Practice - A Pound of This and A Pound of That (Practice)
No ratings yet
5.4.2 Practice - A Pound of This and A Pound of That (Practice)
4 pages
Partial Differential Equations
No ratings yet
Partial Differential Equations
2 pages
Grade Analyser
No ratings yet
Grade Analyser
3 pages
1986 - MORSE - Chem - Rev - Clustersof Transition-Metalatoms
No ratings yet
1986 - MORSE - Chem - Rev - Clustersof Transition-Metalatoms
61 pages
PD 6705-2-2010+a1-2013 - (2020-06-17 - 10-24-28 Am)
No ratings yet
PD 6705-2-2010+a1-2013 - (2020-06-17 - 10-24-28 Am)
58 pages
The Influence of Hull Form On The Motions of High Speed Vessels in Head Seas
No ratings yet
The Influence of Hull Form On The Motions of High Speed Vessels in Head Seas
25 pages
Chapter 1
No ratings yet
Chapter 1
10 pages
Ktom Important Questions
No ratings yet
Ktom Important Questions
20 pages
Cyclic Direct Simple Shear Test Report
No ratings yet
Cyclic Direct Simple Shear Test Report
12 pages
KHUB NEET Grade 13 Module Test - 03 (08-Feb-24) - Sol - Final
No ratings yet
KHUB NEET Grade 13 Module Test - 03 (08-Feb-24) - Sol - Final
22 pages
Xi Neet (Code a) (Without Key) 20-07-2025
No ratings yet
Xi Neet (Code a) (Without Key) 20-07-2025
17 pages
Examples of Laplace Equations Applications
100% (1)
Examples of Laplace Equations Applications
5 pages
The Psychological Foundations of Culture
No ratings yet
The Psychological Foundations of Culture
72 pages
Gross Moisture in Green Petroleum Coke: Standard Test Method For
No ratings yet
Gross Moisture in Green Petroleum Coke: Standard Test Method For
3 pages
Fuersteneau Power Model (See Mill Power - Ball Mills Spreadsheet For Further Details On Such Model)
No ratings yet
Fuersteneau Power Model (See Mill Power - Ball Mills Spreadsheet For Further Details On Such Model)
3 pages
UNIT 5 Chapter 18 Density and Pressure
No ratings yet
UNIT 5 Chapter 18 Density and Pressure
76 pages
Datasheet
No ratings yet
Datasheet
7 pages
Large End of Cone: Conical Reducer
No ratings yet
Large End of Cone: Conical Reducer
5 pages
Chapter 19 Complete PDF
No ratings yet
Chapter 19 Complete PDF
14 pages
23.03.27 Kiem Tra DLHDK Sol
No ratings yet
23.03.27 Kiem Tra DLHDK Sol
1 page
Is 12762 3 1998
No ratings yet
Is 12762 3 1998
17 pages
Aiee Test Code For Evaluation of Systems of Insulating Materials
No ratings yet
Aiee Test Code For Evaluation of Systems of Insulating Materials
12 pages
History of The Atom Webquest ANSWERS
No ratings yet
History of The Atom Webquest ANSWERS
6 pages
Basics of Maths - Pranav Popat - Full PDF
No ratings yet
Basics of Maths - Pranav Popat - Full PDF
39 pages