Variable Selection
YIK LUN, KEI
[email protected]This paper is a lab from the book called An Introduction to Statistical Learning
with Applications in R. All R codes and comments below are belonged to the
book and authors.
Best Subset Selection
library(ISLR)
library(leaps)
Hitters<-na.omit(Hitters)
dim(Hitters)
## [1] 263
20
regfit.full=regsubsets (Salary~.,Hitters,nvmax=19)
reg.summary<-summary(regfit.full)
names(reg.summary)
## [1] "which"
"rsq"
"rss"
"adjr2"
"cp"
"bic"
"outmat" "obj"
RSS
plot(reg.summary$rss,xlab="Number of Variables",ylab="RSS",type="l")
3.6e+07
3.2e+07
2.8e+07
RSS
2.4e+07
10
15
Number of Variables
Adjusted R-squared
which.max(reg.summary$adjr2)
## [1] 11
plot(reg.summary$adjr2,xlab ="Number of Variables",ylab="Adjusted RSq",type="l")
points(11,reg.summary$adjr2[11],col="red",cex=2, pch=20)
0.50
0.45
0.40
0.35
Adjusted RSq
10
15
Number of Variables
Cp
which.min(reg.summary$cp)
## [1] 10
plot(reg.summary$cp,xlab ="Number of Variables",ylab="Cp",type="l")
points(10,reg.summary$cp[10],col ="red",cex=2, pch=20)
100
80
60
20
40
Cp
10
15
Number of Variables
BIC
which.min(reg.summary$bic)
## [1] 6
plot(reg.summary$bic,xlab="Number of Variables",ylab="BIC",type="l")
points (6,reg.summary$bic[6],col ="red",cex=2,pch=20)
90
110
150
130
BIC
10
Number of Variables
Selected Variables
plot(regfit.full,scale ="r2")
15
(Intercept)
AtBat
Hits
HmRun
Runs
RBI
Walks
Years
CAtBat
CHits
CHmRun
CRuns
CRBI
CWalks
LeagueN
DivisionW
PutOuts
Assists
Errors
NewLeagueN
r2
0.55
0.55
0.55
0.55
0.55
0.55
0.54
0.54
0.54
0.54
0.53
0.53
0.51
0.51
0.49
0.48
0.45
0.43
0.32
plot(regfit.full,scale ="adjr2")
(Intercept)
AtBat
Hits
HmRun
Runs
RBI
Walks
Years
CAtBat
CHits
CHmRun
CRuns
CRBI
CWalks
LeagueN
DivisionW
PutOuts
Assists
Errors
NewLeagueN
adjr2
0.52
0.52
0.52
0.52
0.52
0.52
0.52
0.52
0.51
0.51
0.51
0.51
0.5
0.5
0.48
0.47
0.45
0.42
0.32
plot(regfit.full,scale ="Cp")
(Intercept)
AtBat
Hits
HmRun
Runs
RBI
Walks
Years
CAtBat
CHits
CHmRun
CRuns
CRBI
CWalks
LeagueN
DivisionW
PutOuts
Assists
Errors
NewLeagueN
Cp
5
5.9
6.2
7.3
7.4
8.9
10
12
13
14
14
16
18
20
22
28
39
51
100
plot(regfit.full,scale ="bic")
(Intercept)
AtBat
Hits
HmRun
Runs
RBI
Walks
Years
CAtBat
CHits
CHmRun
CRuns
CRBI
CWalks
LeagueN
DivisionW
PutOuts
Assists
Errors
NewLeagueN
bic
150
150
150
150
140
140
140
140
140
130
130
130
120
120
110
110
100
96
91
coef(regfit.full,6) #by BIC
## (Intercept)
##
91.5117981
##
DivisionW
## -122.9515338
AtBat
-1.8685892
PutOuts
0.2643076
Hits
7.6043976
Walks
3.6976468
CRBI
0.6430169
Forward Stepwise Selection
regfit.fwd=regsubsets(Salary~.,data=Hitters,nvmax =19,method ="forward")
summary(regfit.fwd)
##
##
##
##
##
##
##
##
##
##
Subset selection object
Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
19 Variables (and intercept)
Forced in Forced out
AtBat
FALSE
FALSE
Hits
FALSE
FALSE
HmRun
FALSE
FALSE
Runs
FALSE
FALSE
RBI
FALSE
FALSE
Walks
FALSE
FALSE
9
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
Years
FALSE
FALSE
CAtBat
FALSE
FALSE
CHits
FALSE
FALSE
CHmRun
FALSE
FALSE
CRuns
FALSE
FALSE
CRBI
FALSE
FALSE
CWalks
FALSE
FALSE
LeagueN
FALSE
FALSE
DivisionW
FALSE
FALSE
PutOuts
FALSE
FALSE
Assists
FALSE
FALSE
Errors
FALSE
FALSE
NewLeagueN
FALSE
FALSE
1 subsets of each size up to 19
Selection Algorithm: forward
AtBat Hits HmRun Runs RBI Walks Years
1 ( 1 ) " "
" " " "
" " " " " "
" "
2 ( 1 ) " "
"*" " "
" " " " " "
" "
3 ( 1 ) " "
"*" " "
" " " " " "
" "
4 ( 1 ) " "
"*" " "
" " " " " "
" "
5 ( 1 ) "*"
"*" " "
" " " " " "
" "
6 ( 1 ) "*"
"*" " "
" " " " "*"
" "
7 ( 1 ) "*"
"*" " "
" " " " "*"
" "
8 ( 1 ) "*"
"*" " "
" " " " "*"
" "
9 ( 1 ) "*"
"*" " "
" " " " "*"
" "
10 ( 1 ) "*"
"*" " "
" " " " "*"
" "
11 ( 1 ) "*"
"*" " "
" " " " "*"
" "
12 ( 1 ) "*"
"*" " "
"*" " " "*"
" "
13 ( 1 ) "*"
"*" " "
"*" " " "*"
" "
14 ( 1 ) "*"
"*" "*"
"*" " " "*"
" "
15 ( 1 ) "*"
"*" "*"
"*" " " "*"
" "
16 ( 1 ) "*"
"*" "*"
"*" "*" "*"
" "
17 ( 1 ) "*"
"*" "*"
"*" "*" "*"
" "
18 ( 1 ) "*"
"*" "*"
"*" "*" "*"
"*"
19 ( 1 ) "*"
"*" "*"
"*" "*" "*"
"*"
CRBI CWalks LeagueN DivisionW PutOuts
1 ( 1 ) "*" " "
" "
" "
" "
2 ( 1 ) "*" " "
" "
" "
" "
3 ( 1 ) "*" " "
" "
" "
"*"
4 ( 1 ) "*" " "
" "
"*"
"*"
5 ( 1 ) "*" " "
" "
"*"
"*"
6 ( 1 ) "*" " "
" "
"*"
"*"
7 ( 1 ) "*" "*"
" "
"*"
"*"
8 ( 1 ) "*" "*"
" "
"*"
"*"
9 ( 1 ) "*" "*"
" "
"*"
"*"
10 ( 1 ) "*" "*"
" "
"*"
"*"
11 ( 1 ) "*" "*"
"*"
"*"
"*"
12 ( 1 ) "*" "*"
"*"
"*"
"*"
13 ( 1 ) "*" "*"
"*"
"*"
"*"
14 ( 1 ) "*" "*"
"*"
"*"
"*"
15 ( 1 ) "*" "*"
"*"
"*"
"*"
16 ( 1 ) "*" "*"
"*"
"*"
"*"
17 ( 1 ) "*" "*"
"*"
"*"
"*"
18 ( 1 ) "*" "*"
"*"
"*"
"*"
10
CAtBat CHits CHmRun CRuns
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
"*"
"*"
" "
" "
"*"
"*"
" "
" "
"*"
"*"
" "
" "
"*"
"*"
" "
" "
"*"
"*"
" "
" "
"*"
"*"
" "
" "
"*"
"*"
"*"
" "
"*"
"*"
"*"
" "
"*"
"*"
"*"
" "
"*"
"*"
"*"
" "
"*"
"*"
"*"
"*"
"*"
Assists Errors NewLeagueN
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
"*"
" "
" "
"*"
" "
" "
"*"
" "
" "
"*"
"*"
" "
"*"
"*"
" "
"*"
"*"
" "
"*"
"*"
" "
"*"
"*"
"*"
"*"
"*"
"*"
## 19
( 1 ) "*"
"*"
"*"
"*"
"*"
"*"
"*"
"*"
150
150
150
140
140
140
140
140
140
130
130
130
120
120
110
110
100
96
91
(Intercept)
AtBat
Hits
HmRun
Runs
RBI
Walks
Years
CAtBat
CHits
CHmRun
CRuns
CRBI
CWalks
LeagueN
DivisionW
PutOuts
Assists
Errors
NewLeagueN
bic
plot(regfit.fwd,scale="bic")
Backward Stepwise Selection
regfit.bwd=regsubsets(Salary~.,data=Hitters,nvmax =19,method ="backward")
summary(regfit.bwd)
##
##
##
##
##
##
##
##
##
##
##
##
##
Subset selection object
Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "backward")
19 Variables (and intercept)
Forced in Forced out
AtBat
FALSE
FALSE
Hits
FALSE
FALSE
HmRun
FALSE
FALSE
Runs
FALSE
FALSE
RBI
FALSE
FALSE
Walks
FALSE
FALSE
Years
FALSE
FALSE
CAtBat
FALSE
FALSE
CHits
FALSE
FALSE
11
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
CHmRun
FALSE
FALSE
CRuns
FALSE
FALSE
CRBI
FALSE
FALSE
CWalks
FALSE
FALSE
LeagueN
FALSE
FALSE
DivisionW
FALSE
FALSE
PutOuts
FALSE
FALSE
Assists
FALSE
FALSE
Errors
FALSE
FALSE
NewLeagueN
FALSE
FALSE
1 subsets of each size up to 19
Selection Algorithm: backward
AtBat Hits HmRun Runs RBI Walks Years
1 ( 1 ) " "
" " " "
" " " " " "
" "
2 ( 1 ) " "
"*" " "
" " " " " "
" "
3 ( 1 ) " "
"*" " "
" " " " " "
" "
4 ( 1 ) "*"
"*" " "
" " " " " "
" "
5 ( 1 ) "*"
"*" " "
" " " " "*"
" "
6 ( 1 ) "*"
"*" " "
" " " " "*"
" "
7 ( 1 ) "*"
"*" " "
" " " " "*"
" "
8 ( 1 ) "*"
"*" " "
" " " " "*"
" "
9 ( 1 ) "*"
"*" " "
" " " " "*"
" "
10 ( 1 ) "*"
"*" " "
" " " " "*"
" "
11 ( 1 ) "*"
"*" " "
" " " " "*"
" "
12 ( 1 ) "*"
"*" " "
"*" " " "*"
" "
13 ( 1 ) "*"
"*" " "
"*" " " "*"
" "
14 ( 1 ) "*"
"*" "*"
"*" " " "*"
" "
15 ( 1 ) "*"
"*" "*"
"*" " " "*"
" "
16 ( 1 ) "*"
"*" "*"
"*" "*" "*"
" "
17 ( 1 ) "*"
"*" "*"
"*" "*" "*"
" "
18 ( 1 ) "*"
"*" "*"
"*" "*" "*"
"*"
19 ( 1 ) "*"
"*" "*"
"*" "*" "*"
"*"
CRBI CWalks LeagueN DivisionW PutOuts
1 ( 1 ) " " " "
" "
" "
" "
2 ( 1 ) " " " "
" "
" "
" "
3 ( 1 ) " " " "
" "
" "
"*"
4 ( 1 ) " " " "
" "
" "
"*"
5 ( 1 ) " " " "
" "
" "
"*"
6 ( 1 ) " " " "
" "
"*"
"*"
7 ( 1 ) " " "*"
" "
"*"
"*"
8 ( 1 ) "*" "*"
" "
"*"
"*"
9 ( 1 ) "*" "*"
" "
"*"
"*"
10 ( 1 ) "*" "*"
" "
"*"
"*"
11 ( 1 ) "*" "*"
"*"
"*"
"*"
12 ( 1 ) "*" "*"
"*"
"*"
"*"
13 ( 1 ) "*" "*"
"*"
"*"
"*"
14 ( 1 ) "*" "*"
"*"
"*"
"*"
15 ( 1 ) "*" "*"
"*"
"*"
"*"
16 ( 1 ) "*" "*"
"*"
"*"
"*"
17 ( 1 ) "*" "*"
"*"
"*"
"*"
18 ( 1 ) "*" "*"
"*"
"*"
"*"
19 ( 1 ) "*" "*"
"*"
"*"
"*"
12
CAtBat CHits CHmRun CRuns
" "
" "
" "
"*"
" "
" "
" "
"*"
" "
" "
" "
"*"
" "
" "
" "
"*"
" "
" "
" "
"*"
" "
" "
" "
"*"
" "
" "
" "
"*"
" "
" "
" "
"*"
"*"
" "
" "
"*"
"*"
" "
" "
"*"
"*"
" "
" "
"*"
"*"
" "
" "
"*"
"*"
" "
" "
"*"
"*"
" "
" "
"*"
"*"
"*"
" "
"*"
"*"
"*"
" "
"*"
"*"
"*"
" "
"*"
"*"
"*"
" "
"*"
"*"
"*"
"*"
"*"
Assists Errors NewLeagueN
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
"*"
" "
" "
"*"
" "
" "
"*"
" "
" "
"*"
"*"
" "
"*"
"*"
" "
"*"
"*"
" "
"*"
"*"
" "
"*"
"*"
"*"
"*"
"*"
"*"
"*"
"*"
"*"
150
150
140
140
140
140
140
140
130
130
130
120
120
120
110
110
100
96
89
(Intercept)
AtBat
Hits
HmRun
Runs
RBI
Walks
Years
CAtBat
CHits
CHmRun
CRuns
CRBI
CWalks
LeagueN
DivisionW
PutOuts
Assists
Errors
NewLeagueN
bic
plot(regfit.bwd,scale="bic")
Validation Set Approach
set.seed(1)
train=sample(c(TRUE,FALSE),nrow(Hitters),rep=T)
test =(!train )
regfit.best=regsubsets(Salary~.,data=Hitters[train,],nvmax =19)
test.mat=model.matrix(Salary~.,data=Hitters[test,])
val.errors =rep(NA ,19)
for(i in 1:19){
coefi=coef(regfit.best,id=i)
pred=test.mat[,names(coefi)]%*% coefi
val.errors[i]= mean((Hitters$Salary[test]-pred)^2)
}
val.errors
## [1] 220968.0 169157.1 178518.2 163426.1 168418.1 171270.6 162377.1
## [8] 157909.3 154055.7 148162.1 151156.4 151742.5 152214.5 157358.7
## [15] 158541.4 158743.3 159972.7 159859.8 160105.6
13
which.min(val.errors)
## [1] 10
coef(regfit.best,10)
## (Intercept)
## -80.2751499
##
CHmRun
##
1.3844863
AtBat
-1.4683816
CWalks
-0.7483170
Hits
Walks
7.1625314
3.6430345
LeagueN
DivisionW
84.5576103 -53.0289658
CAtBat
-0.1855698
PutOuts
0.2381662
CHits
1.1053238
regfit.best=regsubsets (Salary~.,data=Hitters ,nvmax =19)
coef(regfit.best ,10)
##
##
##
##
##
##
(Intercept)
162.5354420
CRuns
1.4082490
Assists
0.2831680
AtBat
-2.1686501
CRBI
0.7743122
Hits
Walks
6.9180175
5.7732246
CWalks
DivisionW
-0.8308264 -112.3800575
CAtBat
-0.1300798
PutOuts
0.2973726
Cross-Validation
predict.regsubsets =function(object ,newdata ,id ,...){
form=as.formula(object$call [[2]])
mat=model.matrix(form ,newdata )
coefi=coef(object,id=id)
xvars=names(coefi)
mat[,xvars]%*% coefi
}
k=10
set.seed(1)
folds=sample(1:k,nrow(Hitters),replace =TRUE)
cv.errors=matrix(NA,k,19,dimnames=list(paste(1:k), paste(1:19)))
for(j in 1:k){
best.fit =regsubsets(Salary~.,data=Hitters[folds!=j,],nvmax =19)
for(i in 1:19) {
pred=predict(best.fit ,Hitters[folds==j,],id=i)
cv.errors [j,i]=mean( (Hitters$Salary[folds ==j]-pred)^2)
}
}
mean.cv.errors=apply(cv.errors ,2, mean)
mean.cv.errors
##
1
2
3
4
5
6
7
8
## 160093.5 140196.8 153117.0 151159.3 146841.3 138302.6 144346.2 130207.7
##
9
10
11
12
13
14
15
16
## 129459.6 125334.7 125153.8 128273.5 133461.0 133974.6 131825.7 131882.8
##
17
18
19
## 132750.9 133096.2 132804.7
14
125000 135000 145000 155000
mean.cv.errors
plot(mean.cv.errors,type="b")
10
15
Index
reg.best=regsubsets(Salary~.,data=Hitters,nvmax =19)
coef(reg.best ,11)
##
##
##
##
##
##
(Intercept)
135.7512195
CRuns
1.4553310
PutOuts
0.2894087
AtBat
-2.1277482
CRBI
0.7852528
Assists
0.2688277
Hits
6.9236994
CWalks
-0.8228559
Walks
CAtBat
5.6202755
-0.1389914
LeagueN
DivisionW
43.1116152 -111.1460252
Reference:
James, Gareth, et al. An introduction to statistical learning. New
York: springer, 2013.
15