hw8_github.knit

Stepwise regression

uscrime <- read.csv("./data 11.1/uscrime.txt",sep = "")
full<-lm(Crime~., data=uscrime) # full model
null<-lm(Crime~1,data=uscrime) # null model
combo<-step(null, direction='both',scope=formula(full)) # stepwise regression

## Start:  AIC=561.02
## Crime ~ 1
## 
##          Df Sum of Sq     RSS    AIC
## + Po1     1   3253302 3627626 532.94
## + Po2     1   3058626 3822302 535.39
## + Wealth  1   1340152 5540775 552.84
## + Prob    1   1257075 5623853 553.54
## + Pop     1    783660 6097267 557.34
## + Ed      1    717146 6163781 557.85
## + M.F     1    314867 6566061 560.82
## <none>                6880928 561.02
## + LF      1    245446 6635482 561.32
## + Ineq    1    220530 6660397 561.49
## + U2      1    216354 6664573 561.52
## + Time    1    154545 6726383 561.96
## + So      1     56527 6824400 562.64
## + M       1     55084 6825844 562.65
## + U1      1     17533 6863395 562.90
## + NW      1      7312 6873615 562.97
## 
## Step:  AIC=532.94
## Crime ~ Po1
## 
##          Df Sum of Sq     RSS    AIC
## + Ineq    1    739819 2887807 524.22
## + M       1    616741 3010885 526.18
## + M.F     1    250522 3377104 531.57
## + NW      1    232434 3395192 531.82
## + So      1    219098 3408528 532.01
## + Wealth  1    180872 3446754 532.53
## <none>                3627626 532.94
## + Po2     1    146167 3481459 533.00
## + Prob    1     92278 3535348 533.72
## + LF      1     77479 3550147 533.92
## + Time    1     43185 3584441 534.37
## + U2      1     17848 3609778 534.70
## + Pop     1      5666 3621959 534.86
## + U1      1      2878 3624748 534.90
## + Ed      1       767 3626859 534.93
## - Po1     1   3253302 6880928 561.02
## 
## Step:  AIC=524.22
## Crime ~ Po1 + Ineq
## 
##          Df Sum of Sq     RSS    AIC
## + Ed      1    587050 2300757 515.53
## + M.F     1    454545 2433262 518.17
## + Prob    1    280690 2607117 521.41
## + LF      1    260571 2627236 521.77
## + Wealth  1    213937 2673871 522.60
## + M       1    181236 2706571 523.17
## + Pop     1    130377 2757430 524.04
## <none>                2887807 524.22
## + NW      1     36439 2851369 525.62
## + So      1     33738 2854069 525.66
## + Po2     1     30673 2857134 525.71
## + U1      1      2309 2885498 526.18
## + Time    1       497 2887310 526.21
## + U2      1       253 2887554 526.21
## - Ineq    1    739819 3627626 532.94
## - Po1     1   3772590 6660397 561.49
## 
## Step:  AIC=515.53
## Crime ~ Po1 + Ineq + Ed
## 
##          Df Sum of Sq     RSS    AIC
## + M       1    239405 2061353 512.37
## + Prob    1    234981 2065776 512.47
## + M.F     1    117026 2183731 515.08
## <none>                2300757 515.53
## + Wealth  1     79540 2221218 515.88
## + U2      1     62112 2238646 516.25
## + Time    1     61770 2238987 516.26
## + Po2     1     42584 2258174 516.66
## + Pop     1     39319 2261438 516.72
## + U1      1      7365 2293392 517.38
## + LF      1      7254 2293503 517.39
## + NW      1      4210 2296547 517.45
## + So      1      4135 2296622 517.45
## - Ed      1    587050 2887807 524.22
## - Ineq    1   1326101 3626859 534.93
## - Po1     1   3782666 6083423 559.23
## 
## Step:  AIC=512.37
## Crime ~ Po1 + Ineq + Ed + M
## 
##          Df Sum of Sq     RSS    AIC
## + Prob    1    258063 1803290 508.08
## + U2      1    200988 1860365 509.55
## + Wealth  1    163378 1897975 510.49
## <none>                2061353 512.37
## + M.F     1     74398 1986955 512.64
## + U1      1     50835 2010518 513.20
## + Po2     1     45392 2015961 513.32
## + Time    1     42746 2018607 513.39
## + NW      1     16488 2044865 513.99
## + Pop     1      8101 2053251 514.19
## + So      1      3189 2058164 514.30
## + LF      1      2988 2058365 514.30
## - M       1    239405 2300757 515.53
## - Ed      1    645219 2706571 523.17
## - Ineq    1    864671 2926024 526.83
## - Po1     1   4000849 6062202 561.07
## 
## Step:  AIC=508.08
## Crime ~ Po1 + Ineq + Ed + M + Prob
## 
##          Df Sum of Sq     RSS    AIC
## + U2      1    192233 1611057 504.79
## + Wealth  1     86490 1716801 507.77
## + M.F     1     84509 1718781 507.83
## <none>                1803290 508.08
## + U1      1     52313 1750977 508.70
## + Pop     1     47719 1755571 508.82
## + Po2     1     37967 1765323 509.08
## + So      1     21971 1781320 509.51
## + Time    1     10194 1793096 509.82
## + LF      1       990 1802301 510.06
## + NW      1       797 1802493 510.06
## - Prob    1    258063 2061353 512.37
## - M       1    262486 2065776 512.47
## - Ed      1    598315 2401605 519.55
## - Ineq    1    968199 2771489 526.28
## - Po1     1   3268577 5071868 554.69
## 
## Step:  AIC=504.79
## Crime ~ Po1 + Ineq + Ed + M + Prob + U2
## 
##          Df Sum of Sq     RSS    AIC
## <none>                1611057 504.79
## + Wealth  1     59910 1551147 505.00
## + U1      1     54830 1556227 505.16
## + Pop     1     51320 1559737 505.26
## + M.F     1     30945 1580112 505.87
## + Po2     1     25017 1586040 506.05
## + So      1     17958 1593098 506.26
## + LF      1     13179 1597878 506.40
## + Time    1      7159 1603898 506.58
## + NW      1       359 1610698 506.78
## - U2      1    192233 1803290 508.08
## - Prob    1    249308 1860365 509.55
## - M       1    400611 2011667 513.22
## - Ed      1    776207 2387264 521.27
## - Ineq    1    949221 2560278 524.56
## - Po1     1   2817067 4428124 550.31

So using a stepwise regression we end up with a model with six variables. Their coefficients and p-values and other model output are shown below.

summary(lm(Crime~Po1+Ineq+Ed+M+Prob+U2, data=uscrime))

## 
## Call:
## lm(formula = Crime ~ Po1 + Ineq + Ed + M + Prob + U2, data = uscrime)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -470.68  -78.41  -19.68  133.12  556.23 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5040.50     899.84  -5.602 1.72e-06 ***
## Po1           115.02      13.75   8.363 2.56e-10 ***
## Ineq           67.65      13.94   4.855 1.88e-05 ***
## Ed            196.47      44.75   4.390 8.07e-05 ***
## M             105.02      33.30   3.154  0.00305 ** 
## Prob        -3801.84    1528.10  -2.488  0.01711 *  
## U2             89.37      40.91   2.185  0.03483 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 200.7 on 40 degrees of freedom
## Multiple R-squared:  0.7659, Adjusted R-squared:  0.7307 
## F-statistic: 21.81 on 6 and 40 DF,  p-value: 3.418e-11

Lasso, elastic net, and ridge

Since lasso and elastic net both use the glmnet package, I will do both together and cover ridge as well for the sake of completeness. For lasso, alpha = 1, for ridge, alpha = 0, and elastic net is everything in-between, i.e., 0<alpha<1.

prepare data

library(glmnet)

## Warning: package 'glmnet' was built under R version 4.1.3

# prepare data in the matrix format and partition into predictor and response variables
X <- as.matrix(uscrime[,1:15])
y <- uscrime$Crime

# split data into 80% training and 20% test sets
set.seed(1)
train <- sample(nrow(uscrime), nrow(uscrime) * 0.8) # train stores the indices of the observations that are randomly selected to be the train set
test<-(-train) # test stores the indices that exclude the train indices

fit models

lasso <- glmnet(X[train,],y[train],alpha=1) # standardize = TRUE by default
ridge <- glmnet(X[train,],y[train],alpha=0)
elastic <- glmnet(X[train,],y[train],alpha=0.5) # use 0.5 for elastic net initially just for exploration, the best alpha can be tuned by cross validation later

# plot solution paths
par(mfrow=c(1,3))
plot(lasso, xvar = "lambda", main='lasso')
plot(ridge, xvar = "lambda", main='ridge')
plot(elastic, xvar = "lambda", main='elastic net')

The solution paths show as lambda (the penalty for coefficients) increases, the coefficients shrink and the number of nonzero coefficients decreases (except for ridge, which does not force any coefficient to 0).

cross validation

To find the best lambda values for each model and the best alpha value for the elastic model, we perform cross validation using MSE as the metric.

# randomly assign each observation to a fold, foldid records the assignment, useful in using CV to select alpha
# due to the small size of the data set (47), the training set is just 80% of that, use 5-fold CV instead of the default 10-fold 
set.seed(1)
foldid <- sample(1:5, size = length(y[train]), replace = TRUE) 

for (i in 0:10) { # loop over alpha = 0, 0.1, 0.2,..., 0.9, 1
    nam = paste("cv", i, sep="")
    assign(nam, cv.glmnet(X[train,], y[train], foldid=foldid, alpha=i/10))
}

# plot CV MSE as a function of lambda
par(mfrow=c(1,3))
plot(cv10, xvar = "lambda", main='lasso')
plot(cv0, xvar = "lambda", main='ridge')
plot(cv5, xvar = "lambda", main='elastic net') # using alpha = 0.5 for elastic net just for plotting purposes

We can see the the plots for lasso and elastic net (when alpha=0.5) are similar. The two vertical dotted lines in each plot indicate the lambda values that gives the mininum MSE (lambda.min) and that gives the most regularized model such that the MSE is within one standard error of the minimum (1-SE rule, lambda.1se). If we want the lowest possible error in predicting new data, we can choose lambda.min. If we are willing to trade off some predictive power for the simplicity of the model, we can choose lambda.1se. To be fair across all models (ridge doesn’t do variable selection), we will use lambda.min. For lasso (alpha=1), the minimum MSE is obtained when using lambda = 5.9213329. For ridge (alpha=0), the minimum MSE is obtained when using lambda = 16.8641684. For elastic net, we need to choose from a grid of alpha and lambda values.

plot(log(cv1$lambda), cv1$cvm, pch = 19, col = "red",
     xlab = "log(Lambda)", ylab = cv1$name, ylim = c(50000,100000))
points(log(cv2$lambda), cv2$cvm, pch = 19, col = "orange")
points(log(cv3$lambda), cv3$cvm, pch = 19, col = "yellow")
points(log(cv4$lambda), cv4$cvm, pch = 19, col = "green")
points(log(cv5$lambda), cv5$cvm, pch = 19, col = "cyan")
points(log(cv6$lambda), cv6$cvm, pch = 19, col = "blue")
points(log(cv7$lambda), cv7$cvm, pch = 19, col = "purple")
points(log(cv8$lambda), cv8$cvm, pch = 19, col = "grey")
points(log(cv9$lambda), cv9$cvm, pch = 19, col = "black")
lines(log(cv0$lambda), cv0$cvm, lwd=5, col = "darkolivegreen")
lines(log(cv10$lambda), cv10$cvm, lwd=5, col = "deeppink")
legend("topleft",legend=c("alpha=0.1","alpha=0.2","alpha=0.3","alpha=0.4","alpha=0.5","alpha=0.6","alpha=0.7","alpha=0.8","alpha=0.9","alpha=0(ridge)","alpha=1(lasso)"),pch=19,col=c("red","orange","yellow","green","cyan","blue","purple","grey","black","darkolivegreen","deeppink"))

The plot above shows the CV MSE across all alpha and lambda values (we are comparing alpha values from 0.1~0.9 for elastic net, but I included ridge and lasso for the sake of completeness). We can see that the black line (alpha=0.9) achieved the lowest MSE. Below I compile a table extracting all the numbers so we can see clearly.

cv_table<-data.frame(alpha = seq(0,1,0.1), lambda.min=NA, MSE = NA) # construct the data frame

for (i in 1:nrow(cv_table)){ # loop over each CV result, assign minimum lambda and MSE values to the corresponding position in the data frame
  data<-get(paste("cv", i-1, sep=""))
  cv_table[i,2]<-data$lambda.min
  cv_table[i,3]<-min(data$cvm)
}

knitr::kable(cv_table)

alpha	lambda.min	MSE
0.0	16.864168	60143.57
0.1	14.667594	58533.83
0.2	14.065564	56648.41
0.3	11.294681	55383.16
0.4	10.203363	54408.71
0.5	8.958542	53500.42
0.6	8.193324	52713.39
0.7	7.707569	52148.28
0.8	6.744122	51691.27
0.9	6.579259	51301.22
1.0	5.921333	50984.63

Consistent with our visual observations, the best elastic net model is obtained using alpha = 0.9, lambda = 6.5792588. Now we use the test set to calculate unbiased estimates of MSE for each model.

estimate MSE in test set

lasso_pred<-predict(lasso,s=cv10$lambda.min,newx=X[test,])
lasso_mse<-mean((lasso_pred-y[test])^2) # test set MSE

ridge_pred<-predict(ridge,s=cv0$lambda.min,newx=X[test,])
ridge_mse<-mean((ridge_pred-y[test])^2) # test set MSE

elastic <- glmnet(X[train,],y[train],alpha=0.9) # refit the elastic net model using the best alpha
elastic_pred<-predict(elastic, s=cv9$lambda.min,newx=X[test,])
elastic_mse<-mean((elastic_pred-y[test])^2) # test set MSE

test_table<-data.frame(Model=c("lasso","ridge","elastic net (alpha=0.9)"), MSE=c(lasso_mse,ridge_mse,elastic_mse))
knitr::kable(test_table)

Model	MSE
lasso	112967.7
ridge	122433.4
elastic net (alpha=0.9)	111940.6

The three models all have comparable performances on the test set. Those numbers are higher than the CV MSE as expected.

final models

Finally, we refit our models on the full data set using the values of lambda and alpha chosen by CV, and examine the coefficient estimates.

lasso_final<-glmnet(X,y,alpha=1)
coef(lasso_final, s = cv10$lambda.min)

## 16 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept) -5.544638e+03
## M            7.798691e+01
## So           3.804884e+01
## Ed           1.419571e+02
## Po1          1.005304e+02
## Po2          .           
## LF           .           
## M.F          1.861589e+01
## Pop         -2.069447e-01
## NW           1.020458e+00
## U1          -3.118374e+03
## U2           1.145867e+02
## Wealth       3.265464e-02
## Ineq         5.591344e+01
## Prob        -3.794745e+03
## Time         .

ridge_final<-glmnet(X,y,alpha=0)
coef(ridge_final, s = cv0$lambda.min)

## 16 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept) -5.579394e+03
## M            7.065651e+01
## So           7.950176e+01
## Ed           1.149492e+02
## Po1          5.508492e+01
## Po2          3.772236e+01
## LF           4.177259e+02
## M.F          2.280885e+01
## Pop         -1.826204e-01
## NW           2.824573e+00
## U1          -3.335806e+03
## U2           1.188814e+02
## Wealth       4.545313e-02
## Ineq         4.394975e+01
## Prob        -4.029724e+03
## Time         2.821382e-01

elastic_final<-glmnet(X,y,alpha=0.9)
coef(elastic_final, s = cv9$lambda.min)

## 16 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept) -5.528557e+03
## M            7.728112e+01
## So           3.941912e+01
## Ed           1.404083e+02
## Po1          9.981025e+01
## Po2          .           
## LF           .           
## M.F          1.898726e+01
## Pop         -1.787778e-01
## NW           1.116745e+00
## U1          -3.112638e+03
## U2           1.137022e+02
## Wealth       3.219933e-02
## Ineq         5.500634e+01
## Prob        -3.802170e+03
## Time         .

As expected, none of the coefficients are zero in ridge. Lasso and elastic net (at alpha=0.9) have similar values in coefficients.