38 R11: Ridge and Lasso

Author

Brian Powers

Published

November 29, 2023

38.1 K Fold Validation for Ridge Regression

kfoldCV.ridge <- function(K, lambdas, dataset, responseVar){
  m <- length(lambdas)
  
  #idx is a shuffled vector of row numbers
  idx <- sample(1:nrow(dataset))
  #folds partitions the row indices
  folds <- split(idx, as.factor(1:K))

  #an empty data frame to store the results of each validation
  results <- data.frame(fold = rep(1:K, rep(m,K)),
                        model = rep(1:m, K),
                        error = 0)    
  for(k in 1:K){
    #split the data into training and testing sets
    training <- dataset[-folds[[k]],]
    testing <- dataset[folds[[k]],]
    #go through each model and estimate MSE
    ridge_models <- lm.ridge(reformulate(".",responseVar), training, lambda=lambdas);

    for(f in 1:m){
      coeff <- coef(ridge_models)[f,]
      
      Y <- testing[,c(responseVar)] 
      X <- cbind( 1, testing[,names(dataset) != responseVar])

      Y.hat <- as.numeric(coeff) %*% as.matrix(t(X))
            
      #calculate the average squared error on the testing data
      results[results$fold == k & results$model == f, "error"] <- mean((Y-Y.hat)^2)
    }
  }
  #aggregate over each model, averaging the error
  aggregated <- aggregate(error~model, data=results, FUN="mean")
  #produces a simple line & dot plot
  plot(error ~ sqrt(lambdas), type="b", data=aggregated, ylab="MSE")
#  lines(error ~ model, data=aggregated)
  print(which(aggregated$error == min(aggregated$error)))
  print(lambdas[[which(aggregated$error == min(aggregated$error))]])
  return(aggregated)
}

38.2 Ridge Regression

library(MASS);

Warning: package 'MASS' was built under R version 4.2.3

lambda_vals <- seq(0,10,1)^2; # Choose lambdas to try.
# lm.ridge needs:
# 1) a model (mpg~. says to model mpg as an intercept
#         plus a coefficient for every other variable in the data frame)
# 2) a data set (mtcars, of course)
# 3) a value for lambda. lambda=0 is the default,
#         and recovers classic linear regression.
#         But we can also pass a whole vector of lambdas, like we are about to do,
#         and lm.ridge will fit a separate model for each.
# See ?lm.ridge for details.
ridge_models <- lm.ridge(mpg~., mtcars, lambda=lambda_vals);

# Naively plotting this object shows us how the different coefficients
# change as lambda changes.
plot( ridge_models );

kfoldCV.ridge(32, lambda_vals, mtcars, "mpg")

[1] 5
[1] 16

   model     error
1      1 12.181558
2      2  9.452096
3      3  7.936502
4      4  7.393522
5      5  7.283203
6      6  7.441021
7      7  7.809000
8      8  8.361948
9      9  9.080131
10    10  9.940278
11    11 10.914771

ridge_models$coef

              0          1          4          9         16         25
cyl  -0.1958895 -0.2854708 -0.5066139 -0.6161717 -0.6591445 -0.6683601
disp  1.6267230  0.2846046 -0.3725236 -0.5766167 -0.6490976 -0.6685263
hp   -1.4496794 -1.0078499 -0.8528542 -0.8129980 -0.7775318 -0.7375242
drat  0.4142235  0.4865947  0.5232729  0.5453579  0.5549330  0.5519234
wt   -3.5780149 -2.3702010 -1.6545051 -1.3466497 -1.1634754 -1.0318544
qsec  1.4440470  0.8663632  0.4563819  0.3199939  0.2816562  0.2727392
vs    0.1576353  0.1858566  0.2665591  0.3379935  0.3882118  0.4182555
am    1.2377648  1.1337179  0.9930509  0.8779217  0.7836540  0.7053996
gear  0.4759507  0.4979561  0.4415798  0.4078556  0.3938022  0.3845387
carb -0.3170292 -0.9153711 -1.0545303 -0.9681697 -0.8536815 -0.7501713
             36         49         64         81        100
cyl  -0.6584602 -0.6371174 -0.6089930 -0.5771490 -0.5436589
disp -0.6618424 -0.6410597 -0.6125306 -0.5799961 -0.5457984
hp   -0.6949141 -0.6515838 -0.6087390 -0.5671623 -0.5273766
drat  0.5389018  0.5188717  0.4944121  0.4675133  0.4396305
wt   -0.9277904 -0.8409367 -0.7659920 -0.6999756 -0.6410990
qsec  0.2700141  0.2662930  0.2599270  0.2510841  0.2404236
vs    0.4315668  0.4322241  0.4239420  0.4097144  0.3918058
am    0.6390688  0.5816137  0.5309703  0.4857926  0.4451943
gear  0.3737460  0.3599879  0.3436658  0.3256615  0.3068348
carb -0.6631350 -0.5906396 -0.5296824 -0.4777057 -0.4327987

38.3 Ridge Regression With Standardized Data

One problem that we will encounter with Ridge regression (and LASSO) is that it assumes large betas are large because the predictor is important. This is not necessarily true - it could be due to the units of the variable.

Consider a model of mpg based on weight in pounds.

lm(mpg~ I(1000*wt), data=mtcars)


Call:
lm(formula = mpg ~ I(1000 * wt), data = mtcars)

Coefficients:
 (Intercept)  I(1000 * wt)  
   37.285126     -0.005344

When we convert weight to pounds (multiply by 1000) then the coefficient is -.005. But if we have wt in 1000s of pounds

lm(mpg~ wt, data=mtcars)


Call:
lm(formula = mpg ~ wt, data = mtcars)

Coefficients:
(Intercept)           wt  
     37.285       -5.344

The coefficient is 1000x as large! For this reason it is a good idea to standardize your data before you apply a shrinkage method. Standardized data requires subtracting the mean and dividing by standard deviation (of that column): \[ Z_{i,j} = \frac{X_{i,j} - \bar{X}_j}{S_{j}}\] No need to standardize the response variable.

mtcars.std <- mtcars
#let's standardize the quantitative predictors
stdcols <- c("cyl","disp","hp","drat","wt","qsec","gear","carb")
for(col in stdcols){
  xbar <- mean(mtcars.std[,col])
  sd <- sd(mtcars.std[,col])
  mtcars.std[,col] <- (mtcars.std[,col]-xbar)/sd
}

lambda_vals <- seq(0,10,1)^2; # Choose lambdas to try.
ridge_models <- lm.ridge(mpg~., mtcars.std, lambda=lambda_vals);
plot( ridge_models );

kfoldCV.ridge(32, lambda_vals, mtcars.std, "mpg")

[1] 5
[1] 16

   model     error
1      1 12.181558
2      2  9.452096
3      3  7.936502
4      4  7.393522
5      5  7.283203
6      6  7.441021
7      7  7.809000
8      8  8.361948
9      9  9.080131
10    10  9.940278
11    11 10.914771

38.4 LASSO Regression

library(glmnet)

Loading required package: Matrix

Loaded glmnet 4.1-8

kfoldCV.LASSO <- function(K, lambdas, dataset, responseVar){
  m <- length(lambdas)
  
  #idx is a shuffled vector of row numbers
  idx <- sample(1:nrow(dataset))
  #folds partitions the row indices
  folds <- split(idx, as.factor(1:K))

  #an empty data frame to store the results of each validation
  results <- data.frame(fold = rep(1:K, rep(m,K)),
                        model = rep(1:m, K),
                        error = 0)    
  for(k in 1:K){
    #split the data into training and testing sets
    training <- dataset[-folds[[k]],]
    testing <- dataset[folds[[k]],]
    #go through each model and estimate MSE

      for(f in 1:m){
      mtc_lasso_lambda <- glmnet(training[,names(dataset) != responseVar], training[,c(responseVar)], alpha = 1, lambda=lambdas[f]);
      coeffs <- as.vector(coef(mtc_lasso_lambda))
      y.mtc.predict <- coeffs %*% t(cbind(1,testing[,names(dataset) != responseVar]))

      results[results$fold == k & results$model == f, "error"] <- mean((y.mtc.predict-testing[,c(responseVar)])^2)
    }
  }
  #aggregate over each model, averaging the error
  aggregated <- aggregate(error~model, data=results, FUN="mean")
  #produces a simple line & dot plot
  plot(error ~ lambdas, type="b", data=aggregated, ylab="MSE")
#  print(which(aggregated$error == min(aggregated$error)))
  print(paste("best lambdas:", paste(lambdas[which(aggregated$error == min(aggregated$error))], collapse=",")))
  return(aggregated)
}

#LASSO mpg on mtcars

lambda_vals <- c(0,.1,.2,.3,.5,.7, 1, 1.5, 2, 2.5, 3, 3.5, 4, 5)
kfoldCV.LASSO(32, lambda_vals, mtcars.std, "mpg")

[1] "best lambdas: 0.7"

   model     error
1      1 12.139522
2      2  9.401635
3      3  9.208783
4      4  8.945935
5      5  8.314812
6      6  8.013806
7      7  8.357635
8      8  9.960995
9      9 12.241522
10    10 14.956380
11    11 18.398062
12    12 22.475571
13    13 27.129122
14    14 36.870189

LASSO.fits <- glmnet(mtcars[,-1], mtcars[,1], alpha=1, lambda=lambda_vals)
plot(LASSO.fits, label=TRUE, xvar="lambda")

LASSO_coeff<- t(coef(LASSO.fits))
#colnames(LASSO_coeff) <- c("intercept",names(mtcars)[-1])
LASSO_coeff

14 x 11 sparse Matrix of class "dgCMatrix"

  [[ suppressing 11 column names '(Intercept)', 'cyl', 'disp' ... ]]

                                                                     
s0  20.58164  .         .           .           .          -0.1526208
s1  24.29182 -0.2320135 .           .           .          -0.8596190
s2  26.21643 -0.3916771 .           .           .          -1.1507654
s3  28.14090 -0.5512556 .           .           .          -1.4420334
s4  30.06537 -0.7108337 .           .           .          -1.7333021
s5  31.87369 -0.8000158 .          -0.002226427 .          -2.0223414
s6  33.59349 -0.8355844 .          -0.006175353 .          -2.3084431
s7  35.31356 -0.8713207 .          -0.010120873 .          -2.5944626
s8  36.34612 -0.8930789 .          -0.012481712 .          -2.7659212
s9  35.87185 -0.8564098 .          -0.014070252 0.07915497 -2.6716570
s10 32.27520 -0.7185199 .          -0.014111007 0.40765131 -2.5704279
s11 26.92701 -0.5030523 .          -0.013211244 0.60633782 -2.6290418
s12 20.12195 -0.2164116 .          -0.013112947 0.77468208 -2.6356369
s13 12.32700 -0.1194297 0.01369736 -0.021693022 0.78186836 -3.7504495
                                                         
s0  .          .           .         .          .        
s1  .          .           .         .          .        
s2  .          .           .         .          .        
s3  .          .           .         .          .        
s4  .          .           .         .          .        
s5  .          .           .         .          .        
s6  .          .           .         .          .        
s7  .          .           .         .          .        
s8  .          .           .         .          .        
s9  .          .           0.4869054 .         -0.1085726
s10 0.08195105 0.003182207 1.1364003 .         -0.2812566
s11 0.26537365 0.068664832 1.6929202 .         -0.3422762
s12 0.45898572 0.123968843 2.1142688 0.3046746 -0.4637967
s13 0.82638838 0.316450552 2.5192826 0.6476344 -0.1854400

38.5 Boston Dataset Example

data("Boston")
library(glmnet)

#Standardize Variables
Boston.std <- Boston
for(i in 1:(ncol(Boston.std)-1)){
  Boston.std[,i] <- (Boston.std[,i]-mean(Boston.std[,i]))/sd(Boston.std[,i])
}

lambda_vals <- seq(0,5,length.out=10)^2; # Choose lambdas to try.

ridge_models <- lm.ridge(medv ~ . , Boston.std, lambda=lambda_vals);
plot( ridge_models );

kfoldCV.ridge(4, lambda_vals, Boston.std, "medv")

[1] 5
[1] 4.938272

   model    error
1      1 23.76252
2      2 23.75683
3      3 23.74260
4      4 23.72693
5      5 23.71822
6      6 23.72360
7      7 23.74769
8      8 23.79281
9      9 23.85964
10    10 23.94792

#LASSO 
lambda_vals <- seq(0,.05,length.out=10)

LASSO.fits <- glmnet(Boston.std[,names(Boston.std)!="medv"], Boston[,"medv"], alpha=1, lambda=lambda_vals)
plot(LASSO.fits, label=TRUE, xvar="lambda")

kfoldCV.LASSO(10, lambda_vals, Boston.std, "medv")

Warning in split.default(idx, as.factor(1:K)): data length is not a multiple of
split variable

[1] "best lambdas: 0.0277777777777778"

   model    error
1      1 24.46258
2      2 24.44899
3      3 24.43303
4      4 24.41995
5      5 24.41240
6      6 24.41130
7      7 24.41413
8      8 24.42939
9      9 24.44388
10    10 24.46682

resultsTable <- t(as.matrix(coef(LASSO.fits)))
row.names(resultsTable) <- sort(lambda_vals,decreasing=TRUE)
round(resultsTable,3)

                    (Intercept)   crim    zn indus  chas    nox    rm   age
0.05                     22.533 -0.783 0.891 0.000 0.674 -1.799 2.747 0.000
0.0444444444444444       22.533 -0.799 0.907 0.000 0.677 -1.814 2.744 0.000
0.0388888888888889       22.533 -0.816 0.927 0.000 0.678 -1.838 2.735 0.000
0.0333333333333333       22.533 -0.832 0.947 0.000 0.680 -1.863 2.726 0.000
0.0277777777777778       22.533 -0.849 0.967 0.000 0.682 -1.887 2.718 0.000
0.0222222222222222       22.533 -0.865 0.987 0.000 0.684 -1.911 2.708 0.000
0.0166666666666667       22.533 -0.882 1.007 0.000 0.686 -1.935 2.700 0.000
0.0111111111111111       22.533 -0.898 1.029 0.031 0.686 -1.968 2.693 0.000
0.00555555555555556      22.533 -0.913 1.053 0.084 0.684 -2.006 2.687 0.000
0                        22.533 -0.928 1.078 0.138 0.683 -2.049 2.680 0.017
                       dis   rad    tax ptratio black  lstat
0.05                -2.789 1.914 -1.424  -1.988 0.806 -3.732
0.0444444444444444  -2.818 1.971 -1.471  -1.992 0.810 -3.730
0.0388888888888889  -2.858 2.052 -1.537  -2.000 0.815 -3.730
0.0333333333333333  -2.898 2.131 -1.601  -2.007 0.820 -3.730
0.0277777777777778  -2.937 2.207 -1.662  -2.013 0.824 -3.730
0.0222222222222222  -2.978 2.287 -1.727  -2.020 0.829 -3.731
0.0166666666666667  -3.017 2.363 -1.788  -2.027 0.834 -3.730
0.0111111111111111  -3.050 2.456 -1.874  -2.037 0.839 -3.733
0.00555555555555556 -3.079 2.556 -1.974  -2.049 0.845 -3.737
0                   -3.100 2.658 -2.074  -2.061 0.851 -3.745