36 Bias Variance Tradeoff

Author

Brian Powers

Published

April 25, 2024

36.1 Bias and Variance of an Linear Model

We will sample from a population, fit a linear model, then predict 1000 data points. We will actually repeat this over and over and in the end we will have 1000 model fits and for each one we will have 1000 predictions.

The true model is

\[Y_i = 10 + 2X_i + \epsilon_i, \text{ where }\epsilon\sim N(0,2^2)\] A simple function will be used to generate an unlimited number of data so we can precisely estimate out of sample error for our model.

generate_data <- function(n){
  X <- rnorm(n, 5, 4)
  eps <- rnorm(n, 0, 2)
  Y <- 10 + 2*X + eps
  return(data.frame(X,Y))
}

The question is this: What is the expected model error of this linear model based on a sample size of 32, for this population? We’ll start with 1000 out of sample data points, which we can take as being representative of the population of out of sample data.

For 1000 Monte Carlo replicates we’ll

Generate a random sample of size 32
Fit a linear model to the data
Make predictions for all 1000 out of sample data
Calculate the average squared error on those predictions

This will give us a great estimate of model error. We can then use this as a benchmark for K-fold cross validation in the next phase of this example.

Note: This code can take a little bit to run

NMC <- 1000
nOut <- 1000
outData <- generate_data(nOut)

results <- data.frame('rep' = rep(1:NMC, nOut),
                      'i' = rep(1:nOut, each=NMC),
                      'X' = rep(outData$X, each=NMC),
                      'Y' = rep(outData$Y, each=NMC),
                      'beta0' = rep(0,nOut*NMC),
                      'beta1' = rep(0,nOut*NMC),
                      'Yhat' = rep(0,nOut*NMC),
                      'error'= rep(0,nOut*NMC))

for(k in 1:NMC){
  sampleData <- generate_data(32)
  modelFit <- lm(Y~X, data=sampleData)
  yhat <- predict(modelFit, newdata=outData)
  results[results$rep==k, "beta0"] <- coef(modelFit)[1]
  results[results$rep==k, "beta1"] <- coef(modelFit)[2]
  results[results$rep==k,"Yhat"] <- yhat
}
results$error = results$Y-results$Yhat

predVar <- as.numeric(mean(aggregate(error~rep, results, FUN="var")$error))
predBias2 <- mean(aggregate(error~rep, results, FUN="mean")$error)^2
predErr <- mean(results$error^2)

Variance of predictions is 4.2607881.

Squared Bias of predictions is 0.0012517.

Mean Square Error is 4.3787505

Var + Bias^2 = 4.2620398.

A great explanation of the Bias Variance Tradeoff

NMC <- 500
ks <- c(2,4,8,16,32)
crossValidationResults <- data.frame('k' = ks,
                                     trainingError = 0,
                                     testingError = 0,
                                     testingVariance = 0,
                                     testingBias2 = 0,
                                     estimationError=0)

for(K in ks){
  trnErr <- 0
  tstErr <- 0

  for(j in 1:NMC){
    set.seed(j) #same data, repeated once for each k
    sampleData <- generate_data(32)

    #idx is a shuffled vector of row numbers
    idx <- sample(1:nrow(sampleData))
    #folds partitions the row indices
    folds <- split(idx, as.factor(1:K))
  
    trainingErrors <- 0
    testingErrors <- 0
    
    for(k in 1:K){ 
      training <- sampleData[-folds[[k]],]
      testing <- sampleData[folds[[k]],]
      #estimate MSE
      #fit the model to the training data
      modelFit <- lm(Y~X, data=training)
      #calculate the average squared error on the testing data
      trainingErrors[k] <- mean(resid(modelFit)^2)
      testingErrors[k] <- mean((predict(modelFit, newdata=testing) - testing$Y)^2)
    }
    
    trnErr[j] <- mean(trainingErrors)
    tstErr[j] <- mean(testingErrors)
  }
  crossValidationResults[crossValidationResults$k==K, "trainingError"] <- mean(trnErr)
  crossValidationResults[crossValidationResults$k==K, "testingError"] <- mean(tstErr)
  
  crossValidationResults[crossValidationResults$k==K, "testingVariance"] <- var(tstErr)
  crossValidationResults[crossValidationResults$k==K, "testingBias2"] <- mean((tstErr-4))^2
  crossValidationResults[crossValidationResults$k==K, "estimationError"] <- mean((tstErr-4)^2)
}
crossValidationResults

   k trainingError testingError testingVariance testingBias2 estimationError
1  2      3.477456     4.498287        1.992548   0.24829035        2.236853
2  4      3.635586     4.312318        1.441599   0.09754247        1.536259
3  8      3.680593     4.274024        1.316525   0.07508933        1.388982
4 16      3.699758     4.249614        1.250197   0.06230698        1.310004
5 32      3.708385     4.234627        1.229465   0.05505002        1.282056

plot(testingVariance~k, data=crossValidationResults, type="l", main="Bias Variance Decomposition - Estimation of MSE", ylim=c(0, max(crossValidationResults$estimationError)), ylab="", xlab="k (folds)")
lines(testingBias2~k, data=crossValidationResults, col="blue")
lines(estimationError~k, data=crossValidationResults, col="red")
legend(0,1,legend=c("Variance","Squared Bias","Error"), col=c("black","blue","red"), lty=1)

plot(trainingError~k, data=crossValidationResults, type="l", ylim=c(3,4.6))
lines(testingError~k, data=crossValidationResults, col="blue")
abline(h=2^2, col="red")

36.2 Another Example - A Multiple Regression Model

The true model is

\[Y_i = 10 + 2X_{1,i} + 5X_{2,i} -3X_{3,i} + \epsilon_i, \text{ where }\epsilon\sim N(0,2^2)\] This model fitting will be based on a larger sample, a sample of size 128.

generate_data <- function(n){
  X1 <- rnorm(n, 5, 4)
  X2 <- rnorm(n, 1, 4)
  X3 <- rnorm(n, 3, 4)
  eps <- rnorm(n, 0, 2)
  Y <- 10 + 2*X1 +5*X2 -3*X3+ eps
  return(data.frame(X1,X2,X3,Y))
}

NMC <- 1000
nOut <- 1000
outData <- generate_data(nOut)
n <- 128

results <- data.frame('rep' = rep(1:NMC, nOut),
                      'i' = rep(1:nOut, each=NMC),
                      'Y' = rep(outData$Y, each=NMC),
                      'Yhat' = rep(0,nOut*NMC),
                      'error'= rep(0,nOut*NMC))

for(k in 1:NMC){
  sampleData <- generate_data(128)
  modelFit <- lm(Y~., data=sampleData)
  yhat <- predict(modelFit, newdata=outData)
  results[results$rep==k,"Yhat"] <- yhat
}
results$error = results$Y-results$Yhat


predVar <- mean(aggregate(Yhat~rep, results, FUN="var")$Yhat)
predBias2 <- mean(aggregate(error~i, results, FUN="mean")$error)^2
predErr <- mean(results$error^2)

print(paste("Variance =",predVar))

[1] "Variance = 580.633211183573"

print(paste("Squared Bias = ", predBias2))

[1] "Squared Bias =  0.00160131358719143"

print(paste("Mean Square Error = ",predErr))

[1] "Mean Square Error =  4.36704941289589"

print(paste("Var + Bias^2 = ",predVar+predBias2))

[1] "Var + Bias^2 =  580.634812497161"

print(paste("estimate of sigma = ",sqrt(predErr)))

[1] "estimate of sigma =  2.08974864825794"

Now we will estimate the testing errror and training error for various numbers of folds

NMC <- 250
ks <- c(2,4,8,16,32,64,128)
crossValidationResults <- data.frame('k' = ks,
                                     trainingError = 0,
                                     testingError = 0,
                                     testingVariance = 0,
                                     testingBias2 = 0,
                                     estimationError=0)

for(K in ks){
  trnErr <- 0
  tstErr <- 0

  for(j in 1:NMC){
    set.seed(j) #same data, repeated once for each k
    sampleData <- generate_data(n)

    #idx is a shuffled vector of row numbers
    idx <- sample(1:nrow(sampleData))
    #folds partitions the row indices
    folds <- split(idx, as.factor(1:K))
  
    trainingErrors <- 0
    testingErrors <- 0
    
    for(k in 1:K){ 
      training <- sampleData[-folds[[k]],]
      testing <- sampleData[folds[[k]],]
      #estimate MSE
      #fit the model to the training data
      modelFit <- lm(Y~., data=training)
      #calculate the average squared error on the testing data
      trainingErrors[k] <- mean(resid(modelFit)^2)
      testingErrors[k] <- mean((predict(modelFit, newdata=testing) - testing$Y)^2)
    }
    
    trnErr[j] <- mean(trainingErrors)
    tstErr[j] <- mean(testingErrors)
  }
  crossValidationResults[crossValidationResults$k==K, "trainingError"] <- mean(trnErr)
  crossValidationResults[crossValidationResults$k==K, "testingError"] <- mean(tstErr)
  crossValidationResults[crossValidationResults$k==K, "testingVariance"] <- var(tstErr)
  crossValidationResults[crossValidationResults$k==K, "testingBias2"] <- mean((tstErr-4))^2
  crossValidationResults[crossValidationResults$k==K, "estimationError"] <- mean((tstErr-4)^2)
}
crossValidationResults

    k trainingError testingError testingVariance testingBias2 estimationError
1   2      3.750027     4.286584       0.4030953   0.08213047       0.4836134
2   4      3.836878     4.183352       0.3323707   0.03361782       0.3646590
3   8      3.861163     4.154025       0.3210524   0.02372356       0.3434918
4  16      3.870692     4.144120       0.3167117   0.02077066       0.3362155
5  32      3.875014     4.139478       0.3156883   0.01945414       0.3338797
6  64      3.877074     4.137198       0.3139791   0.01882324       0.3315464
7 128      3.878085     4.134802       0.3129656   0.01817168       0.3298854

plot(testingVariance~k, data=crossValidationResults, type="l", main="Bias Variance Decomposition - Estimation of MSE", ylim=c(0, max(crossValidationResults$estimationError)), ylab="", xlab="k (folds)")
lines(testingBias2~k, data=crossValidationResults, col="blue")
lines(estimationError~k, data=crossValidationResults, col="red")
legend(0,1,legend=c("Variance","Squared Bias","Error"), col=c("black","blue","red"), lty=1)

plot(trainingError~k, data=crossValidationResults, type="l", ylim=range(crossValidationResults[,2:3]), main="Error vs K", ylab="mean sq err",xlab="k (folds)")
lines(testingError~k, data=crossValidationResults, col="blue")
abline(h=predErr, col="red")
text(2, crossValidationResults$testingError[1],"Validation",adj=c(0,1))
text(2, crossValidationResults$trainingError[1],"Training",adj=c(0,0))
text(2, 4,"Model Error",adj=c(0,0))