Chapter 1 Linear Regression

1.1 Linear Regression

inner.prod <- function(x, y) {
  return(sum(x * y))
}

linear <- function(X, y) {
  n <- nrow(X)
  p <- ncol(X)
  X <- as.matrix(X)
  x.bar <- array(dim = p)
  for (j in 1:p) 
    x.bar[j] <- mean(X[, j])

  ## Centralization of X
  for (j in 1:p)
    X[, j] <- X[, j] - x.bar[j]

  ## Centralization of y
  y <- as.vector(y)
  y.bar <- mean(y)
  y <- y - y.bar

  beta <- as.vector(solve(t(X) %*% X) %*% t(X) %*% y)
  beta.0 <- y.bar - sum(x.bar * beta)
  return(list(beta = beta, beta.0 = beta.0))
}

1.2 SubDerivative

Example 1

curve(x ^ 2 - 3 * x + abs(x), -2, 2, main = "y = x^2 - 3x + |x|")
points(1, -1, col = "red", pch = 16)

curve(x ^ 2 + x + 2 * abs(x), -2, 2, main = "y = x^2 + x + 2|x|")
points(0, 0, col = "red", pch = 16)

1.3 Lasso

soft.th <- function(lambda, x) {
  return(sign(x) * pmax(abs(x) - lambda, 0))
}

curve(soft.th(5, x), -10, 10, main = "soft.th(lambda, x)")
segments(-5, -4, -5, 4, lty = 5, col = "blue")
segments(5, -4, 5, 4, lty = 5, col = "blue")
text(-0.2, 1, "lambda = 5", cex = 1.5, col = "red")

linear.lasso <- function(X, y, lambda = 0, beta = rep(0, ncol(X))) {
  n <- nrow(X)
  p <- ncol(X)
  res <- centralize(X, y)   ## Centralization(See Below)
  X <- res$X
  y <- res$y
  eps <- 1
  beta.old <- beta
  while (eps > 0.001) {    ## Wait the convergence of this loop
    for (j in 1:p) {
      r <- y - as.matrix(X[, -j]) %*% beta[-j]
      beta[j] <- soft.th(lambda, sum(r * X[, j]) / n) / (sum(X[, j] * X[, j]) / n) 
    }
    eps <- max(abs(beta - beta.old))
    beta.old <- beta
  }
  beta <- beta / res$X.sd   ## Recover the coefficients to the original before regularization
  beta.0 <- res$y.bar - sum(res$X.bar * beta)
  return(list(beta = beta, beta.0 = beta.0))
}
centralize <- function(X, y, standardize = TRUE) {
  X <- as.matrix(X)
  n <- nrow(X)
  p <- ncol(X)
  X.bar <- array(dim = p)          ## The average of X for each column
  X.sd <- array(dim = p)           ## The sd of each column in X
  for (j in 1:p) {
    X.bar[j] <- mean(X[, j])
    X[, j] <- (X[, j] - X.bar[j])  ## Centralization of each column in X
    X.sd[j] <- sqrt(var(X[, j]))
    if (standardize == TRUE) 
      X[, j] <- X[, j] / X.sd[j]   ## standarization of each column in X
  }
  if (is.matrix(y)) {      ## if y is a matrix
    K <- ncol(y)
    y.bar <- array(dim = K)        ## average of y
    for (k in 1:K) {
      y.bar[k] <- mean(y[, k])
      y[, k] <- y[, k] - y.bar[k]  ## centralization of y
    }
  } else {                         ## if y is a vector
    y.bar <- mean(y)
    y <- y - y.bar
  }
  return(list(X = X, y = y, X.bar = X.bar, X.sd = X.sd, y.bar = y.bar))
}

Example 2

df <- read.table("crime.txt")
x <- df[, 3:7]
y <- df[, 1]
p <- ncol(x)
lambda.seq <- seq(0, 200, 0.1)
plot(lambda.seq, xlim = c(0, 200), ylim = c(-10, 20), xlab = "lambda", ylab = "beta",
     main = "Coefficients for each lambda", type = "n", col = "red")
r <- length(lambda.seq)
coef.seq <- array(dim = c(r, p))
for (i in 1:r)
  coef.seq[i, ] <- linear.lasso(x, y, lambda.seq[i])$beta
for (j in 1:p) {
  par(new = TRUE)
  lines(lambda.seq, coef.seq[, j], col = j)
}
legend("topright",
       legend = c("annual police funding", "% of people 25 years+ with 4 yrs. of high school",
                  "% of 16--19 year-olds not in highschool and not highschool graduates",
                  "% of people 25 years+ with at least 4 years of college"),
       col = 1:p, lwd = 2, cex = .8)

warm.start <- function(X, y, lambda.max = 100) {
  dec <- round(lambda.max / 50)
  lambda.seq <- seq(lambda.max, 1, -dec)
  r <- length(lambda.seq)
  p <- ncol(X)
  coef.seq <- matrix(nrow = r, ncol = p)
  coef.seq[1, ] <- linear.lasso(X, y, lambda.seq[1])$beta
  for (k in 2:r) 
    coef.seq[k, ] <- linear.lasso(X, y, lambda.seq[k], coef.seq[(k - 1), ])$beta
  return(coef.seq)
}

Example 3

crime <- read.table("crime.txt")
X <- crime[, 3:7]
y <- crime[, 1] 
coef.seq <- warm.start(X, y, 200)
p <- ncol(X)
lambda.max <- 200
dec <- round(lambda.max / 50)
lambda.seq <- seq(lambda.max, 1, -dec)
plot(log(lambda.seq), coef.seq[, 1], xlab = "log(lambda)", ylab = "Coefficients",
     ylim = c(min(coef.seq), max(coef.seq)), type = "n")
for (j in 1:p)
  lines(log(lambda.seq), coef.seq[, j], col = j)

Example 4

library(glmnet)
## Warning: package 'glmnet' was built under R version 4.0.3
## Loading required package: Matrix
## Loaded glmnet 4.0-2
library(MASS)
df <- Boston
x <- as.matrix(df[, 1:13])
y <- df[, 14]
fit <- glmnet(x, y)
plot(fit, xvar = "lambda", main = "BOSTON")

1.4 Ridge

ridge <- function(X, y, lambda = 0) {
  X <- as.matrix(X)
  p <- ncol(X)
  n <- length(y)
  res <- centralize(X, y)
  X <- res$X
  y <- res$y
  ## The Ridge procedure is the only one line below
  beta <- drop(solve(t(X) %*% X + n * lambda * diag(p)) %*% t(X) %*% y)
  beta <- beta / res$X.sd  ## Recover the each coefficient to the origibal before regularization
  beta.0 <- res$y.bar - sum(res$X.bar * beta)
  return(list(beta = beta, beta.0 = beta.0))
}

Example 5

df <- read.table("crime.txt")
x <- df[, 3:7]
y <- df[, 1]
p <- ncol(x)
lambda.seq <- seq(0, 100, 0.1)
plot(lambda.seq, xlim = c(0, 100), ylim = c(-10, 20), xlab = "lambda", ylab = "beta",
     main = "The coefficients for each lambda", type = "n", col = "red") 
r <- length(lambda.seq)
coef.seq <- array(dim = c(r, p))
for (i in 1:r)
  coef.seq[i, ] = ridge(x, y, lambda.seq[i])$beta
for (j in 1:p) {
  par(new = TRUE)
  lines(lambda.seq, coef.seq[, j], col = j)
}
legend("topright",
       legend = c("annual police funding", "% of people 25 years+ with 4 yrs. of high school",
                  "% of 16--19 year-olds not in highschool and not highschool graduates",
                  "% of people 25 years+ with at least 4 years of college"),
       col = 1:p, lwd = 2, cex = .8)
crime <- read.table("crime.txt")
X <- crime[, 3:7]
y <- crime[, 1]
linear(X, y)
## $beta
## [1] 10.9806703 -6.0885294  5.4803042  0.3770443  5.5004712
## 
## $beta.0
## [1] 489.6486
ridge(X, y)
## $beta
## [1] 10.9806703 -6.0885294  5.4803042  0.3770443  5.5004712
## 
## $beta.0
## [1] 489.6486
ridge(X, y, 200)
## $beta
## [1]  0.055231573 -0.019372823  0.076321840 -0.016784731 -0.006907194
## 
## $beta.0
## [1] 716.4355

1.5 Comparing Lasso and Ridge

Example 6

R2 <- function(x, y) {
  y.hat <- lm(y ~ x)$fitted.values
  y.bar <- mean(y)
  RSS <- sum((y - y.hat) ^ 2)
  TSS <- sum((y - y.bar) ^ 2)
  return(1 - RSS / TSS)
}

vif <- function(x) {
  p <- ncol(x)
  values <- array(dim = p)
  for (j in 1:p)
    values[j] <- 1 / (1 - R2(x[, -j], x[, j]))
  return(values)
}

library(MASS)
x <- as.matrix(Boston)
vif(x)
##  [1] 1.831537 2.352186 3.992503 1.095223 4.586920 2.260374 3.100843 4.396007
##  [9] 7.808198 9.205542 1.993016 1.381463 3.581585 3.855684

Example 7

n <- 500
x <- array(dim = c(n, 6))
z <- array(dim = c(n, 2))
for (i in 1:2)
  z[, i] <- rnorm(n)
y <- 3 * z[, 1] - 1.5 * z[, 2] + 2 * rnorm(n)
for (j in 1:3)
  x[, j] <- z[, 1] + rnorm(n) / 5
for (j in 4:6)
  x[, j] <- z[, 2] + rnorm(n) / 5
glm.fit <- glmnet(x, y)
plot(glm.fit)
legend("topleft", legend = c("X1", "X2", "X3", "X4", "X5", "X6"), col = 1:6, lwd = 2, cex = .8)

1.6 Elastic Net

linear.lasso <- function(X, y, lambda = 0, beta = rep(0, ncol(X)), alpha = 1) {  #
  X <- as.matrix(X)
  n <- nrow(X)
  p <- ncol(X)
  X.bar <- array(dim = p)
  for (j in 1:p) {
    X.bar[j] <- mean(X[, j])
    X[, j] <- X[, j] - X.bar[j]
  }
  y.bar <- mean(y)
  y <- y - y.bar
  scale <- array(dim = p)
  for (j in 1:p) {
    scale[j] <- sqrt(sum(X[, j] ^ 2) / n)
    X[, j] <- X[, j] / scale[j]
  }
  eps <- 1
  beta.old <- beta
  while (eps > 0.001) {
    for (j in 1:p) {
      r <- y - as.matrix(X[, -j]) %*% beta[-j]
      beta[j] <- soft.th(lambda * alpha, sum(r * X[, j]) / n) /              ##
        (sum(X[, j] * X[, j]) / n + lambda * (1 - alpha))                    ##
    }
    eps <- max(abs(beta - beta.old))
    beta.old <- beta
  }
  for (j in 1:p)
    beta[j] <- beta[j] / scale[j]
  beta.0 <- y.bar - sum(X.bar * beta)
  return(list(beta = beta, beta.0 = beta.0))
}

1.7 Setting the lambda value

Example 9

library(glmnet)
df <- read.table("crime.txt")
X <- as.matrix(df[, 3:7])
y <- as.vector(df[, 1])
cv.fit <- cv.glmnet(X, y)
plot(cv.fit)

lambda.min <- cv.fit$lambda.min
lambda.min
## [1] 15.15854
fit <- glmnet(X, y, lambda = lambda.min)
fit$beta
## 5 x 1 sparse Matrix of class "dgCMatrix"
##           s0
## V3 10.053897
## V4 -3.071651
## V5  3.280323
## V6  .       
## V7  .

Example 10

n <- 500
x <- array(dim = c(n, 6))
z <- array(dim = c(n, 2))
for (i in 1:2)
  z[, i] <- rnorm(n)
y <- 3 * z[, 1] - 1.5 * z[, 2] + 2 * rnorm(n)
for (j in 1:3)
  x[, j] <- z[, 1] + rnorm(n) / 5
for (j in 4:6)
  x[, j] <- z[, 2] + rnorm(n) / 5
best.score <- Inf
for (alpha in seq(0, 1, 0.01)) {
  res <- cv.glmnet(x, y, alpha = alpha)
  lambda <- res$lambda.min
  min.cvm <- min(res$cvm)
  if (min.cvm < best.score) {
    alpha.min <- alpha
    lambda.min <- lambda
    best.score <- min.cvm
  }
}
alpha.min
## [1] 0.13
lambda.min
## [1] 0.1528932
glmnet(x, y, alpha = alpha.min, lambda = lambda.min)$beta
## 6 x 1 sparse Matrix of class "dgCMatrix"
##            s0
## V1  1.1101385
## V2  0.8095265
## V3  0.9133333
## V4 -0.4002594
## V5 -0.3476525
## V6 -0.6198445
glm.fit <- glmnet(x, y, alpha = alpha.min)
plot(glm.fit)