1.3 Lasso
soft.th <- function(lambda, x) {
return(sign(x) * pmax(abs(x) - lambda, 0))
}
curve(soft.th(5, x), -10, 10, main = "soft.th(lambda, x)")
segments(-5, -4, -5, 4, lty = 5, col = "blue")
segments(5, -4, 5, 4, lty = 5, col = "blue")
text(-0.2, 1, "lambda = 5", cex = 1.5, col = "red")
linear.lasso <- function(X, y, lambda = 0, beta = rep(0, ncol(X))) {
n <- nrow(X)
p <- ncol(X)
res <- centralize(X, y) ## Centralization(See Below)
X <- res$X
y <- res$y
eps <- 1
beta.old <- beta
while (eps > 0.001) { ## Wait the convergence of this loop
for (j in 1:p) {
r <- y - as.matrix(X[, -j]) %*% beta[-j]
beta[j] <- soft.th(lambda, sum(r * X[, j]) / n) / (sum(X[, j] * X[, j]) / n)
}
eps <- max(abs(beta - beta.old))
beta.old <- beta
}
beta <- beta / res$X.sd ## Recover the coefficients to the original before regularization
beta.0 <- res$y.bar - sum(res$X.bar * beta)
return(list(beta = beta, beta.0 = beta.0))
}
centralize <- function(X, y, standardize = TRUE) {
X <- as.matrix(X)
n <- nrow(X)
p <- ncol(X)
X.bar <- array(dim = p) ## The average of X for each column
X.sd <- array(dim = p) ## The sd of each column in X
for (j in 1:p) {
X.bar[j] <- mean(X[, j])
X[, j] <- (X[, j] - X.bar[j]) ## Centralization of each column in X
X.sd[j] <- sqrt(var(X[, j]))
if (standardize == TRUE)
X[, j] <- X[, j] / X.sd[j] ## standarization of each column in X
}
if (is.matrix(y)) { ## if y is a matrix
K <- ncol(y)
y.bar <- array(dim = K) ## average of y
for (k in 1:K) {
y.bar[k] <- mean(y[, k])
y[, k] <- y[, k] - y.bar[k] ## centralization of y
}
} else { ## if y is a vector
y.bar <- mean(y)
y <- y - y.bar
}
return(list(X = X, y = y, X.bar = X.bar, X.sd = X.sd, y.bar = y.bar))
}
Example 2
df <- read.table("crime.txt")
x <- df[, 3:7]
y <- df[, 1]
p <- ncol(x)
lambda.seq <- seq(0, 200, 0.1)
plot(lambda.seq, xlim = c(0, 200), ylim = c(-10, 20), xlab = "lambda", ylab = "beta",
main = "Coefficients for each lambda", type = "n", col = "red")
r <- length(lambda.seq)
coef.seq <- array(dim = c(r, p))
for (i in 1:r)
coef.seq[i, ] <- linear.lasso(x, y, lambda.seq[i])$beta
for (j in 1:p) {
par(new = TRUE)
lines(lambda.seq, coef.seq[, j], col = j)
}
legend("topright",
legend = c("annual police funding", "% of people 25 years+ with 4 yrs. of high school",
"% of 16--19 year-olds not in highschool and not highschool graduates",
"% of people 25 years+ with at least 4 years of college"),
col = 1:p, lwd = 2, cex = .8)
warm.start <- function(X, y, lambda.max = 100) {
dec <- round(lambda.max / 50)
lambda.seq <- seq(lambda.max, 1, -dec)
r <- length(lambda.seq)
p <- ncol(X)
coef.seq <- matrix(nrow = r, ncol = p)
coef.seq[1, ] <- linear.lasso(X, y, lambda.seq[1])$beta
for (k in 2:r)
coef.seq[k, ] <- linear.lasso(X, y, lambda.seq[k], coef.seq[(k - 1), ])$beta
return(coef.seq)
}
Example 3
crime <- read.table("crime.txt")
X <- crime[, 3:7]
y <- crime[, 1]
coef.seq <- warm.start(X, y, 200)
p <- ncol(X)
lambda.max <- 200
dec <- round(lambda.max / 50)
lambda.seq <- seq(lambda.max, 1, -dec)
plot(log(lambda.seq), coef.seq[, 1], xlab = "log(lambda)", ylab = "Coefficients",
ylim = c(min(coef.seq), max(coef.seq)), type = "n")
for (j in 1:p)
lines(log(lambda.seq), coef.seq[, j], col = j)
Example 4
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.0.3
## Loading required package: Matrix
## Loaded glmnet 4.0-2
library(MASS)
df <- Boston
x <- as.matrix(df[, 1:13])
y <- df[, 14]
fit <- glmnet(x, y)
plot(fit, xvar = "lambda", main = "BOSTON")
1.4 Ridge
ridge <- function(X, y, lambda = 0) {
X <- as.matrix(X)
p <- ncol(X)
n <- length(y)
res <- centralize(X, y)
X <- res$X
y <- res$y
## The Ridge procedure is the only one line below
beta <- drop(solve(t(X) %*% X + n * lambda * diag(p)) %*% t(X) %*% y)
beta <- beta / res$X.sd ## Recover the each coefficient to the origibal before regularization
beta.0 <- res$y.bar - sum(res$X.bar * beta)
return(list(beta = beta, beta.0 = beta.0))
}
Example 5
df <- read.table("crime.txt")
x <- df[, 3:7]
y <- df[, 1]
p <- ncol(x)
lambda.seq <- seq(0, 100, 0.1)
plot(lambda.seq, xlim = c(0, 100), ylim = c(-10, 20), xlab = "lambda", ylab = "beta",
main = "The coefficients for each lambda", type = "n", col = "red")
r <- length(lambda.seq)
coef.seq <- array(dim = c(r, p))
for (i in 1:r)
coef.seq[i, ] = ridge(x, y, lambda.seq[i])$beta
for (j in 1:p) {
par(new = TRUE)
lines(lambda.seq, coef.seq[, j], col = j)
}
legend("topright",
legend = c("annual police funding", "% of people 25 years+ with 4 yrs. of high school",
"% of 16--19 year-olds not in highschool and not highschool graduates",
"% of people 25 years+ with at least 4 years of college"),
col = 1:p, lwd = 2, cex = .8)
crime <- read.table("crime.txt")
X <- crime[, 3:7]
y <- crime[, 1]
linear(X, y)
## $beta
## [1] 10.9806703 -6.0885294 5.4803042 0.3770443 5.5004712
##
## $beta.0
## [1] 489.6486
ridge(X, y)
## $beta
## [1] 10.9806703 -6.0885294 5.4803042 0.3770443 5.5004712
##
## $beta.0
## [1] 489.6486
ridge(X, y, 200)
## $beta
## [1] 0.055231573 -0.019372823 0.076321840 -0.016784731 -0.006907194
##
## $beta.0
## [1] 716.4355
1.5 Comparing Lasso and Ridge
Example 6
R2 <- function(x, y) {
y.hat <- lm(y ~ x)$fitted.values
y.bar <- mean(y)
RSS <- sum((y - y.hat) ^ 2)
TSS <- sum((y - y.bar) ^ 2)
return(1 - RSS / TSS)
}
vif <- function(x) {
p <- ncol(x)
values <- array(dim = p)
for (j in 1:p)
values[j] <- 1 / (1 - R2(x[, -j], x[, j]))
return(values)
}
library(MASS)
x <- as.matrix(Boston)
vif(x)
## [1] 1.831537 2.352186 3.992503 1.095223 4.586920 2.260374 3.100843 4.396007
## [9] 7.808198 9.205542 1.993016 1.381463 3.581585 3.855684
Example 7
n <- 500
x <- array(dim = c(n, 6))
z <- array(dim = c(n, 2))
for (i in 1:2)
z[, i] <- rnorm(n)
y <- 3 * z[, 1] - 1.5 * z[, 2] + 2 * rnorm(n)
for (j in 1:3)
x[, j] <- z[, 1] + rnorm(n) / 5
for (j in 4:6)
x[, j] <- z[, 2] + rnorm(n) / 5
glm.fit <- glmnet(x, y)
plot(glm.fit)
legend("topleft", legend = c("X1", "X2", "X3", "X4", "X5", "X6"), col = 1:6, lwd = 2, cex = .8)
1.6 Elastic Net
linear.lasso <- function(X, y, lambda = 0, beta = rep(0, ncol(X)), alpha = 1) { #
X <- as.matrix(X)
n <- nrow(X)
p <- ncol(X)
X.bar <- array(dim = p)
for (j in 1:p) {
X.bar[j] <- mean(X[, j])
X[, j] <- X[, j] - X.bar[j]
}
y.bar <- mean(y)
y <- y - y.bar
scale <- array(dim = p)
for (j in 1:p) {
scale[j] <- sqrt(sum(X[, j] ^ 2) / n)
X[, j] <- X[, j] / scale[j]
}
eps <- 1
beta.old <- beta
while (eps > 0.001) {
for (j in 1:p) {
r <- y - as.matrix(X[, -j]) %*% beta[-j]
beta[j] <- soft.th(lambda * alpha, sum(r * X[, j]) / n) / ##
(sum(X[, j] * X[, j]) / n + lambda * (1 - alpha)) ##
}
eps <- max(abs(beta - beta.old))
beta.old <- beta
}
for (j in 1:p)
beta[j] <- beta[j] / scale[j]
beta.0 <- y.bar - sum(X.bar * beta)
return(list(beta = beta, beta.0 = beta.0))
}
1.7 Setting the lambda value
Example 9
library(glmnet)
df <- read.table("crime.txt")
X <- as.matrix(df[, 3:7])
y <- as.vector(df[, 1])
cv.fit <- cv.glmnet(X, y)
plot(cv.fit)
lambda.min <- cv.fit$lambda.min
lambda.min
## [1] 15.15854
fit <- glmnet(X, y, lambda = lambda.min)
fit$beta
## 5 x 1 sparse Matrix of class "dgCMatrix"
## s0
## V3 10.053897
## V4 -3.071651
## V5 3.280323
## V6 .
## V7 .
Example 10
n <- 500
x <- array(dim = c(n, 6))
z <- array(dim = c(n, 2))
for (i in 1:2)
z[, i] <- rnorm(n)
y <- 3 * z[, 1] - 1.5 * z[, 2] + 2 * rnorm(n)
for (j in 1:3)
x[, j] <- z[, 1] + rnorm(n) / 5
for (j in 4:6)
x[, j] <- z[, 2] + rnorm(n) / 5
best.score <- Inf
for (alpha in seq(0, 1, 0.01)) {
res <- cv.glmnet(x, y, alpha = alpha)
lambda <- res$lambda.min
min.cvm <- min(res$cvm)
if (min.cvm < best.score) {
alpha.min <- alpha
lambda.min <- lambda
best.score <- min.cvm
}
}
alpha.min
## [1] 0.13
lambda.min
## [1] 0.1528932
glmnet(x, y, alpha = alpha.min, lambda = lambda.min)$beta
## 6 x 1 sparse Matrix of class "dgCMatrix"
## s0
## V1 1.1101385
## V2 0.8095265
## V3 0.9133333
## V4 -0.4002594
## V5 -0.3476525
## V6 -0.6198445
glm.fit <- glmnet(x, y, alpha = alpha.min)
plot(glm.fit)