第1章 線形回帰

1.1 線形回帰

inner.prod <- function(x, y) {
  return(sum(x * y))
}

linear <- function(X, y) {
  n <- nrow(X)
  p <- ncol(X)
  X <- as.matrix(X)
  x.bar <- array(dim = p)
  for (j in 1:p) 
    x.bar[j] <- mean(X[, j])

  ## Xの中心化
  for (j in 1:p)
    X[, j] <- X[, j] - x.bar[j]

  ## yの中心化
  y <- as.vector(y)
  y.bar <- mean(y)
  y <- y - y.bar

  beta <- as.vector(solve(t(X) %*% X) %*% t(X) %*% y)
  beta.0 <- y.bar - sum(x.bar * beta)
  return(list(beta = beta, beta.0 = beta.0))
}

1.2 劣微分

例1

curve(x ^ 2 - 3 * x + abs(x), -2, 2, main = "y = x^2 - 3x + |x|")
points(1, -1, col = "red", pch = 16)

curve(x ^ 2 + x + 2 * abs(x), -2, 2, main = "y = x^2 + x + 2|x|")
points(0, 0, col = "red", pch = 16)

1.3 Lasso

soft.th <- function(lambda, x) {
  return(sign(x) * pmax(abs(x) - lambda, 0))
}

curve(soft.th(5, x), -10, 10, main = "soft.th(lambda, x)")
segments(-5, -4, -5, 4, lty = 5, col = "blue")
segments(5, -4, 5, 4, lty = 5, col = "blue")
text(-0.2, 1, "lambda = 5", cex = 1.5, col = "red")

linear.lasso <- function(X, y, lambda = 0, beta = rep(0, ncol(X))) {
  n <- nrow(X)
  p <- ncol(X)
  res <- centralize(X, y)   ## 中心化(下記参照)
  X <- res$X
  y <- res$y
  eps <- 1
  beta.old <- beta
  while (eps > 0.001) {    ## このループの収束を待つ
    for (j in 1:p) {
      r <- y - as.matrix(X[, -j]) %*% beta[-j]
      beta[j] <- soft.th(lambda, sum(r * X[, j]) / n) / (sum(X[, j] * X[, j]) / n) 
    }
    eps <- max(abs(beta - beta.old))
    beta.old <- beta
  }
  beta <- beta / res$X.sd   ## 各変数の係数を正規化前のものに戻す
  beta.0 <- res$y.bar - sum(res$X.bar * beta)
  return(list(beta = beta, beta.0 = beta.0))
}
centralize <- function(X, y, standardize = TRUE) {
  X <- as.matrix(X)
  n <- nrow(X)
  p <- ncol(X)
  X.bar <- array(dim = p)          ## Xの各列の平均
  X.sd <- array(dim = p)           ## Xの各列の標準偏差
  for (j in 1:p) {
    X.bar[j] <- mean(X[, j])
    X[, j] <- (X[, j] - X.bar[j])  ## Xの各列の中心化
    X.sd[j] <- sqrt(var(X[, j]))
    if (standardize == TRUE) 
      X[, j] <- X[, j] / X.sd[j]   ## Xの各列の標準化
  }
  if (is.matrix(y)) {      ## yが行列の場合
    K <- ncol(y)
    y.bar <- array(dim = K)        ## yの平均
    for (k in 1:K) {
      y.bar[k] <- mean(y[, k])
      y[, k] <- y[, k] - y.bar[k]  ## yの中心化
    }
  } else {                         ## yがベクトルの場合
    y.bar <- mean(y)
    y <- y - y.bar
  }
  return(list(X = X, y = y, X.bar = X.bar, X.sd = X.sd, y.bar = y.bar))
}

例2

df <- read.table("crime.txt")
x <- df[, 3:7]
y <- df[, 1]
p <- ncol(x)
lambda.seq <- seq(0, 200, 0.1)
plot(lambda.seq, xlim = c(0, 200), ylim = c(-10, 20), xlab = "lambda", ylab = "beta",
     main = "各 lambda についての各係数の値", type = "n", col = "red")
r <- length(lambda.seq)
coef.seq <- array(dim = c(r, p))
for (i in 1:r)
  coef.seq[i, ] <- linear.lasso(x, y, lambda.seq[i])$beta
for (j in 1:p) {
  par(new = TRUE)
  lines(lambda.seq, coef.seq[, j], col = j)
}
legend("topright",
       legend = c("警察への年間資金", "25 歳以上で高校を卒業した人の割合",
                  "16-19 歳で高校に通っていない人の割合", "18-24 歳で大学生の割合",
                  "25 歳以上で 4 年制大学を卒業した人の割合"),
       col = 1:p, lwd = 2, cex = .8)

warm.start <- function(X, y, lambda.max = 100) {
  dec <- round(lambda.max / 50)
  lambda.seq <- seq(lambda.max, 1, -dec)
  r <- length(lambda.seq)
  p <- ncol(X)
  coef.seq <- matrix(nrow = r, ncol = p)
  coef.seq[1, ] <- linear.lasso(X, y, lambda.seq[1])$beta
  for (k in 2:r) 
    coef.seq[k, ] <- linear.lasso(X, y, lambda.seq[k], coef.seq[(k - 1), ])$beta
  return(coef.seq)
}

例3

crime <- read.table("crime.txt")
X <- crime[, 3:7]
y <- crime[, 1] 
coef.seq <- warm.start(X, y, 200)
p <- ncol(X)
lambda.max <- 200
dec <- round(lambda.max / 50)
lambda.seq <- seq(lambda.max, 1, -dec)
plot(log(lambda.seq), coef.seq[, 1], xlab = "log(lambda)", ylab = "係数",
     ylim = c(min(coef.seq), max(coef.seq)), type = "n")
for (j in 1:p)
  lines(log(lambda.seq), coef.seq[, j], col = j)

例4

library(glmnet)
## Warning: package 'glmnet' was built under R version 4.0.3
## Loading required package: Matrix
## Loaded glmnet 4.0-2
library(MASS)
df <- Boston
x <- as.matrix(df[, 1:13])
y <- df[, 14]
fit <- glmnet(x, y)
plot(fit, xvar = "lambda", main = "BOSTON")

1.4 Ridge

ridge <- function(X, y, lambda = 0) {
  X <- as.matrix(X)
  p <- ncol(X)
  n <- length(y)
  res <- centralize(X, y)
  X <- res$X
  y <- res$y
  ## Ridgeの処理は,次の1行のみ
  beta <- drop(solve(t(X) %*% X + n * lambda * diag(p)) %*% t(X) %*% y)
  beta <- beta / res$X.sd  ## 各変数の係数を正規化前のものに戻す
  beta.0 <- res$y.bar - sum(res$X.bar * beta)
  return(list(beta = beta, beta.0 = beta.0))
}

例5

df <- read.table("crime.txt")
x <- df[, 3:7]
y <- df[, 1]
p <- ncol(x)
lambda.seq <- seq(0, 100, 0.1)
plot(lambda.seq, xlim = c(0, 100), ylim = c(-10, 20), xlab = "lambda", ylab = "beta",
     main = "各 lambda についての各係数の値", type = "n", col = "red") 
r <- length(lambda.seq)
coef.seq <- array(dim = c(r, p))
for (i in 1:r)
  coef.seq[i, ] = ridge(x, y, lambda.seq[i])$beta
for (j in 1:p) {
  par(new = TRUE)
  lines(lambda.seq, coef.seq[, j], col = j)
}
legend("topright",
       legend = c("警察への年間資金", "25 歳以上で高校を卒業した人の割合",
                  "16-19 歳で高校に通っていない人の割合", "18-24 歳で大学生の割合",
                  "25 歳以上で 4 年制大学を卒業した人の割合"),
       col = 1:p, lwd = 2, cex = .8)

crime <- read.table("crime.txt")
X <- crime[, 3:7]
y <- crime[, 1]
linear(X, y)
## $beta
## [1] 10.9806703 -6.0885294  5.4803042  0.3770443  5.5004712
## 
## $beta.0
## [1] 489.6486
ridge(X, y)
## $beta
## [1] 10.9806703 -6.0885294  5.4803042  0.3770443  5.5004712
## 
## $beta.0
## [1] 489.6486
ridge(X, y, 200)
## $beta
## [1]  0.055231573 -0.019372823  0.076321840 -0.016784731 -0.006907194
## 
## $beta.0
## [1] 716.4355

1.5 LassoとRidgeを比較して

例6

R2 <- function(x, y) {
  y.hat <- lm(y ~ x)$fitted.values
  y.bar <- mean(y)
  RSS <- sum((y - y.hat) ^ 2)
  TSS <- sum((y - y.bar) ^ 2)
  return(1 - RSS / TSS)
}

vif <- function(x) {
  p <- ncol(x)
  values <- array(dim = p)
  for (j in 1:p)
    values[j] <- 1 / (1 - R2(x[, -j], x[, j]))
  return(values)
}

library(MASS)
x <- as.matrix(Boston)
vif(x)
##  [1] 1.831537 2.352186 3.992503 1.095223 4.586920 2.260374 3.100843 4.396007
##  [9] 7.808198 9.205542 1.993016 1.381463 3.581585 3.855684

例7

n <- 500
x <- array(dim = c(n, 6))
z <- array(dim = c(n, 2))
for (i in 1:2)
  z[, i] <- rnorm(n)
y <- 3 * z[, 1] - 1.5 * z[, 2] + 2 * rnorm(n)
for (j in 1:3)
  x[, j] <- z[, 1] + rnorm(n) / 5
for (j in 4:6)
  x[, j] <- z[, 2] + rnorm(n) / 5
glm.fit <- glmnet(x, y)
plot(glm.fit)
legend("topleft", legend = c("X1", "X2", "X3", "X4", "X5", "X6"), col = 1:6, lwd = 2, cex = .8)

1.6 elasticネット

linear.lasso <- function(X, y, lambda = 0, beta = rep(0, ncol(X)), alpha = 1) {  #
  X <- as.matrix(X)
  n <- nrow(X)
  p <- ncol(X)
  X.bar <- array(dim = p)
  for (j in 1:p) {
    X.bar[j] <- mean(X[, j])
    X[, j] <- X[, j] - X.bar[j]
  }
  y.bar <- mean(y)
  y <- y - y.bar
  scale <- array(dim = p)
  for (j in 1:p) {
    scale[j] <- sqrt(sum(X[, j] ^ 2) / n)
    X[, j] <- X[, j] / scale[j]
  }
  eps <- 1
  beta.old <- beta
  while (eps > 0.001) {
    for (j in 1:p) {
      r <- y - as.matrix(X[, -j]) %*% beta[-j]
      beta[j] <- soft.th(lambda * alpha, sum(r * X[, j]) / n) /              ##
        (sum(X[, j] * X[, j]) / n + lambda * (1 - alpha))                    ##
    }
    eps <- max(abs(beta - beta.old))
    beta.old <- beta
  }
  for (j in 1:p)
    beta[j] <- beta[j] / scale[j]
  beta.0 <- y.bar - sum(X.bar * beta)
  return(list(beta = beta, beta.0 = beta.0))
}

1.7 lambdaの値の設定

例9

library(glmnet)
df <- read.table("crime.txt")
X <- as.matrix(df[, 3:7])
y <- as.vector(df[, 1])
cv.fit <- cv.glmnet(X, y)
plot(cv.fit)

lambda.min <- cv.fit$lambda.min
lambda.min
## [1] 21.99244
fit <- glmnet(X, y, lambda = lambda.min)
fit$beta
## 5 x 1 sparse Matrix of class "dgCMatrix"
##           s0
## V3  9.497979
## V4 -2.309352
## V5  3.209057
## V6  .       
## V7  .

例10

n <- 500
x <- array(dim = c(n, 6))
z <- array(dim = c(n, 2))
for (i in 1:2)
  z[, i] <- rnorm(n)
y <- 3 * z[, 1] - 1.5 * z[, 2] + 2 * rnorm(n)
for (j in 1:3)
  x[, j] <- z[, 1] + rnorm(n) / 5
for (j in 4:6)
  x[, j] <- z[, 2] + rnorm(n) / 5
best.score <- Inf
for (alpha in seq(0, 1, 0.01)) {
  res <- cv.glmnet(x, y, alpha = alpha)
  lambda <- res$lambda.min
  min.cvm <- min(res$cvm)
  if (min.cvm < best.score) {
    alpha.min <- alpha
    lambda.min <- lambda
    best.score <- min.cvm
  }
}
alpha.min
## [1] 0.07
lambda.min
## [1] 0.1928796
glmnet(x, y, alpha = alpha.min, lambda = lambda.min)$beta
## 6 x 1 sparse Matrix of class "dgCMatrix"
##            s0
## V1  1.0566878
## V2  1.0194753
## V3  0.9797196
## V4 -0.5967331
## V5 -0.4250913
## V6 -0.4287616
glm.fit <- glmnet(x, y, alpha = alpha.min)
plot(glm.fit)