第3章 リサンプリング

第2章より

knn.1 <- function(x, y, z, k) {
  x <- as.matrix(x)
  n <- nrow(x)
  p <- ncol(x)
  dis <- array(dim = n)
  for (i in 1:n)
    dis[i] <- norm(z - x[i, ], "2")
  S <- order(dis)[1:k]  # 距離の小さいk個の添え字iの集合
  u <- sort(table(y[S]), decreasing = TRUE)  # k個の中の最頻のy[i]と頻度

  # タイブレーキングの処理
  while (length(u) > 1 && u[1] == u[2]) {
    k <- k - 1
    S <- order(dis)[1:k]
    u <- sort(table(y[S]), decreasing = TRUE)
  }

  return(names(u)[1])
}
knn <- function(x, y, z, k) {
  n <- nrow(z)
  w <- array(dim = n)
  for (i in 1:n)
    w[i] <- knn.1(x, y, z[i, ], k)
  return(w)
}

3.1 クロスバリデーション

cv.linear <- function(X, y, k) {
  n <- length(y)
  m <- n / k  # kはnを割り切ることを仮定
  S <- 0
  for (j in 1:k) {
    # n組のデータのうち,どのデータをテスト用に使うかを指定
    test <- ((j - 1) * m + 1):(j * m)
    # テストで用いるデータ以外で,係数betaを推定する
    beta <- solve(t(X[-test, ]) %*% X[-test, ]) %*% t(X[-test, ]) %*% y[-test]
    # 係数betaに,テスト用のデータをあてはめて評価する
    e <- y[test] - X[test, ] %*% beta
    S <- S + drop(t(e) %*% e)
  }
  return(S / n)
}

例39

# データ生成
n <- 100
p <- 5
X <- matrix(rnorm(n * p), ncol = p)
X <- cbind(1, X)
beta <- rnorm(p + 1)
beta[c(2, 3)] <- 0
eps <- rnorm(n)
y <- X %*% beta + eps

# クロスバリデーションによる評価
cv.linear(X[, c(1, 4, 5, 6)], y, 10)
## [1] 1.154165
cv.linear(X[, c(1, 2, 3, 4)], y, 10)
## [1] 1.690943
cv.linear(X, y, 10)
## [1] 1.159602
n <- 100
p <- 5
X <- matrix(rnorm(n * p), ncol = p)
X <- cbind(1, X)
beta <- rnorm(p + 1)
beta[c(2, 3)] <- 0
U <- NULL
V <- NULL
for (j in 1:100) {
  eps <- rnorm(n)
  y <- X %*% beta + eps
  U <- c(U, cv.linear(X[, c(1, 4, 5, 6)], y, 10))
  V <- c(V, cv.linear(X, y, 10))
}
plot(U, V,
     xlab = "cv.linear(X[, c(1, 4, 5)], y, 10)",
     ylab = "cv.linear(X, y, 10)",
     main = "変数を多く選びすぎて過学習")
abline(a = 0, b = 1, col = "red")

例40

# データ生成
plot(0, 0, xlab = "k", ylab = "CVの値",
     xlim = c(2, n), ylim = c(0.3, 1.5), type = "n")
for (j in 2:11) {
  n <- 100
  p <- 5
  X <- matrix(rnorm(n * p), ncol = p)
  X <- cbind(1, X)
  beta <- rnorm(p + 1)
  eps <- rnorm(n)
  y <- X %*% beta + eps
  U <- NULL
  V <- NULL
  for (k in 2:n) {
    if (n %% k == 0) {
      U <- c(U, k)
      V <- c(V, cv.linear(X, y, k))
    }
  }
  lines(U, V, col = j)
}

例41

df <- iris
df <- df[sample(1:150, 150, replace = FALSE), ]
n <- nrow(df)
U <- NULL
V <- NULL
for (k in 1:10) {
  top.seq <- 1 + seq(0, 135, 15)
  S <- 0
  for (top in top.seq) {
    index <- top:(top + 14)
    knn.ans <- knn(df[-index, 1:4], df[-index, 5], df[index, 1:4], k)
    ans <- df[index, 5]
    S <- S + sum(knn.ans != ans)
  }
  S <- S / n
  U <- c(U, k)
  V <- c(V, S)
}
plot(0, 0, type = "n", xlab = "k", ylab = "誤り率",
     xlim = c(1, 10), ylim = c(0, 0.1), main = "cvによる誤り率の評価")
lines(U, V, col = "red")

3.2 線形回帰の場合の公式

cv.fast <- function(X, y, k) {
  n <- length(y)
  m <- n / k
  H <- X %*% solve(t(X) %*% X) %*% t(X)
  I <- diag(rep(1, n))
  e <- (I - H) %*% y
  I <- diag(rep(1, m))
  S <- 0
  for (j in 1:k) {
    test <- ((j - 1) * m + 1):(j * m)
    S <- S + norm(solve(I - H[test, test]) %*% e[test], "2") ^ 2
  }
  return(S / n)
}

例42

## データ生成
n <- 1000
p <- 5
beta <- rnorm(p + 1)
x <- matrix(rnorm(n * p), ncol = p)
X <- cbind(rep(1, n), x)
y <- X %*% beta + rnorm(n)

plot(0, 0, xlab = "k", ylab = "実行時間",
     xlim = c(2, n), ylim = c(0, 0.5), type = "n")
U <- NULL
V <- NULL
for (k in 10:n) {
  if (n %% k == 0) {
    t <- proc.time()[3]
    cv.fast(X, y, k)
    U <- c(U, k)
    V <- c(V, (proc.time()[3] - t))
  }
}
lines(U, V, col = "blue")
U <- NULL
V <- NULL
for (k in 10:n) {
  if (n %% k == 0) {
    t <- proc.time()[3]
    cv.linear(X, y, k)
    U <- c(U, k)
    V <- c(V, (proc.time()[3] - t))
  }
}
lines(U, V, col = "red")
legend("topleft", legend = c("cv.linear", "cv.fast"),
       col = c("red", "blue"), lty = 1)

3.3 ブートストラップ

bt <- function(df, f, r) {
  m <- nrow(df)
  org <- f(df, 1:m)
  u <- array(dim = r)
  for (j in 1:r) {
    index <- sample(1:m, m, replace = TRUE)
    u[j] <- f(df, index)
  }
  return(list(original = org, bias = mean(u) - org, stderr = sd(u)))
}

例43

func.1 <- function(data, index) {
  X <- data$X[index]
  Y <- data$Y[index]
  return((var(Y) - var(X)) / (var(X) + var(Y) - 2 * cov(X, Y)))
}

library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.3
bt(Portfolio, func.1, 1000)
## $original
## [1] 0.1516641
## 
## $bias
## [1] 0.01280344
## 
## $stderr
## [1] 0.1841733

例44

df <- read.table("crime.txt")
for (j in 1:3) {
  func.2 <- function(data, index) {
    return(coef(lm(V1 ~ V3 + V4, data = data, subset = index))[j])
  }
  print(bt(df, func.2, 1000))
}
## $original
## (Intercept) 
##     621.426 
## 
## $bias
## (Intercept) 
##    35.38289 
## 
## $stderr
## [1] 212.9903
## 
## $original
##       V3 
## 11.85833 
## 
## $bias
##        V3 
## -0.697993 
## 
## $stderr
## [1] 3.480477
## 
## $original
##        V4 
## -5.973412 
## 
## $bias
##         V4 
## -0.2838705 
## 
## $stderr
## [1] 3.263297
summary(lm(V1 ~ V3 + V4, data = df))
## 
## Call:
## lm(formula = V1 ~ V3 + V4, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -356.90 -162.92  -60.86  100.69  784.30 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  621.426    222.685   2.791  0.00758 ** 
## V3            11.858      2.568   4.618 3.02e-05 ***
## V4            -5.973      3.561  -1.677  0.10013    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 246.6 on 47 degrees of freedom
## Multiple R-squared:  0.3247, Adjusted R-squared:  0.296 
## F-statistic:  11.3 on 2 and 47 DF,  p-value: 9.838e-05