第3章 リサンプリング(問題32~39)

35

n <- 1000
p <- 10
x <- matrix(rnorm(n * p), nrow = n, ncol = p)
x <- cbind(rep(1, n), x)
beta <- rnorm(p + 1)
y <- x %*% beta + rnorm(n) * 0.2

# 通常のクロスバリデーションによる方法
cv.linear <- function(X, y, k) {
  n <- length(y)
  m <- n / k
  S <- 0
  for (j in 1:k) {
    test <- ((j - 1) * m + 1):(j * m)
    beta <- ## 空欄(1) ##
    e <- y[test] - X[test, ] %*% beta
    S <- S + drop(t(e) %*% e)
  }
  return(S/n)
}

# 公式を用いた方法
cv.fast <- function(X, y, k) {
  n <- length(y)
  m <- n / k
  H <- X %*% solve(t(X) %*% X) %*% t(X)
  I <- diag(rep(1, n))
  e <- (I - H) %*% y
  I <- diag(rep(1, m))
  sum=0
  for (j in 1:k) {
    test <- ((j - 1) * m + 1):(j * m)
    sum <- sum + ## 空欄(2) ##
  }
  return(sum / n)
}
plot(0, 0, xlab = "k", ylab = "実行時間",
     xlim = c(2, n), ylim = c(0, 0.5), type = "n")
U <- NULL
V <- NULL
for (k in 10:n) {
  if (n %% k == 0) {
    t <- proc.time()[3]
    cv.fast(x, y, k)
    U <- c(U, k)
    V <- c(V, (proc.time()[3] - t))
  }
}
## 空欄 数行 ##
legend("topleft", legend = c("cv.linear", "cv.fast"),
       col = c("red", "blue"), lty = 1)

36

# データ生成
n <- 100
p <- 5
plot(0, 0, xlab = "k", ylab = "CVの値",
     xlim = c(2, n), ylim = c(0.3, 1.5), type = "n")
for (j in 2:11) {
  X <- matrix(rnorm(n * p), ncol = p)
  X <- cbind(1, X)
  beta <- rnorm(p + 1)
  eps <- rnorm(n)
  y <- X %*% beta + eps
  U <- NULL
  V <- NULL
  for (k in 2:n) {
    if (n %% k == 0) {
      ## 空欄 ##
    }
  }
  lines(U, V, col = j)
}

37

df <- iris
df <- df[sample(1:150, 150, replace = FALSE), ]
n <- nrow(df)
U <- NULL
V <- NULL
for (k in 1:10) {
  top.seq <- 1 + seq(0, 135, 10)
  S <- 0
  for (top in top.seq) {
    index <- ## 空欄(1) ##
    knn.ans <- knn(df[-index, 1:4], df[-index, 5], df[index, 1:4], k)
    ans <- ## 空欄(2) ##
    S <- S + sum(knn.ans != ans)
  }
  S <- S / n
  U <- c(U, k)
  V <- c(V, S)
}
plot(0, 0, type = "n", xlab = "k", ylab = "誤り率",
     xlim = c(1, 10), ylim = c(0, 0.1), main = "cvによる誤り率の評価")
lines(U, V, col = "red")

38

func.1 <- function(data, index) {
  X <- data$X[index]
  Y <- data$Y[index]
  return((var(Y) - var(X)) / (var(X) + var(Y) - 2 * cov(X, Y)))
}

bt <- function(df, func, r) {
  m <- nrow(df)
  org <- ## 空欄(1) ##
  u <- array(dim = r)
  for (j in 1:r) {
    index <- sample(## 空欄(2) ##)
    u[j] <- func.1(df, index)
  }
  return(list(original = org, bias = mean(u) - org, stderr = sd(u)))
}

library(ISLR)

bt(Portfolio, func.1, 1000)  # 実行例

39

df <- read.table("crime.txt")
for (j in 1:3) {
  func.2 <- function(data, index) {
    return(coef(lm(V1 ~ V3 + V4,
                   data = ## 空欄(1) ##, subset = ## 空欄(2) ##))[j])
  }
  print(bt(df, func.2, 1000))
}
summary(lm(V1 ~ V3 + V4, data = df))