第6章 非線形回帰(問題57~68)

59

# 乱数によるデータの生成
n <- 100
x <- rnorm(n) * 2 * pi
y <- sin(x) + 0.2 * rnorm(n)

# 区切り点の属性
col.set <- c("red", "green", "blue")
K.set <- c(5, 7, 9)

for (k in 1:3) {
  K <- K.set[k]
  knots <- seq(-2 * pi, 2 * pi, length = K)
  X <- matrix(nrow = n, ncol = K + 4)
  for (i in 1:n) {
    X[i, 1] <- 1
    X[i, 2] <- x[i]
    X[i, 3] <- x[i] ^ 2
    X[i, 4] <- x[i] ^ 3
    for (j in 1:K)
      X[i, j + 4] <- max((x[i] - knots[j]) ^ 3, 0)
  }
  beta <- solve(t(X) %*% X) %*% t(X) %*% y  # betaの推定

  f <- function(x) {
    ## 空欄[関数fの定義] ##
  }

  u.seq <- seq(-5, 5, 0.02)
  v.seq <- NULL
  for (u in u.seq)
    v.seq <- c(v.seq, f(u))
  plot(u.seq, v.seq, type = "l", col = col.set[k], yaxt = "n",
       xlab = "x", ylab = "f(x)")
  par(new = TRUE)
}
legend(-2.2, 1, paste0("K = ", K.set), lty = 1, col = col.set)
points(x, y)

61

# データ生成
n <- 100
x <- rnorm(n) * 2 * pi
y <- sin(x) + 0.2 * rnorm(n)

K <- 11
knots <- seq(-5, 5, length = K)
X <- matrix(nrow = n, ncol = K + 4)
for (i in 1:n) {
  X[i, 1] <- 1
  X[i, 2] <- x[i]
  X[i, 3] <- x[i] ^ 2
  X[i, 4] <- x[i] ^ 3
  for (j in 1:K)
    X[i, j + 4] <- max((x[i] - knots[j]) ^ 3, 0)
}
beta <- solve(t(X) %*% X) %*% t(X) %*% y

# スプラインの関数
f <- function(x) {
  S <- beta[1] + beta[2] * x + beta[3] * x ^ 2 + beta[4] * x ^ 3
  for (j in 1:K)
    S <- S + beta[j + 4] * max((x - knots[j]) ^ 3, 0)
  return(S)
}

d <- function(j, x, knots) {
  ## 空欄[関数dの定義] ##
}

h <- function(j, x, knots) {
  ## 空欄[関数hの定義] ##
}

X <- matrix(nrow = n, ncol = K)
X[, 1] <- 1
for (j in 2:K)
  for (i in 1:n)
    X[i, j] <- h(j, x[i], knots)
gamma <- solve(t(X) %*% X) %*% t(X) %*% y

# 自然なスプラインの関数
g <- function(x){
  ## 空欄[関数gの定義] ##
}

# 関数をグラフとして描画
u.seq <- seq(-6, 6, 0.02)
v.seq <- NULL
for (u in u.seq)
  v.seq <- c(v.seq,f(u))
plot(u.seq, v.seq, type = "l", col = "blue", yaxt = "n",
     xlab = "x", ylab = "f(x), g(x)")
par(new = TRUE)
w.seq <- NULL
for (u in u.seq)
  w.seq <- c(w.seq, g(u))
plot(u.seq, w.seq, type = "l", col = "red", yaxt = "n", xlab = "", ylab = "")
par(new = TRUE)
legend(-3.7, 1.1, c("スプライン", "自然なスプライン"),
       lty = 1, col = c("blue", "red"))
points(x, y)
abline(v = knots, lty = 3)
abline(v = c(-5, 5), lwd = 2)
title("K = 11")

64

# データ生成
n <- 100
x <- runif(n, -5, 5)
y <- x + sin(x) * 2 + rnorm(n)

index <- order(x)
x <- x[index]
y <- y[index]
X <- matrix(nrow = n, ncol = n)
X[, 1] <- 1

# 行列Xの生成
for (j in 2:n)
  for (i in 1:n)
    X[i, j] <- h(j, x[i], x)

# 行列Gの生成
GG <- G(x)

lambda.set <- c(1, 30, 80)
col.set <- c("red", "blue", "green")
for (i in 1:3) {
  lambda <- lambda.set[i]
  gamma <- ## 空欄 ##

  g <- function(u) {
    S <- gamma[1]
    for (j in 2:n)
      S <- S + gamma[j] * h(j, u, x)
    return(S)
  }

  u.seq <- seq(-8, 8, 0.02)
  v.seq <- NULL
  for (u in u.seq)
    v.seq <- c(v.seq, g(u))
  plot(u.seq, v.seq, type = "l", yaxt = "n",
       xlab = "x", ylab = "g(x)", ylim = c(-8, 8), col = col.set[i])
  par(new = TRUE)
}
points(x, y)
legend("topleft", paste0("lambda = ", lambda.set), col = col.set, lty = 1)
title("平滑化スプライン (n = 100)")

65

cv.ss.fast <- function(X, y, lambda, G, k) {
  n <- length(y)
  m <- n / k
  H <- X %*% solve(t(X) %*% X + lambda * G) %*% t(X)
  df <- ## 空欄(1) ##
  I <- diag(rep(1, n))
  e <- (I - H) %*% y
  I <- diag(rep(1, m))
  S <- 0
  for (j in 1:k) {
    test <- ((j - 1) * m + 1):(j * m)
    S <- S + norm(solve(I - H[test, test]) %*% e[test], "2") ^ 2
  }
  return(list(score = S / n, df = df))
}

n <- 100
x <- runif(n, -5, 5)
y <- x - 0.02 * sin(x) - 0.1 * rnorm(n)
index <- order(x)
x <- x[index]
y <- y[index]
X <- matrix(nrow = n, ncol = n)
X[, 1] <- 1
for (j in 2:n)
  for (i in 1:n)
    X[i, j] <- h(j, x[i], x)
GG <- G(x)
u <- seq(1, 50)
v <- NULL
w <- NULL
for (lambda in u) {
  result <- cv.ss.fast(## 空欄(2) ##, n)
  v <- c(v, result$df)
  w <- c(w, result$score)
}
plot(v, w, type = "l", col = "red",
     xlab = "有効自由度", ylab = "CVによる予測誤差")
title("有効自由度とCVによる予測誤差")

66

# データ生成
n <- 250
x <- 2 * rnorm(n)
y <- sin(2 * pi * x) + rnorm(n) / 4

D <- function(t) {
  ## 空欄(1) ##
}

K <- function(x, y, lambda) {
  ## 空欄(2) ##
}

# 関数定義f
f <- function(z, lambda) {
  S <- 0
  T <- 0
  for (i in 1:n) {
    S <- S + K(x[i], z, lambda) * y[i]
    T <- T + K(x[i], z, lambda)
  }
  return(S / T)
}

# lambda = 0.05, 0.25の曲線を図示
plot(seq(-3, 3, length = 10), seq(-2, 3, length = 10),
     type = "n", xlab = "x", ylab = "y")
points(x, y)
xx <- seq(-3, 3, 0.1)
yy <- NULL
for (zz in xx)
  yy <- c(yy, f(zz, 0.05))
lines(xx, yy, col = "green")
yy <- NULL
for (zz in xx)
  yy <- c(yy, f(zz, 0.25))
lines(xx, yy, col = "blue")

# 最適なlambdaの値を計算
m <- n / 10
lambda.seq <- seq(0.05, 1, 0.01)
SS.min <- Inf
for (lambda in lambda.seq) {
  SS <- 0
  for (k in 1:10) {
    test <- ((k - 1) * m + 1):(k * m)
    train <- setdiff(1:n, test)
    for (j in test) {
      u <- 0
      v <- 0
      for (i in train) {
        kk <- K(x[i], x[j], lambda)
        u <- u + kk * y[i]
        v <- v + kk
      }
      if (v == 0) {
        index <- order(abs(x[j] - x[-j]))[1]
        z <- y[index]
      } else {
        z <- u / v
      }
      SS <- SS + (y[j] - z) ^ 2
    }
  }
  if (SS < SS.min) {
    SS.min <- SS
    lambda.best <- lambda
  }
}

yy <- NULL
for (zz in xx)
  yy <- c(yy, f(zz, lambda.best))
lines(xx, yy, col = "red")
title("Nadaraya-Watson推定量")
legend("topleft", legend = paste0("lambda = ", c(0.05, 0.25, "lambda.best")),
       lwd = 1, col = c("green", "blue", "red"))

67

local <- function(x, y, z = x) {
  X <- cbind(rep(1, n), x)
  yy <- NULL
  beta.hat <- array(dim = 2)
  for (u in z) {
    w <- array(dim = n)
    for (i in 1:n)
      w[i] <- K(x[i], u, lambda = 1)
    W <- ## 空欄(1) ##
    beta.hat <- ## 空欄(2) ##
    yy <- c(yy, beta.hat[1] + beta.hat[2] * u)
  }
  return(yy)
}

# データ生成
n <- 30
x <- runif(n) * 2 * pi - pi
y <- sin(x) + rnorm(n)
plot(x, y)

m <- 200
U <- seq(-pi, pi, pi / m)
V <- local(x, y, U)
lines(U, V, col = "red", type = "l")
title("局所線形回帰 (p = 1, N = 30)")

68

poly <- function(x, y, z = x) {
  n <- length(x)
  m <- length(z)
  X <- cbind(rep(1, n), x, x^2, x^3)
  yy <- array(dim = n)
  beta.hat <- array(dim = 4)
  beta.hat <- solve(t(X) %*% X) %*% t(X) %*% y
  X <- cbind(rep(1, m), z, z^2, z^3)
  yy <- ## 空欄(1) ##
  return(yy)
}

# データ生成
n <- 30
x <- runif(n) * 2 * pi - pi
y <- sin(x) + rnorm(n)
plot(x, y)

y.1 <- 0
y.2 <- 0
for (k in 1:10) {
  y.1 <- poly(x, y - y.2)
  y.2 <- ## 空欄(2) ##
}
z <- seq(-2, 2, 0.1)
par(mfrow = c(1, 2))
plot(z, poly(x, y.1, z), type = "l", xlab = "x", ylab = "f(x)",
     main = "多項式回帰(3次)", col = "green")
plot(z, local(x, y.2, z), type = "l", xlab = "x", ylab = "f(x)",
     main = "局所線形回帰", col = "green")