第4章カーネル計算の実際

4.1 カーネルRidge回帰

alpha <- function(k, x, y) {
  n <- length(x)
  K <- matrix(0, n, n)
  for (i in 1:n)
    for (j in 1:n)
      K[i, j] <- k(x[i], x[j])
  return(solve(K + 10^(-5)*diag(n)) %*% y)  # K に 10^(-5)I を加えて正則にする
}

例63

k.p <- function(x, y) {  # カーネル定義
  return((sum(x*y)+1)^3)
}

k.g <- function(x, y) {  # カーネル定義
  return(exp(-(x-y)^2/2))
}

lambda <- 0.1

# データ生成
n <- 50
x <- rnorm(n)
y <- 1 + x + x^2 + rnorm(n)

alpha.p <- alpha(k.p, x, y)
alpha.g <- alpha(k.g, x, y)
z <- sort(x)
u <- array(n)
v <- array(n)
for (j in 1:n) {
  S <- 0
  for (i in 1:n)
    S <- S + alpha.p[i] * k.p(x[i], z[j])
  u[j] <- S
  S <- 0
  for (i in 1:n)
    S <- S + alpha.g[i] * k.g(x[i], z[j])
  v[j] <- S
}
plot(z, u, type = "l", xlim = c(-1, 1), xlab = "x", ylab = "y", ylim = c(-1, 5),
     col = "red", main = "カーネル回帰")
lines(z, v, col = "blue")
points(x, y)
legend("topleft", legend = c("多項式カーネル", "Gauss カーネル"), 
       col = c("red", "blue"), lty = 1)

alpha <- function(k, x, y) {
  n <- length(x)
  K <- matrix(0, n, n)
  for (i in 1:n)
    for (j in 1:n)
      K[i, j] <- k(x[i], x[j])
  return(solve(K + lambda * diag(n)) %*% y)
}

例64

k.p <- function(x, y) {  # カーネル定義
  return((sum(x*y)+1)^3)
}

k.g <- function(x, y) {  # カーネル定義
  return(exp(-(x-y)^2/2))
}

lambda <- 0.1

# データ生成
n <- 50
x <- rnorm(n)
y <- 1 + x + x^2 + rnorm(n)

alpha.p <- alpha(k.p, x, y)
alpha.g <- alpha(k.g, x, y)
z <- sort(x)
u <- array(n)
v <- array(n)
for (j in 1:n) {
  S <- 0
  for (i in 1:n)
    S <- S + alpha.p[i] * k.p(x[i], z[j])
  u[j] <- S
  S <- 0
  for (i in 1:n)
    S <- S + alpha.g[i] * k.g(x[i], z[j])
  v[j] <- S
}
plot(z, u, type = "l", xlim = c(-1, 1), xlab = "x", ylab = "y",
     ylim = c(-1, 5), col = "red", main = "Kernel Ridge")
lines(z, v, col = "blue")
points(x, y)

4.2 カーネル主成分分析

kernel.pca.train <- function(x, k) {
  n <- nrow(x)
  K <- matrix(0, n, n)
  S <- rep(0, n)
  T <- rep(0, n)
  for (i in 1:n)
    for(j in 1:n)
      K[i, j] <- k(x[i, ], x[j, ])
  for (i in 1:n)
    S[i] <- sum(K[i, ])
  for (j in 1:n)
    T[j] <- sum(K[, j])
  U <- sum(K)
  for (i in 1:n)
    for (j in 1:n)
      K[i, j] <- K[i, j] - S[i]/n - T[j]/n + U/n^2
  res <- eigen(K)
  alpha <- matrix(0, n, n)
  for (i in 1:n)
    alpha[, i] <- res$vector[, i] / res$value[i] ^ 0.5
  return(alpha)
}

kernel.pca.test <- function(x, k, alpha, m, z) {
  n <- nrow(x)
  pca <- array(0, dim = m)
  for (i in 1:n)
    pca <- pca + alpha[i, 1:m] * k(x[i, ], z)
  return(pca) 
}

例65

# k <- function(x, y) {
#   return(sum(x*y))
# }

sigma.2 <- 0.01

k <- function(x, y) {
  return(exp(-norm(x-y, "2")^2 / 2 / sigma.2))
}

x <- as.matrix(USArrests)
n <- nrow(x)
p <- ncol(x)
alpha <- kernel.pca.train(x, k)
z <- array(dim = c(n, 2))
for (i in 1:n)
  z[i, ] <- kernel.pca.test(x, k, alpha, 2, x[i, ])
min.1 <- min(z[, 1])
min.2 <- min(z[, 2])
max.1 <- max(z[, 1])
max.2 <- max(z[, 2]) 
plot(0, xlim = c(min.1, max.1), ylim = c(min.2, max.2),
     xlab = "First", ylab = "Second", cex.lab = 0.75, cex.axis = 0.75,
     main = "Kernel PCA (Gauss 0.01)")
for (i in 1:n)
  if (i != 5)
    text(z[i, 1], z[i, 2], labels = i, cex = 0.5)
text(z[5, 1], z[5, 2], 5, col = "red")

# 通常の PCA の場合、スコアは下記 1 行で求めることができる。
z <- prcomp(x)$x[, 1:2]

4.3 カーネルSVM

例66

library(quadprog)

K.linear <- function(x, y) {
  return(t(x) %*% y)
}

K.poly <- function(x, y) {
  return((1 + t(x) %*% y)^2)
}

svm.2 <- function(X, y, C, K) {  # 関数名を svm.2 とした
  eps <- 0.0001
  n <- nrow(X)
  Dmat <- matrix(nrow = n, ncol = n)
  Kmat <- matrix(nrow = n, ncol = n)
  for (i in 1:n)
    for (j in 1:n) {
      Dmat[i, j] <- K(X[i, ], X[j, ]) * y[i] * y[j]
      Kmat <- K(X[i, ], X[j, ])
  }
  Dmat <- Dmat + eps * diag(n)
  dvec <- rep(1, n)
  Amat <- matrix(nrow = (2*n + 1), ncol = n)
  Amat[1, ] <- y
  Amat[2:(n + 1), 1:n] <- -diag(n)
  Amat[(n + 2):(2*n + 1), 1:n] <- diag(n)
  Amat <- t(Amat)
  bvec <- c(0, -C*rep(1, n), rep(0, n))
  meq <- 1
  alpha <- solve.QP(Dmat, dvec, Amat, bvec = bvec, meq = 1)$solution
  index <- (1:n)[0 < alpha & alpha < C]
  beta <- drop(Kmat %*% (alpha * y))
  beta.0 <- mean(y[index] - beta[index])
  return(list(alpha = alpha, beta.0 = beta.0))
}

# 関数の定義
plot.kernel <- function(K, lty) {  # 引数で、線の種類を指定する lty
  qq <- svm.2(X, y, 0.1, K)
  alpha <- qq$alpha
  beta.0 <- qq$beta.0
  
  f <- function(u, v) {
    x <- c(u, v)
    S <- beta.0
    for (i in 1:n)
      S <- S + alpha[i] * y[i] * K(X[i, ], x)
    return(S)
  }
  
  # z は、f(x, y) での方向の高さを求める。これから、輪郭を求めることができる。
  u <- seq(-2, 2, .1)
  v <- seq(-2, 2, .1)
  w <- array(dim = c(41, 41))
  for (i in 1:41)
    for (j in 1:41)
      w[i, j] <- f(u[i], v[j])
  contour(u, v, w, level = 0, add = TRUE, lty = lty)
}

# 実行
a <- rnorm(1)
b <- rnorm(1)
n <- 100
X <- matrix(rnorm(n * 2), ncol = 2, nrow = n)
y <- sign(a*X[, 1] + b*X[, 2] + 0.3*rnorm(n))
plot(-3:3, -3:3, xlab = "X[, 1]", ylab = "X[, 2]", type = "n")
for (i in 1:n) {
  if (y[i] == 1) {
    points(X[i, 1], X[i, 2], col = "red")
  } else {
    points(X[i, 1], X[i, 2], col = "blue")
  }
}
plot.kernel(K.linear, 1)
plot.kernel(K.poly, 2)

4.4 スプライン

# d, h で基底を求める関数を構成している
d <- function(j, x, knots) {
  K <- length(knots)
  return((max((x - knots[j])^3, 0) - max((x - knots[K])^3, 0)) /
         (knots[K] - knots[j]))
}

h <- function(j, x, knots) {
  K <- length(knots)
  if (j == 1) {
    return(1)
  } else if (j == 2) {
    return(x)
  } else {
    return(d(j-2, x, knots) - d(K-1, x, knots))
  }
}

# G は、2 回微分した関数を積分した値になっている。
G <- function(x) {  # x の各値が昇順になっていることを仮定している
  n <- length(x)
  g <- matrix(0, nrow = n, ncol = n)
  for (i in 3:(n-1)) {
    for (j in i:n) {
      g[i, j] = 12 * (x[n] - x[n-1]) * (x[n-1] - x[j-2]) * (x[n-1] - x[i-2]) /
        (x[n] - x[i-2]) / (x[n] - x[j-2]) +
        (12 * x[n-1] + 6 * x[j-2] - 18 * x[i-2]) *
        (x[n-1] - x[j-2])^2 / (x[n] - x[i-2]) / (x[n] - x[j-2])
      g[j, i] = g[i, j]
    }
  }
  return(g)
}

# メインの処理

# データ生成
n <- 100
x <- runif(n, -5, 5)
y <- x + sin(x)*2 + rnorm(n)

index <- order(x)
x <- x[index]
y <- y[index]
X <- matrix(nrow = n, ncol = n)
X[, 1] <- 1

# 行列 X の生成
for (j in 2:n)
  for (i in 1:n)
    X[i, j] <- h(j, x[i], x)

# 行列 G の生成
GG = G(x)

lambda.set <- c(1, 30, 80)
col.set <- c("red", "blue", "green")
for (i in 1:3) {
  lambda <- lambda.set[i]
  gamma <- solve(t(X) %*% X + lambda * GG) %*% t(X) %*% y

  g <- function(u) {
    S <- gamma[1]
    for (j in 2:n)
      S <- S + gamma[j] * h(j, u, x)
    return(S)
  }

  u.seq <- seq(-8, 8, 0.02)
  v.seq <- NULL
  for (u in u.seq)
    v.seq <- c(v.seq, g(u))
  plot(u.seq, v.seq, type = "l", yaxt = "n", xlab = "x", ylab = "g(x)",
       ylim = c(-8, 8), col = col.set[i])
  par(new = TRUE)
}
points(x, y)
legend("topleft", paste0("lambda = ", lambda.set), col = col.set, lty = 1)
title("平滑化スプライン (n = 100)")

4.5 Random Fourier Features

例68

sigma <- 10
sigma2 <- sigma^2

k <- function(x, y) {
  return(exp(-(x-y)^2 / 2 / sigma2))
}

z <- function(x) {
  return(sqrt(2/m) * cos(w*x + b))
}

zz <- function(x, y) {
  return(sum(z(x) * z(y)))
}

u <- matrix(0, 1000, 3)
m_seq <- c(20, 100, 400)
for (i in 1:1000) {
  x <- rnorm(1)
  y <- rnorm(1)
  for (j in 1:3) {
    m <- m_seq[j]
    w <- rnorm(m) / sigma
    b <- runif(m) * 2 * pi
    u[i, j] <- zz(x, y) - k(x, y)
  }
}
boxplot(u[, 1], u[, 2], u[, 3], ylab = "k(x, y)との差異",
        names = paste0("m = ", m_seq), col = c("red", "blue", "green"),
        main = "RFF によるカーネルの近似")

例69

sigma <- 10
sigma2 <- sigma^2

# 関数 z
m <- 20
w <- rnorm(m) / sigma
b <- runif(m) * 2 * pi

z <- function(u, m) {
  return(sqrt(2/m) * cos(w*u + b))
}

# ガウスカーネル
k.g <- function(x, y) {
  return(exp(-(x-y)^2 / 2 / sigma2))
}

# データ生成
n <- 200
x <- rnorm(n) / 2
y <- 1 + 5*sin(x/10) + 5*x^2 + rnorm(n)  
x.min <- min(x)
x.max <- max(x)
y.min <- min(y)
y.max <- max(y)
lambda <- 0.001  # lambda = 0.9 も

# 低ランク近似の関数
alpha.rff <- function(x, y, m) {
  n <- length(x)
  Z <- array(dim = c(n, m))
  for (i in 1:n)
    Z[i, ] <- z(x[i], m)
  beta <- solve(t(Z) %*% Z + lambda * diag(m)) %*% t(Z) %*% y
  return(as.vector(beta))
}

# 通常の関数
alpha <- function(k, x, y) {
  n <- length(x)
  K <- matrix(0, n, n)
  for (i in 1:n)
    for (j in 1:n)
      K[i, j] <- k(x[i], x[j])
  alpha <- solve(K + lambda * diag(n)) %*% y
  return(as.vector(alpha))
}

# 数値で比較する
alpha.hat <- alpha(k.g, x, y)
beta.hat <- alpha.rff(x, y, m)
r <- sort(x)
u <- array(n)
v <- array(n)
for (j in 1:n) {
  S <- 0
  for (i in 1:n)
    S <- S + alpha.hat[i] * k.g(x[i], r[j])
  u[j] <- S
  v[j] <- sum(beta.hat * z(r[j], m))
}
plot(r, u, type = "l", xlim = c(x.min, x.max), ylim = c(y.min, y.max),
     xlab = "x", ylab = "y", col = "red",
     main = "lambda = 10^(-4), m = 20, n = 200")
lines(r, v, col = "blue")
points(x, y)
legend("topleft", lwd = 1, c("近似なし", "近似あり"), col = c("red", "blue"))

4.6 Nystrom近似

例70

sigma2 <- 1

k.g <- function(x, y) {
  return(exp(-(x-y)^2 / 2 / sigma2))
}

# データ生成
n <- 300
x <- rnorm(n) / 2
y <- 3 - 2*x^2 + 3*x^3 + 2*rnorm(n)

lambda <- 10 ^ (-5)  # lambda = 0.9も
m <- 10

# 低ランク近似の関数
alpha.m <- function(k, x, y, m) {
  n <- length(x)
  K <- matrix(0, n, n)
  for (i in 1:n)
    for (j in 1:n)
      K[i, j] <- k(x[i], x[j])
  A <- svd(K[1:m, 1:m])
  u <- array(dim = c(n, m))
  for (i in 1:m)
    for (j in 1:n)
      u[j, i] <- sqrt(m/n) * sum(K[j, 1:m] * A$u[1:m, i]) / A$d[i]
  mu <- A$d * n / m
  R <- sqrt(mu[1]) * u[, 1]
  for (i in 2:m)
    R <- cbind(R, sqrt(mu[i]) * u[, i])
  alpha <- (diag(n) - R %*% solve(t(R) %*% R + lambda * diag(m)) %*% t(R)) %*% y / lambda
  return(as.vector(alpha))
}

# 通常の関数
alpha <- function(k, x, y) {
  n <- length(x)
  K <- matrix(0, n, n)
  for (i in 1:n)
    for (j in 1:n)
      K[i, j] <- k(x[i], x[j])
  alpha <- solve(K + lambda * diag(n)) %*% y
  return(as.vector(alpha))
}

# 数値で比較する
alpha.1 <- alpha(k.g, x, y)
alpha.2 <- alpha.m(k.g, x, y, m)
z <- sort(x)
u <- array(n)
v <- array(n)
for (j in 1:n) {
  S <- 0
  for (i in 1:n)
    S <- S + alpha.1[i] * k.g(x[i], z[j])
  u[j] <- S
  S <- 0
  for (i in 1:n)
    S <- S + alpha.2[i] * k.g(x[i], z[j])
  v[j] <- S
}
plot(z, u, type = "l", xlim = c(-1, 1), xlab = "x", ylab = "y", ylim = c(-1, 5),
     col = "red", main = "Kernel Ridge")
lines(z, v, col = "blue")
points(x, y)
legend("topleft", lwd = 1, c("近似なし", "近似あり"), col = c("red", "blue"))

4.7 不完全Cholesky分解

im.ch <- function(A, m = ncol(A)) {
  n <- ncol(A)
  R <- matrix(0, n, n)
  P <- diag(n)
  for (i in 1:n)
    R[i, i] <- sqrt(A[i, i])
  max.R <- 0
  for (i in 1:n) {
    if (R[i, i] > max.R) {
      k <- i
      max.R <- R[i, i]
    }
  }
  R[1, 1] <- max.R
  if (k != 1) {
    w <- A[, k]
    A[, k] <- A[, 1]
    A[, 1] <- w
    w <- A[k, ]
    A[k, ] <- A[1, ]
    A[1, ] <- w
    P[1, 1] <- 0
    P[k, k] <- 0
    P[1, k] <- 1
    P[k, 1] <- 1
  }
  for (i in 2:n)
    R[i, 1] <- A[i, 1] / R[1, 1]
  if (m > 1) {
    for (i in 2:m) {
      for (j in i:n)
        R[j, j] = sqrt(A[j, j] - sum(R[j, 1:(i-1)]^2))
      max.R <- 0
      for (j in i:n) {
        if (R[j, j] > max.R) {
          k <- j
          max.R <- R[j, j]
        }
      }
      R[i, i] <- max.R
      if (k != i) {
        w <- R[i, 1:(i-1)]
        R[i, 1:(i-1)] <- R[k, 1:(i-1)]
        R[k, 1:(i-1)] <- w
        w <- A[, k]
        A[, k] <- A[, i]
        A[, i] <- w
        w <- A[k, ]
        A[k, ] <- A[i, ]
        A[i, ] <- w
        Q <- diag(n)
        Q[i, i] <- 0
        Q[k, k] <- 0
        Q[i, k] <- 1
        Q[k, i] <- 1
        P <- P %*% Q
      }
      if (i < n) {
        for (j in (i+1):n) {
          R[j, i] <- (A[j, i] - sum(R[i, 1:(i-1)] * R[j, 1:(i-1)])) / R[i, i]
        }
      }
    }
  }
  if (m < n)
    for (i in (m+1):n)
      R[, i] <- 0
  return(list(P = P, R = R))
}

# データ生成
n <- 4
range <- -5:5
D <- matrix(sample(range, n*n, replace = TRUE), n, n)
A <- t(D) %*% D

# 実行例
im.ch(A)

## $P
##      [,1] [,2] [,3] [,4]
## [1,]    0    0    1    0
## [2,]    1    0    0    0
## [3,]    0    0    0    1
## [4,]    0    1    0    0
## 
## $R
##           [,1]       [,2]       [,3]     [,4]
## [1,]  7.549834  0.0000000  0.0000000 0.000000
## [2,]  2.251705  5.4708157  0.0000000 0.000000
## [3,] -6.490209  0.2950264  3.1289219 0.000000
## [4,] -3.576237 -0.1731677 -0.3705549 1.021386

L <- im.ch(A)$R
L %*% t(L)

##      [,1] [,2] [,3] [,4]
## [1,]   57   17  -49  -27
## [2,]   17   35  -13   -9
## [3,]  -49  -13   52   22
## [4,]  -27   -9   22   14

P <- im.ch(A)$P
t(P) %*% A %*% P

##      [,1] [,2] [,3] [,4]
## [1,]   57   17  -49  -27
## [2,]   17   35  -13   -9
## [3,]  -49  -13   52   22
## [4,]  -27   -9   22   14

P %*% (L %*% t(L)) %*% t(P)  # A に一致する

##      [,1] [,2] [,3] [,4]
## [1,]   52  -49   22  -13
## [2,]  -49   57  -27   17
## [3,]   22  -27   14   -9
## [4,]  -13   17   -9   35

im.ch(A, 2)

## $P
##      [,1] [,2] [,3] [,4]
## [1,]    0    0    0    1
## [2,]    1    0    0    0
## [3,]    0    0    1    0
## [4,]    0    1    0    0
## 
## $R
##           [,1]       [,2] [,3] [,4]
## [1,]  7.549834  0.0000000    0    0
## [2,]  2.251705  5.4708157    0    0
## [3,] -3.576237 -0.1731677    0    0
## [4,] -6.490209  0.2950264    0    0

L <- im.ch(A, 2)$R
L %*% t(L)

##      [,1] [,2]      [,3]      [,4]
## [1,]   57   17 -27.00000 -49.00000
## [2,]   17   35  -9.00000 -13.00000
## [3,]  -27   -9  12.81946  23.15944
## [4,]  -49  -13  23.15944  42.20985

P <- im.ch(A, 2)$P
t(P) %*% A %*% P

##      [,1] [,2] [,3] [,4]
## [1,]   57   17  -27  -49
## [2,]   17   35   -9  -13
## [3,]  -27   -9   14   22
## [4,]  -49  -13   22   52

P %*% (L %*% t(L)) %*% t(P)  # A の低ランク近似

##           [,1] [,2]      [,3] [,4]
## [1,]  42.20985  -49  23.15944  -13
## [2,] -49.00000   57 -27.00000   17
## [3,]  23.15944  -27  12.81946   -9
## [4,] -13.00000   17  -9.00000   35

第4章 カーネル計算の実際

4.1 カーネルRidge回帰

例63

例64

4.2 カーネル主成分分析

例65

4.3 カーネルSVM

例66

4.4 スプライン

4.5 Random Fourier Features

例68

例69

4.6 Nystrom近似

例70

4.7 不完全Cholesky分解

第4章カーネル計算の実際