第9章 教師なし学習(問題88~100)

88

k.means <- function(X, K, iteration = 20) {
  n <- nrow(X)
  p <- ncol(X)
  center <- array(dim = c(K, p))
  y <- sample(1:K, n, replace = TRUE)  # ベクトルyにクラスタが格納
  for (h in 1:iteration) {
    for (k in 1:K) {
      if (sum(y[] == k) == 0) {
        center[k, ] <- Inf
      } else {
        for (j in 1:p)
          center[k, j] <- ## 空欄(1) ##
      }
    }
    for (i in 1:n) {
      S.min <- Inf
      for (k in 1:K) {
        S <- sum((X[i, ] - center[k, ]) ^ 2)
        if (S < S.min) {
          S.min <- S
          ## 空欄(2) ##
        }
      }
    }
  }
  return(list(clusters = y, scores = scores))
}

# データ生成
p <- 2
n <- 1000
X <- matrix(rnorm(p * n), nrow = n, ncol = p)

# 各サンプルのクラスタを得る
y <- k.means(X, 5)$clusters

# クラスタごとに色を変えて,点を描く
plot(-3:3, -3:3, xlab = "第1成分", ylab = "第2成分", type = "n")
points(X[, 1], X[, 2], col = y + 1)

92

hc <- function(X, dd = "complete") {
  n <- nrow(X)
  index <- list()
  for (i in 1:n)
    index[[i]] <- list(i)
  cluster <- list()
  for (k in n:2) {
    dist.min <- Inf
    for (i in 1:(k - 1)) {
      for (j in (i + 1):k) {
        i.0 <- unlist(index[[i]])
        j.0 <- unlist(index[[j]])
        d <- switch(dist,
                    "complete" = dist.complete(X[i.0, ], X[j.0, ]),
                    "single" = dist.single(X[i.0, ], X[j.0, ]),
                    "centroid" = dist.centroid(X[i.0, ], X[j.0, ]),
                    "average" = dist.average(X[i.0, ], X[j.0, ]))
        if (d < dist.min) {
          ## 空欄(1) ##
          i.1 <- i
          j.1 <- j
        }
      }
    }
    index[[i.1]] <- append(index[[i.1]], index[[j.1]])
    if (j.1 < k)
      for (h in (j.1 + 1):k)
        index[[h - 1]] <- ## 空欄(2) ##
    index[[k]] <- NULL
    cluster[[k-1]] <- index
  }
  return(cluster)
}

# データ生成
n <- 200
p <- 2
X <- matrix(rnorm(n * p), ncol = p, nrow = n)

cluster <- hc(X)
par(mfrow = c(2, 2))
for (K in c(3, 5, 7, 9)) {
  grp <- cluster[[K]]
  plot(-3:3, -3:3, xlab = "第1成分", ylab = "第2成分", type = "n",
       main = paste("K = ", K))
  for (k in 1:K) {
    z <- unlist(grp[[k]])
    x <- X[z, 1]
    y <- X[z, 2]
    points(x, y, col = k + 1)
  }
}

95

n <- 100
p <- 5
x <- matrix(rnorm(n * p), ncol = p, nrow = n)
pca(x)$lambdas
pca(x)$vectors
pca(x)$centers
prcomp(x)$rotation
(prcomp(x)$sdev) ^ 2
prcomp(x)$center
names(prcomp(x))

96

#データ生成
n <- 100
a <- 0.7
b <- sqrt(1 - a ^ 2)
u <- rnorm(n)
v <- rnorm(n)
x <- u
y <- u * a + v * b
plot(x, y)
plot(x, y, xlim = c(-4, 4), ylim = c(-4, 4))
T <- prcomp(cbind(x, y))$rotation
T[2, 1] / T[1, 1] * T[2, 2] / T[1, 2]
abline(## 空欄(1) ##, ## 空欄(2) ##, col = "red")
abline(## 空欄(3) ##, ## 空欄(4) ##, col = "blue")

98

library(ISLR)

pr.out <- prcomp(USArrests, scale = TRUE)

99

pr.var <- ## 空欄(1) ##
pve <- ## 空欄(2) ##
par(mfrow = c(1, 2))
plot(pve, xlab = "主成分", ylab = "寄与率", ylim = c(0, 1) , type = "b")
plot(cumsum(pve), xlab = "主成分", ylab = "累積の寄与率",
     ylim = c(0, 1), type = "b")

100

pca.regression <- function(X, y, m) {
  pr <- prcomp(X)
  Z <- pr$x[, 1:m]
  phi <- pr$rotation[, 1:m]
  theta <- ## 空欄 ##
  return(list(theta = theta, beta = beta))
}
# データ生成
n <- 100
p <- 5
X <- matrix(rnorm(n * p), nrow = n, ncol = p)
for (j in 1:p)
  X[, j] <- X[, j] - mean(X[, j])
y <- X[, 1] + X[, 2] + X[, 3] + X[, 4] + X[, 5] + rnorm(n)
y <- y - mean(y)

# 実行
pca.regression(X, y, 3)
pca.regression(X, y, 5)$beta
solve(t(X) %*% X) %*% t(X) %*% y