9.1 \(K\)-meansクラスタリング
k.means <- function(X, K, iteration = 20) {
n <- nrow(X)
p <- ncol(X)
center <- array(dim = c(K, p))
y <- sample(1:K, n, replace = TRUE)
scores <- NULL #
for (h in 1:iteration) {
for (k in 1:K) {
if (sum(y[] == k) == 0) {
# sum(y[] == k)でy[i] = kなるiの個数を意味する。
# サンプルを含まないクラスタは消滅
center[k, ] <- Inf
} else {
for (j in 1:p)
center[k, j] <- mean(X[y[] == k, j])
}
}
S.total <- 0 #
for (i in 1:n) {
S.min <- Inf
for (k in 1:K) {
S <- sum((X[i, ] - center[k, ]) ^ 2)
if (S < S.min) {
S.min <- S
y[i] <- k
}
}
S.total <- S.total + S.min #
}
scores <- c(scores, S.total) #
}
return(list(clusters = y, scores = scores))
}
例80
# データ生成
p <- 2
n <- 1000
X <- matrix(rnorm(p * n), nrow = n, ncol = p)
# 各サンプルのクラスタを得る
y <- k.means(X, 5)$clusters
# クラスタごとに色を変えて,点を描く
plot(-3:3, -3:3, xlab = "第1成分", ylab = "第2成分", type = "n")
points(X[, 1], X[, 2], col = y + 1)
例82
# データ生成
p <- 2
n <- 1000
X <- matrix(rnorm(p * n), nrow = n, ncol = p)
input <- 1:20
output <- k.means(X, 5)$scores
plot(input, log(output), ylim = c(6.2, 7.5),
xlab = "繰り返し回数", ylab = "log(スコア)",
type = "l", col = 1, main = "初期値ごとに,スコアの変化をみる")
for (r in 2:10) {
output <- k.means(X, 5)$scores
lines(input, log(output), col = r)}
9.2 階層的クラスタリング
dist.complete <- function(x, y) {
x <- as.matrix(x)
y <- as.matrix(y)
r <- nrow(x)
s <- nrow(y)
dist.max <- 0
for (i in 1:r) {
for (j in 1:s) {
d <- norm(x[i, ] - y[j, ], "2")
if (d > dist.max)
dist.max <- d
}
}
return(dist.max)
}
dist.single <- function(x, y) {
x <- as.matrix(x)
y <- as.matrix(y)
r <- nrow(x)
s <- nrow(y)
dist.min <- Inf
for (i in 1:r) {
for (j in 1:s) {
d <- norm(x[i, ] - y[j, ], "2")
if (d < dist.min)
dist.min <- d
}
}
return(dist.min)
}
dist.centroid <- function(x, y) {
x <- as.matrix(x)
y <- as.matrix(y)
r <- nrow(x)
s <- nrow(y)
x.bar <- 0
for (i in 1:r)
x.bar <- x.bar + x[i, ]
x.bar <- x.bar/r
y.bar <- 0
for (i in 1:s)
y.bar <- y.bar + y[i, ]
y.bar <- y.bar / s
return(norm(x.bar - y.bar, "2") ^ 2)
}
dist.average <- function(x, y) {
x <- as.matrix(x)
y <- as.matrix(y)
r <- nrow(x)
s <- nrow(y)
S <- 0
for (i in 1:r) {
for (j in 1:s) {
d <- norm(x[i, ] - y[j, ], "2")
S <- S + d
}
}
return(S / r / s)
}
hc <- function(X, dd = "complete") {
n <- nrow(X)
index <- list()
for (i in 1:n)
index[[i]] <- list(i)
cluster <- list()
for (k in n:2) {
dist.min <- Inf
for (i in 1:(k - 1)) {
for (j in (i + 1):k) {
i.0 <- unlist(index[[i]])
j.0 <- unlist(index[[j]])
d <- switch(dd,
"complete" = dist.complete(X[i.0, ], X[j.0, ]),
"single" = dist.single(X[i.0, ], X[j.0, ]),
"centroid" = dist.centroid(X[i.0, ], X[j.0, ]),
"average" = dist.average(X[i.0, ], X[j.0, ]))
if (d < dist.min) {
dist.min <- d
i.1 <- i
j.1 <- j
}
}
}
index[[i.1]] <- append(index[[i.1]], index[[j.1]])
if (j.1 < k)
for (h in (j.1 + 1):k)
index[[h - 1]] <- index[[h]]
index[[k]]=NULL
cluster[[k-1]]=index
}
return(cluster)
}
例83
# データ生成
n <- 200
p <- 2
X <- matrix(rnorm(n * p), ncol = p, nrow = n)
cluster <- hc(X)
par(mfrow = c(2, 2))
for (K in c(3, 5, 7, 9)) {
grp <- cluster[[K]]
plot(-3:3, -3:3, xlab = "第1成分", ylab = "第2成分",
type = "n", main = paste("K = ", K))
for (k in 1:K) {
z <- unlist(grp[[k]])
x <- X[z, 1]
y <- X[z, 2]
points(x, y, col = k + 1)
}
}
# データ生成
n <- 100
p <- 2
K <- 7
X <- matrix(rnorm(n * p), ncol = p, nrow = n)
for (d in c("complete", "single", "centroid", "average")) {
cluster <- hc(X, dd = d)
grp <- cluster[[K]]
plot(-3:3, -3:3, xlab = "第1成分", ylab = "第2成分", type = "n", main = d)
for (k in 1:K) {
z <- unlist(grp[[k]])
x <- X[z, 1]
y <- X[z, 2]
points(x, y, col = k + 1)
}
}
par(mfrow = c(1, 1))
例85
n <- 100
x <- matrix(rnorm(n * 2), ncol = 2)
par(mfrow = c(2, 2))
hc.complete <- hclust(dist(x), method = "complete")
plot(hc.complete)
hc.single <- hclust(dist(x), method = "single")
plot(hc.single)
hc.centroid <- hclust(dist(x), method = "centroid")
plot(hc.centroid)
hc.average <- hclust(dist(x), method = "average")
plot(hc.average)
9.3 主成分分析
pca <- function(x) {
n <- nrow(x)
p <- ncol(x)
center <- array(dim = p)
for (j in 1:p)
center[j] <- mean(x[, j])
for (j in 1:p)
x[, j] <- x[, j] - center[j]
sigma <- t(x) %*% x / n
lambda <- eigen(sigma)$values
phi <- eigen(sigma)$vectors
return(list(lambdas = lambda, vectors = phi, centers = center))
}
例86
n <- 100
p <- 5
x <- matrix(rnorm(n * p), ncol = p, nrow = n)
pca(x)$lambdas
## [1] 1.2667853 1.0748733 1.0037504 0.9334545 0.5990494
pca(x)$vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.4385672 0.6905285 0.4430918 -0.01409345 -0.3664699
## [2,] -0.3564274 0.5700479 -0.6670760 -0.28478816 -0.1480210
## [3,] -0.4201573 -0.2820769 0.1556820 0.03707042 -0.8475195
## [4,] 0.4805386 -0.3225419 -0.1579887 -0.77621617 -0.1938491
## [5,] 0.5226556 -0.1208929 -0.5563110 0.56108049 -0.2965178
pca(x)$centers
## [1] 0.05804285 0.12217964 0.11283619 0.09725292 0.14572367
prcomp(x)$rotation
## PC1 PC2 PC3 PC4 PC5
## [1,] -0.4385672 -0.6905285 -0.4430918 -0.01409345 -0.3664699
## [2,] 0.3564274 -0.5700479 0.6670760 -0.28478816 -0.1480210
## [3,] 0.4201573 0.2820769 -0.1556820 0.03707042 -0.8475195
## [4,] -0.4805386 0.3225419 0.1579887 -0.77621617 -0.1938491
## [5,] -0.5226556 0.1208929 0.5563110 0.56108049 -0.2965178
(prcomp(x)$sdev) ^ 2
## [1] 1.2795812 1.0857306 1.0138893 0.9428834 0.6051005
prcomp(x)$center
## [1] 0.05804285 0.12217964 0.11283619 0.09725292 0.14572367
names(prcomp(x))
## [1] "sdev" "rotation" "center" "scale" "x"
pr.var <- (prcomp(x)$sdev) ^ 2
pve <- pr.var / sum(pr.var)
par(mfrow = c(1, 2))
plot(pve, xlab = "主成分", ylab = "寄与率", ylim = c(0, 1), type = "b")
plot(cumsum(pve), xlab = "主成分", ylab = "累積の寄与率",
ylim = c(0, 1), type = "b")
例87
n <- 100
a <- 0.7
b <- sqrt(1 - a ^ 2)
u <- rnorm(n)
v <- rnorm(n)
x <- u
y <- u * a + v * b
plot(x, y)
plot(x, y, xlim = c(-4, 4), ylim = c(-4, 4))
T <- prcomp(cbind(x, y))$rotation
T[2, 1] / T[1, 1] * T[2, 2] / T[1, 2]
## [1] -1
abline(0, T[2, 1] / T[1, 1], col = "red")
abline(0, T[2, 2] / T[1, 2], col = "blue")
例88
pr.out <- prcomp(USArrests, scale = TRUE)
biplot(pr.out)
pr.out$x <- -pr.out$x
pr.out$rotation <- -pr.out$rotation
biplot(pr.out)
例89
library(MASS)
z <- as.matrix(Boston)
y <- k.means(z,5)$clusters
w <- prcomp(z)$x[, 1:2]
plot(w, col = y + 1, xlab = "第1主成分", ylab = "第2主成分",
main = "Bostonデータのクラスタリング")
pca.regression <- function(X, y, m) {
pr <- prcomp(X)
Z <- pr$x[, 1:m]
phi <- pr$rotation[, 1:m]
theta <- solve(t(Z) %*% Z) %*% t(Z) %*% y
beta <- phi %*% theta
return(list(theta = theta, beta = beta))
}
例90
# データ生成
n <- 100
p <- 5
X <- matrix(rnorm(n * p), nrow = n, ncol = p)
for (j in 1:p)
X[, j] <- X[, j] - mean(X[, j])
y <- X[, 1] + X[, 2] + X[, 3] + X[, 4] + X[, 5] + rnorm(n)
y <- y - mean(y)
# 実行
pca.regression(X, y, 3)
## $theta
## [,1]
## PC1 0.5181417
## PC2 -1.1049787
## PC3 1.1818900
##
## $beta
## [,1]
## [1,] 1.2962792
## [2,] -0.1341256
## [3,] 0.1453994
## [4,] 0.8155858
## [5,] 0.7082811
pca.regression(X, y, 5)$beta
## [,1]
## [1,] 1.1840991
## [2,] 1.0791514
## [3,] 0.9567213
## [4,] 0.9105458
## [5,] 0.8674473
solve(t(X) %*% X) %*% t(X) %*% y
## [,1]
## [1,] 1.1840991
## [2,] 1.0791514
## [3,] 0.9567213
## [4,] 0.9105458
## [5,] 0.8674473
付録 プログラム
hc.dendroidgram <- function(cluster, dd = "complete") {
index <- unlist(cluster[[1]])
y <- unlist(index)
n <- length(y)
height <- array(dim = n)
z <- matrix(0, ncol = 5, nrow = n)
for (i in 1:n)
index[[i]] <- list(y[i])
for (i in 1:n)
height[i] <- 0
for (k in n:2) {
dist.min <- Inf
for (i in 1:(k - 1)) {
i.0 <- unlist(index[[i]])
j.0 <- unlist(index[[i + 1]])
d <- switch(dd,
"complete" = dist.complete(X[i.0, ], X[j.0, ]),
"single" = dist.single(X[i.0, ], X[j.0, ]),
"centroid" = dist.centroid(X[i.0, ], X[j.0, ]),
"average" = dist.average(X[i.0, ], X[j.0, ]))
if (d < dist.min) {
dist.min <- d
i.1 <- i
j.1 <- i + 1
}
}
i <- 0
for (h in 1:i.1)
i <- i + length(index[[h]])
j <- i + length(index[[j.1]])
z[k, 1] <- i - length(index[[i.1]]) / 2 + 0.5
z[k, 2] <- j - length(index[[j.1]]) / 2 + 0.5
z[k, 3] <- height[i.1]
z[k, 4] <- height[j.1]
z[k, 5] <- dist.min
index[[i.1]] <- append(index[[i.1]], index[[j.1]])
if (j.1 < k) {
for (h in (j.1 + 1):k) {
index[[h - 1]] <- index[[h]]
height[h - 1] <- height[h]
}
}
index[[k]] <- NULL
height[i.1] <- dist.min
}
plot(1:n, 1:n, ylim = c(0, 100), xlab = "", ylab = "", type = "n",
xaxt = "n", yaxt = "n", bty = "n", main = dd)
r <- z[2, 5] / 100
for (k in n:2)
z[k, 3:5] <- z[k, 3:5] / r
for (k in n:2) {
segments(z[k, 1], z[k, 3], z[k, 1], z[k, 5])
segments(z[k, 1], z[k, 5], z[k, 2], z[k, 5])
segments(z[k, 2], z[k, 5], z[k, 2], z[k, 4])
}
for (i in 1:n)
text(i, 0, y[i])
}
# データ生成
n <- 30
p <- 3
X <- matrix(rnorm(n * p), ncol = p, nrow = n)
par(mfrow = c(2, 2))
for (d in c("complete", "single", "centroid", "average")) {
cluster <- hc(X, dd = d)
hc.dendroidgram(cluster, dd = d)
}