88
k.means <- function(X, K, iteration = 20) {
n <- nrow(X)
p <- ncol(X)
center <- array(dim = c(K, p))
y <- sample(1:K, n, replace = TRUE) # Clusters are stored in vector y
for (h in 1:iteration) {
for (k in 1:K) {
if (sum(y[] == k) == 0) {
center[k, ] <- Inf
} else {
for (j in 1:p)
center[k, j] <- mean(X[y[]==k,j])
}
}
for (i in 1:n) {
S.min <- Inf
for (k in 1:K) {
S <- sum((X[i, ] - center[k, ]) ^ 2)
if (S < S.min) {
S.min <- S
y[i] = k
}
}
}
}
return(list(clusters = y))
}
# Data Generation
p <- 2
n <- 1000
X <- matrix(rnorm(p * n), nrow = n, ncol = p)
# Get the cluster for each sample
y <- k.means(X, 5)$clusters
# Draw points with different colors for each cluster
plot(-3:3, -3:3, xlab = "First Component", ylab = "Second Component", type = "n")
points(X[, 1], X[, 2], col = y + 1)
89
k.means <- function(X, K, iteration = 20) {
n <- nrow(X)
p <- ncol(X)
center <- array(dim = c(K, p))
y <- sample(1:K, n, replace = TRUE)
scores <- NULL #
for (h in 1:iteration) {
for (k in 1:K) {
if (sum(y[] == k) == 0) {
# sum(y[] == k) means the number of i for which y[i] = k.
# Clusters without samples disappear.
center[k, ] <- Inf
} else {
for (j in 1:p)
center[k, j] <- mean(X[y[] == k, j])
}
}
S.total <- 0 #
for (i in 1:n) {
S.min <- Inf
for (k in 1:K) {
S <- sum((X[i, ] - center[k, ]) ^ 2)
if (S < S.min) {
S.min <- S
y[i] <- k
}
}
S.total <- S.total + S.min #
}
scores <- c(scores, S.total) #
}
return(list(clusters = y, scores = scores))
}
p <- 2
n <- 1000
X <- matrix(rnorm(p * n), nrow = n, ncol = p)
input <- 1:20
output <- k.means(X, 5)$scores
plot(input, log(output), ylim = c(6.2, 7.5),
xlab = "Number of Iterations", ylab = "log(Score)",
type = "l", col = 1, main = "Change of Score by Initial Value")
for (r in 2:10) {
output <- k.means(X, 5)$scores
lines(input, log(output), col = r)}
91
dist.complete <- function(x, y) {
x <- as.matrix(x)
y <- as.matrix(y)
r <- nrow(x)
s <- nrow(y)
dist.max <- 0
for (i in 1:r) {
for (j in 1:s) {
d <- norm(x[i, ] - y[j, ], "2")
if (d > dist.max)
dist.max <- d
}
}
return(dist.max)
}
dist.single <- function(x, y) {
x <- as.matrix(x)
y <- as.matrix(y)
r <- nrow(x)
s <- nrow(y)
dist.min <- Inf
for (i in 1:r) {
for (j in 1:s) {
d <- norm(x[i, ] - y[j, ], "2")
if (d < dist.min)
dist.min <- d
}
}
return(dist.min)
}
dist.centroid <- function(x, y) {
x <- as.matrix(x)
y <- as.matrix(y)
r <- nrow(x)
s <- nrow(y)
x.bar <- 0
for (i in 1:r)
x.bar <- x.bar + x[i, ]
x.bar <- x.bar/r
y.bar <- 0
for (i in 1:s)
y.bar <- y.bar + y[i, ]
y.bar <- y.bar / s
return(norm(x.bar - y.bar, "2") ^ 2)
}
dist.average <- function(x, y) {
x <- as.matrix(x)
y <- as.matrix(y)
r <- nrow(x)
s <- nrow(y)
S <- 0
for (i in 1:r) {
for (j in 1:s) {
d <- norm(x[i, ] - y[j, ], "2")
S <- S + d
}
}
return(S / r / s)
}
92
hc <- function(X, dd = "complete") {
n <- nrow(X)
index <- list()
for (i in 1:n)
index[[i]] <- list(i)
cluster <- list()
for (k in n:2) {
dist.min <- Inf
for (i in 1:(k - 1)) {
for (j in (i + 1):k) {
i.0 <- unlist(index[[i]])
j.0 <- unlist(index[[j]])
d <- switch(dd,
"complete" = dist.complete(X[i.0, ], X[j.0, ]),
"single" = dist.single(X[i.0, ], X[j.0, ]),
"centroid" = dist.centroid(X[i.0, ], X[j.0, ]),
"average" = dist.average(X[i.0, ], X[j.0, ]))
if (d < dist.min) {
dist.min <- d
i.1 <- i
j.1 <- j
}
}
}
index[[i.1]] <- append(index[[i.1]], index[[j.1]])
if (j.1 < k)
for (h in (j.1 + 1):k)
index[[h - 1]] <- index[[h]];
index[[k]] <- NULL
cluster[[k-1]] <- index
}
return(cluster)
}
# Data generation
n <- 200
p <- 2
X <- matrix(rnorm(n * p), ncol = p, nrow = n)
cluster <- hc(X)
par(mfrow = c(2, 2))
for (K in c(3, 5, 7, 9)) {
grp <- cluster[[K]]
plot(-3:3, -3:3, xlab = "First Component", ylab = "Second Component", type = "n",
main = paste("K = ", K))
for (k in 1:K) {
z <- unlist(grp[[k]])
x <- X[z, 1]
y <- X[z, 2]
points(x, y, col = k + 1)
}
}
95
pca <- function(x) {
n <- nrow(x)
p <- ncol(x)
center <- array(dim = p)
for (j in 1:p)
center[j] <- mean(x[, j])
for (j in 1:p)
x[, j] <- x[, j] - center[j]
sigma <- t(x) %*% x / n
lambda <- eigen(sigma)$values
phi <- eigen(sigma)$vectors
return(list(lambdas = lambda, vectors = phi, centers = center))
}
n <- 100
p <- 5
x <- matrix(rnorm(n * p), ncol = p, nrow = n)
pca(x)$lambdas
## [1] 1.4412754 1.1654096 1.0815836 0.9172694 0.8015015
pca(x)$vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.37362104 -0.11094664 -0.01548308 0.91537382 0.09974573
## [2,] -0.04808686 0.77826104 -0.55002009 0.13380391 -0.26752909
## [3,] -0.92343481 -0.03707471 0.08246759 0.36600624 0.07163898
## [4,] -0.07098181 -0.56739319 -0.81833058 -0.05547607 0.01685282
## [5,] 0.01801953 0.24226230 -0.14412611 -0.08455251 0.95554320
pca(x)$centers
## [1] -0.14380981 -0.22338892 0.02329906 -0.07199732 0.12934625
prcomp(x)$rotation
## PC1 PC2 PC3 PC4 PC5
## [1,] -0.37362104 0.11094664 0.01548308 0.91537382 0.09974573
## [2,] 0.04808686 -0.77826104 0.55002009 0.13380391 -0.26752909
## [3,] 0.92343481 0.03707471 -0.08246759 0.36600624 0.07163898
## [4,] 0.07098181 0.56739319 0.81833058 -0.05547607 0.01685282
## [5,] -0.01801953 -0.24226230 0.14412611 -0.08455251 0.95554320
(prcomp(x)$sdev) ^ 2
## [1] 1.4558337 1.1771814 1.0925087 0.9265347 0.8095975
prcomp(x)$center
## [1] -0.14380981 -0.22338892 0.02329906 -0.07199732 0.12934625
names(prcomp(x))
## [1] "sdev" "rotation" "center" "scale" "x"
tmp <- x
96
# Data generation
n <- 100
a <- 0.7
b <- sqrt(1 - a ^ 2)
u <- rnorm(n)
v <- rnorm(n)
x <- u
y <- u * a + v * b
plot(x, y)
plot(x, y, xlim = c(-4, 4), ylim = c(-4, 4))
T <- prcomp(cbind(x, y))$rotation
T[2, 1] / T[1, 1] * T[2, 2] / T[1, 2]
## [1] -1
abline(0, T[2, 1] / T[1, 1], col = "red")
abline(0, T[2, 2] / T[1, 2], col = "blue")
98
library(ISLR)
pr.out <- prcomp(USArrests, scale = TRUE)
names(pr.out)
## [1] "sdev" "rotation" "center" "scale" "x"
biplot(pr.out)
pr.out$x <- -pr.out$x
biplot(pr.out)
99
x <- tmp
pr.var <- (prcomp(x)$sdev) ^ 2
pve <- pr.var / sum(pr.var)
par(mfrow = c(1, 2))
plot(pve, xlab = "Principal Component", ylab = "Proportion of Variance Explained", ylim = c(0, 1) , type = "b")
plot(cumsum(pve), xlab = "Principal Component", ylab = "Cumulative Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
100
pca.regression <- function(X, y, m) {
pr <- prcomp(X)
Z <- pr$x[, 1:m]
phi <- pr$rotation[, 1:m]
theta <- solve(t(Z) %*% Z) %*% t(Z) %*% y
beta <- phi %*% theta
return(list(theta = theta, beta = beta))
}
# Data generation
n <- 100
p <- 5
X <- matrix(rnorm(n * p), nrow = n, ncol = p)
for (j in 1:p)
X[, j] <- X[, j] - mean(X[, j])
y <- X[, 1] + X[, 2] + X[, 3] + X[, 4] + X[, 5] + rnorm(n)
y <- y - mean(y)
# Execution
pca.regression(X, y, 3)
## $theta
## [,1]
## PC1 0.009617015
## PC2 -0.555852443
## PC3 -0.857067962
##
## $beta
## [,1]
## [1,] -0.34940275
## [2,] 0.02726620
## [3,] 0.72741034
## [4,] -0.09073358
## [5,] 0.61923001
pca.regression(X, y, 5)$beta
## [,1]
## [1,] 1.1215221
## [2,] 1.0641805
## [3,] 1.1639081
## [4,] 0.9486829
## [5,] 1.0430943
solve(t(X) %*% X) %*% t(X) %*% y
## [,1]
## [1,] 1.1215221
## [2,] 1.0641805
## [3,] 1.1639081
## [4,] 0.9486829
## [5,] 1.0430943