88

k.means <- function(X, K, iteration = 20) {
  n <- nrow(X)
  p <- ncol(X)
  center <- array(dim = c(K, p))
  y <- sample(1:K, n, replace = TRUE)  # Clusters are stored in vector y
  for (h in 1:iteration) {
    for (k in 1:K) {
      if (sum(y[] == k) == 0) {
        center[k, ] <- Inf
      } else {
        for (j in 1:p)
          center[k, j] <- mean(X[y[]==k,j])
      }
    }
    for (i in 1:n) {
      S.min <- Inf
      for (k in 1:K) {
        S <- sum((X[i, ] - center[k, ]) ^ 2)
        if (S < S.min) {
          S.min <- S
          y[i] = k
        }
      }
    }
  }
  return(list(clusters = y))
}

# Data Generation
p <- 2
n <- 1000
X <- matrix(rnorm(p * n), nrow = n, ncol = p)

# Get the cluster for each sample
y <- k.means(X, 5)$clusters

# Draw points with different colors for each cluster
plot(-3:3, -3:3, xlab = "First Component", ylab = "Second Component", type = "n")
points(X[, 1], X[, 2], col = y + 1)

89

k.means <- function(X, K, iteration = 20) {
  n <- nrow(X)
  p <- ncol(X)
  center <- array(dim = c(K, p))
  y <- sample(1:K, n, replace = TRUE)
  scores <- NULL            #
  for (h in 1:iteration) {
    for (k in 1:K) {
      if (sum(y[] == k) == 0) {
        # sum(y[] == k) means the number of i for which y[i] = k.
        # Clusters without samples disappear.
        center[k, ] <- Inf
      } else {
        for (j in 1:p)
          center[k, j] <- mean(X[y[] == k, j])
      }
    }
    S.total <- 0                        #
    for (i in 1:n) {
      S.min <- Inf
      for (k in 1:K) {
        S <- sum((X[i, ] - center[k, ]) ^ 2)
        if (S < S.min) {
          S.min <- S
          y[i] <- k
        }
      }
      S.total <- S.total + S.min        #
    }
    scores <- c(scores, S.total)        #
  }
  return(list(clusters = y, scores = scores))
}

p <- 2
n <- 1000
X <- matrix(rnorm(p * n), nrow = n, ncol = p)

input <- 1:20
output <- k.means(X, 5)$scores
plot(input, log(output), ylim = c(6.2, 7.5),
     xlab = "Number of Iterations", ylab = "log(Score)",
     type = "l", col = 1, main = "Change of Score by Initial Value")
for (r in 2:10) {
  output <- k.means(X, 5)$scores
  lines(input, log(output), col = r)}

91

dist.complete <- function(x, y) {
  x <- as.matrix(x)
  y <- as.matrix(y)
  r <- nrow(x)
  s <- nrow(y)
  dist.max <- 0
  for (i in 1:r) {
    for (j in 1:s) {
      d <- norm(x[i, ] - y[j, ], "2")
      if (d > dist.max)
        dist.max <- d
    }
  }
  return(dist.max)
}

dist.single <- function(x, y) {
  x <- as.matrix(x)
  y <- as.matrix(y)
  r <- nrow(x)
  s <- nrow(y)
  dist.min <- Inf
  for (i in 1:r) {
    for (j in 1:s) {
      d <- norm(x[i, ] - y[j, ], "2")
      if (d < dist.min)
        dist.min <- d
    }
  }
  return(dist.min)
}

dist.centroid <- function(x, y) {
  x <- as.matrix(x)
  y <- as.matrix(y)
  r <- nrow(x)
  s <- nrow(y)
  x.bar <- 0
  for (i in 1:r)
    x.bar <- x.bar + x[i, ]
  x.bar <- x.bar/r
  y.bar <- 0
  for (i in 1:s)
    y.bar <- y.bar + y[i, ]
  y.bar <- y.bar / s
  return(norm(x.bar - y.bar, "2") ^ 2)
}

dist.average <- function(x, y) {
  x <- as.matrix(x)
  y <- as.matrix(y)
  r <- nrow(x)
  s <- nrow(y)
  S <- 0
  for (i in 1:r) {
    for (j in 1:s) {
      d <- norm(x[i, ] - y[j, ], "2")
      S <- S + d
    }
  }
  return(S / r / s)
}

92

hc <- function(X, dd = "complete") {
  n <- nrow(X)
  index <- list()
  for (i in 1:n)
    index[[i]] <- list(i)
  cluster <- list()
  for (k in n:2) {
    dist.min <- Inf
    for (i in 1:(k - 1)) {
      for (j in (i + 1):k) {
        i.0 <- unlist(index[[i]])
        j.0 <- unlist(index[[j]])
        d <- switch(dd,
                    "complete" = dist.complete(X[i.0, ], X[j.0, ]),
                    "single" = dist.single(X[i.0, ], X[j.0, ]),
                    "centroid" = dist.centroid(X[i.0, ], X[j.0, ]),
                    "average" = dist.average(X[i.0, ], X[j.0, ]))
        if (d < dist.min) {
          dist.min <- d
          i.1 <- i
          j.1 <- j
        }
      }
    }
    index[[i.1]] <- append(index[[i.1]], index[[j.1]])
    if (j.1 < k)
      for (h in (j.1 + 1):k)
        index[[h - 1]] <- index[[h]];
    index[[k]] <- NULL
    cluster[[k-1]] <- index
  }
  return(cluster)
}

# Data generation
n <- 200
p <- 2
X <- matrix(rnorm(n * p), ncol = p, nrow = n)

cluster <- hc(X)
par(mfrow = c(2, 2))
for (K in c(3, 5, 7, 9)) {
  grp <- cluster[[K]]
  plot(-3:3, -3:3, xlab = "First Component", ylab = "Second Component", type = "n",
       main = paste("K = ", K))
  for (k in 1:K) {
    z <- unlist(grp[[k]])
    x <- X[z, 1]
    y <- X[z, 2]
    points(x, y, col = k + 1)
  }
}

95

pca <- function(x) {
  n <- nrow(x)
  p <- ncol(x)
  center <- array(dim = p)
  for (j in 1:p)
    center[j] <- mean(x[, j])
  for (j in 1:p)
    x[, j] <- x[, j] - center[j]
  sigma <- t(x) %*% x / n
  lambda <- eigen(sigma)$values
  phi <- eigen(sigma)$vectors
  return(list(lambdas = lambda, vectors = phi, centers = center))
}

n <- 100
p <- 5
x <- matrix(rnorm(n * p), ncol = p, nrow = n)
pca(x)$lambdas
## [1] 1.4412754 1.1654096 1.0815836 0.9172694 0.8015015
pca(x)$vectors
##             [,1]        [,2]        [,3]        [,4]        [,5]
## [1,]  0.37362104 -0.11094664 -0.01548308  0.91537382  0.09974573
## [2,] -0.04808686  0.77826104 -0.55002009  0.13380391 -0.26752909
## [3,] -0.92343481 -0.03707471  0.08246759  0.36600624  0.07163898
## [4,] -0.07098181 -0.56739319 -0.81833058 -0.05547607  0.01685282
## [5,]  0.01801953  0.24226230 -0.14412611 -0.08455251  0.95554320
pca(x)$centers
## [1] -0.14380981 -0.22338892  0.02329906 -0.07199732  0.12934625
prcomp(x)$rotation
##              PC1         PC2         PC3         PC4         PC5
## [1,] -0.37362104  0.11094664  0.01548308  0.91537382  0.09974573
## [2,]  0.04808686 -0.77826104  0.55002009  0.13380391 -0.26752909
## [3,]  0.92343481  0.03707471 -0.08246759  0.36600624  0.07163898
## [4,]  0.07098181  0.56739319  0.81833058 -0.05547607  0.01685282
## [5,] -0.01801953 -0.24226230  0.14412611 -0.08455251  0.95554320
(prcomp(x)$sdev) ^ 2
## [1] 1.4558337 1.1771814 1.0925087 0.9265347 0.8095975
prcomp(x)$center
## [1] -0.14380981 -0.22338892  0.02329906 -0.07199732  0.12934625
names(prcomp(x))
## [1] "sdev"     "rotation" "center"   "scale"    "x"
tmp <- x

96

# Data generation
n <- 100
a <- 0.7
b <- sqrt(1 - a ^ 2)
u <- rnorm(n)
v <- rnorm(n)
x <- u
y <- u * a + v * b
plot(x, y)
plot(x, y, xlim = c(-4, 4), ylim = c(-4, 4))
T <- prcomp(cbind(x, y))$rotation
T[2, 1] / T[1, 1] * T[2, 2] / T[1, 2]
## [1] -1
abline(0, T[2, 1] / T[1, 1], col = "red")
abline(0, T[2, 2] / T[1, 2], col = "blue")

98

library(ISLR)

pr.out <- prcomp(USArrests, scale = TRUE)

names(pr.out)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
biplot(pr.out)
pr.out$x <- -pr.out$x
biplot(pr.out)

99

x <- tmp
pr.var <- (prcomp(x)$sdev) ^ 2
pve <- pr.var / sum(pr.var)
par(mfrow = c(1, 2))
plot(pve, xlab = "Principal Component", ylab = "Proportion of Variance Explained", ylim = c(0, 1) , type = "b")
plot(cumsum(pve), xlab = "Principal Component", ylab = "Cumulative Proportion of Variance Explained",
     ylim = c(0, 1), type = "b")

100

pca.regression <- function(X, y, m) {
  pr <- prcomp(X)
  Z <- pr$x[, 1:m]
  phi <- pr$rotation[, 1:m]
  theta <- solve(t(Z) %*% Z) %*% t(Z) %*% y
  beta <- phi %*% theta
  return(list(theta = theta, beta = beta))
}
# Data generation
n <- 100
p <- 5
X <- matrix(rnorm(n * p), nrow = n, ncol = p)
for (j in 1:p)
  X[, j] <- X[, j] - mean(X[, j])
y <- X[, 1] + X[, 2] + X[, 3] + X[, 4] + X[, 5] + rnorm(n)
y <- y - mean(y)

# Execution
pca.regression(X, y, 3)
## $theta
##             [,1]
## PC1  0.009617015
## PC2 -0.555852443
## PC3 -0.857067962
## 
## $beta
##             [,1]
## [1,] -0.34940275
## [2,]  0.02726620
## [3,]  0.72741034
## [4,] -0.09073358
## [5,]  0.61923001
pca.regression(X, y, 5)$beta
##           [,1]
## [1,] 1.1215221
## [2,] 1.0641805
## [3,] 1.1639081
## [4,] 0.9486829
## [5,] 1.0430943
solve(t(X) %*% X) %*% t(X) %*% y
##           [,1]
## [1,] 1.1215221
## [2,] 1.0641805
## [3,] 1.1639081
## [4,] 0.9486829
## [5,] 1.0430943