69
mode <- function(y) {
return(names(sort(table(y), decreasing = TRUE))[1])
}
sq.loss <- function(y) {
y.bar <- mean(y)
return(sum((y - y.bar) ^ 2))
}
mis.match <- function(y) {
y.hat <- mode(y)
return(sum(y != y.hat))
}
70
branch <- function(x, y, f, S, m = ncol(x)) {
n <- length(S)
p <- ncol(x)
if (n == 0)
return(NULL)
best.score <- Inf
for (j in 1:p) {
for (i in S) {
left <- NULL
right <- NULL
for (k in S) {
if (x[k, j] < x[i, j]) {
left <- c(left, k)
} else {
right <- c(right, k)
}
}
L <- f(y[left])
R <- f(y[right])
score <- L + R
if (score < best.score) {
best.score <- score
info <- list(i = i, j = j, left = left, right = right,
score = best.score, left.score = L, right.score = R)
}
}
}
return(info)
}
n <- 100
p <- 5
x <- matrix(rnorm(n * p), nrow = n, ncol = p)
y <- rnorm(n)
S <- sample(1:n, 10, replace = FALSE)
branch(x, y, sq.loss, S, m = ncol(x))
## $i
## [1] 89
##
## $j
## [1] 5
##
## $left
## [1] 63 77 52 50 44 73
##
## $right
## [1] 37 89 26 84
##
## $score
## [1] 6.390496
##
## $left.score
## [1] 5.994746
##
## $right.score
## [1] 0.3957497
71
dt <- function(x, y, f = "sq.loss", alpha = 0, n.min = 1, m = ncol(x)) {
if (f == "sq.loss") {
g <- sq.loss
} else if (f == "mis.match") {
g <- mis.match
} else if (f == "gini") {
g <- gini
} else {
g <- entropy
}
n <- length(y)
stack <- list()
stack[[1]] <- list(parent = 0, set = 1:n, score = g(y))
vertex <- list()
k <- 0
while (length(stack) > 0) {
r <- length(stack)
node <- stack[[r]]
stack <- stack[-r] # POP
k <- k + 1 # id of the PUSHED node = k is assigned only after being POPPED
res <- branch(x, y, g, node$set, m)
if (node$score - res$score < alpha || length(node$set) < n.min ||
length(res$left) == 0 || length(res$right) == 0) {
vertex[[k]] <- list(parent = node$parent, j = 0, set = node$set)
} else {
vertex[[k]] <- list(parent = node$parent, set = node$set,
th = x[res$i, res$j], j = res$j)
stack[[r]] <- list(parent = k, set = res$right,
score = res$right.score) # PUSH
stack[[r + 1]] <- list(parent = k, set = res$left,
score=res$left.score) # PUSH
}
}
mode <- function(y) {
return(names(sort(table(y), decreasing = TRUE))[1])
}
# The value (center) is assigned to the terminal node, and the IDs of its left and right children to the internal node (left, right)
r <- length(vertex)
for (h in 1:r) {
vertex[[h]]$left <- 0
vertex[[h]]$right <- 0
}
for (h in r:2) {
pa <- vertex[[h]]$parent
if (vertex[[pa]]$right == 0) {
vertex[[pa]]$right <- h
} else {
vertex[[pa]]$left <- h
}
}
if (f == "sq.loss") {
g <- mean
} else {
g <- mode
}
for (h in 1:r)
if (vertex[[h]]$j == 0)
vertex[[h]]$center <- g(y[vertex[[h]]$set])
return(vertex)
}
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
x=as.matrix(iris[,1:4]); y= as.vector(iris[,5])
vertex = dt(x,y,"mis.match", n.min = 5)
r <- length(vertex)
col <- array(dim = r)
edge.list <- matrix(nrow = r, ncol = 2)
for (h in 1:r)
col[h] <- vertex[[h]]$j # Identify the variable used for branching of the vertex by color
for (h in 1:r)
edge.list[h, ] <- c(vertex[[h]]$parent, h)
edge.list <- edge.list[-1, ] # Store the set of edges in edge.list
g <- graph_from_edgelist(edge.list)
V(g)$color <- col
plot(g, layout = layout.reingold.tilford(g, root = 1))
72
library(MASS)
library(igraph)
value <- function(u, vertex) {
r <- 1
while (vertex[[r]]$j != 0) {
if (u[vertex[[r]]$j] < vertex[[r]]$th) {
r <- vertex[[r]]$left
} else {
r <- vertex[[r]]$right
}
}
return(vertex[[r]]$center)
}
df <- Boston[1:100, ]
n <- nrow(df)
p <- ncol(df)
x <- as.matrix(df[, 1:13])
y <- as.vector(df[, 14])
alpha.seq <- seq(0, 1.5, 0.1)
s <- floor(n / 10)
m <- length(vertex)
out <- NULL
for (alpha in alpha.seq) {
SS <- 0
for (h in 1:10) { # 10-fold CV
test <- (h * s - s + 1):(h * s)
train <- setdiff(1:n, test)
vertex <- dt(x[train, ], y[train], alpha = alpha)
for (t in test)
SS <- SS + (y[t] - value(x[t, ], vertex)) ^ 2
}
out <- c(out, SS / 100)
}
plot(alpha.seq, out, type = "l", ylim = c(10.5, 12),
xlab = "alpha", ylab = "Squared Error",
main = "Optimal alpha by CV (Boston dataset, N = 100)")
73
branch <- function(x, y, f, S, m = ncol(x)){ # Setting the value of m, default is p
n <- length(S)
p <- ncol(x)
if (n == 0)
return(NULL)
best.score <- Inf
if (m < p) {
T=sample(1:p, m, replace=FALSE)
}else {
T <- 1:p
}
for (j in T) { # Choose the best variable among T
for (i in S) {
left <- NULL
right <- NULL
for (k in S) {
if (x[k, j] < x[i, j]) {
left <- c(left, k)
} else {
right <- c(right, k)
}
}
L <- f(y[left])
R <- f(y[right])
score <- L + R
if (score < best.score) {
best.score <- score
info <- list(i = i, j = j, left = left, right = right,
score = best.score, left.score = L, right.score = R)
}
}
}
return(info)
}
rf <- function(z) {
zz <- array(dim = c(B, 50))
zzz <- NULL
for (b in 1:B) {
for (i in 1:50)
zz[b, i] <- (mode(z[1:b, i]) == y[i + 100])
zzz <- c(zzz, sum(zz[b, ]))
}
return(zzz)
}
df <- iris
n <- nrow(df)
p <- ncol(df)
index <- sample(1:n, n, replace = FALSE)
x <- as.matrix(df[index, 1:4])
y <- as.vector(df[index, 5])
train <- 1:100
test <- 101:150
B <- 100
z <- array(dim = c(B, 50))
m <- 4
for (b in 1:B) {
index <- sample(train, 100, replace = TRUE)
vertex <- dt(x[index, ], y[index], "mis.match", n.min = 2, m = m)
for (i in test)
z[b, i - 100] <- value(x[i, ], vertex)
}
z4 <- z
# Change m = 4 to m = 3, m = 2, m = 1, and store in z3, z2, z1 respectively
m <- 3
for (b in 1:B) {
index <- sample(train, 100, replace = TRUE)
vertex <- dt(x[index, ], y[index], "mis.match", n.min = 2, m = m)
for (i in test)
z[b, i - 100] <- value(x[i, ], vertex)
}
z3 <- z
m <- 2
for (b in 1:B) {
index <- sample(train, 100, replace = TRUE)
vertex <- dt(x[index, ], y[index], "mis.match", n.min = 2, m = m)
for (i in test)
z[b, i - 100] <- value(x[i, ], vertex)
}
z2 <- z
m <- 1
for (b in 1:B) {
index <- sample(train, 100, replace = TRUE)
vertex <- dt(x[index, ], y[index], "mis.match", n.min = 2, m = m)
for (i in test)
z[b, i - 100] <- value(x[i, ], vertex)
}
z1 <- z
plot(1:B, rf(z4) - 0.1, type = "l", ylim = c(0, 50), col = 2,
xlab = "Number of Tree Generations", ylab = "Number of Correct Answers/50",
main = "Random Forest")
lines(1:B, rf(z3), col = 3)
lines(1:B, rf(z2) + 0.1, col = 4)
lines(1:B, rf(z1) - 0.1, col = 5)
legend("bottomright", legend = c("m = 4", "m = 3", "m = 2", "m = 1"),
col = c(2, 3, 4, 5), lty = 1)
74
library(MASS)
library(gbm)
## Warning: package 'gbm' was built under R version 4.2.2
## Loaded gbm 2.1.8.1
train <- 1:200
test <- 201:300
boston.test <- Boston[test, 14]
MAX <- 5000
nn <- c(seq(1, 9, 1), seq(10, 90, 10), seq(100, MAX, 50))
plot(nn, nn / MAX * 80, type = "n",
xlab = "Number of Trees Generated", ylab = "Squared Error on Test Data")
d <- 1:3
color <- c("blue", "green", "red")
for (i in 1:3) {
x <- NULL
y <- NULL
for (n in nn) {
boost.boston <- gbm(medv~., data = Boston[train, ],
distribution = "gaussian", n.trees = n,
shrinkage = 0.001, interaction.depth = i)
yhat.boost <- predict(boost.boston, n.trees = n,
newdata = Boston[test,])
S <- mean((yhat.boost - boston.test) ^ 2)
x <- c(x, n)
y <- c(y, S)
}
lines(x, y, col = color[i])
}
legend("topright", legend = c("d = 1", "d = 2", "d = 3"),
col = color, lwd = 2, cex = .8)