第7章 決定木(問題69~74)
69
mode <- function(y) {
return(names(sort(table(y), decreasing = TRUE))[1])
}
70
branch <- function(x, y, f, S, m = ncol(x)) {
n <- length(S)
p <- ncol(x)
if (n == 0)
return(NULL)
best.score <- Inf
for (j in 1:p) {
for (i in S) {
left <- NULL
right <- NULL
for (k in S) {
if (x[k, j] < x[i, j]) {
left <- c(left, k)
} else {
## 空欄(1) ##
}
}
L <- f(y[left])
R <- f(y[right])
score <- ## 空欄(2) ##
if (score < best.score) {
best.score <- score
info <- list(i = i, j = j, left = left, right = right,
score = best.score, left.score = L, right.score = R)
}
}
}
return(info)
}
n <- 100
p <- 5
x <- matrix(rnorm(n * p), nrow = n, ncol = p)
y <- rnorm(n)
S <- sample(1:n, 10, replace = FALSE)
branch(x, y, sq.loss, S, m = ncol(x))
71
dt <- function(x, y, f = "sq.loss", alpha = 0, n.min = 1, m = ncol(x)) {
if (f == "sq.loss") {
g <- sq.loss
} else if (f == "mis.match") {
g <- mis.match
} else if (f == "gini") {
g <- gini
} else {
g <- entropy
}
n <- length(y)
stack <- list()
stack[[1]] <- list(parent = 0, set = 1:n, score = g(y))
vertex <- list()
k <- 0
while (length(stack) > 0) {
r <- length(stack)
node <- stack[[r]]
stack <- stack[-r] # POP
k <- k + 1 # PUSHされたノードのid = kは,POPされてはじめて付与
res <- branch(x, y, g, node$set, m)
if (node$score - res$score < alpha || length(node$set) < n.min ||
length(res$left) == 0 || length(res$right) == 0) {
vertex[[k]] <- list(parent = node$parent, j = 0, set = node$set)
} else {
vertex[[k]] <- list(parent = node$parent, set = node$set,
th = x[res$i, res$j], j = res$j)
stack[[r]] <- list(parent = k, set = res$right,
score = res$right.score) # PUSH
stack[[r + 1]] <- list(parent = k, set = res$left,
score=res$left.score) # PUSH
}
}
mode <- function(y) {
return(names(sort(table(y), decreasing = TRUE))[1])
}
# 端点にその値(center),内点にその左右の子のID (left, right)
r <- length(vertex)
for (h in 1:r) {
vertex[[h]]$left <- 0
vertex[[h]]$right <- 0
}
for (h in r:2) {
pa <- vertex[[h]]$parent
if (vertex[[pa]]$right == 0) {
vertex[[pa]]$right <- h
} else {
vertex[[pa]]$left <- h
}
}
if (f == "sq.loss") {
g <- mean
} else {
g <- mode
}
for (h in 1:r)
if (vertex[[h]]$j == 0)
vertex[[h]]$center <- g(y[vertex[[h]]$set])
return(vertex)
}
library(igraph)
r <- length(vertex)
col <- array(dim = r)
edge.list <- matrix(nrow = r, ncol = 2)
for (h in 1:r)
col[h] <- vertex[[h]]$j # 頂点の分岐で用いる変数を色で識別
for (h in 1:r)
edge.list[h, ] <- c(vertex[[h]]$parent, h)
edge.list <- edge.list[-1, ] # 辺の集合をedge.listに格納
g <- graph_from_edgelist(edge.list)
V(g)$color <- col
plot(g, layout = layout.reingold.tilford(g, root = 1))
72
value <- function(u, vertex) {
r <- 1
while (vertex[[r]]$j != 0) {
if (u[vertex[[r]]$j] < vertex[[r]]$th) {
r <- vertex[[r]]$left
} else {
r <- vertex[[r]]$right
}
}
return(vertex[[r]]$center)
}
df <- Boston[1:100, ]
n <- nrow(df)
p <- ncol(df)
x <- as.matrix(df[, 1:13])
y <- as.vector(df[, 14])
alpha.seq <- seq(0, 1.5, 0.1)
s <- floor(n / 10)
m <- length(vertex)
out <- NULL
for (alpha in alpha.seq) {
SS <- 0
for (h in 1:10) { # 10-fold CV
## 空欄 ## <- (h * s - s + 1):(h * s)
## 空欄 ## <- setdiff(1:n, ## 空欄 ##)
vertex <- dt(x[## 空欄 ##, ], y[## 空欄 ##], alpha = alpha)
for (t in ## 空欄 ##)
SS <- SS + (y[t] - value(x[t, ], vertex)) ^ 2
}
out <- c(out, SS / 100)
}
plot(alpha.seq, out, type = "l", ylim = c(10.5, 12),
xlab = "alpha", ylab = "二乗誤差",
main = "CVで最適なalpha(Bostonデータセット,N = 100)")
73
branch <- function(x, y, f, S, m = ncol(x)){ # mの値の設定, デフォルトはp
n <- length(S)
p <- ncol(x)
if (n == 0)
return(NULL)
best.score <- Inf
if (m < p) {
## 空欄 ##
}
for (j in T) { # Tの中で最適な変数を選ぶ
for (i in S) {
left <- NULL
right <- NULL
for (k in S) {
if (x[k, j] < x[i, j]) {
left <- c(left, k)
} else {
right <- c(right, k)
}
}
L <- f(y[left])
R <- f(y[right])
score <- L + R
if (score < best.score) {
best.score <- score
info <- list(i = i, j = j, left = left, right = right,
score = best.score, left.score = L, right.score = R)
}
}
}
return(info)
}
rf <- function(z) {
zz <- array(dim = c(B, 50))
zzz <- NULL
for (b in 1:B) {
for (i in 1:50)
zz[b, i] <- (mode(z[1:b, i]) == y[i + 100])
zzz <- c(zzz, sum(zz[b, ]))
}
return(zzz)
}
df <- iris
n <- nrow(df)
p <- ncol(df)
index <- sample(1:n, n, replace = FALSE)
x <- as.matrix(df[index, 1:4])
y <- as.vector(df[index, 5])
train <- 1:100
test <- 101:150
B <- 100
z <- array(dim = c(B, 50))
m <- 4
for (b in 1:B) {
index <- sample(train, 100, replace = TRUE)
vertex <- dt(x[index, ], y[index], "mis.match", n.min = 2, m = m)
for (i in test)
z[b, i - 100] <- value(x[i, ], vertex)
}
z4 <- z
# m = 4をm = 3, m = 2, m = 1に変えて,それぞれz3, z2, z1に格納する
plot(1:B, rf(z4) - 0.1, type = "l", ylim = c(0, 50), col = 2,
xlab = "木の生成回数", ylab = "正答回数/50回",
main = "ランダムフォレスト")
lines(1:B, rf(z3), col = 3)
lines(1:B, rf(z2) + 0.1, col = 4)
lines(1:B, rf(z1) - 0.1, col = 5)
legend("bottomright", legend = c("m = 4", "m = 3", "m = 2", "m = 1"),
col = c(2, 3, 4, 5), lty = 1)
74
library(MASS)
library(gbm)
train <- 1:200
test <- 201:300
boston.test <- Boston[test, 14]
MAX <- 5000
nn <- c(seq(1, 9, 1), seq(10, 90, 10), seq(100, MAX, 50))
plot(nn, nn / MAX * 80, type = "n",
xlab = "生成した木の個数", ylab = "テストデータでの二乗誤差")
d <- 1:3
color <- c("blue", "green", "red")
for (i in 1:3) {
x <- NULL
y <- NULL
for (n in nn) {
boost.boston <- gbm(medv~., data = Boston[train, ],
distribution = "gaussian", n.trees = n,
shrinkage = 0.001, interaction.depth = ## 空欄(1) ##)
yhat.boost <- predict(boost.boston, n.trees = n,
newdata = ## 空欄(2) ##)
S <- mean((yhat.boost - boston.test) ^ 2)
x <- c(x, n)
y <- c(y, S)
}
lines(x, y, col = color[i])
}
legend("topright", legend = c("d = 1", "d = 2", "d = 3"),
col = color, lwd = 2, cex = .8)