sq.loss <- function(y) {
y.bar <- mean(y)
return(sum((y - y.bar) ^ 2))
}
branch <- function(x, y, f, S, m = ncol(x)) { # Sはサンプル集合の添え字
n <- length(S)
p <- ncol(x)
if (n == 0)
return(NULL)
best.score <- Inf
for (j in 1:p) {
for (i in S) {
left <- NULL
right <- NULL
for (k in S) {
if (x[k, j] < x[i, j]) {
left <- c(left, k)
} else {
right <- c(right, k)
}
}
L <- f(y[left])
R <- f(y[right])
score <- L + R
if (score < best.score) {
best.score <- score
info <- list(i = i, j = j, left = left, right = right,
score = best.score, left.score = L, right.score = R)
}
}
}
return(info)
}
dt <- function(x, y, f = "sq.loss", alpha = 0, n.min = 1, m = ncol(x)) {
if (f == "sq.loss") {
g <- sq.loss
} else if (f == "mis.match") {
g <- mis.match
} else if (f == "gini") {
g <- gini
} else {
g <- entropy
}
n <- length(y)
stack <- list()
stack[[1]] <- list(parent = 0, set = 1:n, score = g(y))
vertex <- list()
k <- 0
while (length(stack) > 0) {
r <- length(stack)
node <- stack[[r]]
stack <- stack[-r] # POP
k <- k + 1 # PUSHされた頂点のid = kは,POPされてはじめて付与
res <- branch(x, y, g, node$set, m) # 最適な分割に関する情報
if (node$score - res$score < alpha || length(node$set) < n.min ||
length(res$left) == 0 || length(res$right) == 0) {
vertex[[k]] <- list(parent = node$parent, j = 0, set = node$set)
} else {
vertex[[k]] <- list(parent = node$parent, set = node$set,
th = x[res$i, res$j], j = res$j)
stack[[r]] <- list(parent = k, set = res$right,
score = res$right.score) # PUSH
stack[[r + 1]] <- list(parent = k, set = res$left,
score=res$left.score) # PUSH
}
}
# 最頻値
mode <- function(y) {
return(names(sort(table(y), decreasing = TRUE))[1])
}
# 端点にその値(center),内点にその左右の子のID (left, right)
r <- length(vertex)
for (h in 1:r) {
vertex[[h]]$left <- 0
vertex[[h]]$right <- 0
}
for (h in r:2) {
pa <- vertex[[h]]$parent
if (vertex[[pa]]$right == 0) {
vertex[[pa]]$right <- h
} else {
vertex[[pa]]$left <- h
}
}
if (f == "sq.loss") {
g <- mean
} else {
g <- mode
}
for (h in 1:r)
if (vertex[[h]]$j == 0)
vertex[[h]]$center <- g(y[vertex[[h]]$set])
return(vertex)
}
library(MASS)
library(igraph)
## Warning: package 'igraph' was built under R version 4.0.3
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
x <- as.matrix(Boston[, 1:13])
y <- as.vector(Boston[, 14])
vertex <- dt(x, y, n.min = 50)
# グラフの出力
r <- length(vertex)
col <- array(dim =r)
edge.list <- matrix(nrow = r, ncol = 2)
for (h in 1:r)
col[h] <- vertex[[h]]$j
for (h in 1:r)
edge.list[h, ] <- c(vertex[[h]]$parent, h)
edge.list <- edge.list[-1, ]
g <- graph_from_edgelist(edge.list)
V(g)$color <- col
plot(g, layout = layout.reingold.tilford(g, root = 1))
# 表の出力
VAR <- NULL
TH <- NULL
for (h in 1:r) {
if (vertex[[h]]$j != 0) {
j <- vertex[[h]]$j
th <- vertex[[h]]$th
VAR <- c(VAR, j)
TH <- c(TH, th)
}
}
cbind(VAR, TH)
## VAR TH
## [1,] 6 6.94300
## [2,] 13 14.43000
## [3,] 8 1.41300
## [4,] 6 6.54600
## [5,] 13 7.60000
## [6,] 10 223.00000
## [7,] 6 6.08300
## [8,] 7 69.50000
## [9,] 8 4.49860
## [10,] 13 10.15000
## [11,] 10 273.00000
## [12,] 1 7.02259
## [13,] 5 0.53800
## [14,] 13 19.01000
## [15,] 7 85.70000
## [16,] 5 0.61400
## [17,] 13 19.77000
## [18,] 6 7.45400
value <- function(u, vertex) {
r <- 1
while (vertex[[r]]$j != 0) {
if (u[vertex[[r]]$j] < vertex[[r]]$th) {
r <- vertex[[r]]$left
} else {
r <- vertex[[r]]$right
}
}
return(vertex[[r]]$center)
}
df <- Boston[1:100, ]
n <- nrow(df)
p <- ncol(df)
x <- as.matrix(df[, 1:13])
y <- as.vector(df[, 14])
alpha.seq <- seq(0, 1.5, 0.1)
s <- floor(n / 10)
m <- length(vertex)
out <- NULL
for (alpha in alpha.seq) {
SS <- 0
for (h in 1:10) { # 10-fold CV
test <- (h * s - s + 1):(h * s)
train <- setdiff(1:n, test)
vertex <- dt(x[train, ], y[train], alpha = alpha)
for (t in test)
SS <- SS + (y[t] - value(x[t, ], vertex)) ^ 2
}
out <- c(out, SS / 100)
}
plot(alpha.seq, out, type = "l", ylim = c(10.5, 12),
xlab = "alpha", ylab = "二乗誤差",
main = "CVで最適なalpha(Bostonデータセット,N = 100)")
# 最頻値
mode <- function(y) {
return(names(sort(table(y), decreasing = TRUE))[1])
}
# 誤り率
mis.match <- function(y) {
y.hat <- mode(y)
return(sum(y != y.hat))
}
# Gini
gini <- function(y) {
n <- length(y)
if (n == 0)
return(0)
z <- as.vector(table(y))
m <- length(z)
T <- 0
for (j in 1:m)
T <- T + z[j] * (n - z[j])
return(T/n)
}
# エントロピー
entropy <- function(y) {
n <- length(y)
if (n == 0)
return(0)
z <- as.vector(table(y))
m <- length(z)
T <- 0
for (j in 1:m)
if (z[j] != 0)
T <- T + z[j] * log(n / z[j])
return(T)
}
df <- iris
x <- as.matrix(df[, 1:4])
y <- as.matrix(df[, 5])
vertex <- dt(x, y, "mis.match", n.min = 4)
m <- length(vertex)
u <- NULL
v <- NULL
for (h in 1:m) {
if (vertex[[h]]$j == 0) {
w <- y[vertex[[h]]$set]
u <- c(u, rep(mode(w), length(w)))
v <- c(v, w)
}
}
table(u, v)
## v
## u setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 0
## virginica 0 2 50
col <- array(dim = m)
edge.list <- matrix(nrow = m, ncol = 2)
for (h in 1:m)
col[h] <- vertex[[h]]$j
for (h in 1:m)
edge.list[h, ] <- c(vertex[[h]]$parent, h)
edge.list <- edge.list[-1, ]
library(igraph)
g <- graph_from_edgelist(edge.list)
V(g)$color <- col
V(g)$name <- ""
plot(g, pin = c(4, 4), layout = layout.reingold.tilford(g, root = 1))
title("誤り率")
# Giniとエントロピーも同様に実行した(mis.matchをgini, entropyに)
df <- iris
n <- nrow(df)
p <- ncol(df)
index <- sample(1:n, n, replace = FALSE)
x <- as.matrix(df[index, 1:4])
y <- as.matrix(df[index, 5])
n.min.seq <- 1:10
s <- 15
out <- NULL
for (n.min in n.min.seq) {
SS <- 0
for (h in 10:1) {
test <- (15 * h - 14):(15 * h)
train <- setdiff(1:n, test)
vertex <- dt(x[train, ], y[train], "mis.match", n.min,p)
for (i in test)
SS <- SS + as.integer(value(x[i, ], vertex) != y[i])
}
print(SS)
}
## [1] 10
## [1] 12
## [1] 12
## [1] 12
## [1] 12
## [1] 12
## [1] 12
## [1] 12
## [1] 18
## [1] 18
par(mfrow = c(2, 4))
# データ生成
n <- 200
p <- 5
x <- matrix(rnorm(n * p), nrow = n, ncol = p)
beta <- rnorm(p)
y <- abs(round(x %*% beta + rnorm(n))) + 1
for (h in 1:8) {
index <- sample(1:n, n, replace = TRUE)
x <- x[index, ]
y <- y[index]
vertex <- dt(x, y, "mis.match", n.min = 6)
r <- length(vertex)
col <- array(dim = r)
edge.list <- matrix(nrow = r, ncol = 2)
for (h in 1:r)
col[h] <- vertex[[h]]$j
for (h in 1:r)
edge.list[h, ] <- c(vertex[[h]]$parent, h)
edge.list <- edge.list[-1, ]
g <- graph_from_edgelist(edge.list)
V(g)$color <- col
V(g)$name <- ""
plot(g, edge.arrow.size = 0.1, layout = layout.reingold.tilford(g, root = 1))
}
par(mfrow = c(1, 1))
branch <- function(x, y, f, S, m = ncol(x)){ # mの値の設定, デフォルトはp
n <- length(S)
p <- ncol(x)
if (n == 0)
return(NULL)
best.score <- Inf
if (m < p) { # ここが違う
T <- sample(1:p, m, replace = FALSE) # 〃
} else { # 〃
T <- 1:p # 〃
} # 〃
for (j in T) { # Tの中で最適な変数を選ぶ
for (i in S) {
left <- NULL
right <- NULL
for (k in S) {
if (x[k, j] < x[i, j]) {
left <- c(left, k)
} else {
right <- c(right, k)
}
}
L <- f(y[left])
R <- f(y[right])
score <- L + R
if (score < best.score) {
best.score <- score
info <- list(i = i, j = j, left = left, right = right,
score = best.score, left.score = L, right.score = R)
}
}
}
return(info)
}
rf <- function(z) {
zz <- array(dim = c(B, 50))
zzz <- NULL
for (b in 1:B) {
for (i in 1:50)
zz[b, i] <- (mode(z[1:b, i]) == y[i + 100])
zzz <- c(zzz, sum(zz[b, ]))
}
return(zzz)
}
set.seed(101)
df <- iris
n <- nrow(df)
p <- ncol(df)
index <- sample(1:n, n, replace = FALSE)
x <- as.matrix(df[index, 1:4])
y <- as.vector(df[index, 5])
train <- 1:100
test <- 101:150
B <- 100
z <- array(dim = c(B, 50))
m <- 4
for (b in 1:B) {
index <- sample(train, 100, replace = TRUE)
vertex <- dt(x[index, ], y[index], "mis.match", n.min = 2, m = m)
for (i in test)
z[b, i - 100] <- value(x[i, ], vertex)
}
z4 <- z
# m = 4をm = 3, m = 2, m = 1に変えて,それぞれz3, z2, z1に格納
# この部分は書籍に不掲載
m <- 3
for (b in 1:B) {
index <- sample(train, 100, replace = TRUE)
vertex <- dt(x[index, ], y[index], "mis.match", n.min = 2, m = m)
for (i in test)
z[b, i - 100] <- value(x[i, ], vertex)
}
z3 <- z
m <- 2
for (b in 1:B) {
index <- sample(train, 100, replace = TRUE)
vertex <- dt(x[index, ], y[index], "mis.match", n.min = 2, m = m)
for (i in test)
z[b, i - 100] <- value(x[i, ], vertex)
}
z2 <- z
m <- 1
for (b in 1:B) {
index <- sample(train, 100, replace = TRUE)
vertex <- dt(x[index, ], y[index], "mis.match", n.min = 2, m = m)
for (i in test)
z[b, i - 100] <- value(x[i, ], vertex)
}
z1 <- z
# ここまで
plot(1:B, rf(z4) - 0.1, type = "l", ylim = c(0, 50), col = 2,
xlab = "木の生成回数", ylab = "正答回数/50回",
main = "ランダムフォレスト")
lines(1:B, rf(z3), col = 3)
lines(1:B, rf(z2) + 0.1, col = 4)
lines(1:B, rf(z1) - 0.1, col = 5)
legend("bottomright", legend = c("m = 4", "m = 3", "m = 2", "m = 1"),
col = c(2, 3, 4, 5), lty = 1)
b.dt <- function(x, y, d, f = "sq.loss") {
n <- nrow(x)
if (f == "sq.loss") {
g <- sq.loss
} else if (f == "mis.match") {
g <- mis.match
} else if (f == "gini") {
g <- gini
} else {
g <- entropy
}
vertex <- list()
vertex[[1]] <- list(parent = 0, set = 1:n, score = g(y), j = 0)
while (length(vertex) <= 2 * d - 1) { #
r <- length(vertex) #
gain.max <- -Inf #
for (h in 1:r) { #
if (vertex[[h]]$j == 0) { #
res <- branch(x, y, g, vertex[[h]]$set) #
gain <- vertex[[h]]$score - res$score #
if (gain > gain.max) { #
gain.max <- gain #
h.max <- h #
res.max <- res #
} #
} #
} #
vertex[[h.max]]$th <- x[res.max$i, res.max$j]
vertex[[h.max]]$j <- res.max$j
vertex[[r + 1]] <- list(parent = h.max, set = res.max$left,
score = res.max$left.score, j = 0)
vertex[[r + 2]] <- list(parent = h.max, set = res.max$right,
score = res.max$right.score, j = 0)
}
r <- 2 * d + 1 #
for (h in 1:r) {
vertex[[h]]$left <- 0
vertex[[h]]$right <- 0
}
for (h in r:2) {
pa <- vertex[[h]]$parent
if (vertex[[pa]]$right == 0) {
vertex[[pa]]$right <- h
} else {
vertex[[pa]]$left <- h
}
if (vertex[[h]]$right == 0 & vertex[[h]]$left == 0)
vertex[[h]]$j <- 0
}
mode <- function(y) {
return(names(sort(table(y), decreasing = TRUE))[1])
}
if (f == "sq.loss") {
g <- mean
} else {
g <- mode
}
for (h in 1:r)
if (vertex[[h]]$j == 0)
vertex[[h]]$center <- g(y[vertex[[h]]$set])
return(vertex)
}
\(*\) このプログラムは書籍とは異なっています。
library(MASS)
df <- Boston
x <- as.matrix(df[, 1:13])
y <- as.matrix(df[, 14])
train <- 1:200
test <- 201:300
B <- 200
lambda <- 0.1
d <- 1
trees <- list()
r <- y[train]
for (b in 1:B) {
trees[[b]] <- b.dt(x[train, ], r, d)
for (i in train)
r[i] <- r[i] - lambda * value(x[i, ], trees[[b]])
}
z <- array(0, dim = c(B, 600))
for (i in test)
z[1, i] <- lambda * value(x[i, ], trees[[1]])
for (b in 2:B)
for (i in test)
z[b, i] <- z[b - 1, i] + lambda * value(x[i, ], trees[[b]])
out <- NULL
for (b in 1:B)
out <- c(out, sum((y[test] - z[b, test]) ^ 2) / length(test))
out1 <- out
# d = 2, d = 3でも実行する
d <- 2
trees <- list()
r <- y[train]
for (b in 1:B) {
trees[[b]] <- b.dt(x[train, ], r, d)
for (i in train)
r[i] <- r[i] - lambda * value(x[i, ], trees[[b]])
}
z <- array(0, dim = c(B, 600))
for (i in test)
z[1, i] <- lambda * value(x[i, ], trees[[1]])
for (b in 2:B)
for (i in test)
z[b, i] <- z[b - 1, i] + lambda * value(x[i, ], trees[[b]])
out <- NULL
for (b in 1:B)
out <- c(out, sum((y[test] - z[b, test]) ^ 2) / length(test))
out2 <- out
# d = 3
d <- 3
trees <- list()
r <- y[train]
for (b in 1:B) {
trees[[b]] <- b.dt(x[train, ], r, d)
for (i in train)
r[i] <- r[i] - lambda * value(x[i, ], trees[[b]])
}
z <- array(0, dim = c(B, 600))
for (i in test)
z[1, i] <- lambda * value(x[i, ], trees[[1]])
for (b in 2:B)
for (i in test)
z[b, i] <- z[b - 1, i] + lambda * value(x[i, ], trees[[b]])
out <- NULL
for (b in 1:B)
out <- c(out, sum((y[test] - z[b, test]) ^ 2) / length(test))
out3 <- out
plot(21:100, xlab = "生成した木の個数", ylab = "テストデータでの二乗誤差",
type = "n", xlim = c(20, 100), ylim = c(0, 35), main = "ブースティング")
lines(21:100, out1[21:100], col = "red")
lines(21:100, out2[21:100], col = "blue")
lines(21:100, out3[21:100], col = "green")
legend("topright", legend = c("d = 1", "d = 2", "d = 3"),
col = c("red", "blue", "green"), lty = 1)
library(MASS)
library(gbm)
## Warning: package 'gbm' was built under R version 4.0.5
## Loaded gbm 2.1.8
train <- 1:200
test <- 201:300
boston.test <- Boston[test, 14]
MAX <- 5000
nn <- c(seq(1, 9, 1), seq(10, 90, 10), seq(100, MAX, 50))
plot(nn, nn / MAX * 80, type = "n",
xlab = "生成した木の個数", ylab = "テストデータでの二乗誤差")
d <- 1:3
color <- c("blue", "green", "red")
for (i in 1:3) {
x <- NULL
y <- NULL
for (n in nn) {
boost.boston <- gbm(medv~., data = Boston[train, ],
distribution = "gaussian", n.trees = n,
shrinkage = 0.001, interaction.depth = i)
yhat.boost <- predict(boost.boston, n.trees = n, newdata = Boston[test, ])
S <- mean((yhat.boost - boston.test) ^ 2)
x <- c(x, n)
y <- c(y, S)
}
lines(x, y, col = color[i])
}
legend("topright", legend = c("d = 1", "d = 2", "d = 3"),
col = color, lwd = 2, cex = .8)