第7章 決定木(問題69~74)

69

mode <- function(y) {
  return(names(sort(table(y), decreasing = TRUE))[1])
}

70

branch <- function(x, y, f, S, m = ncol(x)) {
  n <- length(S)
  p <- ncol(x)
  if (n == 0)
    return(NULL)
  best.score <- Inf
  for (j in 1:p) {
    for (i in S) {
      left <- NULL
      right <- NULL
      for (k in S) {
        if (x[k, j] < x[i, j]) {
          left <- c(left, k)
        } else {
          ## 空欄(1) ##
        }
      }
      L <- f(y[left])
      R <- f(y[right])
      score <- ## 空欄(2) ##
      if (score < best.score) {
        best.score <- score
        info <- list(i = i, j = j, left = left, right = right,
                     score = best.score, left.score = L, right.score = R)
      }
    }
  }
  return(info)
}

n <- 100
p <- 5
x <- matrix(rnorm(n * p), nrow = n, ncol = p)
y <- rnorm(n)
S <- sample(1:n, 10, replace = FALSE)
branch(x, y, sq.loss, S, m = ncol(x))

71

dt <- function(x, y, f = "sq.loss", alpha = 0, n.min = 1, m = ncol(x)) {
  if (f == "sq.loss") {
    g <- sq.loss
  } else if (f == "mis.match") {
    g <- mis.match
  } else if (f == "gini") {
    g <- gini
  } else {
    g <- entropy
  }
  n <- length(y)
  stack <- list()
  stack[[1]] <- list(parent = 0, set = 1:n, score = g(y))
  vertex <- list()
  k <- 0
  while (length(stack) > 0) {
    r <- length(stack)
    node <- stack[[r]]
    stack <- stack[-r]  # POP
    k <- k + 1  # PUSHされたノードのid = kは,POPされてはじめて付与
    res <- branch(x, y, g, node$set, m)
    if (node$score - res$score < alpha || length(node$set) < n.min ||
        length(res$left) == 0 || length(res$right) == 0) {
      vertex[[k]] <- list(parent = node$parent, j = 0, set = node$set)
    } else {
      vertex[[k]] <- list(parent = node$parent, set = node$set,
                          th = x[res$i, res$j], j = res$j)
      stack[[r]] <- list(parent = k, set = res$right,
                         score = res$right.score)  # PUSH
      stack[[r + 1]] <- list(parent = k, set = res$left,
                             score=res$left.score)  # PUSH
    }
  }
  
  mode <- function(y) {
    return(names(sort(table(y), decreasing = TRUE))[1])
  }
  
  # 端点にその値(center),内点にその左右の子のID (left, right)
  r <- length(vertex)
  for (h in 1:r) {
    vertex[[h]]$left <- 0
    vertex[[h]]$right <- 0
  }
  for (h in r:2) {
    pa <- vertex[[h]]$parent
    if (vertex[[pa]]$right == 0) {
      vertex[[pa]]$right <- h
    } else {
      vertex[[pa]]$left <- h
    }
  }
  if (f == "sq.loss") {
    g <- mean
  } else {
    g <- mode
  }
  for (h in 1:r)
    if (vertex[[h]]$j == 0)
      vertex[[h]]$center <- g(y[vertex[[h]]$set])
  return(vertex)
}
library(igraph)

r <- length(vertex)
col <- array(dim = r)
edge.list <- matrix(nrow = r, ncol = 2)
for (h in 1:r)
  col[h] <- vertex[[h]]$j  # 頂点の分岐で用いる変数を色で識別
for (h in 1:r)
  edge.list[h, ] <- c(vertex[[h]]$parent, h)
edge.list <- edge.list[-1, ]  # 辺の集合をedge.listに格納

g <- graph_from_edgelist(edge.list)
V(g)$color <- col
plot(g, layout = layout.reingold.tilford(g, root = 1))

72

value <- function(u, vertex) {
  r <- 1
  while (vertex[[r]]$j != 0) {
    if (u[vertex[[r]]$j] < vertex[[r]]$th) {
      r <- vertex[[r]]$left
    } else {
      r <- vertex[[r]]$right
    }
  }
  return(vertex[[r]]$center)
}

df <- Boston[1:100, ]
n <- nrow(df)
p <- ncol(df)
x <- as.matrix(df[, 1:13])
y <- as.vector(df[, 14])
alpha.seq <- seq(0, 1.5, 0.1)
s <- floor(n / 10)
m <- length(vertex)
out <- NULL
for (alpha in alpha.seq) {
  SS <- 0
  for (h in 1:10) {  # 10-fold CV
    ## 空欄 ## <- (h * s - s + 1):(h * s)
    ## 空欄 ## <- setdiff(1:n, ## 空欄 ##)
    vertex <- dt(x[## 空欄 ##, ], y[## 空欄 ##], alpha = alpha)
    for (t in ## 空欄 ##)
      SS <- SS + (y[t] - value(x[t, ], vertex)) ^ 2
  }
  out <- c(out, SS / 100)
}
plot(alpha.seq, out, type = "l", ylim = c(10.5, 12),
     xlab = "alpha", ylab = "二乗誤差",
     main = "CVで最適なalpha(Bostonデータセット,N = 100)")

73

branch <- function(x, y, f, S, m = ncol(x)){  # mの値の設定, デフォルトはp
  n <- length(S)
  p <- ncol(x)
  if (n == 0)
    return(NULL)
  best.score <- Inf
  if (m < p) {
    ## 空欄 ##
  }
  for (j in T) {  # Tの中で最適な変数を選ぶ
    for (i in S) {
      left <- NULL
      right <- NULL
      for (k in S) {
        if (x[k, j] < x[i, j]) {
          left <- c(left, k)
        } else {
          right <- c(right, k)
        }
      }
      L <- f(y[left])
      R <- f(y[right])
      score <- L + R
      if (score < best.score) {
        best.score <- score
        info <- list(i = i, j = j, left = left, right = right,
                     score = best.score, left.score = L, right.score = R)
      }
    }
  }
  return(info)
}

rf <- function(z) {
  zz <- array(dim = c(B, 50))
  zzz <- NULL
  for (b in 1:B) {
    for (i in 1:50)
      zz[b, i] <- (mode(z[1:b, i]) == y[i + 100])
    zzz <- c(zzz, sum(zz[b, ]))
  }
  return(zzz)
}

df <- iris
n <- nrow(df)
p <- ncol(df)
index <- sample(1:n, n, replace = FALSE)
x <- as.matrix(df[index, 1:4])
y <- as.vector(df[index, 5])
train <- 1:100
test <- 101:150
B <- 100
z <- array(dim = c(B, 50))
m <- 4
for (b in 1:B) {
  index <- sample(train, 100, replace = TRUE)
  vertex <- dt(x[index, ], y[index], "mis.match", n.min = 2, m = m)
  for (i in test)
    z[b, i - 100] <- value(x[i, ], vertex)
}
z4 <- z
# m = 4をm = 3, m = 2, m = 1に変えて,それぞれz3, z2, z1に格納する
plot(1:B, rf(z4) - 0.1, type = "l", ylim = c(0, 50), col = 2,
     xlab = "木の生成回数", ylab = "正答回数/50回",
     main = "ランダムフォレスト")
lines(1:B, rf(z3), col = 3)
lines(1:B, rf(z2) + 0.1, col = 4)
lines(1:B, rf(z1) - 0.1, col = 5)
legend("bottomright", legend = c("m = 4", "m = 3", "m = 2", "m = 1"),
       col = c(2, 3, 4, 5), lty = 1)

74

library(MASS)
library(gbm)

train <- 1:200
test <- 201:300
boston.test <- Boston[test, 14]
MAX <- 5000
nn <- c(seq(1, 9, 1), seq(10, 90, 10), seq(100, MAX, 50))
plot(nn, nn / MAX * 80, type = "n",
     xlab = "生成した木の個数", ylab = "テストデータでの二乗誤差")
d <- 1:3
color <- c("blue", "green", "red")
for (i in 1:3) {
  x <- NULL
  y <- NULL
  for (n in nn) {
    boost.boston <- gbm(medv~., data = Boston[train, ],
                        distribution = "gaussian", n.trees = n,
                        shrinkage = 0.001, interaction.depth = ## 空欄(1) ##)
    yhat.boost <- predict(boost.boston, n.trees = n,
                          newdata = ## 空欄(2) ##)
    S <- mean((yhat.boost - boston.test) ^ 2)
    x <- c(x, n)
    y <- c(y, S)
  }
  lines(x, y, col = color[i])
}
legend("topright", legend = c("d = 1", "d = 2", "d = 3"),
       col = color, lwd = 2, cex = .8)