Statistical Learning with Math and R, Springer

Chapter 8 Decision Tree

8.1 Decison Trees for Regression

sq.loss=function(y){y.bar=mean(y); return(sum((y-y.bar)^2))}
branch=function(x,y,f,S,m=ncol(x)){ ## S expresses the sample set.
  n=length(S)
  p=ncol(x)
  if(n==0)return(NULL)
  best.score=Inf
  for(j in 1:p)for(i in S){
    left=NULL
    right=NULL
    for(k in S)if(x[k,j]<x[i,j])left=c(left,k) else right=c(right,k)
    L=f(y[left])
    R=f(y[right])
    score=L+R
    if(score<best.score){
      best.score=score;
      info=list(i=i, j=j, left=left, right=right, score=best.score,left.score=L, right.score=R)
    }
  }
  return(info)
}
dt=function(x,y,f="sq.loss",alpha=0, n.min=1, m=ncol(x)){
  if(f=="sq.loss")g=sq.loss 
  else if(f=="mis.match") g=mis.match 
  else if(f=="gini") g=gini 
  else g=entropy
  n=length(y)
  stack=list()
  stack[[1]]=list(parent=0, set=1:n, score=g(y))
  vertex=list()
  k=0
  while(length(stack)>0){
    r=length(stack)
    node=stack[[r]]
    stack=stack[-r] ## POP
    k=k+1 ## The node generated when PUSH is given ID id=k when POP.
    res=branch(x, y, g, node$set, m) ## Info on the optimum decomposition 
    if(node$score-res$score<alpha||length(node$set)<n.min||length(res$left)==0||length(res$right)==0){
      vertex[[k]]=list(parent=node$parent, j=0, set=node$set)
    } 
    else{
      vertex[[k]]=list(parent=node$parent,set=node$set, th=x[res$i,res$j],j=res$j)
      stack[[r]]=list(parent=k, set=res$right, score=res$right.score) ## PUSH
      stack[[r+1]]=list(parent=k, set=res$left, score=res$left.score) ## PUSH
    }
  }
  ## center is given for each terminal node while the IDs of left and right are given for each internal node.
  mode=function(y) names(sort(table(y),decreasing=TRUE))[1] ## The most frequent value
  r=length(vertex)
  for(h in 1:r){vertex[[h]]$left=0; vertex[[h]]$right=0}
  for(h in r:2){
    pa=vertex[[h]]$parent
    if(vertex[[pa]]$right==0)vertex[[pa]]$right=h 
    else vertex[[pa]]$left=h
  }
  if(f=="sq.loss") g=mean else g=mode
  for(h in 1:r)if(vertex[[h]]$j==0)vertex[[h]]$center=g(y[vertex[[h]]$set])
  return(vertex)
}
library(MASS); library(igraph)
## Warning: package 'igraph' was built under R version 4.0.3
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
x=as.matrix(Boston[,1:13])
y=as.vector(Boston[,14])
vertex=dt(x,y,n.min=50)
## Graph Output 
r=length(vertex)
col=array(dim=r)
edge.list=matrix(nrow=r,ncol=2)
for(h in 1:r)col[h]=vertex[[h]]$j
for(h in 1:r)edge.list[h,]=c(vertex[[h]]$parent,h)
edge.list=edge.list[-1,]
g=graph_from_edgelist(edge.list)
V(g)$color=col
plot(g, layout = layout.reingold.tilford(g, root=1))

## Tabel Output
VAR=NULL; TH=NULL
for(h in 1:r)if(vertex[[h]]$j!=0){
  i=vertex[[h]]$i
  j=vertex[[h]]$j
  VAR=c(VAR,j)
  TH=c(TH,x[i,j])
}
cbind(VAR,TH)
##       VAR
##  [1,]   6
##  [2,]  13
##  [3,]   8
##  [4,]   6
##  [5,]  13
##  [6,]  10
##  [7,]   6
##  [8,]   7
##  [9,]   8
## [10,]  13
## [11,]  10
## [12,]   1
## [13,]   5
## [14,]  13
## [15,]   7
## [16,]   5
## [17,]  13
## [18,]   6
VAR
##  [1]  6 13  8  6 13 10  6  7  8 13 10  1  5 13  7  5 13  6
value=function(u,vertex){
  r=1
  while(vertex[[r]]$j!=0)if(u[vertex[[r]]$j]<vertex[[r]]$th)
  r=vertex[[r]]$left else r=vertex[[r]]$right
  return(vertex[[r]]$center)
}
df=Boston[1:100,]
n=nrow(df)
p=ncol(df)
x=as.matrix(df[,1:13])
y=as.vector(df[,14])
alpha.seq=seq(0,1.5,0.1)
s=floor(n/10); m=length(vertex)
out=NULL
for(alpha in alpha.seq){
  SS=0
  for(h in 1:10){ ## 10-fold CV
    test=(h*s-s+1):(h*s)
    train=setdiff(1:n,test)
    vertex=dt(x[train,],y[train],alpha=alpha)
    for(t in test)SS=SS+(y[t]-value(x[t,],vertex))^2
  }
  out=c(out,SS/100)
}
plot(alpha.seq,out,type="l",ylim=c(10.5,12), xlab="alpha",
     ylab="二乗誤差", main="CVで最適なalpha (Bostonデータセット, N=100)")

8.2 Decision Trees for Classification

mode=function(y)names(sort(table(y),decreasing=TRUE))[1] ## Most Frequent value
## Error rate
mis.match=function(y){y.hat=mode(y)
return(sum(y!=y.hat))}
## Gini
gini=function(y){
  n=length(y)
  if(n==0)return(0)
  z=as.vector(table(y))
  m=length(z);
  T=0;for(j in 1:m)T=T+z[j]*(n-z[j])/n
  return(T/n)
}
## Entropy
entropy=function(y){
  n=length(y)
  if(n==0)return(0)
  z=as.vector(table(y))
  m=length(z)
  T=0;for(j in 1:m)if(z[j]!=0)T=T+z[j]*log(n/z[j])
  return(T)
}
df=iris
x=as.matrix(df[,1:4])
y=as.matrix(df[,5])
vertex=dt(x,y,"mis.match",n.min=4)
m=length(vertex)
u=NULL
v=NULL
for(h in 1:m)if(vertex[[h]]$j==0){
  w=y[vertex[[h]]$set]; u=c(u,rep(mode(w),length(w))); v=c(v,w)
}
table(u,v)
##             v
## u            setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         0
##   virginica       0          2        50
col=array(dim=m); edge.list=matrix(nrow=m,ncol=2)
for(h in 1:m)col[h]=vertex[[h]]$j
for(h in 1:m)edge.list[h,]=c(vertex[[h]]$parent,h) 
edge.list=edge.list[-1,]
library(igraph)
g=graph_from_edgelist(edge.list) 
V(g)$color=col
V(g)$name=""
plot(g, pin=c(4,4), layout = layout.reingold.tilford(g, root=1)); title("誤り率")

## For Gini and entropy, we executed similarly by replacing mis.match by gini and entropy.

df=iris; n=nrow(df)
p=ncol(df)
index=sample(1:n,n,replace=FALSE)
x=as.matrix(df[index,1:4])
y=as.matrix(df[index,5])
n.min.seq=1:10
s=15
out=NULL
for(n.min in n.min.seq){
  SS=0
  for(h in 10:1){
    test=(15*h-14):(15*h)
    train=setdiff(1:n,test)
    vertex=dt(x[train,],y[train],"mis.match",n.min,p)
    for(i in test)SS=SS+as.integer(value(x[i,],vertex)!=y[i])
  }
  print(SS)
}
## [1] 12
## [1] 13
## [1] 13
## [1] 13
## [1] 13
## [1] 13
## [1] 13
## [1] 13
## [1] 15
## [1] 19

8.3 Bagging

par(mfrow=c(2,4))
n=200; p=5; x=matrix(rnorm(n*p),nrow=n,ncol=p)
beta=rnorm(p); y=abs(round(x%*%beta+rnorm(n)))+1 ## Data Generation
for(h in 1:8){
  index=sample(1:n,n,replace=TRUE)
  x=x[index,]
  y=y[index]
  vertex=dt(x,y,"mis.match",n.min=6)
  r=length(vertex)
  col=array(dim=m)
  edge.list=matrix(nrow=r,ncol=2)
  for(h in 1:r)col[h]=vertex[[h]]$j
  for(h in 1:r)edge.list[h,]=c(vertex[[h]]$parent,h)
  edge.list=edge.list[-1,]
  g=graph_from_edgelist(edge.list)
  V(g)$color=col
  V(g)$name=""
  plot(g, layout = layout.reingold.tilford(g, root=1))
}

par(mfrow=c(1,1))

8.4 Random Forest

branch=function(x,y,f,S, m=ncol(x)){ ## Setting of m (the default value is p)
  n=length(S)
  p=ncol(x)
  if(n==0)return(NULL)
  best.score=Inf
  if(m<p) T=sample(1:p, m, replace=FALSE) 
  else T=1:p ## The main difference
  for(j in T)for(i in S){ ## Choose optimum variables in T
    left=NULL; right=NULL
    for(k in S)if(x[k,j]<x[i,j])left=c(left,k) else right=c(right,k)
    L=f(y[left]); R=f(y[right]); score=L+R
    if(score<best.score){
      best.score=score;
      info=list(i=i, j=j, left=left, right=right, score=best.score,
      left.score=L, right.score=R)
    }
  }
  return(info)
}
rf=function(z){
  zz=array(dim=c(B,50))
  zzz=NULL
  for(b in 1:B){
    for(i in 1:50)zz[b,i]=(mode(z[1:b,i])==y[i+100])
    zzz=c(zzz,sum(zz[b,]))
  }
  return(zzz)
}
set.seed(101)
df=iris
n=nrow(df)
p=ncol(df) 
index=sample(1:n,n,replace=FALSE)
x=as.matrix(df[index,1:4])
y=as.vector(df[index,5])
train=1:100; test=101:150
B=100
z=array(dim=c(B,50))
m=4
for(b in 1:B){
  index=sample(train, 100, replace=TRUE)
  vertex=dt(x[index,], y[index], "mis.match", n.min=2, m=m)
  for(i in test)z[b,i-100]=value(x[i,],vertex)
}
z4=z
## Replacing m=4 by m=3,m=2,m=1, and store them to z4,z3,z2,z1.
m=3
for(b in 1:B){
  index=sample(train, 100, replace=TRUE)
  vertex=dt(x[index,], y[index], "mis.match", n.min=2, m=m)
  for(i in test)z[b,i-100]=value(x[i,],vertex)
}
z3=z
m=2
for(b in 1:B){
  index=sample(train, 100, replace=TRUE)
  vertex=dt(x[index,], y[index], "mis.match", n.min=2, m=m)
  for(i in test)z[b,i-100]=value(x[i,],vertex)
}
z2=z
m=1
for(b in 1:B){
  index=sample(train, 100, replace=TRUE)
  vertex=dt(x[index,], y[index], "mis.match", n.min=2, m=m)
  for(i in test)z[b,i-100]=value(x[i,],vertex)
}
z1=z
plot(1:B,rf(z4)-0.1,type="l",ylim=c(0,50),col=2,xlab="The number of Generation",
  ylab="The number of correct answers/50 times", main="Random Forest")
lines(1:B,rf(z3),col=3)
lines(1:B,rf(z2)+0.1,col=4) 
lines(1:B,rf(z1)-0.1,col=5)
legend("bottomright",legend=c("m=4","m=3","m=2","m=1"),col=c(2,3,4,5),lty=1)

8.5 Boosting

b.dt=function(x,y,d,f="sq.loss"){
  n=nrow(x)
  if(f=="sq.loss")g=sq.loss 
  else if(f=="mis.match") g=mis.match 
  else if(f=="gini")g=gini 
  else g=entropy
  vertex=list()
  vertex[[1]]=list(parent=0, set=1:n, score=g(y), j=0)
  while(length(vertex)<=2*d-1){ #
    r=length(vertex); gain.max=-Inf #
    for(h in 1:r)if(vertex[[h]]$j==0){ #
      res=branch(x,y,g,vertex[[h]]$set) #
      gain=vertex[[h]]$score-res$score #
      if(gain>gain.max){gain.max=gain; h.max=h; res.max=res} #
    } #
    vertex[[h.max]]$th=x[res.max$i,res.max$j]; vertex[[h.max]]$j=res.max$j
    vertex[[r+1]]=list(parent=h.max, set=res.max$left,
    score=res.max$left.score, j=0)
    vertex[[r+2]]=list(parent=h.max, set=res.max$right,
    score=res.max$right.score, j=0)
  }
  r=2*d+1 #
  for(h in 1:r){vertex[[h]]$left=0; vertex[[h]]$right=0}
  for(h in r:2){
    pa=vertex[[h]]$parent
    if(vertex[[pa]]$right==0)vertex[[pa]]$right=h else vertex[[pa]]$left=h
    if(vertex[[h]]$right==0&vertex[[h]]$left==0)vertex[[h]]$j=0
  }
  mode=function(y) names(sort(table(y),decreasing=TRUE))[1]
  if(f=="sq.loss") g=mean else g=mode
  for(h in 1:r)if(vertex[[h]]$j==0)vertex[[h]]$center=g(y[vertex[[h]]$set])
  return(vertex)
}
library(MASS)
df=Boston; x=as.matrix(df[,1:13])
y=as.matrix(df[,14])
train=1:200
test=201:300
B=200
lambda=0.1; d=1
trees=list(); r=y[train]
for(b in 1:B){
  trees[[b]]=b.dt(x[train,],r,d)
  for(i in train)r[i]=r[i]-lambda*value(x[i,],trees[[b]])
}
z=array(0,dim=c(B,600))
for(i in test)z[1,i]=lambda*value(x[i,],trees[[1]])
for(b in 2:B)for(i in test)z[b,i]=z[b-1,i]+lambda*value(x[i,],trees[[b]])
out=NULL
for(b in 1:B)out=c(out,sum((y[test]-z[b,test])^2)/length(test))
out1=out

# Executed even for d=2,d=3
d=2;trees=list(); r=y[train]
for(b in 1:B){
  trees[[b]]=b.dt(x[train,],r,d)
  for(i in train)r[i]=r[i]-lambda*value(x[i,],trees[[b]])
}
z=array(0,dim=c(B,600))
for(i in test)z[1,i]=lambda*value(x[i,],trees[[1]])
for(b in 2:B)for(i in test)z[b,i]=z[b-1,i]+lambda*value(x[i,],trees[[b]])
out=NULL; 
for(b in 1:B)out=c(out,sum((y[test]-z[b,test])^2)/length(test))
out2=out

# d=3
d=3;trees=list(); r=y[train]
for(b in 1:B){
  trees[[b]]=b.dt(x[train,],r,d)
  for(i in train)r[i]=r[i]-lambda*value(x[i,],trees[[b]])
}
z=array(0,dim=c(B,600))
for(i in test)z[1,i]=lambda*value(x[i,],trees[[1]])
for(b in 2:B)for(i in test)z[b,i]=z[b-1,i]+lambda*value(x[i,],trees[[b]])
out=NULL; 
for(b in 1:B)out=c(out,sum((y[test]-z[b,test])^2)/length(test))
out3=out
plot(21:100, xlab="The number of generated trees",ylab="The squared loss for test data", type="n", xlim=c(20,100),ylim=c(0,35),main="Boosting")
lines(21:100, out1[21:100],col="red")
lines(21:100, out2[21:100],col="blue")
lines(21:100, out3[21:100],col="green")
legend("topright",legend=c("d=1","d=2","d=3"),col=c("red","blue","green"),lty=1)