第7章 決定木¶

70¶

In [ ]:
import numpy as np
import matplotlib.pyplot as plt
import scipy
from numpy.random import randn  # 正規乱数
%matplotlib inline
# import japanize_matplotlib
In [ ]:
# Anacondaの場合は下記
import matplotlib
from matplotlib import font_manager
matplotlib.rc("font", family="BIZ UDGothic")
In [ ]:
def sq_loss(y):
    if len(y) == 0:
        return (0)
    else:
        y_bar = np.mean(y)
        return (np.linalg.norm(y-y_bar)**2)
In [ ]:
def branch(x, y, S, rf=0):
    if rf == 0:
        m = x.shape[1]
    if x.shape[0] == 0:
        return ([0, 0, 0, 0, 0, 0, 0])
    best_score = np.inf
    for j in range(x.shape[1]):
        for i in S:
            left = []
            right = []
            for k in S:
                if x[k, j] < x[i, j]:
                    left.append(k)
                else:
                    right.append(k)
            left_score = f(y[left])
            right_score = f(y[right])
            score = left_score + right_score
            if score < best_score:
                best_score = score
                i_1 = i
                j_1 = j
                left_1 = left
                right_1 = right
                left_score_1 = left_score
                right_score_1 = right_score
    return [i_1, j_1, left_1, right_1, best_score, left_score_1, right_score_1]
In [ ]:
f = sq_loss
n = 100
p = 5
x = randn(n, p)
y = randn(n)
S = np.random.choice(n, 10, replace=False)
branch(x, y, S)

71¶

In [ ]:
class Stack:
    def __init__(self, parent, set, score):
        self.parent = parent
        self.set = set
        self.score = score
In [ ]:
class Node:
    def __init__(self, parent, j, th, set):
        self.parent = parent
        self.j = j
        self.th = th
        self.set = set
In [ ]:
def dt(x, y, alpha=0, n_min=1, rf=0):
    if rf == 0:
        m = x.shape[1]
    # 1個からなるstackを構成。決定木を初期化
    stack = [Stack(0, list(range(x.shape[0])), f(y))]  # 関数 fは、大域
    node = []
    k = -1
    # stackの最後の要素を取り出して、決定木を更新する。
    while len(stack) > 0:
        popped = stack.pop()
        k = k+1
        i, j, left, right, score, left_score, right_score = branch(x, y, popped.set, rf)
        if popped.score-score < alpha or len(popped.set) < n_min or len(left) == 0 or len(right) == 0:
            node.append(Node(popped.parent, -1, 0, popped.set))
        else:
            node.append(Node(popped.parent, j, x[i, j], popped.set))
            stack.append(Stack(k, right, right_score))
            stack.append(Stack(k, left, left_score))
    # これより下でnode.left, node.rightの値を設定する
    for h in range(k, -1, -1):
        node[h].left = 0
        node[h].right = 0
    for h in range(k, 0, -1):
        pa = node[h].parent
        if node[pa].right == 0:
            node[pa].right = h
        else:
            node[pa].left = h
    # これより下で、node.centerの値を計算する
    if f == sq_loss:
        g = np.mean
    else:
        g = mode_max
    for h in range(k+1):
        if node[h].j == -1:
            node[h].center = g(y[node[h].set])
        else:
            node[h].center = 0
    return (node)
In [ ]:
df = np.loadtxt("boston.txt", delimiter="\t")
X = np.array(df[:, [0, 2, 4, 5, 6, 7, 9, 10, 11, 12]])
p = X.shape[1]
y = np.array(df[:, 13])
f = sq_loss
node = dt(X, y, n_min=50)
len(node)
In [ ]:
!pip uninstall igraph -y
!pip uninstall python-igraph -y
!pip install python-igraph
!pip install cairocffi
!pip install pycairo
In [ ]:
from igraph import *
In [ ]:
def draw_graph(node):
    r = len(node)
    col = []
    for h in range(r):
        col.append(node[h].j)
    colorlist = ["#ffffff", "#fff8ff", "#fcf9ce", "#d6fada", "#d7ffff",
                 "#d9f2f8", "#fac8be", "#ffebff", "#ffffe0", "#fdf5e6",
                 "#fac8be", "#f8ecd5", "#ee82ee"]
    color = [colorlist[col[i]] for i in range(r)]
    edge = []
    for h in range(1, r):
        edge.append([node[h].parent, h])
        g = Graph(edges=edge, directed=True)
        layout = g.layout_reingold_tilford(root=[0])
    out = plot(g, vertex_size=15, layout=layout, bbox=(300, 300),
               vertex_label=list(range(r)), vertex_color=color)
    return (out)
In [ ]:
draw_graph(node)

72¶

In [ ]:
def value(u, node):
    r = 0
    while node[r].j != -1:
        if u[node[r].j] < node[r].th:
            r = node[r].left
        else:
            r = node[r].right
    return (node[r].center)
In [ ]:
n = 100
df = np.loadtxt("boston.txt", delimiter="\t")
X = np.array(df[0:n, [0, 2, 4, 5, 6, 7, 9, 10, 11, 12]])
p = X.shape[1]
y = np.array(df[0:n, 13])
f = sq_loss
alpha_seq = np.arange(0, 1.5, 0.1)
s = np.int(n/10)
out = []
for alpha in alpha_seq:
    SS = 0
    for h in range(10):
        test = list(range(h*s, h*s+s))
        train = list(set(range(n)) - set(test))
        node = dt(X[train, :], y[train], alpha=alpha)
        for t in test:
            SS = SS + (y[t] - value(X[t, :], node))**2
    print(SS / n)
    out.append(SS / n)
plt.plot(alpha_seq, out)
plt.xlabel("alpha")
plt.ylabel("2乗誤差")
plt.title("CVで最適なalpha (N=100)")
In [ ]:
df = np.loadtxt("boston.txt", delimiter="\t")
X = np.array(df[0:n, [0, 2, 4, 5, 6, 7, 9, 10, 11, 12]])
p = X.shape[1]
y = np.array(df[0:n, 13])
n_min_seq = np.arange(1, 13, 1)
s = np.int(n / 10)
out = []
for n_min in n_min_seq:
    SS = 0
    for h in range(10):
        test = list(range(h*s, h*s+s))
        train = list(set(range(n)) - set(test))
        node = dt(X[train, :], y[train], n_min=n_min)
        for t in test:
            SS = SS + (y[t] - value(X[t, :], node))**2
    print(SS / n)
    out.append(SS / n)
plt.plot(n_min_seq, out)
plt.xlabel("n_min")
plt.ylabel("2乗誤差")
plt.title("CVで最適なn_min (N=100)")

73¶

In [ ]:
def branch(x, y, S, rf=0):                                   #
    if rf == 0:                                              #
        T = np.arange(x.shape[1])                            #
    else:                                                    #
        T = np.random.choice(x.shape[1], rf, replace=False)  #
    if x.shape[0] == 0:
        return ([0, 0, 0, 0, 0, 0, 0])
    best_score = np.inf
    for j in T:                                              #
        for i in S:
            left = []
            right = []
            for k in S:
                if x[k, j] < x[i, j]:
                    left.append(k)
                else:
                    right.append(k)
            left_score = f(y[left])
            right_score = f(y[right])
            score = left_score + right_score
            if score < best_score:
                best_score = score
                i_1 = i
                j_1 = j
                left_1 = left
                right_1 = right
                left_score_1 = left_score
                right_score_1 = right_score
    return [i_1, j_1, left_1, right_1, best_score, left_score_1, right_score_1]
In [ ]:
def rf(z):
    z = np.array(z, dtype=np.int64)
    zz = []
    for b in range(B):
        u = sum([mode_max(z[range(b+1), i]) == y[i+100] for i in range(50)])
        zz.append(u)
    return (zz)
In [ ]:
def freq(y):
    y = list(y)
    return ([y.count(i) for i in set(y)])
In [ ]:
def mode(y):
    n = len(y)
    if n == 0:
        return (0)
    return (max(freq(y)))
In [ ]:
def mis_match(y):
    return (len(y)-mode(y))
In [ ]:
def mode_max(y):
    if len(y) == 0:
        return (-np.inf)
    count = np.bincount(y)
    return (np.argmax(count))
In [ ]:
from sklearn.datasets import load_iris
In [ ]:
iris = load_iris()
iris.target_names
f = mis_match
n = iris.data.shape[0]
order = np.random.choice(n, n, replace=False)  # 並び替える
X = iris.data[order, :]
y = iris.target[order]
train = list(range(100))
test = list(range(100, 150))
B = 100
plt.ylim([35, 55])
m_seq = [1, 2, 3, 4]
c_seq = ["r", "b", "g", "y"]
label_seq = ["m=1", "m=2", "m=3", "m=4"]
plt.xlabel("繰り返し数 b")
plt.ylabel("テスト50データでの正答数")
plt.title("ランダムフォレスト")
for m in m_seq:
    z = np.zeros((B, 50))
    for b in range(B):
        index = np.random.choice(train, 100, replace=True)
        node = dt(X[index, :], y[index], n_min=2, rf=m)
        for i in test:
            z[b, i-100] = value(X[i, ], node)
    plt.plot(list(range(B)), np.array(rf(z))-0.2*(m-2),
             label=label_seq[m-1], linewidth=0.8, c=c_seq[m-1])
plt.legend(loc="lower right")
plt.axhline(y=50, c="b", linewidth=0.5, linestyle="dashed")

74¶

In [ ]:
import lightgbm as lgb
In [ ]:
df = np.loadtxt("boston.txt", delimiter="\t")
X = np.array(df[:, 0:12])
p = X.shape[1]
y = np.array(df[:, 13])
train = list(range(200))
test = list(range(200, 300))
B = 200
lgb_train = lgb.Dataset(X[train, :], y[train])
lgb_eval = lgb.Dataset(X[test, :], y[test], reference=lgb_train)

B = 5000
nn_seq = list(range(1, 10, 1)) + list(range(10, 91, 10)) + list(range(100, B, 50))
out_set = []
for d in range(1, 4):
    lgbm_params = {
        "objective": "regression",
        "metric": "rmse",
        "num_leaves": d+1,
        "learning_rate": 0.001
    }
    out = []
    for nn in nn_seq:
        model = lgb.train(lgbm_params, lgb_train,
                          valid_sets=lgb_eval, num_boost_round=nn)
        z = model.predict(X[test, :], num_iteration=model.best_iteration)
        out.append(sum((z-y[test])**2) / 100)
    out_set.append(out)
In [ ]:
# グラフで表示
plt.ylim([0, 80])
c_seq = ["r", "b", "g"]
label_seq = ["d=1", "d=2", "d=3"]
plt.xlabel("生成した木の個数")
plt.ylabel("テストデータでの二乗誤差")
plt.title("lightgbm パッケージ (lambda=0.001)")
for d in range(1, 4):
    plt.plot(nn_seq, out_set[d-1], label=label_seq[d-1],
             linewidth=0.8, c=c_seq[d-1])
plt.legend(loc="upper right")