import numpy as np
import matplotlib.pyplot as plt
import scipy
from numpy.random import randn # 正規乱数
%matplotlib inline
# import japanize_matplotlib
# Anacondaの場合は下記
import matplotlib
from matplotlib import font_manager
matplotlib.rc("font", family="BIZ UDGothic")
def sq_loss(y):
if len(y) == 0:
return (0)
else:
y_bar = np.mean(y)
return (np.linalg.norm(y-y_bar)**2)
def branch(x, y, S, rf=0):
if rf == 0:
m = x.shape[1]
if x.shape[0] == 0:
return ([0, 0, 0, 0, 0, 0, 0])
best_score = np.inf
for j in range(x.shape[1]):
for i in S:
left = []
right = []
for k in S:
if x[k, j] < x[i, j]:
left.append(k)
else:
right.append(k)
left_score = f(y[left])
right_score = f(y[right])
score = left_score + right_score
if score < best_score:
best_score = score
i_1 = i
j_1 = j
left_1 = left
right_1 = right
left_score_1 = left_score
right_score_1 = right_score
return [i_1, j_1, left_1, right_1, best_score, left_score_1, right_score_1]
f = sq_loss
n = 100
p = 5
x = randn(n, p)
y = randn(n)
S = np.random.choice(n, 10, replace=False)
branch(x, y, S)
class Stack:
def __init__(self, parent, set, score):
self.parent = parent
self.set = set
self.score = score
class Node:
def __init__(self, parent, j, th, set):
self.parent = parent
self.j = j
self.th = th
self.set = set
def dt(x, y, alpha=0, n_min=1, rf=0):
if rf == 0:
m = x.shape[1]
# 1個からなるstackを構成。決定木を初期化
stack = [Stack(0, list(range(x.shape[0])), f(y))] # 関数 fは、大域
node = []
k = -1
# stackの最後の要素を取り出して、決定木を更新する。
while len(stack) > 0:
popped = stack.pop()
k = k+1
i, j, left, right, score, left_score, right_score = branch(x, y, popped.set, rf)
if popped.score-score < alpha or len(popped.set) < n_min or len(left) == 0 or len(right) == 0:
node.append(Node(popped.parent, -1, 0, popped.set))
else:
node.append(Node(popped.parent, j, x[i, j], popped.set))
stack.append(Stack(k, right, right_score))
stack.append(Stack(k, left, left_score))
# これより下でnode.left, node.rightの値を設定する
for h in range(k, -1, -1):
node[h].left = 0
node[h].right = 0
for h in range(k, 0, -1):
pa = node[h].parent
if node[pa].right == 0:
node[pa].right = h
else:
node[pa].left = h
# これより下で、node.centerの値を計算する
if f == sq_loss:
g = np.mean
else:
g = mode_max
for h in range(k+1):
if node[h].j == -1:
node[h].center = g(y[node[h].set])
else:
node[h].center = 0
return (node)
df = np.loadtxt("boston.txt", delimiter="\t")
X = np.array(df[:, [0, 2, 4, 5, 6, 7, 9, 10, 11, 12]])
p = X.shape[1]
y = np.array(df[:, 13])
f = sq_loss
node = dt(X, y, n_min=50)
len(node)
!pip uninstall igraph -y
!pip uninstall python-igraph -y
!pip install python-igraph
!pip install cairocffi
!pip install pycairo
from igraph import *
def draw_graph(node):
r = len(node)
col = []
for h in range(r):
col.append(node[h].j)
colorlist = ["#ffffff", "#fff8ff", "#fcf9ce", "#d6fada", "#d7ffff",
"#d9f2f8", "#fac8be", "#ffebff", "#ffffe0", "#fdf5e6",
"#fac8be", "#f8ecd5", "#ee82ee"]
color = [colorlist[col[i]] for i in range(r)]
edge = []
for h in range(1, r):
edge.append([node[h].parent, h])
g = Graph(edges=edge, directed=True)
layout = g.layout_reingold_tilford(root=[0])
out = plot(g, vertex_size=15, layout=layout, bbox=(300, 300),
vertex_label=list(range(r)), vertex_color=color)
return (out)
draw_graph(node)
def value(u, node):
r = 0
while node[r].j != -1:
if u[node[r].j] < node[r].th:
r = node[r].left
else:
r = node[r].right
return (node[r].center)
n = 100
df = np.loadtxt("boston.txt", delimiter="\t")
X = np.array(df[0:n, [0, 2, 4, 5, 6, 7, 9, 10, 11, 12]])
p = X.shape[1]
y = np.array(df[0:n, 13])
f = sq_loss
alpha_seq = np.arange(0, 1.5, 0.1)
s = np.int(n/10)
out = []
for alpha in alpha_seq:
SS = 0
for h in range(10):
test = list(range(h*s, h*s+s))
train = list(set(range(n)) - set(test))
node = dt(X[train, :], y[train], alpha=alpha)
for t in test:
SS = SS + (y[t] - value(X[t, :], node))**2
print(SS / n)
out.append(SS / n)
plt.plot(alpha_seq, out)
plt.xlabel("alpha")
plt.ylabel("2乗誤差")
plt.title("CVで最適なalpha (N=100)")
df = np.loadtxt("boston.txt", delimiter="\t")
X = np.array(df[0:n, [0, 2, 4, 5, 6, 7, 9, 10, 11, 12]])
p = X.shape[1]
y = np.array(df[0:n, 13])
n_min_seq = np.arange(1, 13, 1)
s = np.int(n / 10)
out = []
for n_min in n_min_seq:
SS = 0
for h in range(10):
test = list(range(h*s, h*s+s))
train = list(set(range(n)) - set(test))
node = dt(X[train, :], y[train], n_min=n_min)
for t in test:
SS = SS + (y[t] - value(X[t, :], node))**2
print(SS / n)
out.append(SS / n)
plt.plot(n_min_seq, out)
plt.xlabel("n_min")
plt.ylabel("2乗誤差")
plt.title("CVで最適なn_min (N=100)")
def branch(x, y, S, rf=0): #
if rf == 0: #
T = np.arange(x.shape[1]) #
else: #
T = np.random.choice(x.shape[1], rf, replace=False) #
if x.shape[0] == 0:
return ([0, 0, 0, 0, 0, 0, 0])
best_score = np.inf
for j in T: #
for i in S:
left = []
right = []
for k in S:
if x[k, j] < x[i, j]:
left.append(k)
else:
right.append(k)
left_score = f(y[left])
right_score = f(y[right])
score = left_score + right_score
if score < best_score:
best_score = score
i_1 = i
j_1 = j
left_1 = left
right_1 = right
left_score_1 = left_score
right_score_1 = right_score
return [i_1, j_1, left_1, right_1, best_score, left_score_1, right_score_1]
def rf(z):
z = np.array(z, dtype=np.int64)
zz = []
for b in range(B):
u = sum([mode_max(z[range(b+1), i]) == y[i+100] for i in range(50)])
zz.append(u)
return (zz)
def freq(y):
y = list(y)
return ([y.count(i) for i in set(y)])
def mode(y):
n = len(y)
if n == 0:
return (0)
return (max(freq(y)))
def mis_match(y):
return (len(y)-mode(y))
def mode_max(y):
if len(y) == 0:
return (-np.inf)
count = np.bincount(y)
return (np.argmax(count))
from sklearn.datasets import load_iris
iris = load_iris()
iris.target_names
f = mis_match
n = iris.data.shape[0]
order = np.random.choice(n, n, replace=False) # 並び替える
X = iris.data[order, :]
y = iris.target[order]
train = list(range(100))
test = list(range(100, 150))
B = 100
plt.ylim([35, 55])
m_seq = [1, 2, 3, 4]
c_seq = ["r", "b", "g", "y"]
label_seq = ["m=1", "m=2", "m=3", "m=4"]
plt.xlabel("繰り返し数 b")
plt.ylabel("テスト50データでの正答数")
plt.title("ランダムフォレスト")
for m in m_seq:
z = np.zeros((B, 50))
for b in range(B):
index = np.random.choice(train, 100, replace=True)
node = dt(X[index, :], y[index], n_min=2, rf=m)
for i in test:
z[b, i-100] = value(X[i, ], node)
plt.plot(list(range(B)), np.array(rf(z))-0.2*(m-2),
label=label_seq[m-1], linewidth=0.8, c=c_seq[m-1])
plt.legend(loc="lower right")
plt.axhline(y=50, c="b", linewidth=0.5, linestyle="dashed")
import lightgbm as lgb
df = np.loadtxt("boston.txt", delimiter="\t")
X = np.array(df[:, 0:12])
p = X.shape[1]
y = np.array(df[:, 13])
train = list(range(200))
test = list(range(200, 300))
B = 200
lgb_train = lgb.Dataset(X[train, :], y[train])
lgb_eval = lgb.Dataset(X[test, :], y[test], reference=lgb_train)
B = 5000
nn_seq = list(range(1, 10, 1)) + list(range(10, 91, 10)) + list(range(100, B, 50))
out_set = []
for d in range(1, 4):
lgbm_params = {
"objective": "regression",
"metric": "rmse",
"num_leaves": d+1,
"learning_rate": 0.001
}
out = []
for nn in nn_seq:
model = lgb.train(lgbm_params, lgb_train,
valid_sets=lgb_eval, num_boost_round=nn)
z = model.predict(X[test, :], num_iteration=model.best_iteration)
out.append(sum((z-y[test])**2) / 100)
out_set.append(out)
# グラフで表示
plt.ylim([0, 80])
c_seq = ["r", "b", "g"]
label_seq = ["d=1", "d=2", "d=3"]
plt.xlabel("生成した木の個数")
plt.ylabel("テストデータでの二乗誤差")
plt.title("lightgbm パッケージ (lambda=0.001)")
for d in range(1, 4):
plt.plot(nn_seq, out_set[d-1], label=label_seq[d-1],
linewidth=0.8, c=c_seq[d-1])
plt.legend(loc="upper right")