第9章 教師なし学習¶

88¶

In [ ]:
import numpy as np
import matplotlib.pyplot as plt
import scipy
from scipy import stats
from numpy.random import randn  # 正規乱数
%matplotlib inline
# import japanize_matplotlib # Google Colabの場合
In [ ]:
# Anacondaの場合は下記
import matplotlib
from matplotlib import font_manager
matplotlib.rc("font", family="BIZ UDGothic")
In [ ]:
def k_means(X, K, iteration=20):
    n, p = X.shape
    center = np.zeros((K, p))
    y = np.random.choice(K, n, replace=True)
    scores = []
    for h in range(iteration):
        for k in range(K):
            if np.sum(y == k) == 0:
                center[k, 0] = np.inf
            else:
                for j in range(p):
                    center[k, j] = np.mean(X[y == k, j])
        S_total = 0
        for i in range(n):
            S_min = np.inf
            for k in range(K):
                S = np.sum((X[i, ] - center[k, ])**2)
                if S < S_min:
                    S_min = S
                    y[i] = k
            S_total += S_min
        scores.append(S_total)
    return {"clusters":y, "scores":scores}
In [ ]:
n = 1000
K = 5
p = 2
X = randn(n, p)  # データ生成
y = k_means(X, 5)["clusters"]  # 各サンプルのクラスタを得る
# クラスタごとに色を変えて,点を書く
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.xlabel("第1成分")
plt.ylabel("第2成分")

89¶

In [ ]:
def dist_complete(x, y):
    r = x.shape[0]
    s = y.shape[0]
    dist_max = 0
    for i in range(r):
        for j in range(s):
            d = np.linalg.norm(x[i, ] - y[j, ])
            if d > dist_max:
                dist_max = d
    return dist_max
In [ ]:
def dist_single(x, y):
    r = x.shape[0]
    s = y.shape[0]
    dist_min = np.inf
    for i in range(r):
        for j in range(s):
            d = np.linalg.norm(x[i, ] - y[j, ])
            if d < dist_min:
                dist_min = d
    return dist_min
In [ ]:
def dist_centroid(x, y):
    r = x.shape[0]
    s = y.shape[0]
    x_bar = 0
    for i in range(r):
        x_bar = x_bar + x[i, ]
    x_bar = x_bar / r
    y_bar = 0
    for i in range(s):
        y_bar = y_bar + y[i, ]
    y_bar = y_bar / s
    return (np.linalg.norm(x_bar-y_bar))
In [ ]:
def dist_average(x, y):
    r = x.shape[0]
    s = y.shape[0]
    S = 0
    for i in range(r):
        for j in range(s):
            S = S + np.linalg.norm(x[i, ]-y[j, ])
    return (S / r / s)
In [ ]:
import copy
In [ ]:
def hc(X, dd="complete"):
    n = X.shape[0]
    index = [[i] for i in range(n)]
    cluster = [[] for i in range(n-1)]
    for k in range(n, 1, -1):
        dist_min = np.inf
        for i in range(k-1):
            for j in range(i+1, k):
                i_0 = index[i]
                j_0 = index[j]
                if dd == "complete":
                    d = dist_complete(X[i_0, ], X[j_0, ])
                elif dd == "single":
                    d = dist_single(X[i_0, ], X[j_0, ])
                elif dd == "centroid":
                    d = dist_centroid(X[i_0, ], X[j_0, ])
                elif dd == "average":
                    d = dist_average(X[i_0, ], X[j_0, ])
                if d < dist_min:
                    dist_min = d
                    i_1 = i    # 結合される側のlistのindex
                    j_1 = j    # 新たに結合するlistのindex
        index[i_1].extend(index[j_1])  # 追加する
        if j_1 < k:               # 追加したindexの後ろを1つ前に詰める
            for h in range(j_1+1, k, 1):
                index[h-1] = index[h]
        index2 = copy.deepcopy(index[0:(k-1)])
        # indexのまま使うと, 毎回書き換わってしまうため
        cluster[k-2].extend(index2)
    return cluster
# 下から結果を見ると, 1つずつ結合が起こっていることがわかる
In [ ]:
n = 200
p = 2
X = randn(n, p)
cluster = hc(X, "complete")
K = [2, 4, 6, 8] # クラスタ数が3,5,7,9
for i in range(4):
    grp = cluster[K[i]]  # 全体の結果から, クラスタ数がK[i]のときの結果を取り出す
    plt.subplot(2, 2, i+1)
    for k in range(len(grp)):
        x = X[grp[k], 0]
        y = X[grp[k], 1]
        plt.scatter(x, y, s=5)
    plt.text(2, 2, "K={}".format(K[i]+1), fontsize=12)

95¶

In [ ]:
def pca(X):
    n, p = X.shape
    center = np.average(X, 0)
    X = X-center  # 列ごとに中心化
    Sigma = X.T@X/n
    lam, phi = np.linalg.eig(Sigma)  # 固有値, 固有ベクトル
    index = np.argsort(-lam)  # 降順にソート
    lam = lam[index]
    phi = phi[:, index]
    return {"lam":lam, "vectors":phi, "centers":center}
In [ ]:
X = randn(100, 5)
res = pca(X)
res["lam"]
res["vectors"]
res["centers"]
In [ ]:
from sklearn.decomposition import PCA
In [ ]:
pca = PCA()
pca.fit(X)  # 実行
score = pca.fit_transform(X)  # 主成分得点(行:n 列:主成分)
score[0:5, ]
pca.mean_    # 上のcentersと同じ

96¶

In [ ]:
n = 100
a = 0.7
b = np.sqrt(1-a**2)
u = randn(n)
v = randn(n)
x = u
y = u*a+v*b
plt.scatter(x, y)
plt.xlim(-4, 4)
plt.ylim(-4, 4)
D = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1)), 1)
pca.fit(D)
T = pca.components_
T[0, 1] / T[0, 0] * T[1, 1] / T[1, 0]   # 主成分ベクトル(主成分負荷量)が直交している
In [ ]:
def f_1(x):
    y = T[0, 1] / T[0, 0] * x
    return y
In [ ]:
def f_2(x):
    y = T[1, 1] / T[1, 0] * x
    return y
In [ ]:
x_seq = np.arange(-4, 4, 0.5)
plt.scatter(x, y, c="black")
plt.xlim(-4, 4)
plt.ylim(-4, 4)
plt.plot(x_seq, f_1(x_seq))
plt.plot(x_seq, f_2(x_seq))
plt.gca().set_aspect("equal", adjustable="box")

98¶

In [ ]:
import pandas as pd
In [ ]:
USA = pd.read_csv("USArrests.csv", header=0, index_col=0)
X = (USA-np.average(USA, 0))/np.std(USA, 0)
index = USA.index
col = USA.columns
pca = PCA(n_components=2)
pca.fit(X)
score = pca.fit_transform(X)
vector = pca.components_
vector
vector.shape[1]
evr = pca.explained_variance_ratio_
evr
In [ ]:
plt.figure(figsize=(7, 7))
for i in range(score.shape[0]):
    plt.scatter(score[i, 0], score[i, 1], s=5)
    plt.annotate(index[i], xy=(score[i, 0], score[i, 1]))
for j in range(vector.shape[1]):
    plt.arrow(0, 0, vector[0, j]*2, vector[1, j]*2, color="red")  # 2は線の長さ, 任意でよい
    plt.text(vector[0, j]*2, vector[1, j]*2, col[j], color="red")

99¶

In [ ]:
def pca(X):
    n, p = X.shape
    center = np.average(X, 0)
    X = X - center                  # 列ごとに中心化
    Sigma = X.T@X/n
    lam, phi = np.linalg.eig(Sigma)  # 固有値, 固有ベクトル
    index = np.argsort(-lam)        # 降順にソート
    lam = lam[index]
    phi = phi[:, index]
    return {"lam":lam, "vectors":phi, "centers":center}
In [ ]:
df = np.loadtxt("boston.txt", delimiter="\t")
res = pca(df)
evr = (res["lam"]/np.sum(res["lam"]))[0:5]  # 各主成分の寄与率
evr
In [ ]:
plt.plot(np.arange(1, 6), evr)
plt.scatter(np.arange(1, 6), evr)
plt.xticks(np.arange(1, 6))
plt.ylim(0, 1)
plt.xlabel("主成分")
plt.ylabel("寄与率")

plt.plot(np.arange(1, 6), np.cumsum(evr))
plt.scatter(np.arange(1, 6), np.cumsum(evr))
plt.xticks(np.arange(1, 6))
plt.ylim(0, 1)
plt.xlabel("主成分")
plt.ylabel("累積寄与率")

100¶

In [ ]:
def pca_regression(X, y, m):
    pca = PCA(n_components=m)
    pca.fit(X)
    Z = pca.fit_transform(X)  # 行:n 列: 主成分
    phi = pca.components_   # 行: 主成分 列:変数
    theta = np.linalg.inv(Z.T@Z)@Z.T@y
    beta = phi.T@theta
    return {"theta":theta, "beta":beta}
In [ ]:
n = 100
p = 5
X = randn(n, p)
X = X - np.average(X, 0)
y = X[:, 0] + X[:, 1] + X[:, 2] + X[:, 3] + X[:, 4] + randn(n)
y = y - np.mean(y)
pca_regression(X, y, 3)
pca_regression(X, y, 5)["beta"]
np.linalg.inv(X.T@X) @ X.T @ y