import numpy as np
import matplotlib.pyplot as plt
import scipy
from scipy import stats
from numpy.random import randn # 正規乱数
%matplotlib inline
# import japanize_matplotlib # Google Colabの場合
# Anacondaの場合は下記
import matplotlib
from matplotlib import font_manager
matplotlib.rc("font", family="BIZ UDGothic")
def k_means(X, K, iteration=20):
n, p = X.shape
center = np.zeros((K, p))
y = np.random.choice(K, n, replace=True)
scores = []
for h in range(iteration):
for k in range(K):
if np.sum(y == k) == 0:
center[k, 0] = np.inf
else:
for j in range(p):
center[k, j] = np.mean(X[y == k, j])
S_total = 0
for i in range(n):
S_min = np.inf
for k in range(K):
S = np.sum((X[i, ] - center[k, ])**2)
if S < S_min:
S_min = S
y[i] = k
S_total += S_min
scores.append(S_total)
return {"clusters":y, "scores":scores}
n = 1000
K = 5
p = 2
X = randn(n, p) # データ生成
y = k_means(X, 5)["clusters"] # 各サンプルのクラスタを得る
# クラスタごとに色を変えて,点を書く
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.xlabel("第1成分")
plt.ylabel("第2成分")
def dist_complete(x, y):
r = x.shape[0]
s = y.shape[0]
dist_max = 0
for i in range(r):
for j in range(s):
d = np.linalg.norm(x[i, ] - y[j, ])
if d > dist_max:
dist_max = d
return dist_max
def dist_single(x, y):
r = x.shape[0]
s = y.shape[0]
dist_min = np.inf
for i in range(r):
for j in range(s):
d = np.linalg.norm(x[i, ] - y[j, ])
if d < dist_min:
dist_min = d
return dist_min
def dist_centroid(x, y):
r = x.shape[0]
s = y.shape[0]
x_bar = 0
for i in range(r):
x_bar = x_bar + x[i, ]
x_bar = x_bar / r
y_bar = 0
for i in range(s):
y_bar = y_bar + y[i, ]
y_bar = y_bar / s
return (np.linalg.norm(x_bar-y_bar))
def dist_average(x, y):
r = x.shape[0]
s = y.shape[0]
S = 0
for i in range(r):
for j in range(s):
S = S + np.linalg.norm(x[i, ]-y[j, ])
return (S / r / s)
import copy
def hc(X, dd="complete"):
n = X.shape[0]
index = [[i] for i in range(n)]
cluster = [[] for i in range(n-1)]
for k in range(n, 1, -1):
dist_min = np.inf
for i in range(k-1):
for j in range(i+1, k):
i_0 = index[i]
j_0 = index[j]
if dd == "complete":
d = dist_complete(X[i_0, ], X[j_0, ])
elif dd == "single":
d = dist_single(X[i_0, ], X[j_0, ])
elif dd == "centroid":
d = dist_centroid(X[i_0, ], X[j_0, ])
elif dd == "average":
d = dist_average(X[i_0, ], X[j_0, ])
if d < dist_min:
dist_min = d
i_1 = i # 結合される側のlistのindex
j_1 = j # 新たに結合するlistのindex
index[i_1].extend(index[j_1]) # 追加する
if j_1 < k: # 追加したindexの後ろを1つ前に詰める
for h in range(j_1+1, k, 1):
index[h-1] = index[h]
index2 = copy.deepcopy(index[0:(k-1)])
# indexのまま使うと, 毎回書き換わってしまうため
cluster[k-2].extend(index2)
return cluster
# 下から結果を見ると, 1つずつ結合が起こっていることがわかる
n = 200
p = 2
X = randn(n, p)
cluster = hc(X, "complete")
K = [2, 4, 6, 8] # クラスタ数が3,5,7,9
for i in range(4):
grp = cluster[K[i]] # 全体の結果から, クラスタ数がK[i]のときの結果を取り出す
plt.subplot(2, 2, i+1)
for k in range(len(grp)):
x = X[grp[k], 0]
y = X[grp[k], 1]
plt.scatter(x, y, s=5)
plt.text(2, 2, "K={}".format(K[i]+1), fontsize=12)
def pca(X):
n, p = X.shape
center = np.average(X, 0)
X = X-center # 列ごとに中心化
Sigma = X.T@X/n
lam, phi = np.linalg.eig(Sigma) # 固有値, 固有ベクトル
index = np.argsort(-lam) # 降順にソート
lam = lam[index]
phi = phi[:, index]
return {"lam":lam, "vectors":phi, "centers":center}
X = randn(100, 5)
res = pca(X)
res["lam"]
res["vectors"]
res["centers"]
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(X) # 実行
score = pca.fit_transform(X) # 主成分得点(行:n 列:主成分)
score[0:5, ]
pca.mean_ # 上のcentersと同じ
n = 100
a = 0.7
b = np.sqrt(1-a**2)
u = randn(n)
v = randn(n)
x = u
y = u*a+v*b
plt.scatter(x, y)
plt.xlim(-4, 4)
plt.ylim(-4, 4)
D = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1)), 1)
pca.fit(D)
T = pca.components_
T[0, 1] / T[0, 0] * T[1, 1] / T[1, 0] # 主成分ベクトル(主成分負荷量)が直交している
def f_1(x):
y = T[0, 1] / T[0, 0] * x
return y
def f_2(x):
y = T[1, 1] / T[1, 0] * x
return y
x_seq = np.arange(-4, 4, 0.5)
plt.scatter(x, y, c="black")
plt.xlim(-4, 4)
plt.ylim(-4, 4)
plt.plot(x_seq, f_1(x_seq))
plt.plot(x_seq, f_2(x_seq))
plt.gca().set_aspect("equal", adjustable="box")
import pandas as pd
USA = pd.read_csv("USArrests.csv", header=0, index_col=0)
X = (USA-np.average(USA, 0))/np.std(USA, 0)
index = USA.index
col = USA.columns
pca = PCA(n_components=2)
pca.fit(X)
score = pca.fit_transform(X)
vector = pca.components_
vector
vector.shape[1]
evr = pca.explained_variance_ratio_
evr
plt.figure(figsize=(7, 7))
for i in range(score.shape[0]):
plt.scatter(score[i, 0], score[i, 1], s=5)
plt.annotate(index[i], xy=(score[i, 0], score[i, 1]))
for j in range(vector.shape[1]):
plt.arrow(0, 0, vector[0, j]*2, vector[1, j]*2, color="red") # 2は線の長さ, 任意でよい
plt.text(vector[0, j]*2, vector[1, j]*2, col[j], color="red")
def pca(X):
n, p = X.shape
center = np.average(X, 0)
X = X - center # 列ごとに中心化
Sigma = X.T@X/n
lam, phi = np.linalg.eig(Sigma) # 固有値, 固有ベクトル
index = np.argsort(-lam) # 降順にソート
lam = lam[index]
phi = phi[:, index]
return {"lam":lam, "vectors":phi, "centers":center}
df = np.loadtxt("boston.txt", delimiter="\t")
res = pca(df)
evr = (res["lam"]/np.sum(res["lam"]))[0:5] # 各主成分の寄与率
evr
plt.plot(np.arange(1, 6), evr)
plt.scatter(np.arange(1, 6), evr)
plt.xticks(np.arange(1, 6))
plt.ylim(0, 1)
plt.xlabel("主成分")
plt.ylabel("寄与率")
plt.plot(np.arange(1, 6), np.cumsum(evr))
plt.scatter(np.arange(1, 6), np.cumsum(evr))
plt.xticks(np.arange(1, 6))
plt.ylim(0, 1)
plt.xlabel("主成分")
plt.ylabel("累積寄与率")
def pca_regression(X, y, m):
pca = PCA(n_components=m)
pca.fit(X)
Z = pca.fit_transform(X) # 行:n 列: 主成分
phi = pca.components_ # 行: 主成分 列:変数
theta = np.linalg.inv(Z.T@Z)@Z.T@y
beta = phi.T@theta
return {"theta":theta, "beta":beta}
n = 100
p = 5
X = randn(n, p)
X = X - np.average(X, 0)
y = X[:, 0] + X[:, 1] + X[:, 2] + X[:, 3] + X[:, 4] + randn(n)
y = y - np.mean(y)
pca_regression(X, y, 3)
pca_regression(X, y, 5)["beta"]
np.linalg.inv(X.T@X) @ X.T @ y