In [ ]:
import copy
from numpy.random import randn # Normal distribution random numbers
import pandas as pd
from sklearn.decomposition import PCA
from scipy import stats
import scipy
import matplotlib.pyplot as plt
import numpy as np
In [ ]:
def k_means(X, K, iteration=20):
n, p = X.shape
center = np.zeros((K, p))
y = np.random.choice(K, n, replace=True)
scores = []
for h in range(iteration):
for k in range(K):
if np.sum(y == k) == 0:
center[k, 0] = np.inf
else:
for j in range(p):
center[k, j] = np.mean(X[y == k, j])
S_total = 0
for i in range(n):
S_min = np.inf
for k in range(K):
S = np.sum((X[i, ] - center[k, ])**2)
if S < S_min:
S_min = S
y[i] = k
S_total += S_min
scores.append(S_total)
return {"clusters":y, "scores":scores}
In [ ]:
n = 1000
K = 5
p = 2
X = randn(n, p) # Data Generation
y = k_means(X, 5)["clusters"] # Obtain the clusters for each X
# クラスタごとに色を変えて,点を書く
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.xlabel("First")
plt.ylabel("Second")
Out[ ]:
Text(0, 0.5, 'Second')
89¶
In [ ]:
def dist_complete(x, y):
r = x.shape[0]
s = y.shape[0]
dist_max = 0
for i in range(r):
for j in range(s):
d = np.linalg.norm(x[i,]-y[j,])
if d > dist_max:
dist_max = d
return dist_max
In [ ]:
def dist_single(x, y):
r = x.shape[0]
s = y.shape[0]
dist_min = np.inf
for i in range(r):
for j in range(s):
d = np.linalg.norm(x[i,]-y[j,])
if d < dist_min:
dist_min = d
return dist_min
In [ ]:
def dist_centroid(x, y):
r = x.shape[0]
s = y.shape[0]
x_bar = 0
for i in range(r):
x_bar = x_bar+x[i,]
x_bar = x_bar/r
y_bar = 0
for i in range(s):
y_bar = y_bar+y[i,]
y_bar = y_bar/s
return (np.linalg.norm(x_bar-y_bar))
In [ ]:
def dist_average(x, y):
r = x.shape[0]
s = y.shape[0]
S = 0
for i in range(r):
for j in range(s):
S = S+np.linalg.norm(x[i,]-y[j,])
return (S/r/s)
In [ ]:
def hc(X, dd="complete"):
n = X.shape[0]
index = [[i] for i in range(n)]
cluster = [[] for i in range(n-1)]
for k in range(n, 1, -1):
# index_2=[]
dist_min = np.inf
for i in range(k-1):
for j in range(i+1, k):
i_0 = index[i]
j_0 = index[j]
if dd == "complete":
d = dist_complete(X[i_0,], X[j_0,])
elif dd == "single":
d = dist_single(X[i_0,], X[j_0,])
elif dd == "centroid":
d = dist_centroid(X[i_0,], X[j_0,])
elif dd == "average":
d = dist_average(X[i_0,], X[j_0,])
if d < dist_min:
dist_min = d
i_1 = i # Index of the list being merged
j_1 = j # Index of the list being newly merged
index[i_1].extend(index[j_1]) # Add
if j_1 < k: # Fill the back of the added index forward by one
for h in range(j_1+1, k, 1):
index[h-1] = index[h]
index2 = copy.deepcopy(index[0:(k-1)]) # Use deep copy to avoid overwriting
cluster[k-2].extend(index2)
return cluster # Viewing from the bottom, you can see that merging happens one by one
In [ ]:
n = 200
p = 2
X = randn(n, p)
cluster = hc(X, "complete")
K = [2, 4, 6, 8] # Number of clusters 3, 5, 7, 9
for i in range(4):
grp = cluster[K[i]] # Extract results for K[i] clusters from overall results
plt.subplot(2, 2, i+1)
for k in range(len(grp)):
x = X[grp[k], 0]
y = X[grp[k], 1]
plt.scatter(x, y, s=5)
plt.text(2, 2, "K={}".format(K[i]+1), fontsize=12)
95¶
In [ ]:
def pca(X):
n, p = X.shape
center = np.average(X, 0)
X = X-center # Center each column
Sigma = X.T@X/n
lam, phi = np.linalg.eig(Sigma) # Eigenvalues, eigenvectors
index = np.argsort(-lam) # Sort in descending order
lam = lam[index]
phi = phi[:, index]
return {'lam': lam, 'vectors': phi, 'centers': center}
In [ ]:
X = randn(100, 5)
res = pca(X)
res['lam']
res['vectors']
res['centers']
Out[ ]:
array([-0.09954019, 0.06386149, -0.0486636 , 0.06827612, 0.10803061])
In [ ]:
pca = PCA()
pca.fit(X) # Execute
score = pca.fit_transform(X) # Principal component scores (rows: n, columns: principal components)
score[0:5,]
pca.mean_ # Same as centers above
Out[ ]:
array([-0.09954019, 0.06386149, -0.0486636 , 0.06827612, 0.10803061])
96¶
In [ ]:
n = 100
a = 0.7
b = np.sqrt(1-a**2)
u = randn(n)
v = randn(n)
x = u
y = u*a+v*b
plt.scatter(x, y)
plt.xlim(-4, 4)
plt.ylim(-4, 4)
D = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1)), 1)
pca.fit(D)
T = pca.components_
T[0, 1]/T[0, 0]*T[1, 1]/T[1, 0] # Principal component vectors (principal component loadings) are orthogonal
Out[ ]:
-1.0
In [ ]:
def f_1(x):
y = T[0, 1]/T[0, 0]*x
return y
In [ ]:
def f_2(x):
y = T[1, 1]/T[1, 0]*x
return y
In [ ]:
x_seq = np.arange(-4, 4, 0.5)
plt.scatter(x, y, c="black")
plt.xlim(-4, 4)
plt.ylim(-4, 4)
plt.plot(x_seq, f_1(x_seq))
plt.plot(x_seq, f_2(x_seq))
plt.gca().set_aspect('equal', adjustable='box')
98¶
In [ ]:
USA = pd.read_csv('USArrests.csv', header=0, index_col=0)
X = (USA-np.average(USA, 0))/np.std(USA, 0)
index = USA.index
col = USA.columns
pca = PCA(n_components=2)
pca.fit(X)
score = pca.fit_transform(X)
vector = pca.components_
vector
vector.shape[1]
evr = pca.explained_variance_ratio_
evr
Out[ ]:
array([0.62006039, 0.24744129])
In [ ]:
plt.figure(figsize=(7, 7))
for i in range(score.shape[0]):
plt.scatter(score[i, 0], score[i, 1], s=5)
plt.annotate(index[i], xy=(score[i, 0], score[i, 1]))
for j in range(vector.shape[1]):
plt.arrow(0, 0, vector[0, j]*2, vector[1, j]
* 2, color="red") # 2 is the length of the line, arbitrary
plt.text(vector[0, j]*2, vector[1, j]*2, col[j], color="red")
99¶
In [ ]:
def pca(X):
n, p = X.shape
center = np.average(X, 0)
X = X-center # Center each column
Sigma = X.T@X/n
lam, phi = np.linalg.eig(Sigma) # Eigenvalues, eigenvectors
index = np.argsort(-lam) # Sort in descending order
lam = lam[index]
phi = phi[:, index]
return {'lam': lam, 'vectors': phi, 'centers': center}
In [ ]:
df = np.loadtxt("boston.txt", delimiter="\t")
res = pca(df)
evr = (res['lam']/np.sum(res['lam']))[0:5] # Contribution rate of each principal component
evr
Out[ ]:
array([0.80457219, 0.16270557, 0.02140833, 0.00695759, 0.00200539])
In [ ]:
plt.plot(np.arange(1, 6), evr)
plt.scatter(np.arange(1, 6), evr)
plt.xticks(np.arange(1, 6))
plt.ylim(0, 1)
plt.xlabel("Principal Component")
plt.ylabel("Contribution Rate")
Out[ ]:
Text(0, 0.5, 'Contribution Rate')
In [ ]:
plt.plot(np.arange(1, 6), np.cumsum(evr))
plt.scatter(np.arange(1, 6), np.cumsum(evr))
plt.xticks(np.arange(1, 6))
plt.ylim(0, 1)
plt.xlabel("Principal Component")
plt.ylabel("Cumulative Contribution Rate")
Out[ ]:
Text(0, 0.5, 'Cumulative Contribution Rate')
100¶
In [ ]:
def pca_regression(X, y, m):
pca = PCA(n_components=m)
pca.fit(X)
Z = pca.fit_transform(X) # Rows: n, Columns: principal components
phi = pca.components_ # Rows: principal components, Columns: variables
theta = np.linalg.inv(Z.T@Z)@Z.T@y
beta = phi.T@theta
return {'theta': theta, 'beta': beta}
In [ ]:
n = 100
p = 5
X = randn(n, p)
X = X-np.average(X, 0)
y = X[:, 0]+X[:, 1]+X[:, 2]+X[:, 3]+X[:, 4]+randn(n)
y = y-np.mean(y)
pca_regression(X, y, 3)
pca_regression(X, y, 5)['beta']
np.linalg.inv(X.T@X)@X.T@y
Out[ ]:
array([1.07565883, 0.91903011, 0.97885506, 1.17373711, 1.03563185])