import copy
from numpy.random import randn  # Normal distribution random numbers
import pandas as pd
from sklearn.decomposition import PCA
from scipy import stats
import scipy
import matplotlib.pyplot as plt
import numpy as np

def k_means(X, K, iteration=20):
    n, p = X.shape
    center = np.zeros((K, p))
    y = np.random.choice(K, n, replace=True)
    scores = []
    for h in range(iteration):
        for k in range(K):
            if np.sum(y == k) == 0:
                center[k, 0] = np.inf
            else:
                for j in range(p):
                    center[k, j] = np.mean(X[y == k, j])
        S_total = 0
        for i in range(n):
            S_min = np.inf
            for k in range(K):
                S = np.sum((X[i, ] - center[k, ])**2)
                if S < S_min:
                    S_min = S
                    y[i] = k
            S_total += S_min
        scores.append(S_total)
    return {"clusters":y, "scores":scores}

n = 1000
K = 5
p = 2
X = randn(n, p)  # Data Generation
y = k_means(X, 5)["clusters"]  # Obtain the clusters for each X
# クラスタごとに色を変えて,点を書く
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.xlabel("First")
plt.ylabel("Second")

Text(0, 0.5, 'Second')

def dist_complete(x, y):
    r = x.shape[0]
    s = y.shape[0]
    dist_max = 0
    for i in range(r):
        for j in range(s):
            d = np.linalg.norm(x[i,]-y[j,])
            if d > dist_max:
                dist_max = d
    return dist_max

def dist_single(x, y):
    r = x.shape[0]
    s = y.shape[0]
    dist_min = np.inf
    for i in range(r):
        for j in range(s):
            d = np.linalg.norm(x[i,]-y[j,])
            if d < dist_min:
                dist_min = d
    return dist_min

def dist_centroid(x, y):
    r = x.shape[0]
    s = y.shape[0]
    x_bar = 0
    for i in range(r):
        x_bar = x_bar+x[i,]
    x_bar = x_bar/r
    y_bar = 0
    for i in range(s):
        y_bar = y_bar+y[i,]
    y_bar = y_bar/s
    return (np.linalg.norm(x_bar-y_bar))

def dist_average(x, y):
    r = x.shape[0]
    s = y.shape[0]
    S = 0
    for i in range(r):
        for j in range(s):
            S = S+np.linalg.norm(x[i,]-y[j,])
    return (S/r/s)

def hc(X, dd="complete"):
    n = X.shape[0]
    index = [[i] for i in range(n)]
    cluster = [[] for i in range(n-1)]
    for k in range(n, 1, -1):
        # index_2=[]
        dist_min = np.inf
        for i in range(k-1):
            for j in range(i+1, k):
                i_0 = index[i]
                j_0 = index[j]
                if dd == "complete":
                    d = dist_complete(X[i_0,], X[j_0,])
                elif dd == "single":
                    d = dist_single(X[i_0,], X[j_0,])
                elif dd == "centroid":
                    d = dist_centroid(X[i_0,], X[j_0,])
                elif dd == "average":
                    d = dist_average(X[i_0,], X[j_0,])
                if d < dist_min:
                    dist_min = d
                    i_1 = i  # Index of the list being merged
                    j_1 = j  # Index of the list being newly merged
        index[i_1].extend(index[j_1])  # Add
        if j_1 < k:  # Fill the back of the added index forward by one
            for h in range(j_1+1, k, 1):
                index[h-1] = index[h]
        index2 = copy.deepcopy(index[0:(k-1)])  # Use deep copy to avoid overwriting
        cluster[k-2].extend(index2)
    return cluster  # Viewing from the bottom, you can see that merging happens one by one

n = 200
p = 2
X = randn(n, p)
cluster = hc(X, "complete")
K = [2, 4, 6, 8]  # Number of clusters 3, 5, 7, 9
for i in range(4):
    grp = cluster[K[i]]  # Extract results for K[i] clusters from overall results
    plt.subplot(2, 2, i+1)
    for k in range(len(grp)):
        x = X[grp[k], 0]
        y = X[grp[k], 1]
        plt.scatter(x, y, s=5)
    plt.text(2, 2, "K={}".format(K[i]+1), fontsize=12)

def pca(X):
    n, p = X.shape
    center = np.average(X, 0)
    X = X-center  # Center each column
    Sigma = X.T@X/n
    lam, phi = np.linalg.eig(Sigma)  # Eigenvalues, eigenvectors
    index = np.argsort(-lam)  # Sort in descending order
    lam = lam[index]
    phi = phi[:, index]
    return {'lam': lam, 'vectors': phi, 'centers': center}

X = randn(100, 5)
res = pca(X)
res['lam']
res['vectors']
res['centers']

array([-0.09954019,  0.06386149, -0.0486636 ,  0.06827612,  0.10803061])

pca = PCA()
pca.fit(X)  # Execute
score = pca.fit_transform(X)  # Principal component scores (rows: n, columns: principal components)
score[0:5,]
pca.mean_  # Same as centers above

array([-0.09954019,  0.06386149, -0.0486636 ,  0.06827612,  0.10803061])

n = 100
a = 0.7
b = np.sqrt(1-a**2)
u = randn(n)
v = randn(n)
x = u
y = u*a+v*b
plt.scatter(x, y)
plt.xlim(-4, 4)
plt.ylim(-4, 4)
D = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1)), 1)
pca.fit(D)
T = pca.components_
T[0, 1]/T[0, 0]*T[1, 1]/T[1, 0]  # Principal component vectors (principal component loadings) are orthogonal

-1.0

def f_1(x):
    y = T[0, 1]/T[0, 0]*x
    return y

def f_2(x):
    y = T[1, 1]/T[1, 0]*x
    return y

x_seq = np.arange(-4, 4, 0.5)
plt.scatter(x, y, c="black")
plt.xlim(-4, 4)
plt.ylim(-4, 4)
plt.plot(x_seq, f_1(x_seq))
plt.plot(x_seq, f_2(x_seq))
plt.gca().set_aspect('equal', adjustable='box')

USA = pd.read_csv('USArrests.csv', header=0, index_col=0)
X = (USA-np.average(USA, 0))/np.std(USA, 0)
index = USA.index
col = USA.columns
pca = PCA(n_components=2)
pca.fit(X)
score = pca.fit_transform(X)
vector = pca.components_
vector
vector.shape[1]
evr = pca.explained_variance_ratio_
evr

array([0.62006039, 0.24744129])

plt.figure(figsize=(7, 7))
for i in range(score.shape[0]):
    plt.scatter(score[i, 0], score[i, 1], s=5)
    plt.annotate(index[i], xy=(score[i, 0], score[i, 1]))
for j in range(vector.shape[1]):
    plt.arrow(0, 0, vector[0, j]*2, vector[1, j]
              * 2, color="red")  # 2 is the length of the line, arbitrary
    plt.text(vector[0, j]*2, vector[1, j]*2, col[j], color="red")

def pca(X):
    n, p = X.shape
    center = np.average(X, 0)
    X = X-center  # Center each column
    Sigma = X.T@X/n
    lam, phi = np.linalg.eig(Sigma)  # Eigenvalues, eigenvectors
    index = np.argsort(-lam)  # Sort in descending order
    lam = lam[index]
    phi = phi[:, index]
    return {'lam': lam, 'vectors': phi, 'centers': center}

df = np.loadtxt("boston.txt", delimiter="\t")
res = pca(df)
evr = (res['lam']/np.sum(res['lam']))[0:5]  # Contribution rate of each principal component
evr

array([0.80457219, 0.16270557, 0.02140833, 0.00695759, 0.00200539])

plt.plot(np.arange(1, 6), evr)
plt.scatter(np.arange(1, 6), evr)
plt.xticks(np.arange(1, 6))
plt.ylim(0, 1)
plt.xlabel("Principal Component")
plt.ylabel("Contribution Rate")

Text(0, 0.5, 'Contribution Rate')

plt.plot(np.arange(1, 6), np.cumsum(evr))
plt.scatter(np.arange(1, 6), np.cumsum(evr))
plt.xticks(np.arange(1, 6))
plt.ylim(0, 1)
plt.xlabel("Principal Component")
plt.ylabel("Cumulative Contribution Rate")

Text(0, 0.5, 'Cumulative Contribution Rate')

def pca_regression(X, y, m):
    pca = PCA(n_components=m)
    pca.fit(X)
    Z = pca.fit_transform(X)  # Rows: n, Columns: principal components
    phi = pca.components_  # Rows: principal components, Columns: variables
    theta = np.linalg.inv(Z.T@Z)@Z.T@y
    beta = phi.T@theta
    return {'theta': theta, 'beta': beta}

n = 100
p = 5
X = randn(n, p)
X = X-np.average(X, 0)
y = X[:, 0]+X[:, 1]+X[:, 2]+X[:, 3]+X[:, 4]+randn(n)
y = y-np.mean(y)
pca_regression(X, y, 3)
pca_regression(X, y, 5)['beta']
np.linalg.inv(X.T@X)@X.T@y

array([1.07565883, 0.91903011, 0.97885506, 1.17373711, 1.03563185])

Chapter 10 Unsupervised Learning¶

88¶

89¶

95¶

96¶

98¶

99¶

100¶