# 第1章のプログラムは、事前に下記が実行されていることを仮定する。
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use("seaborn-ticks")

1.1 行列の正定値性¶

n = 3
B = np.random.normal(size=n**2).reshape(3, 3)
A = np.dot(B.T, B)
values, vectors = np.linalg.eig(A)
print("values:\n", values, "\n\nvectors:\n", vectors, "\n")

values:
 [5.66888609 0.47730883 0.81278299] 

vectors:
 [[-0.20918868  0.78107155 -0.58835986]
 [ 0.96046159  0.05107156 -0.2736882 ]
 [ 0.18372161  0.62234952  0.76087282]]

S = []
for i in range(10):
    z = np.random.normal(size = n)
    y = np.squeeze(z.T.dot(A.dot(z)))
    S.append(y)
    if (i+1) % 5 == 0:
        print("S[%d:%d]:"%((i-4),i), S[i-4:i])

S[0:4]: [0.6011394976389837, 3.0782724565351343, 22.538486464027656, 13.78817857366506]
S[5:9]: [9.026645257123285, 3.9696972174185294, 2.057700793662757, 3.3906306502223753]

1.2 カーネル¶

n = 250
x = 2 * np.random.normal(size = n)
y = np.sin(2 * np.pi * x) + np.random.normal(size = n) / 4

def D(t):
    return np.maximum(0.75 * (1 - t**2), 0)

def k(x, y, lam):
    return D(np.abs((x - y) / lam))

def f(z, lam):
    S = 0; T = 0
    for i in range(n):
        S = S + k(x[i], z, lam) * y[i]
        T = T + k(x[i], z, lam)
    return S / T

plt.figure(num=1, figsize=(15, 8),dpi=80)
plt.xlim(-3, 3); plt.ylim(-2, 3)
plt.xticks(fontsize = 14); plt.yticks(fontsize = 14)

plt.scatter(x, y, facecolors='none', edgecolors = "k", marker = "o")

xx = np.arange(-3, 3, 0.1)
yy = [[] for _ in range(3)]
lam = [0.05, 0.35, 0.50]
color = ["g", "b", "r"]
for i in range(3):
    for zz in xx:
        yy[i].append(f(zz, lam[i]))
    plt.plot(xx, yy[i], c = color[i], label = lam[i])

plt.legend(loc = "upper left", frameon = True, prop={'size':14})
plt.title("Nadaraya-Watson Estimator", fontsize = 20)

<ipython-input-4-93d7004f2e14>:16: RuntimeWarning: invalid value encountered in double_scalars
  return S / T

Text(0.5, 1.0, 'Nadaraya-Watson Estimator')

1.3 正定値カーネル¶

def K(x, y, sigma2):
    return np.exp(-np.linalg.norm(x - y)**2/2/sigma2)

def F(z, sigma2):        # 関数定義 f
    S=0; T=0
    for i in range(n):
        S = S + K(x[i], z, sigma2) * y[i]
        T = T + K(x[i], z, sigma2)
    return S / T

n = 100
x = 2 * np.random.normal(size = n)
y = np.sin(2 * np.pi * x) + np.random.normal(size = n) / 4  # データ生成
#  sigma2 = 0.01, 0.001 の曲線の図示
plt.figure(num=1, figsize=(15, 8),dpi=80)
plt.scatter(x, y, facecolors='none', edgecolors = "k", marker = "o")
plt.xlim(-3, 3)
plt.ylim(-2, 3)
plt.xticks(fontsize = 14); plt.yticks(fontsize = 14)

xx = np.arange(-3, 3, 0.1)
yy = [[] for _ in range(2)]
sigma2 = [0.001, 0.01]
color = ["g", "b"]
for i in range(2):
    for zz in xx:
        yy[i].append(F(zz, sigma2[i]))
    plt.plot(xx, yy[i], c = color[i], label = sigma2[i])
plt.legend(loc = "upper left", frameon = True, prop={'size':20})
plt.title("Nadaraya-Watson Estimator", fontsize = 20)

Text(0.5, 1.0, 'Nadaraya-Watson Estimator')

m = int(n / 10)
sigma2_seq = np.arange(0.001, 0.01, 0.001)
SS_min = np.inf
for sigma2 in sigma2_seq:
    SS = 0
    for k in range(10):
        test = range(k*m, (k+1)*m)
        train = [x for x in range(n) if x not in test]
        for j in test:
            u, v = 0, 0
            for i in train:
                kk = K(x[i], x[j], sigma2)
                u = u + kk * y[i]
                v = v + kk
            if v != 0:
                z = u/v
                SS = SS + (y[j] - z)**2
    if SS < SS_min:
        SS_min = SS
        sigma2_best = sigma2
print("Best sigma2 = ", sigma2_best)

Best sigma2 =  0.003

SS

15.941362514277072

plt.figure(num = 1, figsize=(15, 8),dpi = 80)
plt.scatter(x, y, facecolors = 'none', edgecolors = "k", marker = "o")
plt.xlim(-3, 3)
plt.ylim(-2, 3)
plt.xticks(fontsize = 14); plt.yticks(fontsize = 14)

xx = np.arange(-3, 3, 0.1)
yy = [[] for _ in range(3)]
sigma2 = [0.001, 0.01, sigma2_best]
labels = [0.001, 0.01, "sigma2_best"]
color = ["g", "b", "r"]

for i in range(3):
    for zz in xx:
        yy[i].append(F(zz, sigma2[i]))
    plt.plot(xx, yy[i], c = color[i], label = labels[i])
plt.legend(loc = "upper left", frameon = True, prop={'size': 20})
plt.title("Nadaraya-Watson Estimator", fontsize = 20)

Text(0.5, 1.0, 'Nadaraya-Watson Estimator')

1.6 文字列、木、グラフのカーネル¶

def string_kernel(x, y):
    m, n = len(x), len(y)
    S = 0
    for i in range(m):
        for j in range(i, m):
            for k in range(n):
                if x[(i-1):j] == y[(k-1):(k+j-i)]:
                    S = S + 1
    return S

C = ["a", "b", "c"]
m = 10
w = np.random.choice(C, m, replace = True)
x = ""
for i in range(m):
    x = x + w[i]
n = 12
w = np.random.choice(C, n, replace = True)
y = ""
for i in range(n):
    y = y + w[i]

x

'ababbcaaac'

y

'ccbcbcaaacaa'

string_kernel(x,y)

58

def C(i, j):
    S, T = s[i], t[j]
    if S[0] != T[0]:
        return 0
    if S[1] is None:
        return 0
    if T[1] is None:
        return 0
    if len(S[1]) != len(T[1]):
        return 0
    U = []
    for x in S[1]:
        U.append(s[x][0])
        U1 = sorted(U)
    V = []
    for y in T[1]:
        V.append(t[y][0])
        V1 = sorted(V)
    m = len(U)
    for h in range(m):
        if U1[h] != V1[h]:
            return 0
    U2 = np.array(S[1])[np.argsort(U)]
    V2 = np.array(T[1])[np.argsort(V)]
    W = 1
    for h in range(m):
        W = W * (1 + C(U2[h], V2[h]))
        return W

def k(s, t):
    m, n = len(s), len(t)
    kernel = 0
    for i in range(m):
        for j in range(n):
            if C(i, j) > 0:
                kernel = kernel + C(i, j)
    return kernel

s = [[] for _ in range(6)]
s[0] = ["G", [1, 3]]; s[1] = ["T", [2]];  s[2] = ["C", None]
s[3] = ["A", [4, 5]]; s[4] = ["C", None]; s[5] = ["T", None]

t = [[] for _ in range(9)]
t[0] = ["G", [1, 4]];  t[1] = ["A", [2, 3]];  t[2] = ["C", None]
t[3] = ["T", None];    t[4] = ["T", [5, 6]];  t[5] = ["C", None]
t[6] = ["A", [7, 8]];  t[7] = ["C", None];    t[8] = ["T", None]

for i in range(6):
    for j in range(9):
        if C(i, j) > 0:
            print(i, j, C(i, j))

0 0 2
3 1 1
3 6 1

k(s, t)

4

def k(s, p):
    return prob(s, p)/ len(node)

def prob(s, p):
    if len(node[s[0]]) == 0:
        return 0
    if len(s) == 1:
        return p
    m = len(s)
    S = (1 - p) / len(node[s[0]]) * prob(s[1:m], p)
    return S

node = [[] for _ in range(5)]
node[0] = [2, 4]; node[1] = [4]; node[2] = [1, 5]
node[3] = [1, 5]; node[4] = [3]
k([0, 3, 2, 4, 2], 1 / 3)

0.0016460905349794243

def k(x, y, lam):
    return D(np.abs((x - y) / lam))

n = 250
x = 2 * np.random.normal(size = n)
y = np.sin(2 * np.pi * x) + np.random.normal(size = n) / 4


plt.figure(num=1, figsize=(15, 8),dpi=80)
plt.xlim(-3, 3); plt.ylim(-2, 3)
plt.xticks(fontsize = 14); plt.yticks(fontsize = 14)

plt.scatter(x, y, facecolors='none', edgecolors = "k", marker = "o")

xx = np.arange(-3, 3, 0.1)
yy = [[] for _ in range(3)]
lam = [0.05, 0.35, 0.50]
color = ["g", "b", "r"]
for i in range(3):
    for zz in xx:
        yy[i].append(f(zz, lam[i]))
    plt.plot(xx, yy[i], c = color[i], label = lam[i])

plt.legend(loc = "upper left", frameon = True, prop={'size':14})
plt.title("Nadaraya-Watson Estimator", fontsize = 20)

<ipython-input-17-de07a120e21c>:18: RuntimeWarning: invalid value encountered in double_scalars
  return S / T

Text(0.5, 1.0, 'Nadaraya-Watson Estimator')

def K(x, y, sigma2):
    return np.exp(-np.linalg.norm(x - y)**2/2/sigma2)

n = 100
x = 2 * np.random.normal(size = n)
y = np.sin(2 * np.pi * x) + np.random.normal(size = n) / 4

m = int(n / 10)
sigma2_seq = np.arange(0.001, 0.01, 0.001)
SS_min = np.inf
for sigma2 in sigma2_seq:
    SS = 0
    for k in range(10):
        test = range(k*m,(k+1)*m)
        train = [x for x in range(n) if x not in test]
        for j in test:
            u, v = 0, 0
            for i in train:
                kk = K(x[i], x[j], sigma2)
                u = u + kk * y[i]
                v = v + kk
            if not(v==0):
                z=u/v
                SS = SS + (y[j] - z)**2
    if SS < SS_min:
        SS_min = SS
        sigma2_best = sigma2
print("Best sigma2 = ", sigma2_best)

def string_kernel(x, y):
    m, n = len(x), len(y)
    S = 0
    for i in range(m):
        for j in range(i, m):
            for k in range(n):
                if x[(i-1):j] == y[(k-1):(k+j-i)]:
                    S = S + 1
    return S

def k(s, p):
    return prob(s, p)/ len(node)

def prob(s, p):
    if len(node[s[0]]) == 0:
        return 0
    if len(s) == 1:
        return p
    m = len(s)
    S = (1 - p) / len(node[s[0]]) * prob(s[1:m], p)
    return S