# The programs in Chapter 5 assume that the following are executed
import numpy as np
from scipy.stats import kde
import itertools
import math
import matplotlib.pyplot as plt
from matplotlib import style
style.use("seaborn-ticks")
sigma = 1
def k(x, y):
return np.exp(-(x - y)**2 / sigma**2)
# Data Generation
n = 100
xx = np.random.randn(n)
yy = np.random.randn(n) # When the distributions are equal
# yy = 2 * np.random.randn(n) # When the distributions are not equal
x = xx
y = yy
# Calculation of the null distribution
T = []
for h in range(100):
index1 = np.random.choice(n, size=int(n/2), replace=False)
index2 = [x for x in range(n) if x not in index1]
x = list(xx[index2]) + list(yy[index1])
y = list(xx[index1]) + list(yy[index2])
S = 0
for i in range(n):
for j in range(n):
if i != j:
S = S + k(x[i], x[j]) + k(y[i], y[j]) \
- k(x[i], y[j]) - k(x[j], y[i])
T.append(S / n / (n - 1))
v = np.quantile(T, 0.95)
# Calculation of statistics
S = 0
for i in range(n):
for j in range(n):
if i != j:
S = S + k(x[i], x[j]) + k(y[i], y[j]) \
- k(x[i], y[j]) - k(x[j], y[i])
u = S / n / (n - 1)
# Illustration of graphs
x = np.linspace(min(min(T), u, v), max(max(T), u, v), 200)
density = kde.gaussian_kde(T)
plt.plot(x, density(x))
plt.axvline(x=u, c="r", linestyle="--")
plt.axvline(x=v, c="b")
sigma = 1
def k(x, y):
return np.exp(-(x - y)**2 / sigma**2)
# Data Generation
n = 100
x = np.random.randn(n)
y = np.random.randn(n) # When the distributions are equal
# y = 2 * np.random.randn(n) # When the distributions are not equal
# Calculation of the null distribution
K = np.zeros((n, n))
for i in range(n):
for j in range(n):
K[i, j] = k(x[i], x[j]) + k(y[i], y[j]) - k(x[i], y[j]) - k(x[j], y[i])
lam, vec = np.linalg.eig(K)
lam = lam / n
r = 20
z = []
for h in range(10000):
z.append(np.longdouble(1 / n * (np.sum(lam[0:r] * (
np.random.chisquare(df=1, size=r) - 1)))))
v = np.quantile(z, 0.95)
# Calculation of statistics
S = 0
for i in range(n - 1):
for j in range(i + 1, n):
S = S + k(x[i], x[j]) + k(y[i], y[j]) - k(x[i], y[j]) - k(x[j], y[i])
u = np.longdouble(S / n / (n - 1))
x = np.linspace(min(min(z), u, v), max(max(z), u, v), 200)
# Illustration of graphs
density = kde.gaussian_kde(z)
plt.plot(x, density(x))
plt.axvline(x=v, c="r", linestyle="--")
plt.axvline(x=u, c="b")
def HSIC_1(x, y, k_x, k_y):
n = len(x)
S = 0
for i in range(n):
for j in range(n):
S = S + k_x(x[i], x[j]) * k_y(y[i], y[j])
T = 0
for i in range(n):
T_1 = 0
for j in range(n):
T_1 = T_1 + k_x(x[i], x[j])
T_2 = 0
for l in range(n):
T_2 = T_2 + k_y(y[i], y[l])
T = T + T_1 * T_2
U = 0
for i in range(n):
for j in range(n):
U = U + k_x(x[i], x[j])
V = 0
for i in range(n):
for j in range(n):
V = V + k_y(y[i], y[j])
return S / n**2 - 2 * T / n**3 + U * V / n**4
def HSIC_1(x, y, k_x, k_y):
n = len(x)
K_x = np.zeros((n, n))
for i in range(n):
for j in range(n):
K_x[i, j] = k_x(x[i], x[j])
K_y = np.zeros((n, n))
for i in range(n):
for j in range(n):
K_y[i, j] = k_y(y[i], y[j])
E = np.ones((n, n))
H = np.identity(n) - E / n
return np.sum(np.diag(np.diag(K_x.dot(H).dot(K_y).dot(H)))) / n**2
def k_x(x, y):
return np.exp(-np.linalg.norm(x - y)**2 / 2)
k_y = k_x
k_z = k_x
n = 100
for a in [0, 0.1, 0.2, 0.4, 0.6, 0.8]: # a is the correlation coefficient
x = np.random.randn(n)
z = np.random.randn(n)
y = a * x + np.sqrt(1 - a**2) * z
print(HSIC_1(x, y, k_x, k_y))
def HSIC_2(x, y, z, k_x, k_y, k_z):
n = len(x)
S = 0
for i in range(n):
for j in range(n):
S = S + k_x(x[i], x[j]) * k_y(y[i], y[j]) * k_z(z[i], z[j])
T = 0
for i in range(n):
T_1 = 0
for j in range(n):
T_1 = T_1 + k_x(x[i], x[j])
T_2 = 0
for l in range(n):
T_2 = T_2 + k_y(y[i], y[l]) * k_z(z[i], z[j])
T = T + T_1 * T_2
U = 0
for i in range(n):
for j in range(n):
U = U + k_x(x[i], x[j])
V = 0
for i in range(n):
for j in range(n):
V = V + k_y(y[i], y[j]) * k_z(z[i], z[j])
return S / n**2 - 2 * T / n**3 + U * V / n**4
def cc(x, y):
return np.sum(np.dot(x.T, y)) / len(x)
def f(u, v):
return u - cc(u, v) / cc(v, v) * v
# Data Generation
n = 30
x = np.random.randn(n)**2 - np.random.randn(n)**2
y = 2 * x + np.random.randn(n)**2 - np.random.randn(n)**2
z = x + y + np.random.randn(n)**2 - np.random.randn(n)**2
x = x - np.mean(x)
y = y - np.mean(y)
z = z - np.mean(z)
# Estimate the top
x_y = f(x, y)
y_z = f(y, z)
z_x = f(z, x)
x_z = f(x, z)
z_y = f(z, y)
y_x = f(y, x)
v1 = HSIC_2(x, y_x, z_x, k_x, k_y, k_z)
v2 = HSIC_2(y, z_y, x_y, k_y, k_z, k_x)
v3 = HSIC_2(z, x_z, y_z, k_z, k_x, k_y)
if v1 < v2:
if v1 < v3:
top = 1
else:
top = 3
else:
if v2 < v3:
top = 2
else:
top = 3
# Estimate the bottom
x_yz = f(x_y, z_y)
y_zx = f(y_z, x_z)
z_xy = f(z_x, y_x)
if top == 1:
v1 = HSIC_1(y_x, z_xy, k_y, k_z)
v2 = HSIC_1(z_x, y_zx, k_z, k_y)
if v1 < v2:
middle = 2
bottom = 3
else:
middle = 3
bottom = 2
if top == 2:
v1 = HSIC_1(z_y, x_yz, k_y, k_z)
v2 = HSIC_1(x_y, z_xy, k_z, k_y)
if v1 < v2:
middle = 3
bottom = 1
else:
middle = 1
bottom = 3
if top == 3:
v1 = HSIC_1(z_y, x_yz, k_z, k_x)
v2 = HSIC_1(x_y, z_xy, k_x, k_z)
if v1 < v2:
middle = 1
bottom = 2
else:
middle = 2
bottom = 1
# Output results
print("top =", top)
print("middle =", middle)
print("bottom =", bottom)
# Sort x to display a histogram of HSIC distribution
# Data Generation
x = np.random.randn(n)
y = np.random.randn(n)
u = HSIC_1(x, y, k_x, k_y)
# Sort x to construct the null distribution
m = 100
w = []
for i in range(m):
x = x[np.random.choice(n, n, replace=False)]
w.append(HSIC_1(x, y, k_x, k_y))
# Set rejection area
v = np.quantile(w, 0.95)
x = np.linspace(min(min(w), u, v), max(max(w), u, v), 200)
# Display in graph
density = kde.gaussian_kde(w)
plt.plot(x, density(x))
plt.axvline(x=v, c="r", linestyle="--")
plt.axvline(x=u, c="b")
def h(i, j, q, r, x, y, k_x, k_y):
M = list(itertools.combinations([i, j, q, r], 4))
m = len(M)
S = 0
for j in range(m):
t = M[j][0]
u = M[j][1]
v = M[j][2]
w = M[j][3]
S = S + k_x(x[t], x[u]) * k_y(y[t], y[u]) \
+ k_x(x[t], x[u]) * k_y(y[v], y[w]) \
- 2 * k_x(x[t], x[u]) * k_y(y[t], y[v])
return S / m
def HSIC_U(x, y, k_x, k_y):
M = list(itertools.combinations(range(n), 4))
m = len(M)
S = 0
for j in range(m):
S = S + h(M[j][0], M[j][1], M[j][2], M[j][3], x, y, k_x, k_y)
return S / math.comb(n, 4)
sigma = 1
def k(x, y):
return np.exp(-(x-y)**2 / sigma**2)
k_x = k; k_y = k
# Data Generation
n = 100; x = np.random.randn(n)
a = 0 # When it is independent
# a = 0.2 # Correlation coefficient 0.2
y = a * x + np.sqrt(1 - a**2) * np.random.randn(n)
# y = np.random.randn(n)*2 # When the distributions are not equal
# Calculation of the null distribution
K_x = np.zeros((n, n))
for i in range(n):
for j in range(n):
K_x[i, j] = k_x(x[i], x[j])
K_y = np.zeros((n, n))
for i in range(n):
for j in range(n):
K_y[i, j] = k_y(y[i], y[j])
F = np.zeros(n)
for i in range(n):
F[i] = np.sum(K_x[i, :]) / n
G = np.zeros((n))
for i in range(n):
G[i] = np.sum(K_y[i, :]) / n
H = np.sum(F) / n
I = np.sum(G) / n
K = np.zeros((n, n))
for i in range(n):
for j in range(n):
K[i, j] = (K_x[i, j] - F[i] - F[j] + H) * (K_y[i, j] - G[i] - G[j] + I) / 6
r = 20
lam, vec = np.linalg.eig(K)
lam = lam / n
print(lam)
z = []
for s in range(10000):
z.append(1 / n * (np.sum(lam[0:r] * (np.random.chisquare(df=1, size=r) - 1))))
v = np.quantile(z, 0.95)
# Calculation of statistics
u = HSIC_U(x, y, k_x, k_y)
# Illustration of graphs
x = np.linspace(min(min(z), u, v), max(max(z), u, v), 200)
density = kde.gaussian_kde(z)
plt.plot(x, density(x))
plt.axvline(x=v, c="r", linestyle="--")
plt.axvline(x=u, c="b")