Chapter 3 Classification¶
20¶
In [ ]:
from sklearn.datasets import load_iris
from numpy.random import randn
from scipy import stats
import scipy
import matplotlib.pyplot as plt
import numpy as np
In [ ]:
def f(x):
return np.exp(beta_0+beta*x)/(1+np.exp(beta_0+beta*x))
In [ ]:
beta_0 = 0
beta_seq = np.array([0, 0.2, 0.5, 1, 2, 10])
x_seq = np.arange(-10, 10, 0.1)
plt.xlabel("x")
plt.ylabel("P(Y=1|x)")
plt.title("Logistic Curve")
for i in range(beta_seq.shape[0]):
beta = beta_seq[i]
p = f(x_seq)
plt.plot(x_seq, p, label='{}'.format(beta))
plt.legend(loc='upper left')
22 (a)¶
In [ ]:
def f(x):
return x**2-1
def df(x):
return 2*x
x_seq = np.arange(-1, 5, 0.1)
f_x = f(x_seq)
plt.plot(x_seq, f_x)
plt.axhline(y=0, c="black", linewidth=0.5)
plt.xlabel("x")
plt.ylabel("f(x)")
x = 4
for i in range(10):
X = x
Y = f(x) # Point before updating X, Y
x = x-f(x)/df(x) # Update x
y = f(x) # Update y
plt.plot([X, x], [Y, 0], c="black", linewidth=0.8)
plt.plot([X, X], [Y, 0], c="black", linestyle="dashed", linewidth=0.8)
plt.scatter(x, 0, c="red")
22 (b)¶
In [ ]:
def f(z):
return z[0]**2+z[1]**2-1
def dfx(z):
return 2*z[0]
def dfy(z):
return 2*z[1]
def g(z):
return z[0]+z[1]
def dgx(z):
return 1
def dgy(z):
return 1
z = np.array([3, 4]) # Initial value
for i in range(10):
J = np.array([[dfx(z), dfy(z)], [dgx(z), dgy(z)]])
z = z-np.linalg.inv(J)@np.array([f(z), g(z)])
z
z
24¶
In [ ]:
N = 1000
p = 2
X = randn(N, p)
X = np.insert(X, 0, 1, axis=1)
beta = randn(p+1)
y = []
prob = 1/(1+np.exp(X@beta))
for i in range(N):
if (np.random.rand(1) > prob[i]):
y.append(1)
else:
y.append(-1)
# Data generation ends here
beta # Check
In [ ]:
# Maximum likelihood estimation
beta = np.inf
gamma = randn(p+1) # Initial value of beta
print(gamma)
while (np.sum((beta-gamma)**2) > 0.001):
beta = gamma
s = X@beta
v = np.exp(-s*y)
u = (y*v)/(1+v)
w = v/((1+v)**2)
W = np.diag(w)
z = s+u/w
gamma = np.linalg.inv(X.T@W@X)@X.T@W@z
print(gamma)
26¶
In [ ]:
# Data generation
n = 100
x = np.concatenate([randn(n)+1, randn(n)-1], 0)
y = np.concatenate([np.ones(n), -np.ones(n)], 0)
train = np.random.choice(2*n, int(n), replace=False) # Indices of training data
test = list(set(range(2*n))-set(train)) # Indices of test data
X = np.insert(x[train].reshape(-1, 1), 0, 1, axis=1)
Y = y[train]
# Insert a column of ones to the left of x, making it 2 columns
# Multiple trials may be needed as convergence may not occur depending on the initial value of gamma
p = 1
beta = [0, 0]
gamma = randn(p+1)
print(gamma)
while (np.sum((beta-gamma)**2) > 0.001):
beta = gamma
s = X@beta
v = np.exp(-s*Y)
u = (Y*v)/(1+v)
w = v/((1+v)**2)
W = np.diag(w)
z = s+u/w
gamma = np.linalg.inv(X.T@W@X)@X.T@W@z
print(gamma)
In [ ]:
def table_count(m, u, v):
n = u.shape[0]
count = np.zeros([m, m])
for i in range(n):
count[int(u[i]), int(v[i])] += 1
return (count)
ans = y[test] # Correct answer
pred = np.sign(gamma[0]+x[test]*gamma[1]) # Prediction
ans = (ans+1)/2 # Change from -1,1 to 0,1
pred = (pred+1)/2 # Change from -1,1 to 0,1
table_count(2, ans, pred)
28¶
In [ ]:
# True parameters
mu_1 = np.array([2, 2])
sigma_1 = 2
sigma_2 = 2
rho_1 = 0
mu_2 = np.array([-3, -3])
sigma_3 = 1
sigma_4 = 1
rho_2 = -0.8
# Data generation based on true parameters
n = 100
u = randn(n)
v = randn(n)
x_1 = sigma_1*u+mu_1[0]
y_1 = (rho_1*u+np.sqrt(1-rho_1**2)*v)*sigma_2+mu_1[1]
u = randn(n)
v = randn(n)
x_2 = sigma_3*u+mu_2[0]
y_2 = (rho_2*u+np.sqrt(1-rho_2**2)*v)*sigma_4+mu_2[1]
# Estimate parameters from data
mu_1 = np.average((x_1, y_1), 1)
mu_2 = np.average((x_2, y_2), 1)
df = np.array([x_1, y_1])
mat = np.cov(df, rowvar=1)
inv_1 = np.linalg.inv(mat)
de_1 = np.linalg.det(mat) #
inv_2 = inv_1
de_2 = de_1 #
# Substitute estimated parameters into the distribution formula
def f(x, mu, inv, de):
return (-0.5*(x-mu).T@inv@(x-mu)-0.5*np.log(de))
def f_1(u, v):
return f(np.array([u, v]), mu_1, inv_1, de_1)
def f_2(u, v):
return f(np.array([u, v]), mu_2, inv_2, de_2)
# Create contour data
# Draw boundary line where this value is 0
pi_1 = 0.5
pi_2 = 0.5
u = v = np.linspace(-6, 6, 50)
m = len(u)
w = np.zeros([m, m])
for i in range(m):
for j in range(m):
w[i, j] = np.log(pi_1)+f_1(u[i], v[j])-np.log(pi_2)-f_2(u[i], v[j])
# Plot boundary line and data
plt.contour(u, v, w, levels=0, colors=['black'])
plt.scatter(x_1, y_1, c="red")
plt.scatter(x_2, y_2, c="blue")
29¶
In [ ]:
iris = load_iris()
iris.target_names
x = iris.data
y = iris.target
n = len(x)
train = np.random.choice(n, int(n/2), replace=False)
test = list(set(range(n))-set(train))
# Estimate parameters
X = x[train, :]
Y = y[train]
mu = []
covv = []
for j in range(3):
xx = X[Y == j, :]
mu.append(np.mean(xx, 0))
covv.append(np.cov(xx, rowvar=0))
# Define distribution formulas with estimated parameters
def f(w, mu, inv, de):
return -0.5*(w-mu).T@inv@(w-mu)-0.5*np.log(de)
def g(v, j):
return f(v, mu[j], np.linalg.inv(covv[j]), np.linalg.det(covv[j]))
z = []
for i in test:
a = 0.5*g(x[i,], 0)
b = 0.25*g(x[i,], 1)
c = 0.25*g(x[i,], 2)
if a < b:
if b < c:
z.append(2)
else:
z.append(1)
else:
z.append(0)
u = y[test]
count = np.zeros([3, 3])
for i in range(int(n/2)):
count[u[i], z[i]] += 1
count
30¶
In [ ]:
def knn_1(x, y, z, k):
x = np.array(x)
y = np.array(y)
dis = []
for i in range(x.shape[0]):
dis.append(np.linalg.norm(z-x[i,]))
S = np.argsort(dis)[0:k] # Indices of k nearest points
u = np.bincount(y[S]) # Count frequencies
m = [i for i, x in enumerate(u) if x == max(u)] # Indices of mode
# Tie-breaking process (if there are multiple modes)
while (len(m) > 1):
k = k-1
S = S[0:k]
u = np.bincount(y[S])
m = [i for i, x in enumerate(u) if x == max(u)] # Indices of mode
return m[0]
Generalization¶
In [ ]:
def knn(x, y, z, k):
w = []
for i in range(z.shape[0]):
w.append(knn_1(x, y, z[i,], k))
return w
In [ ]:
iris = load_iris()
iris.target_names
x = iris.data
y = iris.target
n = x.shape[0]
train = np.random.choice(n, int(n/2), replace=False)
test = list(set(range(n))-set(train))
w = knn(x[train,], y[train], x[test,], k=3)
table_count(3, y[test], w)
In [ ]:
N_0 = 10000
N_1 = 1000
mu_1 = 1
mu_0 = -1 # Diseased person 1, healthy person 0
var_1 = 1
var_0 = 1
x = np.random.normal(mu_0, var_0, N_0)
y = np.random.normal(mu_1, var_1, N_1)
theta_seq = np.exp(np.arange(-10, 100, 0.1))
U = []
V = []
for i in range(len(theta_seq)):
u = np.sum((stats.norm.pdf(x, mu_1, var_1)/stats.norm.pdf(x,
mu_0, var_0)) > theta_seq[i])/N_0 # Consider a non-diseased person as diseased
v = np.sum((stats.norm.pdf(y, mu_1, var_1)/stats.norm.pdf(y,
mu_0, var_0)) > theta_seq[i])/N_1 # Consider a diseased person as diseased
U.append(u)
V.append(v)
AUC = 0 # Calculate area
for i in range(len(theta_seq)-1):
AUC = AUC+np.abs(U[i+1]-U[i])*V[i]
plt.plot(U, V)
plt.xlabel("False Positive")
plt.ylabel("True Positive")
plt.title("ROC Curve")
plt.text(0.3, 0.5, 'AUC={}'.format(AUC), fontsize=15)