import numpy as np
import matplotlib.pyplot as plt
import scipy
from scipy import stats
from numpy.random import randn
# import japanize_matplotlib # Google Colabの場合
%matplotlib inline
# Anacondaの場合は下記
import matplotlib
from matplotlib import font_manager
matplotlib.rc("font", family="BIZ UDGothic")
def f(x):
return np.exp(beta_0 + beta*x) / (1+np.exp(beta_0 + beta*x))
beta_0 = 0
beta_seq = np.array([0, 0.2, 0.5, 1, 2, 10])
x_seq = np.arange(-10, 10, 0.1)
plt.xlabel("x")
plt.ylabel("P(Y=1|x)")
plt.title("ロジスティック曲線")
for i in range(beta_seq.shape[0]):
beta = beta_seq[i]
p = f(x_seq)
plt.plot(x_seq, p, label="{}".format(beta))
plt.legend(loc="upper left")
def f(x):
return x**2-1
def df(x):
return 2*x
x_seq = np.arange(-1, 5, 0.1)
f_x = f(x_seq)
plt.plot(x_seq, f_x)
plt.axhline(y=0, c="black", linewidth=0.5)
plt.xlabel("x")
plt.ylabel("f(x)")
x = 4
for i in range(10):
X = x
Y = f(x) # X,Y更新前の点
x = x - f(x)/df(x) # x 更新後
y = f(x) # y 更新後
plt.plot([X, x], [Y, 0], c="black", linewidth=0.8)
plt.plot([X, X], [Y, 0], c="black", linestyle="dashed", linewidth=0.8)
plt.scatter(x, 0, c="red")
def f(z):
return z[0]**2 + z[1]**2 - 1
def dfx(z):
return 2*z[0]
def dfy(z):
return 2*z[1]
def g(z):
return z[0] + z[1]
def dgx(z):
return 1
def dgy(z):
return 1
z = np.array([3, 4]) # 初期値
for i in range(10):
J = np.array([[dfx(z), dfy(z)], [dgx(z), dgy(z)]])
z = z - np.linalg.inv(J)@np.array([f(z), g(z)])
z
z
N = 1000
p = 2
X = randn(N, p)
X = np.insert(X, 0, 1, axis=1)
beta = randn(p+1)
y = []
prob = 1 / (1+np.exp(X@beta))
for i in range(N):
if np.random.rand(1) > prob[i]:
y.append(1)
else :
y.append(-1)
# データの生成ここまで
beta # 確認
# 最尤推定
beta = np.inf
gamma = randn(p+1) # betaの初期値
print(gamma)
while np.sum((beta-gamma)**2) > 0.001:
beta = gamma
s = X@beta
v = np.exp(-s*y)
u = (y*v) / (1+v)
w = v / ((1+v)**2)
W = np.diag(w)
z = s + u / w
gamma = np.linalg.inv(X.T@W@X)@X.T@W@z
print(gamma)
# データの生成
n = 100
x = np.concatenate([randn(n)+1, randn(n)-1], 0)
y = np.concatenate([np.ones(n), -np.ones(n)], 0)
train = np.random.choice(2*n, int(n), replace=False) # 訓練データの添え字
test = list(set(range(2*n))-set(train)) # テストデータの添え字
X = np.insert(x[train].reshape(-1, 1), 0, 1, axis=1)
Y = y[train]
# すべて1の列をxの左において、2列とした
# gammaの初期値によっては収束しないので、複数回施行することがある
p = 1
beta = [0, 0]
gamma = randn(p+1)
print(gamma)
while np.sum((beta-gamma)**2) > 0.001:
beta = gamma
s = X@beta
v = np.exp(-s*Y)
u = (Y*v) / (1+v)
w = v / ((1+v)**2)
W = np.diag(w)
z = s + u/w
gamma = np.linalg.inv(X.T@W@X)@X.T@W@z
print(gamma)
def table_count(m, u, v):
n = u.shape[0]
count = np.zeros([m, m])
for i in range(n):
count[int(u[i]), int(v[i])] += 1
return (count)
ans = y[test] # 正解
pred = np.sign(gamma[0] + x[test]*gamma[1]) # 予測
ans = (ans+1) / 2 # -1,1から0,1になおす
pred = (pred+1) / 2 # -1,1から0,1になおす
table_count(2, ans, pred)
# 真のパラメータ
mu_1 = np.array([2, 2])
sigma_1 = 2
sigma_2 = 2
rho_1 = 0
mu_2 = np.array([-3, -3])
sigma_3 = 1
sigma_4 = 1
rho_2 = -0.8
# 真のパラメータに基づいてデータを発生
n = 100
u = randn(n)
v = randn(n)
x_1 = sigma_1*u + mu_1[0]
y_1 = (rho_1*u + np.sqrt(1-rho_1**2)*v)*sigma_2 + mu_1[1]
u = randn(n)
v = randn(n)
x_2 = sigma_3*u + mu_2[0]
y_2 = (rho_2*u + np.sqrt(1-rho_2**2)*v)*sigma_4 + mu_2[1]
# データからパラメータを推定
mu_1 = np.average((x_1, y_1), 1)
mu_2 = np.average((x_2, y_2), 1)
df = np.array([x_1, y_1])
mat = np.cov(df, rowvar=1)
inv_1 = np.linalg.inv(mat)
de_1 = np.linalg.det(mat) #
inv_2 = inv_1
de_2 = de_1 #
# 推定されたパラメータを分布の式に代入
def f(x, mu, inv, de):
return (-0.5*(x-mu).T@inv@(x-mu) - 0.5*np.log(de))
def f_1(u, v):
return f(np.array([u, v]), mu_1, inv_1, de_1)
def f_2(u, v):
return f(np.array([u, v]), mu_2, inv_2, de_2)
# 等高線データを作成
# この値が0の箇所に境界線をひく
pi_1 = 0.5
pi_2 = 0.5
u = v = np.linspace(-6, 6, 50)
m = len(u)
w = np.zeros([m, m])
for i in range(m):
for j in range(m):
w[i, j] = np.log(pi_1)+f_1(u[i], v[j])-np.log(pi_2)-f_2(u[i], v[j])
# 境界線とデータをプロット
plt.contour(u, v, w, levels=0, colors=["black"])
plt.scatter(x_1, y_1, c="red")
plt.scatter(x_2, y_2, c="blue")
from sklearn.datasets import load_iris
iris = load_iris()
iris.target_names
x = iris.data
y = iris.target
n = len(x)
train = np.random.choice(n, int(n/2), replace=False)
test = list(set(range(n)) - set(train))
# パラメータを推定する
X = x[train, :]
Y = y[train]
mu = []
covv = []
for j in range(3):
xx = X[Y == j, :]
mu.append(np.mean(xx, 0))
covv.append(np.cov(xx, rowvar=0))
# 推定されたパラメータを代入する分布の定義式
def f(w, mu, inv, de):
return - 0.5 * (w-mu).T@inv@(w-mu) - 0.5*np.log(de)
def g(v, j):
return f(v, mu[j], np.linalg.inv(covv[j]), np.linalg.det(covv[j]))
z = []
for i in test:
a = 0.5 * g(x[i, ], 0)
b = 0.25 * g(x[i, ], 1)
c = 0.25 * g(x[i, ], 2)
if a < b:
if b < c:
z.append(2)
else:
z.append(1)
else:
z.append(0)
u = y[test]
count = np.zeros([3, 3])
for i in range(int(n/2)):
count[u[i], z[i]] += 1
count
def knn_1(x, y, z, k):
x = np.array(x)
y = np.array(y)
dis = []
for i in range(x.shape[0]):
dis.append(np.linalg.norm(z - x[i, ]))
S = np.argsort(dis)[0:k] # 距離が近いk個のindex
u = np.bincount(y[S]) # 度数を数える
m = [i for i, x in enumerate(u) if x == max(u)] # 最頻値のindex
# タイブレーキングの処理(最頻値が2個以上ある場合)
while len(m) > 1:
k = k - 1
S = S[0:k]
u = np.bincount(y[S])
m = [i for i, x in enumerate(u) if x == max(u)] # 最頻値のindex
return m[0]
# 一般化
def knn(x, y, z, k):
w = []
for i in range(z.shape[0]):
w.append(knn_1(x, y, z[i, ], k))
return w
from sklearn.datasets import load_iris
iris = load_iris()
iris.target_names
x = iris.data
y = iris.target
n = x.shape[0]
train = np.random.choice(n, int(n/2), replace=False)
test = list(set(range(n))-set(train))
w = knn(x[train, ], y[train], x[test, ], k=3)
table_count(3, y[test], w)
N_0 = 10000
N_1 = 1000
mu_1 = 1
mu_0 = -1 # 病気の人1,正常な人0
var_1 = 1
var_0 = 1
x = np.random.normal(mu_0, var_0, N_0)
y = np.random.normal(mu_1, var_1, N_1)
theta_seq = np.exp(np.arange(-10, 100, 0.1))
U = []
V = []
for i in range(len(theta_seq)):
u = np.sum((stats.norm.pdf(x, mu_1, var_1)/stats.norm.pdf(x, mu_0, var_0))
> theta_seq[i])/N_0
# 病気でない人を病気とみなす
v = np.sum((stats.norm.pdf(y, mu_1, var_1)/stats.norm.pdf(y, mu_0, var_0))
> theta_seq[i])/N_1
# 病気の人を病気とみなす
U.append(u)
V.append(v)
AUC = 0 # 面積を求める
for i in range(len(theta_seq)-1):
AUC = AUC + np.abs(U[i+1]-U[i])*V[i]
plt.plot(U, V)
plt.xlabel("False Positive")
plt.ylabel("True Positive")
plt.title("ROC曲線")
plt.text(0.3, 0.5, "AUC={}".format(AUC), fontsize=15)