import numpy as np
import matplotlib.pyplot as plt
import scipy
from scipy import stats
from numpy.random import randn
%matplotlib inline
# import japanize_matplotlib # Google Colabの場合
# Anacondaの場合は下記
import matplotlib
from matplotlib import font_manager
matplotlib.rc("font", family="BIZ UDGothic")
def cv_linear(X, y, K):
n = len(y)
m = int(n/K)
S = 0
for j in range(K):
test = list(range(j*m, (j+1)*m)) # テストデータの添え字
train = list(set(range(n))-set(test)) # 訓練データの添え字
beta = np.linalg.inv(X[train, ].T@X[train, ])@X[train, ].T@y[train]
e = y[test] - X[test, ]@beta
S = S + np.linalg.norm(e)**2
return S/n
def cv_fast(X, y, k):
n = len(y)
m = n / k
H = X@np.linalg.inv(X.T@X)@X.T
J = np.diag(np.repeat(1, n))
e = (J - H)@y
K = np.diag(np.repeat(1, m))
S = 0
for j in range(k):
test = np.arange(j*m, (j+1)*m, 1, dtype=int)
S = S + (np.linalg.inv(K - H[test, :][:, test])@e[test]).T@np.linalg.inv(K - H[test, test])@e[test]
return S/n
n = 1000
p = 5
X = np.insert(randn(n, p), 0, 1, axis=1)
beta = randn(p+1).reshape(-1, 1)
y = X@beta + 0.2*randn(n).reshape(-1, 1)
y = y[:, 0]
import time
U_l = []
V_l = []
U_f = []
V_f = []
for k in range(2, n+1, 1):
if n % k == 0:
t1 = time.time() # 処理前の時刻
cv_linear(X, y, k)
t2 = time.time() # 処理後の時刻
U_l.append(k)
V_l.append(t2 - t1)
t1 = time.time()
cv_fast(X, y, k)
t2 = time.time()
U_f.append(k)
V_f.append(t2-t1)
plt.plot(U_l, V_l, c="red", label="cv_linear")
plt.plot(U_f, V_f, c="blue", label="cv_fast")
plt.legend()
plt.xlabel("k")
plt.ylabel("実行時間")
plt.title("cv_fastとcv_linearの比較")
n = 100
p = 5
plt.ylim(0.3, 1.5)
plt.xlabel("k")
plt.ylabel("CVの値")
for j in range(2, 11, 1):
X = randn(n, p)
X = np.insert(X, 0, 1, axis=1)
beta = randn(p+1)
y = X@beta + randn(n)
U = []
V = []
for k in range(2, n+1, 1):
if n % k == 0:
U.append(k)
V.append(cv_linear(X, y, k))
plt.plot(U, V)
from sklearn.datasets import load_iris
def knn_1(x, y, z, k): # 再掲
x = np.array(x)
n, p = x.shape
dis = np.zeros(n)
for i in range(n):
dis[i] = (z-x[i, ]).T@(z-x[i, ])
S = np.argsort(dis)[0:k] # 距離が近いk個のindex
u = np.bincount(y[S]) # 度数を数える
m = [i for i, x in enumerate(u) if x == max(u)] # 最頻値のindex
# タイブレーキングの処理(最頻値が2個以上ある場合)
while len(m) > 1:
k = k - 1
S = S[0:k]
u = np.bincount(y[S])
m = [i for i, x in enumerate(u) if x == max(u)] # 最頻値のindex
return m[0]
def knn(x, y, z, k):
n = z.shape[0]
w = np.zeros(n)
for i in range(n):
w[i] = knn_1(x, y, z[i, ], k)
return w
iris = load_iris()
iris.target_names
x = iris.data
y = iris.target
n = x.shape[0]
index = np.random.choice(n, n, replace=False) # 並び替える
x = x[index, ]
y = y[index]
U = []
V = []
top_seq = list(range(0, 150, 15))
for k in range(1, 11, 1):
S = 0
for top in top_seq:
test = list(range(top, top+15))
train = list(set(range(150))-set(test))
knn_ans = knn(x[train, ], y[train], x[test, ], k=k)
ans = y[test]
S = S + np.sum(knn_ans != ans)
S = S / n
U.append(k)
V.append(S)
plt.plot(U, V)
plt.xlabel("K")
plt.ylabel(" 誤り率")
plt.title("CV による誤り率の評価")
def bt(df, f, r):
m = df.shape[0]
org = f(df, np.arange(0, m, 1))
u = []
for j in range(r):
index = np.random.choice(m, m, replace=True)
u.append(f(df, index))
return {"original":org, "bias":np.mean(u)-org,
"stderr":np.std(u, ddof=1)}
def func_1(data, index):
X = data[index, 0]
Y = data[index, 1]
return ((np.var(Y, ddof=1)-np.cov(X, Y))
/ (np.var(X, ddof=1)+np.var(Y, ddof=1)-2*np.cov(X, Y)[0, 1]))
Portfolio = np.loadtxt("Portfolio.csv", delimiter=",", skiprows=1)
bt(Portfolio, func_1, 1000)
from sklearn import linear_model
df = np.loadtxt("crime.txt", delimiter="\t")
reg = linear_model.LinearRegression()
X = df[:, 2:4]
y = df[:, 0]
reg.fit(X, y)
reg.coef_
for j in range(3):
def func_2(data, index):
X = data[index, 2:4]
y = data[index, 0]
reg.fit(X, y)
if j == 0:
return reg.intercept_
else:
return reg.coef_[j-1]
print(bt(df, func_2, 1000))
import statsmodels.api as sm
n = X.shape[0]
X = np.insert(X, 0, 1, axis=1)
model = sm.OLS(y, X)
res = model.fit()
print(res.summary())