第3章 リサンプリング¶

35¶

In [ ]:
import numpy as np
import matplotlib.pyplot as plt
import scipy
from scipy import stats
from numpy.random import randn
%matplotlib inline
# import japanize_matplotlib # Google Colabの場合
In [ ]:
# Anacondaの場合は下記
import matplotlib
from matplotlib import font_manager
matplotlib.rc("font", family="BIZ UDGothic")
In [ ]:
def cv_linear(X, y, K):
    n = len(y)
    m = int(n/K)
    S = 0
    for j in range(K):
        test = list(range(j*m, (j+1)*m))  # テストデータの添え字
        train = list(set(range(n))-set(test))     # 訓練データの添え字
        beta = np.linalg.inv(X[train, ].T@X[train, ])@X[train, ].T@y[train]
        e = y[test] - X[test, ]@beta
        S = S + np.linalg.norm(e)**2
    return S/n
In [ ]:
def cv_fast(X, y, k):
    n = len(y)
    m = n / k
    H = X@np.linalg.inv(X.T@X)@X.T
    J = np.diag(np.repeat(1, n))
    e = (J - H)@y
    K = np.diag(np.repeat(1, m))
    S = 0
    for j in range(k):
        test = np.arange(j*m, (j+1)*m, 1, dtype=int)
        S = S + (np.linalg.inv(K - H[test, :][:, test])@e[test]).T@np.linalg.inv(K - H[test, test])@e[test]
    return S/n
In [ ]:
n = 1000
p = 5
X = np.insert(randn(n, p), 0, 1, axis=1)
beta = randn(p+1).reshape(-1, 1)
y = X@beta + 0.2*randn(n).reshape(-1, 1)
y = y[:, 0]
In [ ]:
import time
U_l = []
V_l = []
U_f = []
V_f = []
for k in range(2, n+1, 1):
    if n % k == 0:
        t1 = time.time()  # 処理前の時刻
        cv_linear(X, y, k)
        t2 = time.time()  # 処理後の時刻
        U_l.append(k)
        V_l.append(t2 - t1)
        t1 = time.time()
        cv_fast(X, y, k)
        t2 = time.time()
        U_f.append(k)
        V_f.append(t2-t1)
plt.plot(U_l, V_l, c="red", label="cv_linear")
plt.plot(U_f, V_f, c="blue", label="cv_fast")
plt.legend()
plt.xlabel("k")
plt.ylabel("実行時間")
plt.title("cv_fastとcv_linearの比較")

36¶

In [ ]:
n = 100
p = 5
plt.ylim(0.3, 1.5)
plt.xlabel("k")
plt.ylabel("CVの値")
for j in range(2, 11, 1):
    X = randn(n, p)
    X = np.insert(X, 0, 1, axis=1)
    beta = randn(p+1)
    y = X@beta + randn(n)
    U = []
    V = []
    for k in range(2, n+1, 1):
        if n % k == 0:
            U.append(k)
            V.append(cv_linear(X, y, k))
    plt.plot(U, V)

37¶

In [ ]:
from sklearn.datasets import load_iris
In [ ]:
def knn_1(x, y, z, k):   # 再掲
    x = np.array(x)
    n, p = x.shape
    dis = np.zeros(n)
    for i in range(n):
        dis[i] = (z-x[i, ]).T@(z-x[i, ])
    S = np.argsort(dis)[0:k]  # 距離が近いk個のindex
    u = np.bincount(y[S])     # 度数を数える
    m = [i for i, x in enumerate(u) if x == max(u)]  # 最頻値のindex
    # タイブレーキングの処理(最頻値が2個以上ある場合)
    while len(m) > 1:
        k = k - 1
        S = S[0:k]
        u = np.bincount(y[S])
        m = [i for i, x in enumerate(u) if x == max(u)]  # 最頻値のindex
    return m[0]
In [ ]:
def knn(x, y, z, k):
    n = z.shape[0]
    w = np.zeros(n)
    for i in range(n):
        w[i] = knn_1(x, y, z[i, ], k)
    return w
In [ ]:
iris = load_iris()
iris.target_names
x = iris.data
y = iris.target
n = x.shape[0]
index = np.random.choice(n, n, replace=False)  # 並び替える
x = x[index, ]
y = y[index]
In [ ]:
U = []
V = []
top_seq = list(range(0, 150, 15))
for k in range(1, 11, 1):
    S = 0
    for top in top_seq:
        test = list(range(top, top+15))
        train = list(set(range(150))-set(test))
        knn_ans = knn(x[train, ], y[train], x[test, ], k=k)
        ans = y[test]
        S = S + np.sum(knn_ans != ans)
    S = S / n
    U.append(k)
    V.append(S)
plt.plot(U, V)
plt.xlabel("K")
plt.ylabel(" 誤り率")
plt.title("CV による誤り率の評価")

38¶

In [ ]:
def bt(df, f, r):
    m = df.shape[0]
    org = f(df, np.arange(0, m, 1))
    u = []
    for j in range(r):
        index = np.random.choice(m, m, replace=True)
        u.append(f(df, index))
    return {"original":org, "bias":np.mean(u)-org, 
            "stderr":np.std(u, ddof=1)}
In [ ]:
def func_1(data, index):
    X = data[index, 0]
    Y = data[index, 1]
    return ((np.var(Y, ddof=1)-np.cov(X, Y))
            / (np.var(X, ddof=1)+np.var(Y, ddof=1)-2*np.cov(X, Y)[0, 1]))
In [ ]:
Portfolio = np.loadtxt("Portfolio.csv", delimiter=",", skiprows=1)
bt(Portfolio, func_1, 1000)

39¶

In [ ]:
from sklearn import linear_model
In [ ]:
df = np.loadtxt("crime.txt", delimiter="\t")
reg = linear_model.LinearRegression()
X = df[:, 2:4]
y = df[:, 0]
reg.fit(X, y)
reg.coef_
In [ ]:
for j in range(3):
    def func_2(data, index):
        X = data[index, 2:4]
        y = data[index, 0]
        reg.fit(X, y)
        if j == 0:
            return reg.intercept_
        else:
            return reg.coef_[j-1]
    print(bt(df, func_2, 1000))
In [ ]:
import statsmodels.api as sm
In [ ]:
n = X.shape[0]
X = np.insert(X, 0, 1, axis=1)
model = sm.OLS(y, X)
res = model.fit()
print(res.summary())