!pip install japanize_matplotlib

Requirement already satisfied: japanize_matplotlib in c:\users\prof-\anaconda3\lib\site-packages (1.1.2)
Requirement already satisfied: matplotlib in c:\users\prof-\anaconda3\lib\site-packages (from japanize_matplotlib) (3.3.4)
Requirement already satisfied: cycler>=0.10 in c:\users\prof-\anaconda3\lib\site-packages (from matplotlib->japanize_matplotlib) (0.11.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\prof-\anaconda3\lib\site-packages (from matplotlib->japanize_matplotlib) (1.4.4)
Requirement already satisfied: numpy>=1.15 in c:\users\prof-\anaconda3\lib\site-packages (from matplotlib->japanize_matplotlib) (1.23.5)
Requirement already satisfied: pillow>=6.2.0 in c:\users\prof-\anaconda3\lib\site-packages (from matplotlib->japanize_matplotlib) (9.4.0)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\users\prof-\anaconda3\lib\site-packages (from matplotlib->japanize_matplotlib) (3.0.9)
Requirement already satisfied: python-dateutil>=2.1 in c:\users\prof-\anaconda3\lib\site-packages (from matplotlib->japanize_matplotlib) (2.8.2)
Requirement already satisfied: six>=1.5 in c:\users\prof-\anaconda3\lib\site-packages (from python-dateutil>=2.1->matplotlib->japanize_matplotlib) (1.16.0)

WARNING: Ignoring invalid distribution -umpy (c:\users\prof-\anaconda3\lib\site-packages)
WARNING: Ignoring invalid distribution -ython-igraph (c:\users\prof-\anaconda3\lib\site-packages)
WARNING: Ignoring invalid distribution -umpy (c:\users\prof-\anaconda3\lib\site-packages)
WARNING: Ignoring invalid distribution -ython-igraph (c:\users\prof-\anaconda3\lib\site-packages)


import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#import japanize_matplotlib
import scipy
from scipy import stats
from numpy.random import randn
from sklearn.datasets import load_iris
from sklearn import linear_model


# Anacondaの場合は下記( import japanize_matplotlib はコメントアウト)
import matplotlib
from matplotlib import font_manager
matplotlib.rc("font", family="BIZ UDGothic")


def cv_linear(X, y, K):
    n = len(y)
    m = int(n / K)
    S = 0
    for j in range(K):
        test = list(range(j * m, (j + 1) * m))  # テストデータの添え字
        train = list(set(range(n)) - set(test))  # 訓練データの添え字
        beta = np.linalg.inv(X[train].T @ X[train]) @ X[train].T @ y[train]
        e = y[test] - X[test] @ beta
        S += np.linalg.norm(e) ** 2
    return S / n


# 線形回帰のクロスバリデーションでの評価
n = 100
p = 5
X = randn(n, p)
X = np.insert(X, 0, 1, axis=1)  # 切片項の追加

beta = randn(p + 1)
beta[[1, 2]] = 0  # 一部の係数を0に設定
y = X @ beta + randn(n)

print(cv_linear(X[:, [0, 3, 4, 5]], y, 10))

1.188747460207503


print(cv_linear(X, y, 10))

1.2422746229745045


# 変数選択の影響のシミュレーション
U = []
V = []
for j in range(100):
    y = X @ beta + randn(n)
    U.append(cv_linear(X[:, [0, 3, 4, 5]], y, 10))
    V.append(cv_linear(X, y, 10))

x_seq = np.linspace(0.7, 1.5, 100)
y_seq = x_seq  # y = x の直線
plt.plot(x_seq, y_seq, c="red")
plt.scatter(U, V)
plt.xlabel("変数4,5,6を選んだ時の２乗誤差")
plt.ylabel("全変数を選んだ時の２乗誤差")
plt.title("変数を多く選びすぎて過学習")
plt.show()


n = 100
p = 5

plt.ylim(0.3, 1.5)
plt.xlabel("k")
plt.ylabel("CVの値")

for j in range(2, 11):
    X = np.random.randn(n, p)
    X = np.insert(X, 0, 1, axis=1)
    beta = np.random.randn(p + 1)
    y = X @ beta + np.random.randn(n)
    U = []
    V = []
    for k in range(2, n + 1):
        if n % k == 0:
            U.append(k)
            V.append(cv_linear(X, y, k))
    plt.plot(U, V)


def knn_1(x, y, z, k):
    x = np.array(x)
    n, p = x.shape
    dis = np.zeros(n)
    for i in range(n):
        dis[i] = np.sum((z - x[i]) ** 2)
    S = np.argsort(dis)[:k]  # 距離が近いk個のindex
    u = np.bincount(y[S])  # 度数を数える
    m = [i for i, x in enumerate(u) if x == max(u)]  # 最頻値のindex
    # タイブレーキングの処理(最頻値が２個以上ある場合)
    while len(m) > 1:
        k -= 1
        S = S[:k]
        u = np.bincount(y[S])
        m = [i for i, x in enumerate(u) if x == max(u)]
    return m[0]

def knn(x, y, z, k):
    n = z.shape[0]
    w = np.zeros(n)
    for i in range(n):
        w[i] = knn_1(x, y, z[i], k)
    return w


# KNNとその評価
iris = load_iris()
x = iris.data
y = iris.target

n = x.shape[0]
index = np.random.choice(n, n, replace=False)  # 並び替える
x = x[index]
y = y[index]


U = []
V = []
top_seq = list(range(0, 150, 15))

for k in range(1, 11):
    S = 0
    for top in top_seq:
        test = list(range(top, top + 15))
        train = list(set(range(150)) - set(test))
        knn_ans = knn(x[train], y[train], x[test], k=k)
        ans = y[test]
        S += np.sum(knn_ans != ans)
    S /= n
    U.append(k)
    V.append(S)

plt.plot(U, V)
plt.xlabel("K")
plt.ylabel("誤り率")
plt.title("CVによる誤り率の評価")
plt.show()


def cv_fast(X, y, k):
    n = len(y)
    m = int(n / k)  # 整数除算の正しい使用
    H = X @ np.linalg.inv(X.T @ X) @ X.T
    I = np.eye(n)  # 単位行列の生成
    e = (I - H) @ y
    S = 0
    for j in range(k):
        test = np.arange(j * m, (j + 1) * m, dtype=int)
        H_test = H[test, :][:, test]
        I_test = np.eye(len(test))  # テストセットのサイズに合わせた単位行列
        S += (e[test].T @ np.linalg.inv(I_test - H_test) @ e[test])
    return S / n


# データの生成
n = 1000
p = 5
beta = np.random.randn(p + 1)
X = np.random.randn(n, p)
X = np.insert(X, 0, 1, axis=1)
y = X @ beta + np.random.randn(n)


import time


# 実行時間の比較
# cv_linear関数は前述のものを使用
U_l = []
V_l = []
U_f = []
V_f = []
for k in range(2, n + 1, 1):
    if n % k == 0:
        t1 = time.time()
        cv_linear(X, y, k)
        t2 = time.time()
        U_l.append(k)
        V_l.append(t2 - t1)

        t1 = time.time()
        cv_fast(X, y, k)
        t2 = time.time()
        U_f.append(k)
        V_f.append(t2 - t1)
plt.plot(U_l, V_l, c="red", label="cv_linear")
plt.plot(U_f, V_f, c="blue", label="cv_fast")
plt.legend()
plt.xlabel("k")
plt.ylabel("実行時間")
plt.title("cv_fastとcv_linearの比較")
plt.show()


def bt(df, f, r):
    m = df.shape[0]
    org = f(df, np.arange(0, m, 1))
    u = []
    for j in range(r):
        index = np.random.choice(m, m, replace=True)
        u.append(f(df, index))
    return {'original': org, 'bias': np.mean(u) - org, 'stderr': np.std(u, ddof=1)}


Portfolio = np.loadtxt("Portfolio.csv", delimiter=",", skiprows=1)
def func_1(data,index):
    X=data[index,0];Y=data[index,1]
    return (np.var(Y,ddof=1)-np.cov(X,Y)[0,1])/(np.var(X,ddof=1)+np.var(Y,ddof=1)-2*np.cov(X,Y)[0,1])
print(bt(Portfolio, func_1, 1000))

{'original': 0.5758320746483814, 'bias': 0.00012144294352134377, 'stderr': 0.09126002930178}


# crime.txtの読み込み
df = np.loadtxt("crime.txt", delimiter="\t")

# 線形回帰モデルの設定
reg = linear_model.LinearRegression()
X = df[:, 2:4]
y = df[:, 0]
reg.fit(X, y)
reg.coef_

array([11.8583308 , -5.97341169])


for j in range(3):
    def func_2(data, index):
        X = data[index, 2:4]
        y = data[index, 0]
        reg.fit(X, y)
        return reg.intercept_ if j == 0 else reg.coef_[j - 1]

    print(bt(df, func_2, 1000))

{'original': 621.4260363802891, 'bias': 28.226570563414157, 'stderr': 218.18753270819937}
{'original': 11.858330796711092, 'bias': -0.774499087671785, 'stderr': 3.513675713220029}
{'original': -5.973411688164964, 'bias': -0.3258931651270611, 'stderr': 3.3842577856846328}


import statsmodels.api as sm


n = X.shape[0]
X = np.insert(X, 0, 1, axis=1)
model = sm.OLS(y, X)
res = model.fit()
print(res.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.325
Model:                            OLS   Adj. R-squared:                  0.296
Method:                 Least Squares   F-statistic:                     11.30
Date:                Sun, 25 Feb 2024   Prob (F-statistic):           9.84e-05
Time:                        14:45:55   Log-Likelihood:                -344.79
No. Observations:                  50   AIC:                             695.6
Df Residuals:                      47   BIC:                             701.3
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        621.4260    222.685      2.791      0.008     173.441    1069.411
x1            11.8583      2.568      4.618      0.000       6.692      17.024
x2            -5.9734      3.561     -1.677      0.100     -13.138       1.191
==============================================================================
Omnibus:                       14.866   Durbin-Watson:                   1.581
Prob(Omnibus):                  0.001   Jarque-Bera (JB):               16.549
Skew:                           1.202   Prob(JB):                     0.000255
Kurtosis:                       4.470   Cond. No.                         453.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

第3章リサンプリング¶

3.1 クロスバリデーション¶

3.2 CV fast¶

3.3 ブートストラップ¶

第3章 リサンプリング¶

3.1 クロスバリデーション¶

3.2 CV fast¶

3.3 ブートストラップ¶

第3章リサンプリング¶