鈴木讓「統計的機械学習の数理 with Python 100問」(共立出版)
!pip install japanize_matplotlib
Requirement already satisfied: japanize_matplotlib in c:\users\prof-\anaconda3\lib\site-packages (1.1.2) Requirement already satisfied: matplotlib in c:\users\prof-\anaconda3\lib\site-packages (from japanize_matplotlib) (3.3.4) Requirement already satisfied: cycler>=0.10 in c:\users\prof-\anaconda3\lib\site-packages (from matplotlib->japanize_matplotlib) (0.11.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\prof-\anaconda3\lib\site-packages (from matplotlib->japanize_matplotlib) (1.4.4) Requirement already satisfied: numpy>=1.15 in c:\users\prof-\anaconda3\lib\site-packages (from matplotlib->japanize_matplotlib) (1.23.5) Requirement already satisfied: pillow>=6.2.0 in c:\users\prof-\anaconda3\lib\site-packages (from matplotlib->japanize_matplotlib) (9.4.0) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\users\prof-\anaconda3\lib\site-packages (from matplotlib->japanize_matplotlib) (3.0.9) Requirement already satisfied: python-dateutil>=2.1 in c:\users\prof-\anaconda3\lib\site-packages (from matplotlib->japanize_matplotlib) (2.8.2) Requirement already satisfied: six>=1.5 in c:\users\prof-\anaconda3\lib\site-packages (from python-dateutil>=2.1->matplotlib->japanize_matplotlib) (1.16.0)
WARNING: Ignoring invalid distribution -umpy (c:\users\prof-\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ython-igraph (c:\users\prof-\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -umpy (c:\users\prof-\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ython-igraph (c:\users\prof-\anaconda3\lib\site-packages)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#import japanize_matplotlib
import scipy
from scipy import stats
from numpy.random import randn
from sklearn.datasets import load_iris
from sklearn import linear_model
# Anacondaの場合は下記( import japanize_matplotlib はコメントアウト)
import matplotlib
from matplotlib import font_manager
matplotlib.rc("font", family="BIZ UDGothic")
def cv_linear(X, y, K):
n = len(y)
m = int(n / K)
S = 0
for j in range(K):
test = list(range(j * m, (j + 1) * m)) # テストデータの添え字
train = list(set(range(n)) - set(test)) # 訓練データの添え字
beta = np.linalg.inv(X[train].T @ X[train]) @ X[train].T @ y[train]
e = y[test] - X[test] @ beta
S += np.linalg.norm(e) ** 2
return S / n
# 線形回帰のクロスバリデーションでの評価
n = 100
p = 5
X = randn(n, p)
X = np.insert(X, 0, 1, axis=1) # 切片項の追加
beta = randn(p + 1)
beta[[1, 2]] = 0 # 一部の係数を0に設定
y = X @ beta + randn(n)
print(cv_linear(X[:, [0, 3, 4, 5]], y, 10))
1.188747460207503
print(cv_linear(X, y, 10))
1.2422746229745045
# 変数選択の影響のシミュレーション
U = []
V = []
for j in range(100):
y = X @ beta + randn(n)
U.append(cv_linear(X[:, [0, 3, 4, 5]], y, 10))
V.append(cv_linear(X, y, 10))
x_seq = np.linspace(0.7, 1.5, 100)
y_seq = x_seq # y = x の直線
plt.plot(x_seq, y_seq, c="red")
plt.scatter(U, V)
plt.xlabel("変数4,5,6を選んだ時の2乗誤差")
plt.ylabel("全変数を選んだ時の2乗誤差")
plt.title("変数を多く選びすぎて過学習")
plt.show()
n = 100
p = 5
plt.ylim(0.3, 1.5)
plt.xlabel("k")
plt.ylabel("CVの値")
for j in range(2, 11):
X = np.random.randn(n, p)
X = np.insert(X, 0, 1, axis=1)
beta = np.random.randn(p + 1)
y = X @ beta + np.random.randn(n)
U = []
V = []
for k in range(2, n + 1):
if n % k == 0:
U.append(k)
V.append(cv_linear(X, y, k))
plt.plot(U, V)
def knn_1(x, y, z, k):
x = np.array(x)
n, p = x.shape
dis = np.zeros(n)
for i in range(n):
dis[i] = np.sum((z - x[i]) ** 2)
S = np.argsort(dis)[:k] # 距離が近いk個のindex
u = np.bincount(y[S]) # 度数を数える
m = [i for i, x in enumerate(u) if x == max(u)] # 最頻値のindex
# タイブレーキングの処理(最頻値が2個以上ある場合)
while len(m) > 1:
k -= 1
S = S[:k]
u = np.bincount(y[S])
m = [i for i, x in enumerate(u) if x == max(u)]
return m[0]
def knn(x, y, z, k):
n = z.shape[0]
w = np.zeros(n)
for i in range(n):
w[i] = knn_1(x, y, z[i], k)
return w
# KNNとその評価
iris = load_iris()
x = iris.data
y = iris.target
n = x.shape[0]
index = np.random.choice(n, n, replace=False) # 並び替える
x = x[index]
y = y[index]
U = []
V = []
top_seq = list(range(0, 150, 15))
for k in range(1, 11):
S = 0
for top in top_seq:
test = list(range(top, top + 15))
train = list(set(range(150)) - set(test))
knn_ans = knn(x[train], y[train], x[test], k=k)
ans = y[test]
S += np.sum(knn_ans != ans)
S /= n
U.append(k)
V.append(S)
plt.plot(U, V)
plt.xlabel("K")
plt.ylabel("誤り率")
plt.title("CVによる誤り率の評価")
plt.show()
↑データによって結構かわります
def cv_fast(X, y, k):
n = len(y)
m = int(n / k) # 整数除算の正しい使用
H = X @ np.linalg.inv(X.T @ X) @ X.T
I = np.eye(n) # 単位行列の生成
e = (I - H) @ y
S = 0
for j in range(k):
test = np.arange(j * m, (j + 1) * m, dtype=int)
H_test = H[test, :][:, test]
I_test = np.eye(len(test)) # テストセットのサイズに合わせた単位行列
S += (e[test].T @ np.linalg.inv(I_test - H_test) @ e[test])
return S / n
# データの生成
n = 1000
p = 5
beta = np.random.randn(p + 1)
X = np.random.randn(n, p)
X = np.insert(X, 0, 1, axis=1)
y = X @ beta + np.random.randn(n)
import time
# 実行時間の比較
# cv_linear関数は前述のものを使用
U_l = []
V_l = []
U_f = []
V_f = []
for k in range(2, n + 1, 1):
if n % k == 0:
t1 = time.time()
cv_linear(X, y, k)
t2 = time.time()
U_l.append(k)
V_l.append(t2 - t1)
t1 = time.time()
cv_fast(X, y, k)
t2 = time.time()
U_f.append(k)
V_f.append(t2 - t1)
plt.plot(U_l, V_l, c="red", label="cv_linear")
plt.plot(U_f, V_f, c="blue", label="cv_fast")
plt.legend()
plt.xlabel("k")
plt.ylabel("実行時間")
plt.title("cv_fastとcv_linearの比較")
plt.show()
def bt(df, f, r):
m = df.shape[0]
org = f(df, np.arange(0, m, 1))
u = []
for j in range(r):
index = np.random.choice(m, m, replace=True)
u.append(f(df, index))
return {'original': org, 'bias': np.mean(u) - org, 'stderr': np.std(u, ddof=1)}
Portfolio = np.loadtxt("Portfolio.csv", delimiter=",", skiprows=1)
def func_1(data,index):
X=data[index,0];Y=data[index,1]
return (np.var(Y,ddof=1)-np.cov(X,Y)[0,1])/(np.var(X,ddof=1)+np.var(Y,ddof=1)-2*np.cov(X,Y)[0,1])
print(bt(Portfolio, func_1, 1000))
{'original': 0.5758320746483814, 'bias': 0.00012144294352134377, 'stderr': 0.09126002930178}
# crime.txtの読み込み
df = np.loadtxt("crime.txt", delimiter="\t")
# 線形回帰モデルの設定
reg = linear_model.LinearRegression()
X = df[:, 2:4]
y = df[:, 0]
reg.fit(X, y)
reg.coef_
array([11.8583308 , -5.97341169])
for j in range(3):
def func_2(data, index):
X = data[index, 2:4]
y = data[index, 0]
reg.fit(X, y)
return reg.intercept_ if j == 0 else reg.coef_[j - 1]
print(bt(df, func_2, 1000))
{'original': 621.4260363802891, 'bias': 28.226570563414157, 'stderr': 218.18753270819937} {'original': 11.858330796711092, 'bias': -0.774499087671785, 'stderr': 3.513675713220029} {'original': -5.973411688164964, 'bias': -0.3258931651270611, 'stderr': 3.3842577856846328}
import statsmodels.api as sm
n = X.shape[0]
X = np.insert(X, 0, 1, axis=1)
model = sm.OLS(y, X)
res = model.fit()
print(res.summary())
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.325 Model: OLS Adj. R-squared: 0.296 Method: Least Squares F-statistic: 11.30 Date: Sun, 25 Feb 2024 Prob (F-statistic): 9.84e-05 Time: 14:45:55 Log-Likelihood: -344.79 No. Observations: 50 AIC: 695.6 Df Residuals: 47 BIC: 701.3 Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 621.4260 222.685 2.791 0.008 173.441 1069.411 x1 11.8583 2.568 4.618 0.000 6.692 17.024 x2 -5.9734 3.561 -1.677 0.100 -13.138 1.191 ============================================================================== Omnibus: 14.866 Durbin-Watson: 1.581 Prob(Omnibus): 0.001 Jarque-Bera (JB): 16.549 Skew: 1.202 Prob(JB): 0.000255 Kurtosis: 4.470 Cond. No. 453. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.