import numpy as np
import matplotlib.pyplot as plt
import scipy
from scipy import stats
from numpy.random import randn
# import japanize_matplotlib
%matplotlib inline
# Anacondaの場合は下記
import matplotlib
from matplotlib import font_manager
matplotlib.rc("font", family="BIZ UDGothic")
def min_sq(x, y): # 最小二乗法の切片と傾きを求める関数
x_bar, y_bar = np.mean(x), np.mean(y)
beta_1 = np.dot(x-x_bar, y-y_bar)/np.linalg.norm(x-x_bar)**2
beta_0 = y_bar-beta_1*x_bar
return [beta_1, beta_0]
N = 100 # サンプルサイズをNとする
a = randn(1) + 2 # 傾き
b = randn(1) # 切片
x = randn(N)
y = a*x + b + randn(N) # ここまで人工データの生成
a1, b1 = min_sq(x, y) # 回帰係数・切片
xx = x - np.mean(x)
yy = y - np.mean(y) # 中心化
a2, b2 = min_sq(xx, yy) # 中心化後の回帰係数・切片
x_seq = np.arange(-5, 5, 0.1)
y_pre = x_seq*a1 + b1
yy_pre = x_seq*a2 + b2
plt.scatter(x, y, c="black")
plt.axhline(y=0, c="black", linewidth=0.5)
plt.axvline(x=0, c="black", linewidth=0.5)
plt.plot(x_seq, y_pre, c="blue", label=" 中心化前")
plt.plot(x_seq, yy_pre, c="orange", label=" 中心化後")
plt.legend(loc="upper left")
N = 100
x = randn(N)
y = randn(N)
beta_1, beta_0 = min_sq(x, y)
RSS = np.linalg.norm(y-beta_0-beta_1*x)**2
RSE = np.sqrt(RSS/(N-1-1))
B_0 = (np.linalg.norm(x)**2/N) / np.linalg.norm(x-np.mean(x))**2
B_1 = 1 / np.linalg.norm(x-np.mean(x))**2
se_0 = RSE*np.sqrt(B_0)
se_1 = RSE*np.sqrt(B_1)
t_0 = beta_0 / se_0
t_1 = beta_1 / se_1
p_0 = 2 * (1-stats.t.cdf(np.abs(t_0), N-2))
p_1 = 2 * (1-stats.t.cdf(np.abs(t_1), N-2))
beta_0, se_0, t_0, p_0 # 切片
beta_1, se_1, t_1, p_1 # 回帰係数
from sklearn import linear_model
reg = linear_model.LinearRegression()
x = x.reshape(-1, 1)
# sklearnでは配列のサイズを明示する必要がある
y = y.reshape(-1, 1)
# 片方の次元を設定し, もう片方を-1にすると自動でしてくれる
reg.fit(x, y) # 実行
reg.coef_, reg.intercept_ # 回帰係数 beta_1, 切片 beta_0
import statsmodels.api as sm
X = np.insert(x, 0, 1, axis=1)
model = sm.OLS(y, X)
res = model.fit()
print(res.summary())
N = 100
r = 1000
T = []
for i in range(r):
x = randn(N)
y = randn(N)
beta_1, beta_0 = min_sq(x, y)
pre_y = beta_0 + beta_1*x # yの予測値
RSS = np.linalg.norm(y-pre_y) ** 2
RSE = np.sqrt(RSS/(N-1-1))
B_1 = 1 / np.linalg.norm(x-np.mean(x))**2
se_1 = RSE*np.sqrt(B_1)
T.append(beta_1/se_1)
plt.hist(T, bins=20, range=(-3, 3), density=True)
x = np.linspace(-4, 4, 400)
plt.plot(x, stats.t.pdf(x, 98))
plt.title(" 帰無仮説が成立する場合 ")
plt.xlabel(" tの値 ")
plt.ylabel(" 確率密度 ")
def R2(x, y):
n = x.shape[0]
xx = np.insert(x, 0, 1, axis=1)
beta = np.linalg.inv(xx.T@xx)@xx.T@y
y_hat = np.dot(xx, beta)
y_bar = np.mean(y)
RSS = np.linalg.norm(y - y_hat) ** 2
TSS = np.linalg.norm(y - y_bar) ** 2
return 1 - RSS/TSS
N = 100
m = 2
x = randn(N, m)
y = randn(N)
R2(x, y)
# 1変量ならば, 決定係数は相関係数の2乗
x = randn(N, 1)
y = randn(N)
R2(x, y)
X = np.loadtxt("boston.txt", delimiter="\t")
def VIF(x):
p = x.shape[1]
values = []
for j in range(p):
S = list(set(range(p))-{j})
values.append(1/(1 - R2(x[:, S], x[:, j])))
return values
VIF(X)
N = 100
p = 1
X = randn(N, p)
X = np.insert(X, 0, 1, axis=1)
beta = np.array([1, 1])
epsilon = randn(N)
y = X@beta + epsilon
# 関数f(x),g(x)を定義
U = np.linalg.inv(X.T@X)
beta_hat = U@X.T@y
RSS = np.linalg.norm(y - X@beta_hat) ** 2
RSE = np.sqrt(RSS / (N-p-1))
alpha = 0.05
def f(x):
x = np.array([1, x])
# stats.t.ppf(0.975, df=N-p-1)
# 累積確率が1-alpha/2となる点
range = stats.t.ppf(0.975, df=N-p-1) * RSE * np.sqrt(x@U@x.T)
lower = x@beta_hat - range
upper = x@beta_hat + range
return ([lower, upper])
def g(x):
x = np.array([1, x])
# stats.t.ppf(0.975,df=N-p-1) # 累積確率が1-alpha/2となる点
range = stats.t.ppf(0.975, df=N-p-1) * RSE * np.sqrt(1+x@U@x.T)
lower = x@beta_hat - range
upper = x@beta_hat + range
return ([lower, upper])
# 例
stats.t.ppf(0.975, df=1) # 確率pに対応する点
x_seq = np.arange(-10, 10, 0.1)
# 信頼区間
lower_seq1 = []
upper_seq1 = []
for i in range(len(x_seq)):
lower_seq1.append(f(x_seq[i])[0])
upper_seq1.append(f(x_seq[i])[1])
# 予測区間
lower_seq2 = []
upper_seq2 = []
for i in range(len(x_seq)):
lower_seq2.append(g(x_seq[i])[0])
upper_seq2.append(g(x_seq[i])[1])
yy = beta_hat[0] + beta_hat[1]*x_seq
plt.xlim(np.min(x_seq), np.max(x_seq))
plt.ylim(np.min(lower_seq1), np.max(upper_seq1))
plt.plot(x_seq, yy, c="black")
plt.plot(x_seq, lower_seq1, c="blue")
plt.plot(x_seq, upper_seq1, c="red")
plt.plot(x_seq, lower_seq2, c="blue", linestyle="dashed")
plt.plot(x_seq, upper_seq2, c="red", linestyle="dashed")
plt.xlabel("x")
plt.ylabel("y")