Chapter 4 Resampling¶
35¶
In [ ]:
import statsmodels.api as sm
from sklearn import linear_model
from sklearn.datasets import load_iris
import time
from numpy.random import randn
from scipy import stats
import scipy
import matplotlib.pyplot as plt
import numpy as np
In [ ]:
def cv_linear(X, y, K):
n = len(y)
m = int(n/K)
S = 0
for j in range(K):
test = list(range(j*m, (j+1)*m)) # Indices of test data
train = list(set(range(n))-set(test)) # Indices of training data
beta = np.linalg.inv(X[train,].T@X[train,])@X[train,].T@y[train]
e = y[test]-X[test,]@beta
S = S+np.linalg.norm(e)**2
return S/n
In [ ]:
def cv_fast(X, y, k):
n = len(y)
m = n/k
H = X@np.linalg.inv(X.T@X)@X.T
I = np.diag(np.repeat(1, n))
e = (I-H)@y
I = np.diag(np.repeat(1, m))
S = 0
for j in range(k):
test = np.arange(j*m, (j+1)*m, 1, dtype=int)
S = S+(np.linalg.inv(I-H[test, :][:, test])@e[test]
).T@np.linalg.inv(I-H[test, test])@e[test]
return S/n
36¶
In [ ]:
n = 1000
p = 5
X = np.insert(randn(n, p), 0, 1, axis=1)
beta = randn(p+1).reshape(-1, 1)
y = X@beta+0.2*randn(n).reshape(-1, 1)
y = y[:, 0]
U_l = []
V_l = []
U_f = []
V_f = []
for k in range(2, n+1, 1):
if n % k == 0:
t1 = time.time() # Time before processing
cv_linear(X, y, k)
t2 = time.time() # Time after processing
U_l.append(k)
V_l.append(t2-t1)
t1 = time.time()
cv_fast(X, y, k)
t2 = time.time()
U_f.append(k)
V_f.append(t2-t1)
plt.plot(U_l, V_l, c="red", label="cv_linear")
plt.plot(U_f, V_f, c="blue", label="cv_fast")
plt.legend()
plt.xlabel("k")
plt.ylabel("Execution time")
plt.title("Comparison of cv_fast and cv_linear")
Out[ ]:
Text(0.5, 1.0, 'Comparison of cv_fast and cv_linear')
In [ ]:
n = 100
p = 5
plt.ylim(0.3, 1.5)
plt.xlabel("k")
plt.ylabel("CV value")
for j in range(2, 11, 1):
X = randn(n, p)
X = np.insert(X, 0, 1, axis=1)
beta = randn(p+1)
y = X@beta+randn(n)
U = []
V = []
for k in range(2, n+1, 1):
if n % k == 0:
U.append(k)
V.append(cv_linear(X, y, k))
plt.plot(U, V)
37¶
In [ ]:
def knn_1(x, y, z, k): # Reprint
x = np.array(x)
n, p = x.shape
dis = np.zeros(n)
for i in range(n):
dis[i] = (z-x[i,]).T@(z-x[i,])
S = np.argsort(dis)[0:k] # Indices of k nearest points
u = np.bincount(y[S]) # Count frequencies
m = [i for i, x in enumerate(u) if x == max(u)] # Indices of mode
# Tie-breaking process (if there are multiple modes)
while (len(m) > 1):
k = k-1
S = S[0:k]
u = np.bincount(y[S])
m = [i for i, x in enumerate(u) if x == max(u)] # Indices of mode
return m[0]
In [ ]:
def knn(x, y, z, k):
n = z.shape[0]
w = np.zeros(n)
for i in range(n):
w[i] = knn_1(x, y, z[i,], k)
return w
iris = load_iris()
iris.target_names
x = iris.data
y = iris.target
n = x.shape[0]
index = np.random.choice(n, n, replace=False) # Shuffle
x = x[index,]
y = y[index]
U = []
V = []
top_seq = list(range(0, 150, 15))
for k in range(1, 11, 1):
S = 0
for top in top_seq:
test = list(range(top, top+15))
train = list(set(range(150))-set(test))
knn_ans = knn(x[train,], y[train], x[test,], k=k)
ans = y[test]
S = S+np.sum(knn_ans != ans)
S = S/n
U.append(k)
V.append(S)
plt.plot(U, V)
plt.xlabel("K")
plt.ylabel("Error rate")
plt.title("Evaluation of error rate by CV")
Out[ ]:
Text(0.5, 1.0, 'Evaluation of error rate by CV')
38¶
In [ ]:
def bt(df, f, r):
m = df.shape[0]
org = f(df, np.arange(0, m, 1))
u = []
for j in range(r):
index = np.random.choice(m, m, replace=True)
u.append(f(df, index))
return {'original': org, 'bias': np.mean(u)-org,
'stderr': np.std(u, ddof=1)}
In [ ]:
def func_1(data, index):
X = data[index, 0]
Y = data[index, 1]
return (np.var(Y, ddof=1)-np.cov(X, Y))/(np.var(X, ddof=1)
+np.var(Y, ddof=1)-2*np.cov(X, Y)[0, 1])
Portfolio = np.loadtxt("Portfolio.csv", delimiter=",", skiprows=1)
bt(Portfolio, func_1, 1000)
Out[ ]:
{'original': array([[ 1.51664149e-01, 5.75832075e-01], [ 5.75832075e-01, -1.87511820e-16]]), 'bias': array([[ 0.17456226, -0.24960566], [-0.24960566, 0.32622641]]), 'stderr': 0.27968121799891693}
39¶
In [ ]:
df = np.loadtxt("crime.txt", delimiter="\t")
reg = linear_model.LinearRegression()
X = df[:, 2:4]
y = df[:, 0]
reg.fit(X, y)
reg.coef_
for j in range(3):
def func_2(data, index):
X = data[index, 2:4]
y = data[index, 0]
reg.fit(X, y)
if j == 0:
return reg.intercept_
else:
return reg.coef_[j-1]
print(bt(df, func_2, 1000))
n = X.shape[0]
X = np.insert(X, 0, 1, axis=1)
model = sm.OLS(y, X)
res = model.fit()
print(res.summary())
{'original': 621.4260363802889, 'bias': 32.57770500564709, 'stderr': 217.39552106861717} {'original': 11.858330796711092, 'bias': -0.594184885663191, 'stderr': 3.552971537466119} {'original': -5.97341168816496, 'bias': -0.4252908416704404, 'stderr': 3.19690795881968} OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.325 Model: OLS Adj. R-squared: 0.296 Method: Least Squares F-statistic: 11.30 Date: Tue, 04 Jun 2024 Prob (F-statistic): 9.84e-05 Time: 12:11:58 Log-Likelihood: -344.79 No. Observations: 50 AIC: 695.6 Df Residuals: 47 BIC: 701.3 Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 621.4260 222.685 2.791 0.008 173.441 1069.411 x1 11.8583 2.568 4.618 0.000 6.692 17.024 x2 -5.9734 3.561 -1.677 0.100 -13.138 1.191 ============================================================================== Omnibus: 14.866 Durbin-Watson: 1.581 Prob(Omnibus): 0.001 Jarque-Bera (JB): 16.549 Skew: 1.202 Prob(JB): 0.000255 Kurtosis: 4.470 Cond. No. 453. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.