import statsmodels.api as sm
from sklearn import linear_model
from sklearn.datasets import load_iris
import time
from numpy.random import randn
from scipy import stats
import scipy
import matplotlib.pyplot as plt
import numpy as np

def cv_linear(X, y, K):
    n = len(y)
    m = int(n/K)
    S = 0
    for j in range(K):
        test = list(range(j*m, (j+1)*m))  # Indices of test data
        train = list(set(range(n))-set(test))     # Indices of training data
        beta = np.linalg.inv(X[train,].T@X[train,])@X[train,].T@y[train]
        e = y[test]-X[test,]@beta
        S = S+np.linalg.norm(e)**2
    return S/n

def cv_fast(X, y, k):
    n = len(y)
    m = n/k
    H = X@np.linalg.inv(X.T@X)@X.T
    I = np.diag(np.repeat(1, n))
    e = (I-H)@y
    I = np.diag(np.repeat(1, m))
    S = 0
    for j in range(k):
        test = np.arange(j*m, (j+1)*m, 1, dtype=int)
        S = S+(np.linalg.inv(I-H[test, :][:, test])@e[test]
               ).T@np.linalg.inv(I-H[test, test])@e[test]
    return S/n

n = 1000
p = 5
X = np.insert(randn(n, p), 0, 1, axis=1)
beta = randn(p+1).reshape(-1, 1)
y = X@beta+0.2*randn(n).reshape(-1, 1)
y = y[:, 0]

U_l = []
V_l = []
U_f = []
V_f = []
for k in range(2, n+1, 1):
    if n % k == 0:
        t1 = time.time()  # Time before processing
        cv_linear(X, y, k)
        t2 = time.time()  # Time after processing
        U_l.append(k)
        V_l.append(t2-t1)
        t1 = time.time()
        cv_fast(X, y, k)
        t2 = time.time()
        U_f.append(k)
        V_f.append(t2-t1)
plt.plot(U_l, V_l, c="red", label="cv_linear")
plt.plot(U_f, V_f, c="blue", label="cv_fast")
plt.legend()
plt.xlabel("k")
plt.ylabel("Execution time")
plt.title("Comparison of cv_fast and cv_linear")

Text(0.5, 1.0, 'Comparison of cv_fast and cv_linear')

n = 100
p = 5
plt.ylim(0.3, 1.5)
plt.xlabel("k")
plt.ylabel("CV value")
for j in range(2, 11, 1):
    X = randn(n, p)
    X = np.insert(X, 0, 1, axis=1)
    beta = randn(p+1)
    y = X@beta+randn(n)
    U = []
    V = []
    for k in range(2, n+1, 1):
        if n % k == 0:
            U.append(k)
            V.append(cv_linear(X, y, k))
    plt.plot(U, V)

def knn_1(x, y, z, k):   # Reprint
    x = np.array(x)
    n, p = x.shape
    dis = np.zeros(n)
    for i in range(n):
        dis[i] = (z-x[i,]).T@(z-x[i,])
    S = np.argsort(dis)[0:k]  # Indices of k nearest points
    u = np.bincount(y[S])     # Count frequencies
    m = [i for i, x in enumerate(u) if x == max(u)]  # Indices of mode
    # Tie-breaking process (if there are multiple modes)
    while (len(m) > 1):
        k = k-1
        S = S[0:k]
        u = np.bincount(y[S])
        m = [i for i, x in enumerate(u) if x == max(u)]  # Indices of mode
    return m[0]

def knn(x, y, z, k):
    n = z.shape[0]
    w = np.zeros(n)
    for i in range(n):
        w[i] = knn_1(x, y, z[i,], k)
    return w


iris = load_iris()
iris.target_names
x = iris.data
y = iris.target
n = x.shape[0]
index = np.random.choice(n, n, replace=False)  # Shuffle
x = x[index,]
y = y[index]

U = []
V = []
top_seq = list(range(0, 150, 15))
for k in range(1, 11, 1):
    S = 0
    for top in top_seq:
        test = list(range(top, top+15))
        train = list(set(range(150))-set(test))
        knn_ans = knn(x[train,], y[train], x[test,], k=k)
        ans = y[test]
        S = S+np.sum(knn_ans != ans)
    S = S/n
    U.append(k)
    V.append(S)
plt.plot(U, V)
plt.xlabel("K")
plt.ylabel("Error rate")
plt.title("Evaluation of error rate by CV")

Text(0.5, 1.0, 'Evaluation of error rate by CV')

def bt(df, f, r):
    m = df.shape[0]
    org = f(df, np.arange(0, m, 1))
    u = []
    for j in range(r):
        index = np.random.choice(m, m, replace=True)
        u.append(f(df, index))
    return {'original': org, 'bias': np.mean(u)-org,
    'stderr': np.std(u, ddof=1)}

def func_1(data, index):
    X = data[index, 0]
    Y = data[index, 1]
    return (np.var(Y, ddof=1)-np.cov(X, Y))/(np.var(X, ddof=1)
    +np.var(Y, ddof=1)-2*np.cov(X, Y)[0, 1])


Portfolio = np.loadtxt("Portfolio.csv", delimiter=",", skiprows=1)
bt(Portfolio, func_1, 1000)

{'original': array([[ 1.51664149e-01,  5.75832075e-01],
        [ 5.75832075e-01, -1.87511820e-16]]),
 'bias': array([[ 0.17456226, -0.24960566],
        [-0.24960566,  0.32622641]]),
 'stderr': 0.27968121799891693}

df = np.loadtxt("crime.txt", delimiter="\t")
reg = linear_model.LinearRegression()
X = df[:, 2:4]
y = df[:, 0]
reg.fit(X, y)
reg.coef_

for j in range(3):
    def func_2(data, index):
        X = data[index, 2:4]
        y = data[index, 0]
        reg.fit(X, y)
        if j == 0:
            return reg.intercept_
        else:
            return reg.coef_[j-1]
    print(bt(df, func_2, 1000))


n = X.shape[0]
X = np.insert(X, 0, 1, axis=1)
model = sm.OLS(y, X)
res = model.fit()
print(res.summary())

{'original': 621.4260363802889, 'bias': 32.57770500564709, 'stderr': 217.39552106861717}
{'original': 11.858330796711092, 'bias': -0.594184885663191, 'stderr': 3.552971537466119}
{'original': -5.97341168816496, 'bias': -0.4252908416704404, 'stderr': 3.19690795881968}
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.325
Model:                            OLS   Adj. R-squared:                  0.296
Method:                 Least Squares   F-statistic:                     11.30
Date:                Tue, 04 Jun 2024   Prob (F-statistic):           9.84e-05
Time:                        12:11:58   Log-Likelihood:                -344.79
No. Observations:                  50   AIC:                             695.6
Df Residuals:                      47   BIC:                             701.3
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        621.4260    222.685      2.791      0.008     173.441    1069.411
x1            11.8583      2.568      4.618      0.000       6.692      17.024
x2            -5.9734      3.561     -1.677      0.100     -13.138       1.191
==============================================================================
Omnibus:                       14.866   Durbin-Watson:                   1.581
Prob(Omnibus):                  0.001   Jarque-Bera (JB):               16.549
Skew:                           1.202   Prob(JB):                     0.000255
Kurtosis:                       4.470   Cond. No.                         453.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Chapter 4 Resampling¶

35¶

36¶

37¶

38¶

39¶