Chapter 4 Resampling

4.1 Cross-Validation

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import scipy
from scipy import stats
from numpy.random import randn
In [21]:
def cv_linear(X,y,K):
    n=len(y);m=int(n/K)
    S=0
    for j in range(K):
        test=list(range(j*m,(j+1)*m)) # Indexes of test data 
        train=list(set(range(n))-set(test))     # Indexes of training data
        beta=np.linalg.inv(X[train,].T@X[train,])@X[train,].T@y[train]
        e=y[test]-X[test,]@beta
        S=S+np.linalg.norm(e)**2
    return S/n
In [22]:
n=100;p=5
X=randn(n,p)
X=np.insert(X, 0, 1, axis=1)
In [23]:
beta=randn(p+1)
beta[[1,2]]=0
y=X@beta+randn(n)
In [24]:
cv_linear(X[:,[0,3,4,5]],y,10)
Out[24]:
0.9480948826645423
In [25]:
cv_linear(X,y,10)
Out[25]:
0.9677344631743608
In [26]:
n=100;p=5
X=randn(n,p)
X=np.insert(X, 0, 1, axis=1)
beta=randn(p+1);beta[[1,2]]=0
In [27]:
U=[]; V=[] 
for j in range(100):
    y=X@beta+randn(n)
    U.append(cv_linear(X[:,[0,3,4,5]],y,10))
    V.append(cv_linear(X,y,10))
In [29]:
x_seq=np.linspace(0.7,1.5,100)
y=x_seq
plt.plot(x_seq,y,c="red")
plt.scatter(U,V)
plt.xlabel(" Sq err when variables 4,5,6 are chosen")
plt.ylabel(" Sq err when all are chosen")
plt.title(" Over-fitting due to choosing too many")
Out[29]:
Text(0.5, 1.0, ' Over-fitting due to choosing too many')
In [11]:
n=100;p=5
plt.ylim(0.3,1.5)
plt.xlabel("k")
plt.ylabel("CV value")
for j in range(2,11,1):
    X=randn(n,p)
    X=np.insert(X, 0, 1, axis=1)
    beta=randn(p+1)
    y=X@beta+randn(n)
    U = []; V = []
    for k in range(2,n+1,1):
        if n%k==0:
            U.append(k); V.append(cv_linear(X,y,k))
    plt.plot(U,V)        
In [14]:
def knn_1(x,y,z,k):   # this function appeared in Chapter 3
    x=np.array(x)
    n,p=x.shape
    dis=np.zeros(n)
    for i in range(n):
        dis[i]=(z-x[i,]).T@(z-x[i,])
    S=np.argsort(dis)[0:k]  
    u=np.bincount(y[S])     
    m = [i for i, x in enumerate(u) if x == max(u)] 
    # tie-breaking
    while (len(m)>1):
        k=k-1
        S=S[0:k]
        u=np.bincount(y[S])
        m = [i for i, x in enumerate(u) if x == max(u)] 
    return m[0]
def knn(x,y,z,k):
    n=z.shape[0]
    w=np.zeros(n)
    for i in range(n):
        w[i]=knn_1(x,y,z[i,],k)
    return w
In [38]:
from sklearn.datasets import load_iris
iris = load_iris()
iris.target_names
x=iris.data
y=iris.target
In [39]:
n=x.shape[0]
index=np.random.choice(n, n, replace=False) # The order is randomly chosen
x=x[index,]
y=y[index]
In [40]:
U = []
V = []
top_seq=list(range(0,150,15))    
for k in range(1,11,1):
    S=0
    for top in top_seq:
        test=list(range(top,top+15))
        train=list(set(range(150))-set(test))
        knn_ans=knn(x[train,],y[train],x[test,],k=k)
        ans=y[test]
        S=S+np.sum(knn_ans!=ans)
    S=S/n
    U.append(k)
    V.append(S)
plt.plot(U,V)
plt.xlabel("K")
plt.ylabel(" Err Rate")
plt.title("Evaluation of the error rate via CV")
Out[40]:
Text(0.5, 1.0, 'Evaluation of the error rate via CV')

4.2 CV fast

In [41]:
def cv_fast(X,y,k):
    n=len(y)
    m=n/k
    H=X@np.linalg.inv(X.T@X)@X.T
    I=np.diag(np.repeat(1,n))
    e=(I-H)@y
    I=np.diag(np.repeat(1,m))
    S=0
    for j in range(k):
        test=np.arange(j*m,(j+1)*m,1,dtype=int)
        S=S+(np.linalg.inv(I-H[test,:][:,test])@e[test]).T@np.linalg.inv(I-H[test,test])@e[test]
    return S/n
In [42]:
cv_fast(x,y,10)
Out[42]:
0.05256865629641018
In [43]:
n=1000;p=5
beta=randn(p+1)
x=randn(n,p)
X=np.insert(x, 0, 1, axis=1)
In [44]:
y=X@beta+randn(n)
In [45]:
import time
In [46]:
U_l=[]; V_l=[]; U_f=[]; V_f=[]
for k in range(2,n+1,1):
    if n%k==0:
        t1=time.time() # Time before processing
        cv_linear(X,y,k)
        t2=time.time() # Time after processing
        U_l.append(k); V_l.append(t2-t1)
        t1=time.time() 
        cv_fast(X,y,k)
        t2=time.time() 
        U_f.append(k); V_f.append(t2-t1)
plt.plot(U_l,V_l,c="red",label="cv_linear")
plt.plot(U_f,V_f,c="blue",label="cv_fast")
plt.legend()  
plt.xlabel("k")
plt.ylabel("execution time")
plt.title("Comparizon cv_fast and cv_linear")
Out[46]:
Text(0.5, 1.0, 'Comparizon cv_fast and cv_linear')

4.3 Bootstrap

In [47]:
def bt(df,f,r):
    m=df.shape[0]
    org=f(df,np.arange(0,m,1))
    u=[]
    for j in range(r):
        index=np.random.choice(m, m, replace=True) 
        u.append(f(df,index))
    return {'original':org, 'bias':np.mean(u)-org, 'stderr':np.std(u,ddof=1)}
In [48]:
Portfolio=np.loadtxt("Portfolio.csv",delimiter=",",skiprows=1)
In [49]:
def func_1(data,index):
    X=data[index,0];Y=data[index,1]
    return (np.var(Y,ddof=1)-np.var(X,ddof=1))/(np.var(X,ddof=1)+np.var(Y,ddof=1)-2*np.cov(X,Y)[0,1])
In [50]:
bt(Portfolio,func_1,1000)
Out[50]:
{'original': 0.15166414929676297,
 'bias': 0.005545792343392725,
 'stderr': 0.1814999537124061}
In [51]:
df=np.loadtxt("crime.txt",delimiter="\t")
In [52]:
from sklearn import linear_model
reg = linear_model.LinearRegression()
X=df[:,2:4]
y=df[:,0]
reg.fit(X,y)
reg.coef_
Out[52]:
array([11.8583308 , -5.97341169])
In [53]:
for j in range(3):
    def func_2(data,index):
        X=data[index,2:4];y=data[index,0]
        reg.fit(X,y)
        if j==0:
            return reg.intercept_
        else:
            return reg.coef_[j-1]
    print (bt(df,func_2,1000))
{'original': 621.4260363802891, 'bias': 30.172008959354798, 'stderr': 222.6344532866674}
{'original': 11.858330796711092, 'bias': -0.5184069354638225, 'stderr': 3.576224091887945}
{'original': -5.973411688164964, 'bias': -0.21528977532006532, 'stderr': 3.1531006150614473}
In [56]:
import statsmodels.api as sm
n=X.shape[0]
X=np.insert(X, 0, 1, axis=1)
model = sm.OLS(y,X)
res = model.fit()
print(res.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.992
Model:                            OLS   Adj. R-squared:                  0.992
Method:                 Least Squares   F-statistic:                 2.623e+04
Date:                Wed, 11 Aug 2021   Prob (F-statistic):               0.00
Time:                        17:28:32   Log-Likelihood:                 165.76
No. Observations:                1000   AIC:                            -319.5
Df Residuals:                     994   BIC:                            -290.1
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7578      0.003    232.452      0.000       0.751       0.764
x1             0.7578      0.003    232.452      0.000       0.751       0.764
x2            -1.0043      0.007   -153.629      0.000      -1.017      -0.991
x3             0.3905      0.007     59.330      0.000       0.378       0.403
x4             0.8134      0.007    121.025      0.000       0.800       0.827
x5             0.2600      0.007     39.353      0.000       0.247       0.273
x6            -1.8426      0.007   -277.069      0.000      -1.856      -1.830
==============================================================================
Omnibus:                        2.473   Durbin-Watson:                   1.991
Prob(Omnibus):                  0.290   Jarque-Bera (JB):                2.543
Skew:                          -0.040   Prob(JB):                        0.280
Kurtosis:                       3.234   Cond. No.                     2.86e+16
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 2.46e-30. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.