import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import scipy
from scipy import stats
from numpy.random import randn
def cv_linear(X,y,K):
n=len(y);m=int(n/K)
S=0
for j in range(K):
test=list(range(j*m,(j+1)*m)) # Indexes of test data
train=list(set(range(n))-set(test)) # Indexes of training data
beta=np.linalg.inv(X[train,].T@X[train,])@X[train,].T@y[train]
e=y[test]-X[test,]@beta
S=S+np.linalg.norm(e)**2
return S/n
n=100;p=5
X=randn(n,p)
X=np.insert(X, 0, 1, axis=1)
beta=randn(p+1)
beta[[1,2]]=0
y=X@beta+randn(n)
cv_linear(X[:,[0,3,4,5]],y,10)
cv_linear(X,y,10)
n=100;p=5
X=randn(n,p)
X=np.insert(X, 0, 1, axis=1)
beta=randn(p+1);beta[[1,2]]=0
U=[]; V=[]
for j in range(100):
y=X@beta+randn(n)
U.append(cv_linear(X[:,[0,3,4,5]],y,10))
V.append(cv_linear(X,y,10))
x_seq=np.linspace(0.7,1.5,100)
y=x_seq
plt.plot(x_seq,y,c="red")
plt.scatter(U,V)
plt.xlabel(" Sq err when variables 4,5,6 are chosen")
plt.ylabel(" Sq err when all are chosen")
plt.title(" Over-fitting due to choosing too many")
n=100;p=5
plt.ylim(0.3,1.5)
plt.xlabel("k")
plt.ylabel("CV value")
for j in range(2,11,1):
X=randn(n,p)
X=np.insert(X, 0, 1, axis=1)
beta=randn(p+1)
y=X@beta+randn(n)
U = []; V = []
for k in range(2,n+1,1):
if n%k==0:
U.append(k); V.append(cv_linear(X,y,k))
plt.plot(U,V)
def knn_1(x,y,z,k): # this function appeared in Chapter 3
x=np.array(x)
n,p=x.shape
dis=np.zeros(n)
for i in range(n):
dis[i]=(z-x[i,]).T@(z-x[i,])
S=np.argsort(dis)[0:k]
u=np.bincount(y[S])
m = [i for i, x in enumerate(u) if x == max(u)]
# tie-breaking
while (len(m)>1):
k=k-1
S=S[0:k]
u=np.bincount(y[S])
m = [i for i, x in enumerate(u) if x == max(u)]
return m[0]
def knn(x,y,z,k):
n=z.shape[0]
w=np.zeros(n)
for i in range(n):
w[i]=knn_1(x,y,z[i,],k)
return w
from sklearn.datasets import load_iris
iris = load_iris()
iris.target_names
x=iris.data
y=iris.target
n=x.shape[0]
index=np.random.choice(n, n, replace=False) # The order is randomly chosen
x=x[index,]
y=y[index]
U = []
V = []
top_seq=list(range(0,150,15))
for k in range(1,11,1):
S=0
for top in top_seq:
test=list(range(top,top+15))
train=list(set(range(150))-set(test))
knn_ans=knn(x[train,],y[train],x[test,],k=k)
ans=y[test]
S=S+np.sum(knn_ans!=ans)
S=S/n
U.append(k)
V.append(S)
plt.plot(U,V)
plt.xlabel("K")
plt.ylabel(" Err Rate")
plt.title("Evaluation of the error rate via CV")
def cv_fast(X,y,k):
n=len(y)
m=n/k
H=X@np.linalg.inv(X.T@X)@X.T
I=np.diag(np.repeat(1,n))
e=(I-H)@y
I=np.diag(np.repeat(1,m))
S=0
for j in range(k):
test=np.arange(j*m,(j+1)*m,1,dtype=int)
S=S+(np.linalg.inv(I-H[test,:][:,test])@e[test]).T@np.linalg.inv(I-H[test,test])@e[test]
return S/n
cv_fast(x,y,10)
n=1000;p=5
beta=randn(p+1)
x=randn(n,p)
X=np.insert(x, 0, 1, axis=1)
y=X@beta+randn(n)
import time
U_l=[]; V_l=[]; U_f=[]; V_f=[]
for k in range(2,n+1,1):
if n%k==0:
t1=time.time() # Time before processing
cv_linear(X,y,k)
t2=time.time() # Time after processing
U_l.append(k); V_l.append(t2-t1)
t1=time.time()
cv_fast(X,y,k)
t2=time.time()
U_f.append(k); V_f.append(t2-t1)
plt.plot(U_l,V_l,c="red",label="cv_linear")
plt.plot(U_f,V_f,c="blue",label="cv_fast")
plt.legend()
plt.xlabel("k")
plt.ylabel("execution time")
plt.title("Comparizon cv_fast and cv_linear")
def bt(df,f,r):
m=df.shape[0]
org=f(df,np.arange(0,m,1))
u=[]
for j in range(r):
index=np.random.choice(m, m, replace=True)
u.append(f(df,index))
return {'original':org, 'bias':np.mean(u)-org, 'stderr':np.std(u,ddof=1)}
Portfolio=np.loadtxt("Portfolio.csv",delimiter=",",skiprows=1)
def func_1(data,index):
X=data[index,0];Y=data[index,1]
return (np.var(Y,ddof=1)-np.var(X,ddof=1))/(np.var(X,ddof=1)+np.var(Y,ddof=1)-2*np.cov(X,Y)[0,1])
bt(Portfolio,func_1,1000)
df=np.loadtxt("crime.txt",delimiter="\t")
from sklearn import linear_model
reg = linear_model.LinearRegression()
X=df[:,2:4]
y=df[:,0]
reg.fit(X,y)
reg.coef_
for j in range(3):
def func_2(data,index):
X=data[index,2:4];y=data[index,0]
reg.fit(X,y)
if j==0:
return reg.intercept_
else:
return reg.coef_[j-1]
print (bt(df,func_2,1000))
import statsmodels.api as sm
n=X.shape[0]
X=np.insert(X, 0, 1, axis=1)
model = sm.OLS(y,X)
res = model.fit()
print(res.summary())