Chapter 1 Linear Regression

1.1 Linear Regression

In [3]:
import copy
import japanize_matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy
from matplotlib.pyplot import imshow
from numpy.random import randn
from scipy import stats
C:\Users\prof-\anaconda3\lib\site-packages\japanize_matplotlib\japanize_matplotlib.py:15: MatplotlibDeprecationWarning: 
The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.
  font_list = font_manager.createFontList(font_files)
In [4]:
def linear(X, y):
    p = X.shape[1]
    x_bar = np.zeros(p)
    for j in range(p):
        x_bar[j] = np.mean(X[:, j])
    for j in range(p):
        X[:, j] = X[:, j] - x_bar[j]      # Centralize X
    y_bar = np.mean(y)
    y = y - y_bar                         # Centralize y
    beta = np.dot(
        np.linalg.inv(np.dot(X.T, X)),
        np.dot(X.T, y)
    )
    beta_0 = y_bar - np.dot(x_bar, beta)
    return beta, beta_0

1.2 Subderivative

In [46]:
x = np.arange(-2, 2, 0.1)
y = x**2 - 3 * x + np.abs(x)
plt.plot(x, y)
plt.scatter(1, -1, c="red")
plt.title("$y = x^2 - 3x + |x|$")
Out[46]:
Text(0.5, 1.0, '$y = x^2 - 3x + |x|$')
In [47]:
x = np.arange(-2, 2, 0.1)
y = x**2 + x + 2 * np.abs(x)
plt.plot(x, y)
plt.scatter(0, 0, c="red")
plt.title("$y = x^2 + x + 2|x|$")
Out[47]:
Text(0.5, 1.0, '$y = x^2 + x + 2|x|$')

1.3 Lasso

In [48]:
def soft_th(lam, x):
    return np.sign(x) * np.maximum(np.abs(x) - lam, np.zeros(1))
In [49]:
x = np.arange(-10, 10, 0.1)
y = soft_th(5, x)
plt.plot(x, y, c="black")
plt.title(r"${\cal S}_\lambda(x)$", size=24)
plt.plot([-5, -5], [-4, 4], c="blue", linestyle="dashed")
plt.plot([5, 5], [-4, 4], c="blue", linestyle="dashed")
plt.text(-2, 1, r"$\lambda=5$", c="red", size=16)
Out[49]:
Text(-2, 1, '$\\lambda=5$')
In [50]:
def linear_lasso(X, y, lam=0, beta=None):
    n, p = X.shape
    if beta is None:
        beta = np.zeros(p)
    X, y, X_bar, X_sd, y_bar = centralize(X, y)   # Centralize
    eps = 1
    beta_old = copy.copy(beta)
    while eps > 0.00001:    # Wait the convergence of this loop
        for j in range(p):
            r = y
            for k in range(p):
                if j != k:
                    r = r - X[:, k] * beta[k]
            z = (np.dot(r, X[:, j]) / n) / (np.dot(X[:, j], X[:, j]) / n)
            beta[j] = soft_th(lam, z)
        eps = np.linalg.norm(beta - beta_old, 2)
        beta_old = copy.copy(beta)
    beta = beta / X_sd   # Recover the coefficients to before normalization
    beta_0 = y_bar - np.dot(X_bar, beta)
    return beta, beta_0
In [51]:
def centralize(X0, y0, standardize=True):
    X = copy.copy(X0)
    y = copy.copy(y0)
    n, p = X.shape
    X_bar = np.zeros(p)                   # Mean of each column of X
    X_sd = np.zeros(p)                    # SD of each column of X
    for j in range(p):
        X_bar[j] = np.mean(X[:, j])
        X[:, j] = X[:, j] - X_bar[j]      # Centralize each column of X
        X_sd[j] = np.std(X[:, j])
        if standardize is True:
            X[:, j] = X[:, j] / X_sd[j]   # Centralize each column of X
    if np.ndim(y) == 2:
        K = y.shape[1]
        y_bar = np.zeros(K)               # Mean of y
        for k in range(K):
            y_bar[k] = np.mean(y[:, k])
            y[:, k] = y[:, k] - y_bar[k]  # Centralize y
    else:                                 # when y is a vector
        y_bar = np.mean(y)
        y = y - y_bar
    return X, y, X_bar, X_sd, y_bar
In [52]:
df = np.loadtxt("crime.txt", delimiter="\t")
X = df[:, [i for i in range(2, 7)]]
p = X.shape[1]
y = df[:, 0]
lambda_seq = np.arange(0, 200, 0.1)
plt.xlim(0, 200)
plt.ylim(-10, 20)
plt.xlabel(r"$\lambda$")
plt.ylabel(r"$\beta$")
plt.title(r"Coefficients for each $\lambda$")
labels = ["annual police funding", "\% of people 25 years+ with 4 yrs. of high school",
                  "\% of 16--19 year-olds not in highschool and not highschool graduates",
                  "\% of 18--24 years in college",
                  "\% of people 25 years+ with at least 4 years of college"]
r = len(lambda_seq)
coef_seq = np.zeros((r, p))
for i in range(r):
    coef_seq[i, :], _ = linear_lasso(X, y, lambda_seq[i])
for j in range(p):
    plt.plot(lambda_seq, coef_seq[:, j], label=labels[j])
plt.legend(loc="upper right")
Out[52]:
<matplotlib.legend.Legend at 0x16b68a12f10>
In [53]:
def warm_start(X, y, lambda_max=100):
    dec = np.round(lambda_max / 50)
    lambda_seq = np.arange(lambda_max, 1, -dec)
    r = len(lambda_seq)
    p = X.shape[1]
    beta = np.zeros(p)
    coef_seq = np.zeros((r, p))
    for k in range(r):
        beta, _ = linear_lasso(X, y, lambda_seq[k], beta)
        coef_seq[k, :] = beta
    return coef_seq
In [54]:
df = np.loadtxt("crime.txt", delimiter="\t")
X = df[:, [i for i in range(2, 7)]]
p = X.shape[1]
y = df[:, 0]
coef_seq = warm_start(X, y, 200)
lambda_max = 200; dec = round(lambda_max / 50)
lambda_seq = np.arange(lambda_max, 1, -dec)
plt.ylim(np.min(coef_seq), np.max(coef_seq))
plt.xlabel(r"$\lambda$")
plt.ylabel("Coefficients")
plt.xlim(0, 200)
plt.ylim(-10, 20)
for j in range(p):
    plt.plot(lambda_seq, coef_seq[:, j])
In [24]:
from sklearn.datasets import load_boston
boston = load_boston()
x = boston.data
y = boston.target
n, p = x.shape
lambda_seq = np.arange(0.0001, 20, 0.01)
r = len(lambda_seq)
plt.xlim(-5, 2)
plt.ylim(-18, 6)
plt.xlabel(r"$\log \lambda$")
plt.ylabel("Coefficients")
plt.title("BOSTON")
coef_seq = np.zeros((r, p))
for i in range(r):
    coef_seq[i, :], _ = linear_lasso(x, y, lam=lambda_seq[i])
for j in range(p):
    plt.plot(np.log(lambda_seq), coef_seq[:, j])

1.4 Ridge

In [25]:
def ridge(X, y, lam=0):
    n, p = X.shape
    X, y, X_bar, X_sd, y_bar = centralize(X, y)
    beta = np.dot(
        np.linalg.inv(np.dot(X.T, X) + n * lam * np.eye(p)),
        np.dot(X.T, y)
    )
    beta = beta / X_sd
    beta_0 = y_bar - np.dot(X_bar, beta)
    return beta, beta_0
In [34]:
df = np.loadtxt("crime.txt", delimiter="\t")
X = df[:, [i for i in range(2, 7)]]
p = X.shape[1]
y = df[:, 0]
lambda_seq = np.arange(0, 200, 0.1)
plt.xlim(0, 100)
plt.ylim(-10, 20)
plt.xlabel(r"$\lambda$")
plt.ylabel(r"$\beta$")
plt.title(r"The Coefficients for each $\lambda$")
labels = ["annual police funding", "\% of people 25 years+ with 4 yrs. of high school",
                  "\% of 16--19 year-olds not in highschool and not highschool graduates",
                  "\% of 18--24 years in college",
                  "\% of people 25 years+ with at least 4 years of college"]
r = len(lambda_seq)
beta = np.zeros(p)
coef_seq = np.zeros((r, p))
for i in range(r):
    beta, beta_0 = ridge(X, y, lambda_seq[i])
    coef_seq[i, :] = beta
for j in range(p):
    plt.plot(lambda_seq, coef_seq[:, j], label=labels[j])
plt.legend(loc="upper right")
Out[34]:
<matplotlib.legend.Legend at 0x16b67986ee0>
In [36]:
df = np.loadtxt("crime.txt", delimiter="\t")
X = df[:, [i for i in range(2, 7)]]
y = df[:, 0]
linear(X, y)
Out[36]:
(array([10.98067026, -6.08852939,  5.4803042 ,  0.37704431,  5.50047122]),
 489.64859696903335)
In [38]:
ridge(X,y)
Out[38]:
(array([10.98067026, -6.08852939,  5.4803042 ,  0.37704431,  5.50047122]),
 717.96)
In [37]:
ridge(X, y, 200)
Out[37]:
(array([ 0.0563518 , -0.01976397,  0.07786309, -0.0171218 , -0.0070393 ]),
 717.96)

1.5 A Comparison between Lasso and Ridge

In [55]:
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
In [56]:
def R2(x, y):
    model = LinearRegression()
    model.fit(x, y)           # Train model
    y_hat = model.predict(x)  # Display predictions
    y_bar = np.mean(y)
    RSS = np.dot(y - y_hat, y - y_hat)
    TSS = np.dot(y - y_bar, y - y_bar)
    return 1 - RSS / TSS
In [57]:
def vif(x):
    p = x.shape[1]
    values = np.zeros(p)
    for j in range(p):
        ind = [i for i in range(p) if i != j]
        values[j] = 1 / (1 - R2(x[:, ind], x[:, j]))
    return values
In [58]:
boston = load_boston()
n = boston.data.shape[0]
z = np.concatenate([boston.data, boston.target.reshape([n, 1])], 1)
vif(z)
Out[58]:
array([1.83153668, 2.35218589, 3.99250315, 1.09522267, 4.58692024,
       2.26037436, 3.10084282, 4.39600725, 7.80819843, 9.20554209,
       1.99301566, 1.38146295, 3.5815848 , 3.85568427])
In [59]:
n = 500
x = np.zeros((n, 6))
z = np.zeros((n, 5))
for k in range(2):
    z[:, k] = np.random.randn(n)
y = 3 * z[:, 0] - 1.5 * z[:, 1] + 2 * np.random.randn(n)
for j in range(3):
    x[:, j] = z[:, 0] + np.random.randn(n) / 5
for j in range(3, 6):
    x[:, j] = z[:, 1] + np.random.randn(n) / 5
lambda_seq = np.arange(0.1, 20, 0.1)
p = 6
r = len(lambda_seq)
coef_seq = np.zeros((r, p))
cols = ["blue", "red", "green", "yellow", "purple", "orange"]
for i in range(r):
    coef_seq[i, :], _ = linear_lasso(x, y, lambda_seq[i])
for j in range(p):
    plt.plot(-np.log(lambda_seq), coef_seq[:, j] + 0.01 * j,
             c=cols[j], label="X"+str(j+1))
plt.xlabel(r"$-\log \lambda$")
plt.ylabel(r"$\beta$")
plt.legend(loc="upper left")
plt.title("Lasso")
Out[59]:
Text(0.5, 1.0, 'Lasso')

1.6 elastic net

In [60]:
def elastic_net(X, y, lam=0, alpha=1, beta=None):                     #
    n, p = X.shape
    if beta is None:
        beta = np.zeros(p)
    X, y, X_bar, X_sd, y_bar = centralize(X, y)   # Centralize
    eps = 1
    beta_old = copy.copy(beta)
    while eps > 0.00001:    # Wait Convergence
        for j in range(p):
            r = y
            for k in range(p):
                if j != k:
                    r = r - X[:, k] * beta[k]
            z = ((np.dot(r, X[:, j]) / n)                             ##
                 / (np.dot(X[:, j], X[:, j]) / n + (1-alpha) * lam))  ##
            beta[j] = soft_th(lam * alpha, z)                         ##
        eps = np.linalg.norm(beta - beta_old, 2)
        beta_old = copy.copy(beta)
    beta = beta / X_sd   # Recover to before Normalization
    beta_0 = y_bar - np.dot(X_bar, beta)
    return beta, beta_0

1.7 About How to Set the Value of $\lambda$

In [61]:
def cv_linear_lasso(x, y, alpha=1, k=10):
    lam_max = np.max(np.dot(x.T, y) / np.dot(x.T, x))
    lam_seq = np.array(range(100))**3 / 1000000 * lam_max
    n = len(y)
    m = int(n / k)
    r = n % k
    S_min = np.inf
    for lam in lam_seq:
        S = 0
        for i in range(k):
            if i < k - r:
                index = list(range(i*m, i*m + m))
            else:
                index = list(range(i*m + (i-k+r), i*m + (m+i-k+r+1)))
                # when k cannot divide n
            _index = list(set(range(n)) - set(index))
            beta, beta0 = elastic_net(x[_index, ], y[_index], lam, alpha)
            z = np.linalg.norm((y[index] - beta0 - np.dot(x[index], beta)), 2)
            S = S + z**2
        if S < S_min:
            S_min = S.copy()
            lam_best = lam.copy()
            beta0_best = beta0.copy()
            beta_best = beta.copy()
    return lam_best, beta0_best, beta_best, S_min
In [67]:
df = np.loadtxt("crime.txt", delimiter="\t")
X = df[:, [i for i in range(2, 7)]]
p = X.shape[1]
y = df[:, 0]
lam, beta0, beta, S = cv_linear_lasso(X, y)
print(lam)
print(beta0)
print(beta)
coef_seq = warm_start(X, y, 200)
lambda_max = 200; dec = round(lambda_max / 50)
lambda_seq = np.arange(lambda_max, 1, -dec)
plt.ylim(np.min(coef_seq), np.max(coef_seq))
plt.xlabel(r"$\lambda$")
plt.ylabel("Coefficients")
plt.xlim(0, 200)
plt.ylim(-10, 20)
plt.axvline(x=lam, ymin=-10, ymax=10, linestyle="dotted")
for j in range(p):
    plt.plot(lambda_seq, coef_seq[:, j])
21.98869501247591
526.7401583585395
[ 9.49890024 -3.79670916  2.65033978 -0.          0.        ]
In [68]:
def cv_elastic_net(x, y, k=10):
    alpha_seq = np.array(range(100)) / 100
    S_min = np.inf
    for alpha in alpha_seq:
        lam, beta0, beta, S = cv_linear_lasso(x, y, alpha)
        if S < S_min:
            S_min = S
            alpha_best = alpha
            lam_best = lam
            beta0_best = beta0
            beta_best = beta
    return alpha_best, lam_best, beta0_best, beta_best, S_min
In [69]:
n = 500
x = np.zeros((n, 6))
z = np.zeros((n, 5))
for k in range(2):
    z[:, k] = np.random.randn(n)
y = 3 * z[:, 0] - 1.5 * z[:, 1] + 2 * np.random.randn(n)
for j in range(3):
    x[:, j] = z[:, 0] + np.random.randn(n) / 5
for j in range(3, 6):
    x[:, j] = z[:, 1] + np.random.randn(n) / 5
In [70]:
alpha, lam, beta0, beta, S = cv_elastic_net(x, y)
print(alpha)
print(lam)
print(beta0)
print(beta)
# It takes several minutes
0.0
0.07590257357648734
-0.021804445490204702
[ 1.04538639  0.97448876  0.90746206 -0.24814186 -0.70324433 -0.65441206]
In [71]:
cv_linear_lasso(x, y, 0.25)
Out[71]:
(0.07590257357648734,
 -0.021141212576098356,
 array([ 1.06274028,  0.97123503,  0.89170521, -0.1899063 , -0.73052604,
        -0.67545142]),
 2150.747447553806)
In [72]:
cv_linear_lasso(x, y, 0.5)
Out[72]:
(0.07590257357648734,
 -0.020071465723548412,
 array([ 1.08970757,  0.96685379,  0.86796379, -0.10525936, -0.77281394,
        -0.70904113]),
 2155.0042478218843)
In [73]:
cv_linear_lasso(x, y, 0.75)
Out[73]:
(0.05533297613725927,
 -0.01805901322571632,
 array([ 1.15886964,  0.97006673,  0.81622541, -0.        , -0.84073609,
        -0.76191917]),
 2160.4311415511224)
In [74]:
cv_linear_lasso(x, y, 1)
Out[74]:
(0.009487821697060918,
 -0.016906086179744526,
 array([ 1.24108803,  0.99480948,  0.7561871 ,  0.        , -0.86949559,
        -0.78222113]),
 2165.5356806249383)