import copy
import japanize_matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy
from matplotlib.pyplot import imshow
from numpy.random import randn
from scipy import stats
def linear(X, y):
p = X.shape[1]
x_bar = np.zeros(p)
for j in range(p):
x_bar[j] = np.mean(X[:, j])
for j in range(p):
X[:, j] = X[:, j] - x_bar[j] # Centralize X
y_bar = np.mean(y)
y = y - y_bar # Centralize y
beta = np.dot(
np.linalg.inv(np.dot(X.T, X)),
np.dot(X.T, y)
)
beta_0 = y_bar - np.dot(x_bar, beta)
return beta, beta_0
x = np.arange(-2, 2, 0.1)
y = x**2 - 3 * x + np.abs(x)
plt.plot(x, y)
plt.scatter(1, -1, c="red")
plt.title("$y = x^2 - 3x + |x|$")
x = np.arange(-2, 2, 0.1)
y = x**2 + x + 2 * np.abs(x)
plt.plot(x, y)
plt.scatter(0, 0, c="red")
plt.title("$y = x^2 + x + 2|x|$")
def soft_th(lam, x):
return np.sign(x) * np.maximum(np.abs(x) - lam, np.zeros(1))
x = np.arange(-10, 10, 0.1)
y = soft_th(5, x)
plt.plot(x, y, c="black")
plt.title(r"${\cal S}_\lambda(x)$", size=24)
plt.plot([-5, -5], [-4, 4], c="blue", linestyle="dashed")
plt.plot([5, 5], [-4, 4], c="blue", linestyle="dashed")
plt.text(-2, 1, r"$\lambda=5$", c="red", size=16)
def linear_lasso(X, y, lam=0, beta=None):
n, p = X.shape
if beta is None:
beta = np.zeros(p)
X, y, X_bar, X_sd, y_bar = centralize(X, y) # Centralize
eps = 1
beta_old = copy.copy(beta)
while eps > 0.00001: # Wait the convergence of this loop
for j in range(p):
r = y
for k in range(p):
if j != k:
r = r - X[:, k] * beta[k]
z = (np.dot(r, X[:, j]) / n) / (np.dot(X[:, j], X[:, j]) / n)
beta[j] = soft_th(lam, z)
eps = np.linalg.norm(beta - beta_old, 2)
beta_old = copy.copy(beta)
beta = beta / X_sd # Recover the coefficients to before normalization
beta_0 = y_bar - np.dot(X_bar, beta)
return beta, beta_0
def centralize(X0, y0, standardize=True):
X = copy.copy(X0)
y = copy.copy(y0)
n, p = X.shape
X_bar = np.zeros(p) # Mean of each column of X
X_sd = np.zeros(p) # SD of each column of X
for j in range(p):
X_bar[j] = np.mean(X[:, j])
X[:, j] = X[:, j] - X_bar[j] # Centralize each column of X
X_sd[j] = np.std(X[:, j])
if standardize is True:
X[:, j] = X[:, j] / X_sd[j] # Centralize each column of X
if np.ndim(y) == 2:
K = y.shape[1]
y_bar = np.zeros(K) # Mean of y
for k in range(K):
y_bar[k] = np.mean(y[:, k])
y[:, k] = y[:, k] - y_bar[k] # Centralize y
else: # when y is a vector
y_bar = np.mean(y)
y = y - y_bar
return X, y, X_bar, X_sd, y_bar
df = np.loadtxt("crime.txt", delimiter="\t")
X = df[:, [i for i in range(2, 7)]]
p = X.shape[1]
y = df[:, 0]
lambda_seq = np.arange(0, 200, 0.1)
plt.xlim(0, 200)
plt.ylim(-10, 20)
plt.xlabel(r"$\lambda$")
plt.ylabel(r"$\beta$")
plt.title(r"Coefficients for each $\lambda$")
labels = ["annual police funding", "\% of people 25 years+ with 4 yrs. of high school",
"\% of 16--19 year-olds not in highschool and not highschool graduates",
"\% of 18--24 years in college",
"\% of people 25 years+ with at least 4 years of college"]
r = len(lambda_seq)
coef_seq = np.zeros((r, p))
for i in range(r):
coef_seq[i, :], _ = linear_lasso(X, y, lambda_seq[i])
for j in range(p):
plt.plot(lambda_seq, coef_seq[:, j], label=labels[j])
plt.legend(loc="upper right")
def warm_start(X, y, lambda_max=100):
dec = np.round(lambda_max / 50)
lambda_seq = np.arange(lambda_max, 1, -dec)
r = len(lambda_seq)
p = X.shape[1]
beta = np.zeros(p)
coef_seq = np.zeros((r, p))
for k in range(r):
beta, _ = linear_lasso(X, y, lambda_seq[k], beta)
coef_seq[k, :] = beta
return coef_seq
df = np.loadtxt("crime.txt", delimiter="\t")
X = df[:, [i for i in range(2, 7)]]
p = X.shape[1]
y = df[:, 0]
coef_seq = warm_start(X, y, 200)
lambda_max = 200; dec = round(lambda_max / 50)
lambda_seq = np.arange(lambda_max, 1, -dec)
plt.ylim(np.min(coef_seq), np.max(coef_seq))
plt.xlabel(r"$\lambda$")
plt.ylabel("Coefficients")
plt.xlim(0, 200)
plt.ylim(-10, 20)
for j in range(p):
plt.plot(lambda_seq, coef_seq[:, j])
from sklearn.datasets import load_boston
boston = load_boston()
x = boston.data
y = boston.target
n, p = x.shape
lambda_seq = np.arange(0.0001, 20, 0.01)
r = len(lambda_seq)
plt.xlim(-5, 2)
plt.ylim(-18, 6)
plt.xlabel(r"$\log \lambda$")
plt.ylabel("Coefficients")
plt.title("BOSTON")
coef_seq = np.zeros((r, p))
for i in range(r):
coef_seq[i, :], _ = linear_lasso(x, y, lam=lambda_seq[i])
for j in range(p):
plt.plot(np.log(lambda_seq), coef_seq[:, j])
def ridge(X, y, lam=0):
n, p = X.shape
X, y, X_bar, X_sd, y_bar = centralize(X, y)
beta = np.dot(
np.linalg.inv(np.dot(X.T, X) + n * lam * np.eye(p)),
np.dot(X.T, y)
)
beta = beta / X_sd
beta_0 = y_bar - np.dot(X_bar, beta)
return beta, beta_0
df = np.loadtxt("crime.txt", delimiter="\t")
X = df[:, [i for i in range(2, 7)]]
p = X.shape[1]
y = df[:, 0]
lambda_seq = np.arange(0, 200, 0.1)
plt.xlim(0, 100)
plt.ylim(-10, 20)
plt.xlabel(r"$\lambda$")
plt.ylabel(r"$\beta$")
plt.title(r"The Coefficients for each $\lambda$")
labels = ["annual police funding", "\% of people 25 years+ with 4 yrs. of high school",
"\% of 16--19 year-olds not in highschool and not highschool graduates",
"\% of 18--24 years in college",
"\% of people 25 years+ with at least 4 years of college"]
r = len(lambda_seq)
beta = np.zeros(p)
coef_seq = np.zeros((r, p))
for i in range(r):
beta, beta_0 = ridge(X, y, lambda_seq[i])
coef_seq[i, :] = beta
for j in range(p):
plt.plot(lambda_seq, coef_seq[:, j], label=labels[j])
plt.legend(loc="upper right")
df = np.loadtxt("crime.txt", delimiter="\t")
X = df[:, [i for i in range(2, 7)]]
y = df[:, 0]
linear(X, y)
ridge(X,y)
ridge(X, y, 200)
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
def R2(x, y):
model = LinearRegression()
model.fit(x, y) # Train model
y_hat = model.predict(x) # Display predictions
y_bar = np.mean(y)
RSS = np.dot(y - y_hat, y - y_hat)
TSS = np.dot(y - y_bar, y - y_bar)
return 1 - RSS / TSS
def vif(x):
p = x.shape[1]
values = np.zeros(p)
for j in range(p):
ind = [i for i in range(p) if i != j]
values[j] = 1 / (1 - R2(x[:, ind], x[:, j]))
return values
boston = load_boston()
n = boston.data.shape[0]
z = np.concatenate([boston.data, boston.target.reshape([n, 1])], 1)
vif(z)
n = 500
x = np.zeros((n, 6))
z = np.zeros((n, 5))
for k in range(2):
z[:, k] = np.random.randn(n)
y = 3 * z[:, 0] - 1.5 * z[:, 1] + 2 * np.random.randn(n)
for j in range(3):
x[:, j] = z[:, 0] + np.random.randn(n) / 5
for j in range(3, 6):
x[:, j] = z[:, 1] + np.random.randn(n) / 5
lambda_seq = np.arange(0.1, 20, 0.1)
p = 6
r = len(lambda_seq)
coef_seq = np.zeros((r, p))
cols = ["blue", "red", "green", "yellow", "purple", "orange"]
for i in range(r):
coef_seq[i, :], _ = linear_lasso(x, y, lambda_seq[i])
for j in range(p):
plt.plot(-np.log(lambda_seq), coef_seq[:, j] + 0.01 * j,
c=cols[j], label="X"+str(j+1))
plt.xlabel(r"$-\log \lambda$")
plt.ylabel(r"$\beta$")
plt.legend(loc="upper left")
plt.title("Lasso")
def elastic_net(X, y, lam=0, alpha=1, beta=None): #
n, p = X.shape
if beta is None:
beta = np.zeros(p)
X, y, X_bar, X_sd, y_bar = centralize(X, y) # Centralize
eps = 1
beta_old = copy.copy(beta)
while eps > 0.00001: # Wait Convergence
for j in range(p):
r = y
for k in range(p):
if j != k:
r = r - X[:, k] * beta[k]
z = ((np.dot(r, X[:, j]) / n) ##
/ (np.dot(X[:, j], X[:, j]) / n + (1-alpha) * lam)) ##
beta[j] = soft_th(lam * alpha, z) ##
eps = np.linalg.norm(beta - beta_old, 2)
beta_old = copy.copy(beta)
beta = beta / X_sd # Recover to before Normalization
beta_0 = y_bar - np.dot(X_bar, beta)
return beta, beta_0
def cv_linear_lasso(x, y, alpha=1, k=10):
lam_max = np.max(np.dot(x.T, y) / np.dot(x.T, x))
lam_seq = np.array(range(100))**3 / 1000000 * lam_max
n = len(y)
m = int(n / k)
r = n % k
S_min = np.inf
for lam in lam_seq:
S = 0
for i in range(k):
if i < k - r:
index = list(range(i*m, i*m + m))
else:
index = list(range(i*m + (i-k+r), i*m + (m+i-k+r+1)))
# when k cannot divide n
_index = list(set(range(n)) - set(index))
beta, beta0 = elastic_net(x[_index, ], y[_index], lam, alpha)
z = np.linalg.norm((y[index] - beta0 - np.dot(x[index], beta)), 2)
S = S + z**2
if S < S_min:
S_min = S.copy()
lam_best = lam.copy()
beta0_best = beta0.copy()
beta_best = beta.copy()
return lam_best, beta0_best, beta_best, S_min
df = np.loadtxt("crime.txt", delimiter="\t")
X = df[:, [i for i in range(2, 7)]]
p = X.shape[1]
y = df[:, 0]
lam, beta0, beta, S = cv_linear_lasso(X, y)
print(lam)
print(beta0)
print(beta)
coef_seq = warm_start(X, y, 200)
lambda_max = 200; dec = round(lambda_max / 50)
lambda_seq = np.arange(lambda_max, 1, -dec)
plt.ylim(np.min(coef_seq), np.max(coef_seq))
plt.xlabel(r"$\lambda$")
plt.ylabel("Coefficients")
plt.xlim(0, 200)
plt.ylim(-10, 20)
plt.axvline(x=lam, ymin=-10, ymax=10, linestyle="dotted")
for j in range(p):
plt.plot(lambda_seq, coef_seq[:, j])
def cv_elastic_net(x, y, k=10):
alpha_seq = np.array(range(100)) / 100
S_min = np.inf
for alpha in alpha_seq:
lam, beta0, beta, S = cv_linear_lasso(x, y, alpha)
if S < S_min:
S_min = S
alpha_best = alpha
lam_best = lam
beta0_best = beta0
beta_best = beta
return alpha_best, lam_best, beta0_best, beta_best, S_min
n = 500
x = np.zeros((n, 6))
z = np.zeros((n, 5))
for k in range(2):
z[:, k] = np.random.randn(n)
y = 3 * z[:, 0] - 1.5 * z[:, 1] + 2 * np.random.randn(n)
for j in range(3):
x[:, j] = z[:, 0] + np.random.randn(n) / 5
for j in range(3, 6):
x[:, j] = z[:, 1] + np.random.randn(n) / 5
alpha, lam, beta0, beta, S = cv_elastic_net(x, y)
print(alpha)
print(lam)
print(beta0)
print(beta)
# It takes several minutes
cv_linear_lasso(x, y, 0.25)
cv_linear_lasso(x, y, 0.5)
cv_linear_lasso(x, y, 0.75)
cv_linear_lasso(x, y, 1)