import copy
import japanize_matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy
from matplotlib.pyplot import imshow
from numpy.random import randn
from scipy import stats
第1章より
def soft_th(lam, x):
return np.sign(x) * np.maximum(np.abs(x) - lam, np.zeros(1))
def linear_lasso(X, y, lam=0, beta=None):
n, p = X.shape
if beta is None:
beta = np.zeros(p)
X, y, X_bar, X_sd, y_bar = centralize(X, y) # 中心化(下記参照)
eps = 1
beta_old = copy.copy(beta)
while eps > 0.00001: # このループの収束を待つ
for j in range(p):
r = y
for k in range(p):
if j != k:
r = r - X[:, k] * beta[k]
z = (np.dot(r, X[:, j]) / n) / (np.dot(X[:, j], X[:, j]) / n)
beta[j] = soft_th(lam, z)
eps = np.linalg.norm(beta - beta_old, 2)
beta_old = copy.copy(beta)
beta = beta / X_sd # 各変数の係数を正規化前のものに戻す
beta_0 = y_bar - np.dot(X_bar, beta)
return beta, beta_0
def centralize(X0, y0, standardize=True):
X = copy.copy(X0)
y = copy.copy(y0)
n, p = X.shape
X_bar = np.zeros(p) # Xの各列の平均
X_sd = np.zeros(p) # Xの各列の標準偏差
for j in range(p):
X_bar[j] = np.mean(X[:, j])
X[:, j] = X[:, j] - X_bar[j] # Xの各列の中心化
X_sd[j] = np.std(X[:, j])
if standardize is True:
X[:, j] = X[:, j] / X_sd[j] # Xの各列の標準化
if np.ndim(y) == 2:
K = y.shape[1]
y_bar = np.zeros(K) # yの平均
for k in range(K):
y_bar[k] = np.mean(y[:, k])
y[:, k] = y[:, k] - y_bar[k] # yの中心化
else: # yがベクトルの場合
y_bar = np.mean(y)
y = y - y_bar
return X, y, X_bar, X_sd, y_bar
def W_linear_lasso(X, y, W, lam=0):
n, p = X.shape
X_bar = np.zeros(p)
for k in range(p):
X_bar[k] = np.sum(np.dot(W, X[:, k])) / np.sum(W)
X[:, k] = X[:, k] - X_bar[k]
y_bar = np.sum(np.dot(W, y)) / np.sum(W)
y = y - y_bar
L = np.linalg.cholesky(W) #
# L = np.sqrt(W)
u = np.dot(L, y)
V = np.dot(L, X)
beta, beta_0 = linear_lasso(V, u, lam)
beta_0 = y_bar - np.dot(X_bar, beta)
return beta_0, beta
def f(x):
return np.exp(beta_0 + np.dot(beta, x)) / (1 + np.exp(beta_0 + np.dot(beta, x)))
beta_0 = 0
beta_seq = np.array([0, 0.2, 0.5, 1, 2, 10])
m = len(beta_seq)
x = np.arange(-10, 10)
for i in range(m):
beta = beta_seq[i]
plt.plot(x, f(x), label=beta_seq[i])
plt.title("ロジスティック曲線")
plt.xlabel("$x$")
plt.ylabel("$P(Y=1|x)$")
plt.legend()
# データ生成
N = 100
p = 2
X = np.random.randn(N, p)
X = np.concatenate([np.ones(N).reshape(N, 1), X], axis=1)
beta = np.random.randn(p+1)
y = np.zeros(N)
s = np.dot(X, beta)
prob = 1 / (1 + np.exp(s))
for i in range(N):
if np.random.rand(1) > prob[i]:
y[i] = 1
else:
y[i] = -1
beta
# 最尤推定値の計算
beta = np.inf
gamma = np.random.randn(p + 1)
while np.sum((beta - gamma) ** 2) > 0.001:
beta = gamma.copy()
s = np.dot(X, beta)
v = np.exp(-s * y)
u = y * v / (1 + v)
w = v / (1 + v) ** 2
z = s + u / w
W = np.diag(w)
gamma = np.dot(np.linalg.inv(X.T @ W @ X), np.dot(X.T @ W, z)) ##
print(gamma)
beta # 真の値。最尤法でこの値を推定したい
def logistic_lasso(X, y, lam):
p = X.shape[1] # Xのすべて1の列を含んだ列数
beta = np.inf
gamma = np.random.randn(p)
while np.sum((beta - gamma) ** 2) > 0.001:
beta = gamma.copy()
s = np.dot(X, beta)
v = np.exp(-s * y)
u = y * v / (1 + v)
w = v / (1 + v) ** 2
z = s + u / w
W = np.diag(w)
gamma_0, gamma_1 = W_linear_lasso(X[:, range(1, p)], z, W, lam=lam)
gamma = np.block([gamma_0, gamma_1]).copy()
print(gamma)
return gamma
N = 100
p = 2
X = np.random.randn(N, p)
X = np.concatenate([np.ones(N).reshape(N, 1), X], axis=1)
beta = np.random.randn(p + 1)
print("beta =")
print(beta)
y = np.ones(N)
s = np.dot(X, beta)
prob = 1 / (1 + np.exp(s))
for i in range(N):
if np.random.rand(1) > prob[i]:
y[i] = 1
else:
y[i] = -1
logistic_lasso(X, y, 0)
logistic_lasso(X, y, 0.1)
logistic_lasso(X, y, 0.2)
def table_count(m, u, v):
n = u.shape[0]
count = np.zeros([m, m])
for i in range(n):
count[int(u[i]), int(v[i])] += 1
return(count)
# データ生成
N = 100
p = 2
X = np.random.randn(N, p)
X = np.concatenate([np.ones(N).reshape(N, 1), X], axis=1)
beta = np.random.randn(p+1)
y = np.zeros(N)
s = np.dot(X, beta)
prob = 1 / (1 + np.exp(s))
for i in range(N):
if np.random.rand(1) > prob[i]:
y[i] = 1
else:
y[i] = -1
print(beta)
# パラメータ推定
beta_est = logistic_lasso(X, y, 0.1)
# 分類処理
for i in range(N):
if np.random.rand(1) > prob[i]:
y[i] = 1
else:
y[i] = int(-1)
z = np.sign(np.dot(X, beta_est)) # 指数部が正なら+1, 負なら-1と判定する
table_count(2, (y+1)/2, (z+1)/2)
# Linuxでの実行(Windows, Google Colaboratory不可)
import numpy as np
import matplotlib.pyplot as plt
import glmnet_python
from glmnet import glmnet
import sys
from cvglmnet import cvglmnet
from cvglmnetCoef import cvglmnetCoef
from cvglmnetPlot import cvglmnetPlot
# Linuxマシンのフォルダに"breastcancer.csv"をおく
x = np.loadtxt("breastcancer.csv",
delimiter=",", skiprows=1, usecols=range(1000))
y = np.loadtxt("breastcancer.csv",
delimiter=",", skiprows=1, dtype="unicode", usecols=1000)
n = len(y)
yy = np.ones(n)
for i in range(n):
if y[i] == "control":
yy[i] = 1
else:
yy[i] = -1
fit1 = cvglmnet(x=x.copy(), y=yy.copy(), ptype="deviance", family="binomial")
fit2 = cvglmnet(x=x.copy(), y=yy.copy(), ptype="class", family="binomial")
beta = cvglmnetCoef(fit1)
np.sum(beta != 0)
# CVのグラフを作成する
fig = plt.figure()
cvglmnetPlot(fit1)
fig.savefig("img1.png")
fig2 = plt.figure()
cvglmnetPlot(fit2)
fig2.savefig("img2.png")
# Linuxマシンのフォルダに"img1.png", "img2.png"ができている
def multi_lasso(X, y, lam):
n, p = X.shape
K = len(set(y))
beta = np.ones((K, p))
gamma = np.zeros((K, p))
while np.linalg.norm(beta - gamma, 2) > 0.1:
gamma = beta.copy()
for k in range(K):
r = 0
for h in range(K):
if k != h:
r = r + np.exp(np.dot(X, beta[h, :]))
v = np.exp(np.dot(X, beta[k, :])) / r
u = (y == k) - v / (1 + v)
w = v / (1 + v) ** 2
z = np.dot(X, beta[k, :]) + u / w
beta_0, beta_1 = W_linear_lasso(X[:, range(1, p)],
z, np.diag(w), lam=lam)
beta[k, :] = np.block([beta_0, beta_1]).copy()
for j in range(p):
med = np.median(beta[:, j])
for h in range(K):
beta[h, j] = beta[h, j] - med
return beta
from sklearn.datasets import load_iris
iris = load_iris()
X = np.array(iris["data"])
y = np.array(iris["target"])
N = len(y)
X = np.concatenate([np.ones(N).reshape(N, 1), X], axis=1)
print(X.shape)
beta = multi_lasso(X, y, 0.01)
np.dot(X, beta.T)
# Linuxでの実行(Windows, Google Colaboratory不可)
import numpy as np
import matplotlib.pyplot as plt
import glmnet_python
from glmnet import glmnet
import sys
from cvglmnet import cvglmnet
from cvglmnetCoef import cvglmnetCoef
from cvglmnetPlot import cvglmnetPlot
# Linuxマシンのフォルダに"breastcancer.csv"をおく
from sklearn.datasets import load_iris
iris = load_iris()
X = np.array(iris["data"])
y = np.array(iris["target"], dtype="float64")
cvfit3 = cvglmnet(x=X.copy(), y=y.copy(),
ptype="deviance", family="multinomial")
lam_min = cvfit1["lambda_min"]
beta = cvglmnetCoef(cvfit)
print(lam_min)
print(beta)
# CVのグラフを作成する
fig3 = plt.figure()
cvglmnetPlot(cvfit3)
fig3.savefig("img3.png")
K = 3
p = 5
n = 150
gamma = np.zeros((K, p))
for k in range(K):
for j in range(p):
gamma[k, j] = np.sum(beta[k][j])
v = np.zeros(n)
for i in range(n):
max_value = -np.inf
for k in range(K):
value = gamma[k, 0] + np.dot(gamma[k, range(1, p)], X[i, :])
if value > max_value:
v[i] = k
max_value = value
table_count(3, y, v)
def poisson_lasso(X, y, lam):
p = X.shape[1] # pはすべて1の列を含んでいる
beta = np.random.randn(p)
gamma = np.random.randn(p)
while np.sum((beta - gamma) ** 2) > 0.0001:
beta = gamma
s = np.dot(X, beta)
w = np.exp(s)
u = y - w
z = s + u / w
gamma_0, gamma_1 = W_linear_lasso(X[:, range(1, p)],
z, np.diag(w), lam)
gamma = np.block([gamma_0, gamma_1]).copy()
print(gamma)
return gamma
N = 100 # lambdaの値が小さいと発散して,推定値が出ないことがある。
p = 3
X = np.random.randn(N, p)
X = np.concatenate([np.ones(N).reshape(N, 1), X], axis=1)
beta = np.random.randn(p + 1)
s = np.dot(X, beta)
y = np.random.poisson(lam=np.exp(s))
print(beta)
poisson_lasso(X, y, 2)
# Linuxでの実行(Windows, Google Colaboratory不可)
import numpy as np
import matplotlib.pyplot as plt
import glmnet_python
from glmnet import glmnet
import sys
from cvglmnet import cvglmnet
from cvglmnetCoef import cvglmnetCoef
from cvglmnetPlot import cvglmnetPlot
df = np.loadtxt("giants_2019.txt", delimiter="\t")
index = list(set(range(9)) - {1, 2})
X = np.array(df[:, index])
y = np.array(df[:, 1])
cvfit = cvglmnet(x=X.copy(), y=y.copy(), family="poisson")
cvglmnetCoef(cvfit)
import pandas as pd
df = pd.read_csv("kidney.csv")
df.drop(df.columns[0], axis=1, inplace=True)
df
def Surv(y, delta):
n = len(y)
z = []
for i in range(n):
if delta[i] == 0:
z.append(str(y[i]) + "+")
else:
z.append(str(y[i]))
return z
y = df["time"]
delta = df["status"]
print(Surv(y, delta))
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()
ax = None
for name, group in df.groupby("disease"):
kmf.fit(group["time"], event_observed=group["status"],
label="肝臓病の種類:" + str(name))
if ax is None:
ax = kmf.plot()
else:
ax = kmf.plot(ax=ax)
plt.title("Kaplan-Meier Curve")
plt.show()
def cox_lasso(X, y, delta, lam):
delta[0] = 1
n = len(y)
w = np.zeros(n)
u = np.zeros(n)
pi = np.zeros((n, n))
beta = np.random.randn(p)
gamma = np.zeros(p)
while np.sum((beta - gamma)**2) > 10**(-4):
beta = gamma.copy()
s = np.dot(X, beta)
v = np.exp(s)
for i in range(n):
for j in range(n):
pi[i, j] = v[i] / np.sum(v[j:n])
for i in range(n):
u[i] = delta[i]
w[i] = 0
for j in range(i+1):
if delta[j] == 1:
u[i] = u[i] - pi[i, j]
w[i] = w[i] + pi[i, j] * (1 - pi[i, j])
z = s + u / w
W = np.diag(w)
print(gamma)
gamma0, gamma = W_linear_lasso(X, z, W, lam=lam)
return gamma
df = df.sort_values("time")
y = df["time"]
p = 4
delta = df["status"]
X = df[["age", "sex", "disease", "frail"]].copy()
size_mapping = {"GN": 1, "AN": 2, "PKD": 3, "Other": 0}
X["disease"] = df["disease"].map(size_mapping)
X = np.array(X)
y = np.array(y)
delta = np.array(delta)
cox_lasso(X, y, delta, 0)
cox_lasso(X, y, delta, 0.1)
cox_lasso(X, y, delta, 0.2)
# Linuxでの実行(Windows, Google Colaboratory不可)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glmnet_python
from glmnet import glmnet
from glmnetCoef import glmnetCoef
import sys
from cvglmnet import cvglmnet
from cvglmnetCoef import cvglmnetCoef
from cvglmnetPlot import cvglmnetPlot
import os
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
from lifelines import KaplanMeierFitter
# Linuxでの実行(windows, Google Colaboratory不可)
# 最適な係数を求める(7339個中29個のみが非ゼロ)
base = importr("base")
base.load("LymphomaData.rda")
w = robjects.globalenv["patient.data"]
X = np.array(w[0]).T
y = np.array(w[1])
delta = np.array(w[2])
w = np.concatenate([y.reshape(240, 1), delta.reshape(240, 1)], axis=1)
fit = glmnet(x=X.copy(), y=w.copy(), family="cox")
beta = glmnetCoef(fit, s=np.float64([0.119787]))
print(np.sum(beta != 0))
# Linuxでの実行(Windows, Google Colaboratory不可)
# Kaplan-Meier曲線を描く
z = np.sign(np.dot(X, beta))
df2 = pd.DataFrame(
np.concatenate(
[y.reshape(240, 1), delta.reshape(240, 1), z], axis=1))
df2.columns = ["time", "status", "sign"]
fig = plt.figure()
kmf = KaplanMeierFitter()
ax = None
for name, group in df2.groupby("sign"):
kmf.fit(group["time"], event_observed=group["status"],
label="z = " + str(name))
if ax is None:
ax = kmf.plot()
else:
ax = kmf.plot(ax=ax)
plt.title("Kaplan-Meier Curve")
fig.savefig("img7.png")