第2章 一般化線形回帰

21(c)

f <- function(x) {
  return(exp(beta.0 + beta * x) / (1 + exp(beta.0 + beta * x)))
}

beta.0 <- 0
beta.seq <- c(0, 0.2, 0.5, 1, 2, 10)
m <- length(beta.seq)
beta <- beta.seq[1]
plot(f, xlim = c(-10, 10), ylim = c(0, 1), 
     xlab = "x", ylab = "y", col = 1, main = "ロジスティック曲線")
for (i in 2:m) {
  beta <- ## 空欄(1) ##
  par(new = TRUE)
  plot(## 空欄(2) ##, xlim = c(-10, 10), ylim = c(0, 1),
       xlab = "", ylab = "", axes = FALSE, , col = i)
}
legend("topleft", legend = beta.seq, col = 1:length(beta.seq), lwd = 2, cex = .8)
par(new = FALSE)

22

## データ生成
N <- 1000
p <- 2
X <- matrix(rnorm(N * p), ncol = p)
X <- cbind(rep(1, N), X)
beta <- rnorm(p + 1)
y <- array(N)
s <- as.vector(X %*% beta)
prob <- 1 / (1 + exp(s))
for (i in 1:N) {
  if (runif(1) > prob[i]) {
    y[i] <- 1
  } else {
    y[i] <- -1
  }
}
beta

## 最尤推定値の計算
beta <- Inf
gamma <- rnorm(p + 1)
while (sum((beta - gamma) ^ 2) > 0.001) {
  beta <- gamma
  s <- as.vector(X %*% beta)
  v <- exp(-s * y)
  u <- y * v / (1 + v)
  w <- v / (1 + v) ^ 2
  z <- s + u / w
  W <- diag(w)
  gamma <- as.vector(solve(t(X) %*% W %*% X) %*% t(X) %*% W %*% z)
  print(gamma)
}

24

glm <- glmnet(x, y, lambda = lambda, family = "binomial")
df <- read.csv("breastcancer.csv")
x <- as.matrix(df[, 1:1000])
y <- as.vector(df[, 1001])
cv <- cv.glmnet(x, y, family = "binomial")
cv2 <- cv.glmnet(x, y, family = "binomial", type.measure = "class")
par(mfrow = c(1, 2))
plot(cv)
plot(cv2)
par(mfrow = c(1, 1))

26

y %*% rep(1, nc) でエラー: 適切な引数ではありません

library(glmnet)

df <- read.table("iris.txt", sep = ",")
x <- as.matrix(df[, 1:4])
y <- as.vector(df[, 5])
y <- as.numeric(y == "Iris-setosa")
cv <- cv.glmnet(x, y, family = "binomial")
cv2 <- cv.glmnet(x, y, family = "binomial", type.measure = "class")
par(mfrow = c(1, 2))
plot(cv)
plot(cv2)
par(mfrow = c(1, 1))
lambda <- cv$lambda.min
result <- glmnet(x, y, lambda = lambda, family = "binomial")
beta <- result$beta
beta.0 <- result$a0
f <- function(x) {
  return(exp(beta.0 + x %*% beta))
}
z <- array(dim = 150)
for (i in 1:150)
  z[i] <- drop(f(x[i, ]))
yy <- (z > 1)
sum(yy == y)
library(glmnet)

df <- read.table("iris.txt", sep = ",")
x <- as.matrix(df[, 1:4])
y <- as.vector(df[, 5])
n <- length(y)
u <- array(dim = n)
for (i in 1:n) {
  if (y[i] == "Iris-setosa") {
    u[i] <- 1
  } else if (y[i] == "Iris-versicolor") {
    u[i] <- 2 
  } else {
    u[i] <- 3
  }
}
u <- as.numeric(u)
cv <- cv.glmnet(x, u, family = "multinomial")
cv2 <- cv.glmnet(x, u, family = "multinomial", type.measure = "class")
par(mfrow = c(1, 2))
plot(cv)
plot(cv2)
par(mfrow = c(1, 1))
lambda <- cv$lambda.min
result <- glmnet(x, y, lambda = lambda, family = "multinomial")
beta <- result$beta
beta.0 <- result$a0
v <- array(dim = n)
for (i in 1:n) {
  max.value <- -Inf
  for (j in 1:3) {
    value <- ## 空欄 ##
    if (value > max.value) {
      v[i] <- j
      max.value <- value
    }
  }
}
sum(u == v)

28

## データ生成
N <- 1000
p <- 7
beta <- rnorm(p + 1)
X <- matrix(rnorm(N * p), ncol = p)
X <- cbind(rep(1, N), X)
s <- X %*% beta
y <- rpois(N, lambda = exp(s))
beta

## 最尤推定値の計算
lambda <- 100
beta <- Inf
gamma <- rnorm(p + 1)
while (sum((beta - gamma) ^ 2) > 0.01) {
  beta <- gamma
  s <- as.vector(X %*% beta)
  w <- ## 空欄(1) ##
  u <- ## 空欄(2) ##
  z <- ## 空欄(3) ##
  W <- diag(w)
  gamma <- coordinate(W, z, gamma)
  print(gamma)
}

29

df <- read.csv("baseball.csv", sep = "", , header = "FALSE")
df <- df[, -c(1, 2)]
df[[6]]

30

library(survival)
data(kidney)
names(kidney)
y <- kidney$time
delta <- kidney$status
Surv(y, delta)

30(b)

fit <- survfit(Surv(time, status) ~ disease, data = kidney)
plot(fit, xlab = "時間", ylab = "生存率", col = c("red", "green", "blue", "black"))
legend(300, 0.8, legend = c("その他", "GN", "AN", "PKD"), 
       lty = 1, col = c("red", "green", "blue", "black"))

## 以下も実行する
library(ranger)
library(ggplot2)
library(dplyr)
library(ggfortify)

autoplot(fit)

33(a)

library(survival)
load("LymphomaData.rda")
attach("LymphomaData.rda")
names(patient.data)
x <- t(patient.data$x)
y <- patient.data$time
delta <- patient.data$status
Surv(y, delta)
cv.fit <- cv.glmnet(x, Surv(y, delta), family = "cox")

33(b)

fit2 <- glmnet(x, Surv(y, delta), lambda = cv.fit$lambda.min, family = "cox")
z <- sign(drop(x %*% fit2$beta))
fit3 <- survfit(Surv(y, delta) ~ ## 空欄 ##)
autoplot(fit3)
mean(y[z == 1])
mean(y[z == -1]

34(a)

library(ElemStatLearn)
library(glmnet)
library(sparseSVM)

data(SAheart)
df <- SAheart
df[, 5] <- as.numeric(df[, 5])
x <- as.matrix(df[, 1:9])
y <- as.vector(df[, 10])
p <- 9
binom.fit <- glmnet(x, y, family = "binomial")
svm.fit <- sparseSVM(x, y)
par(mfrow = c(1, 2))
plot(binom.fit)
plot(svm.fit, xvar = "norm")
par(mfrow = c(1, 1))

## 出力は似ているが,凡例がないので,係数の値が近いかどうかわからない。
## そこで,自分でグラフを作成してみた。

coef.binom <- binom.fit$beta
coef.svm <- coef(svm.fit)[2:(p + 1), ]
norm.binom <- apply(abs(coef.binom), 2, sum)
norm.binom <- norm.binom / max(norm.binom) 
norm.svm <- apply(abs(coef.svm), 2, sum)
norm.svm <- norm.svm / max(norm.svm)
par(mfrow <- c(1, 2))
plot(norm.binom, xlim = c(0, 1), ylim = c(min(coef.binom), max(coef.binom)),
     main = "ロジスティック回帰", xlab = "ノルム", ylab = "係数", type = "n")
for (i in 1:p)
  lines(norm.binom, coef.binom[i, ], col = i)
legend("topleft", legend = colnames(df), col = 1:p, lwd = 2, cex = .8)
par(mfrow = c(1, 1))

34(b)

df <- read.csv("http://web.stanford.edu/~hastie/CASI_files/DATA/leukemia_big.csv")
dim(df)
names <- colnames(df)
x <- t(as.matrix(df))  #ゲノムデータでは,行と列が反対になっていることが多い
y <- as.numeric(substr(names, 1, 3) == "ALL")
p <- 7128
binom.fit <- glmnet(x, y, family = "binomial")
svm.fit <- sparseSVM(x, y)
coef.binom <- binom.fit$beta
coef.svm <- coef(svm.fit)[2:(p + 1), ]
norm.binom <- apply(abs(coef.binom), 2, sum)
norm.binom <- norm.binom / max(norm.binom) 
norm.svm <- apply(abs(coef.svm), 2, sum)
norm.svm <- norm.svm / max(norm.svm)