第2章 線形回帰(問題19~31)

20

f <- function(x) {
  return(exp(beta.0 + beta * x) / (1 + exp(beta.0 + beta * x)))
}

beta.0 <- 0
beta.seq <- c(0, 0.2, 0.5, 1, 2, 10)
m <- length(beta.seq)
beta <- beta.seq[1]
plot(f, xlim = c(-10, 10), ylim = c(0, 1),
     xlab = "x", ylab = "y", col = 1, main = "ロジスティック曲線")
for (i in 2:m) {
  beta <- beta.seq[i]
  par(new = TRUE)
  plot(f, xlim = c(-10, 10), ylim = c(0, 1),
       xlab = "", ylab = "", axes = FALSE, col = i)
}
legend("topleft", legend = beta.seq, col = 1:length(beta.seq),
       lwd = 2, cex = .8)
par(new = FALSE)

22

f <- function(z) {
  return(z[1] ^ 2 + z[2] ^ 2 - 1)
}

f.x <- function(z) {
  return(2 * z[1])
}

f.y <- function(z) {
  return(2 * z[2])
}

g <- function(z) {
  return(z[1] + z[2])
}

g.x <- function(z) {
  return(1)
}

g.y <- function(z) {
  return(1)
}

z <- c(1, 2)

24

# データ生成
N <- 1000
p <- 2
X <- matrix(rnorm(N * p), ncol = p)
X <- cbind(rep(1, N), X)
beta <- rnorm(p + 1)
y <- array(N)
s <- as.vector(X %*% beta)
prob <- 1 / (1 + exp(s))
for (i in 1:N) {
  if (runif(1) > prob[i]) {
    y[i] <- 1
  } else {
    y[i] <- -1
  }
}
beta

# 最尤推定
beta <- Inf
gamma <- rnorm(p + 1)
while (sum((beta - gamma) ^ 2) > 0.001) {
  beta <- gamma
  s <- as.vector(X %*% beta)
  v <- exp(-s * y)
  u <- ## 空欄(1) ##
  w <- ## 空欄(2) ##
  W <- diag(w)
  z <- ## 空欄(3) ##
  gamma <- as.vector(solve(t(X) %*% W %*% X) %*% t(X) %*% W %*% z)
  print(gamma)
}

26

n <- 100
x <- c(rnorm(n) + 1, rnorm(n) - 1)
y <- c(rep(1, n), rep(-1, n))
train <- sample(1:(2*n), n, replace = FALSE)
df <- data.frame(x, y)
x <- as.matrix(df[train, 1])
y <- as.vector(df[train, 2])
p <- 1
X <- cbind(1, x)
beta <- 0
gamma <- rnorm(p + 1)
while (sum((beta - gamma) ^ 2) > 0.001) {
  beta <- gamma
  s <- as.vector(X %*% beta)
  v <- exp(-s * y)
  u <- y * v / (1 + v)
  w <- v / (1 + v) ^ 2
  W <- diag(w)
  z <- s + u / w
  gamma <- as.vector(solve(t(X) %*% W %*% X) %*% t(X) %*% W %*% z)
  print(gamma)
}
x <- ## 空欄(1) ##
y <- as.vector(df[-train, 2])
z <- ## 空欄(2) ##
table(y, z) 

28

# データ生成
mu.1 <- 2
mu.2 <- 2
sigma.1 <- 2
sigma.2 <- 2
rho.1 <- 0

mu.3 <- -3
mu.4 <- -3
sigma.3 <- 1
sigma.4 <- 1
rho.2 <- -0.8

n <- 100

u <- rnorm(n)
v <- rnorm(n)
x.1 <- sigma.1 * u + mu.1
y.1 <- (rho.1 * u + sqrt(1 - rho.1 ^ 2) * v) * sigma.2 + mu.2

u <- rnorm(n)
v <- rnorm(n)
x.2 <- sigma.3 * u + mu.3
y.2 <- (rho.2 * u + sqrt(1 - rho.2 ^ 2) * v) * sigma.4 + mu.4

# 分布を推定して,境界線を引く
f <- function(w, mu, inv, de) {
  return(-0.5 * t(w - mu) %*% inv %*% (w - mu) - 0.5 * log(de)
}

mu.1 <- c(mean(x.1), mean(y.1))
mu.2 <- c(mean(x.2), mean(y.2))

df <- data.frame(x.1, y.1)  #
mat <- cov(df)              #
inv.1 <- solve(mat)         #
de.1 <- det(mat)            #

df <- data.frame(x.2, y.2)  #
mat <- cov(df)              #
inv.2 <- solve(mat)         #
de.2 <- det(mat)            #

f.1 <- function(u, v) {
  return(f(c(u, v), mu.1, inv.1, de.1))
}

f.2 <- function(u, v) {
  return(f(c(u, v), mu.2, inv.2, de.2))
}

u <- v <- seq(-6, 6, length = 50)
n <- length(u)
w <- array(dim = c(n, n))
pi.1 <- 0.5
pi.2 <- 0.5
for (i in 1:n)
  for (j in 1:n)
    w[i, j] <- pi.1 * f.1(u[i], v[j]) - pi.2 * f.2(u[i], v[j])
contour(u, v, w, level = 1)
points(x.1, y.1, col = "red")
points(x.2, y.2, col = "blue")

29

f <- function(w, mu, inv, de) {
  return(-0.5 * (w - mu) %*% inv %*% t(w - mu) - 0.5 * log(de))
}

df <- iris
df[[5]] <- c(rep(1, 50), rep(2, 50), rep(3, 50))
n <- nrow(df)
train <- sample(1:n, n / 2, replace = FALSE)
test <- setdiff(1:n, train)
mat <- as.matrix(df[train, ])
mu <- list()
covv <- list()
for (j in 1:3) {
  x <- mat[mat[, 5] == j, 1:4]
  mu[[j]] <- c(mean(x[, 1]), mean(x[, 2]), mean(x[, 3]), mean(x[, 4]))
  covv[[j]] <- cov(x)
}

g <- function(v, j) {
  return(f(v, mu[[j]], solve(covv[[j]]), det(covv[[j]])))
}

z <- array(dim = length(test))
for (i in test) {
  u <- as.matrix(df[i, 1:4])
  a <- g(u, 1)
  b <- g(u, 2)
  c <- g(u, 3)
  if (a < b) {
    if (b < c) {
      z[i] <- 3
    } else {
      z[i] <- 2
    }
  } else {
    if (a < c) {
      z[i] <- 3
    } else {
      z[i] <- 1
    }
  }
}
table(z[test], df[test, 5])

30

knn.1 <- function(x, y, z, k) {
  x <- as.matrix(x)
  n <- nrow(x)
  p <- ncol(x)
  dis <- array(dim = n)
  for (i in 1:n)
    dis[i] <- norm(z - x[i, ], "2")
  S <- order(dis)[1:k]  # 距離の小さいk個のiの集合
  u <- sort(table(y[S]), decreasing = TRUE)  # 最頻のy[i]と頻度
  v <- names(u)[1]  # 最頻のy[i]
  return(v)
}

31

N.0 <- 10000
N.1 <- 1000
mu.1 <- 1
mu.0 <- -1
var.1 <- 1
var.0 <- 1
x <- rnorm(N.0, mu.0, var.0)  # 病気でない人
y <- rnorm(N.1, mu.1, var.1)  # 病気の人
plot(1:1, 1:1, xlim = c(0, 1), ylim = c(0, 1),
     xlab = "False Positive", ylab = "True Positive",
     main = "ROC曲線", type = "n")
theta.seq <- exp(seq(-10, 100, 0.1))
U <- NULL
V <- NULL
for (theta in theta.seq) {
  # 病気でない人を病気とみなす
  u <- sum(dnorm(x, mu.1, var.1) / dnorm(x, mu.0, var.0) > theta) / N.0
  # 病気の人を病気とみなす
  v <- ## 空欄 ##
  U <- c(U, u)
  V <- c(V, v)
}
lines(U, V)
M <- length(theta.seq) - 1
AUC <- 0
for (i in 1:M)
  AUC <- AUC + abs(U[i + 1] - U[i]) * V[i]
text(0.5, 0.5, paste("AUC = ", AUC))