35
n <- 1000
p <- 10
x <- matrix(rnorm(n * p), nrow = n, ncol = p)
x <- cbind(rep(1, n), x)
beta <- rnorm(p + 1)
y <- x %*% beta + rnorm(n) * 0.2
# 通常のクロスバリデーションによる方法
cv.linear <- function(X, y, k) {
n <- length(y)
m <- n / k
S <- 0
for (j in 1:k) {
test <- ((j - 1) * m + 1):(j * m)
beta <- solve(t(X[-test,])%*% X[-test,])%*% t(X[-test,]) %*% y[-test,]
e <- y[test] - X[test, ] %*% beta
S <- S + drop(t(e) %*% e)
}
return(S/n)
}
# 公式を用いた方法
cv.fast <- function(X, y, k) {
n <- length(y)
m <- n / k
H <- X %*% solve(t(X) %*% X) %*% t(X)
I <- diag(rep(1, n))
e <- (I - H) %*% y
I <- diag(rep(1, m))
sum=0
for (j in 1:k) {
test <- ((j - 1) * m + 1):(j * m)
sum <- sum + norm(solve(I-H[test,test]) %*% e[test],"2")^2
}
return(sum / n)
}
cv.fast(x,y,10)
## [1] 0.04272494
cv.linear(x,y,10)
## [1] 0.04272494
plot(0, 0, xlab = "k", ylab = "実行時間",
xlim = c(2, n), ylim = c(0, 0.5), type = "n")
U <- NULL
V <- NULL
for (k in 10:n) {
if (n %% k == 0) {
t <- proc.time()[3]
cv.fast(x, y, k)
U <- c(U, k)
V <- c(V, (proc.time()[3] - t))
}
}
lines(U,V, col="blue")
U=NULL; V=NULL
for (k in 10:n) {
if (n %% k == 0) {
t <- proc.time()[3]
cv.linear(x, y, k)
U <- c(U, k)
V <- c(V, (proc.time()[3] - t))
}
}
lines(U,V, col="red")
legend("topleft", legend = c("cv.linear", "cv.fast"),
col = c("red", "blue"), lty = 1)
36
# データ生成
n <- 100
p <- 5
plot(0, 0, xlab = "k", ylab = "CVの値",
xlim = c(2, n), ylim = c(0.3, 1.5), type = "n")
for (j in 2:11) {
X <- matrix(rnorm(n * p), ncol = p)
X <- cbind(1, X)
beta <- rnorm(p + 1)
eps <- rnorm(n)
y <- X %*% beta + eps
U <- NULL
V <- NULL
for (k in 2:n) {
if (n %% k == 0) {
U = c(U,k);
V = c(V,cv.fast(X,y,k))
}
};
lines(U, V, col = j)
}
37
knn.1 <- function(x, y, z, k) {
x <- as.matrix(x)
n <- nrow(x)
p <- ncol(x)
dis <- array(dim = n)
for (i in 1:n)
dis[i] <- norm(z - x[i, ], "2")
S <- order(dis)[1:k] # 距離の小さいk個のiの集合
u <- sort(table(y[S]), decreasing = TRUE) # 最頻のy[i]と頻度
v <- names(u)[1] # 最頻のy[i]
return(v)
}
knn <- function(x, y, z, k) {
n <- nrow(z)
w <- array(dim = n)
for (i in 1:n)
w[i] <- knn.1(x, y, z[i, ], k)
return(w)
}
df <- iris
df <- df[sample(1:150, 150, replace = FALSE), ]
n <- nrow(df)
U <- NULL
V <- NULL
for (k in 1:10) {
top.seq <- 1 + seq(0, 135, 10)
S <- 0
for (top in top.seq) {
index <- top:(top+14)
knn.ans <- knn(df[-index, 1:4], df[-index, 5], df[index, 1:4], k)
ans <- df[index,5]
S <- S + sum(knn.ans != ans)
}
S <- S / n
U <- c(U, k)
V <- c(V, S)
}
plot(0, 0, type = "n", xlab = "k", ylab = "誤り率",
xlim = c(1, 10), ylim = c(0, 0.1), main = "cvによる誤り率の評価")
lines(U, V, col = "red")
38
func.1 <- function(data, index) {
X <- data$X[index]
Y <- data$Y[index]
return((var(Y) - var(X)) / (var(X) + var(Y) - 2 * cov(X, Y)))
}
bt <- function(df, func, r) {
m <- nrow(df)
org <- func(df,1:m)
u <- array(dim = r)
for (j in 1:r) {
index <- sample(1:m,m,replace=TRUE)
u[j] <- func(df, index)
}
return(list(original = org, bias = mean(u) - org, stderr = sd(u)))
}
library(ISLR)
bt(Portfolio, func.1, 1000) # 実行例
## $original
## [1] 0.1516641
##
## $bias
## [1] 0.006947538
##
## $stderr
## [1] 0.1848633
39
func.1 <- function(data, index) {
X <- data$X[index]
Y <- data$Y[index]
return((var(Y) - var(X)) / (var(X) + var(Y) - 2 * cov(X, Y)))
}
df <- read.table("crime.txt")
for (j in 1:3) {
func.2 <- function(data, index)coef(lm(V1 ~ V3 + V4, data = data, subset = index))[j]
print(bt(df, func.2, 1000))
}
## $original
## (Intercept)
## 621.426
##
## $bias
## (Intercept)
## 49.16254
##
## $stderr
## [1] 229.575
##
## $original
## V3
## 11.85833
##
## $bias
## V3
## -0.643803
##
## $stderr
## [1] 3.453364
##
## $original
## V4
## -5.973412
##
## $bias
## V4
## -0.1179438
##
## $stderr
## [1] 3.196886
summary(lm(V1 ~ V3 + V4, data = df))
##
## Call:
## lm(formula = V1 ~ V3 + V4, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -356.90 -162.92 -60.86 100.69 784.30
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 621.426 222.685 2.791 0.00758 **
## V3 11.858 2.568 4.618 3.02e-05 ***
## V4 -5.973 3.561 -1.677 0.10013
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 246.6 on 47 degrees of freedom
## Multiple R-squared: 0.3247, Adjusted R-squared: 0.296
## F-statistic: 11.3 on 2 and 47 DF, p-value: 9.838e-05