2. Linear Regression

3

N <- 100  # Sample size N

# Randomly generate coefficients of the line
a <- rnorm(1)
b <- rnorm(1)

# Randomly generate points around the line
x <- rnorm(N)
y <- a*x + b + rnorm(N)

plot(y ~ x)  # Plot points

abline(lm(y ~ x), col = "red")  # Fitted line
abline(h = 0)  # x-axis
abline(v = 0)  # y-axis

# Shift the centroid of the N points to the origin
x <- x - mean(x)
y <- y - mean(y)

abline(lm(y ~ x), col = "blue")  # Fitted line
abline(h = 0)  # x-axis
abline(v = 0)  # y-axis

legend("topleft", c("Before centering", "After centering"), lty = 1, col = c("red", "blue"))

remove(list = ls())

12

N <- 100
x <- rnorm(N)
y <- rnorm(N)
x.bar <- mean(x)
y.bar <- mean(y)
beta.0 <- sum(y.bar * sum(x^2) - x.bar * sum(x*y)) / sum((x - x.bar) ^ 2)
beta.1 <- sum((x - x.bar) * (y - y.bar)) / sum((x - x.bar) ^ 2)
RSS <- sum((y - beta.0 - beta.1*x) ^ 2)
RSE <- sqrt(RSS / (N - 1 - 1))
B.0 <- sum(x^2) / N / sum((x - x.bar)^2)
B.1 <- 1 / sum((x - x.bar) ^ 2)
se.0 <- RSE * sqrt(B.0)
se.1 <- RSE * sqrt(B.1)
t.0 <- beta.0 / se.0
t.1 <- beta.1 / se.1
p.0 <- 2 * (1 - pt(abs(t.0), N - 2))  # p-value (probability outside this value)
p.1 <- 2 * (1 - pt(abs(t.1), N - 2))  # p-value (probability outside this value)

beta.0

## [1] 0.09453751

se.0

## [1] 0.1032435

t.0

## [1] 0.9156748

p.0

## [1] 0.3620852

beta.1

## [1] -0.06989407

se.1

## [1] 0.1094299

t.1

## [1] -0.6387111

p.1

## [1] 0.5245009

lm(y ~ x)

## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##     0.09454     -0.06989

summary(lm(y ~ x))

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2361 -0.7286 -0.1229  0.6611  2.6811 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.09454    0.10324   0.916    0.362
## x           -0.06989    0.10943  -0.639    0.525
## 
## Residual standard error: 1.03 on 98 degrees of freedom
## Multiple R-squared:  0.004146,   Adjusted R-squared:  -0.006016 
## F-statistic: 0.408 on 1 and 98 DF,  p-value: 0.5245

13

N <- 100
r <- 1000
T <- NULL
for (i in 1:r) {
  x <- rnorm(N)
  y <- rnorm(N)
  x.bar <- mean(x)
  y.bar <- mean(y)
  fit <- lm(y ~ x)
  beta <- fit$coefficients
  RSS <- sum((y - fit$fitted.values) ^ 2)
  RSE <- sqrt(RSS / (N - 1 - 1))
  B.1 <- 1 / sum((x - x.bar) ^ 2)
  se.1 <- RSE * sqrt(B.1)
  T <- c(T, beta[2] / se.1)
}

hist(T, breaks = sqrt(r), probability = TRUE, xlab = "Value of t", ylab = "Probability Density",
     main = "Histogram of t values and theoretical values (red)")
curve(dt(x, N - 2), -3, 3, type = "l", col = "red", add = TRUE)

N <- 100
r <- 1000
T <- NULL
for (i in 1:r) {
  x <- rnorm(N)
  y <- 0.1 * x + rnorm(N)
  x.bar <- mean(x)
  y.bar <- mean(y)
  fit <- lm(y ~ x)
  beta <- fit$coefficients
  RSS <- sum((y - fit$fitted.values) ^ 2)
  RSE <- sqrt(RSS / (N - 1 - 1))
  B.1 <- 1 / sum((x - x.bar) ^ 2)
  se.1 <- RSE * sqrt(B.1)
  T <- c(T, beta[2] / se.1)
}

hist(T, breaks = sqrt(r), probability = TRUE, xlab = "Value of t", ylab = "Probability Density",
     main = "Histogram of t values and theoretical values (red) (executed after substitution)")
curve(dt(x, N - 2), -3, 3, type = "l", col = "red", add = TRUE)

15(d)

R2 <- function(x, y) {
  y.hat <- lm(y ~ x)$fitted.values
  y.bar <- mean(y)
  RSS <- sum((y - y.hat) ^ 2)
  TSS <- sum((y - y.bar) ^ 2)
  return(1 - RSS / TSS)
}

N <- 100
m <- 2
x <- matrix(rnorm(m * N), ncol = m)
y <- rnorm(N)
R2(x, y)

## [1] 0.02966563

N <- 100
m <- 1
x <- matrix(rnorm(m * N), ncol = m)
y <- rnorm(N)
R2(x, y)

## [1] 0.005950415

cor(x,y)^2

##             [,1]
## [1,] 0.005950415

16

library(MASS)
X <- as.matrix(Boston)
p <- ncol(X)
T <- NULL
for (j in 1:p)
  T <- c(T, 1/(1-R2(X[,-j],X[,j])))
T

##  [1] 1.831537 2.352186 3.992503 1.095223 4.586920 2.260374 3.100843 4.396007
##  [9] 7.808198 9.205542 1.993016 1.381463 3.581585 3.855684

18

# Generate data
N <- 100
p <- 1
X <- matrix(rnorm(N * p), ncol = p)
X <- cbind(rep(1, N), X)
beta <- rnorm(p + 1)
epsilon <- rnorm(N)
y <- X %*% beta + epsilon

# Define function f(x). U is the inverse matrix of t(X) %*% X
U <- solve(t(X) %*% X)
beta.hat <- U %*% t(X) %*% y
RSS <- sum((y - X %*% beta.hat) ^ 2)
RSE <- sqrt(RSS / (N - p - 1))
alpha <- 0.05

f <- function(x) {
  x <- cbind(1, x)
  range <- qt(df = N - p - 1, 1 - alpha / 2) * RSE * sqrt(x %*% U %*% t(x))
  return(list(lower = x %*% beta.hat - range, upper = x %*% beta.hat + range))
}
g <- function(x) {
  x <- cbind(1, x)
  range <- qt(df = N - p - 1, 1 - alpha / 2) * RSE * sqrt(1 + x %*% U %*% t(x))
  return(list(lower = x %*% beta.hat - range, upper = x %*% beta.hat + range))
}

# Display confidence intervals in a graph
x.seq <- seq(-10, 10, 0.1)
lower.seq <- NULL
for (x in x.seq)
  lower.seq <- c(lower.seq, f(x)$lower)
upper.seq <- NULL
for (x in x.seq)
  upper.seq <- c(upper.seq, f(x)$upper)
x.lim <- c(min(x.seq), max(x.seq))
y.lim <- c(min(lower.seq), max(upper.seq))
plot(x.seq, lower.seq, col = "blue", xlim = x.lim, ylim = y.lim,
     xlab = "x", ylab = "y", type = "l")
par(new = TRUE)
plot(x.seq, upper.seq, col = "red", xlim = x.lim, ylim = y.lim,
     xlab = "", ylab = "", type = "l", axes = FALSE)
par(new = TRUE)

lower.seq <- NULL
for (x in x.seq)
  lower.seq <- c(lower.seq, g(x)$lower)
upper.seq <- NULL
for (x in x.seq)
  upper.seq <- c(upper.seq, g(x)$upper)
x.lim <- c(min(x.seq), max(x.seq))
y.lim <- c(min(lower.seq), max(upper.seq))
plot(x.seq, lower.seq, col = "blue", xlim = x.lim, ylim = y.lim,
     xlab = "", ylab = "", type = "l", lty = 4, axes = FALSE)
par(new = TRUE)
plot(x.seq, upper.seq, col = "red", xlim = x.lim, ylim = y.lim,
     xlab = "", ylab = "", type = "l", lty = 4, axes = FALSE)
abline(beta.hat[1], beta.hat[2])

Summary (R Program)

Chapter 2 Linear Regression

3

12

13

15(d)

16

18