Chapter 2 Linear Regression¶

3¶

In [ ]:
import statsmodels.api as sm
from sklearn import linear_model
from numpy.random import randn
from scipy import stats
import scipy
import matplotlib.pyplot as plt
import numpy as np
In [ ]:
def min_sq(x, y):  # Function to find the intercept and slope by least squares method
    x_bar, y_bar = np.mean(x), np.mean(y)
    beta_1 = np.dot(x-x_bar, y-y_bar)/np.linalg.norm(x-x_bar)**2
    beta_0 = y_bar-beta_1*x_bar
    return [beta_1, beta_0]
In [ ]:
N = 100         # Set the sample size to N
a = randn(1)+2  # Slope
b = randn(1)    # Intercept
x = randn(N)
y = a*x+b+randn(N)  # Artificial data generation up to this point
a1, b1 = min_sq(x, y)           # Regression coefficient and intercept
xx = x-np.mean(x)
yy = y - np.mean(y)  # Centralization
a2, b2 = min_sq(xx, yy)         # Regression coefficient and intercept after centralization

x_seq = np.arange(-5, 5, 0.1)
y_pre = x_seq*a1+b1
yy_pre = x_seq*a2+b2
plt.scatter(x, y, c="black")
plt.axhline(y=0, c="black", linewidth=0.5)
plt.axvline(x=0, c="black", linewidth=0.5)
plt.plot(x_seq, y_pre, c="blue", label=" Before centralization")
plt.plot(x_seq, yy_pre, c="orange", label=" After centralization")
plt.legend(loc="upper left")
Out[ ]:
<matplotlib.legend.Legend at 0x7b7ea636c970>
No description has been provided for this image

12¶

In [ ]:
N = 100
x = randn(N)
y = randn(N)
beta_1, beta_0 = min_sq(x, y)
RSS = np.linalg.norm(y-beta_0-beta_1*x)**2
RSE = np.sqrt(RSS/(N-1-1))
B_0 = (np.linalg.norm(x)**2/N)/np.linalg.norm(x-np.mean(x))**2
B_1 = 1/np.linalg.norm(x-np.mean(x))**2
se_0 = RSE*np.sqrt(B_0)
se_1 = RSE*np.sqrt(B_1)
t_0 = beta_0/se_0
t_1 = beta_1/se_1
p_0 = 2*(1-stats.t.cdf(np.abs(t_0), N-2))
p_1 = 2*(1-stats.t.cdf(np.abs(t_1), N-2))
beta_0, se_0, t_0, p_0  # Intercept
beta_1, se_1, t_1, p_1  # Regression coefficient
Out[ ]:
(-0.014173518946334599,
 0.08960543865080017,
 -0.15817699416181621,
 0.874642812487499)
In [ ]:
reg = linear_model.LinearRegression()
x = x.reshape(-1, 1)  # It is necessary to specify the size of the array in sklearn
y = y.reshape(-1, 1)  # Set one dimension and set the other to -1 for automatic adjustment
reg.fit(x, y)  # Execute
reg.coef_, reg.intercept_  # Regression coefficient beta_1, intercept beta_0
Out[ ]:
(array([[-0.01417352]]), array([0.1119832]))
In [ ]:
X = np.insert(x, 0, 1, axis=1)
model = sm.OLS(y, X)
res = model.fit()
print(res.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.010
Method:                 Least Squares   F-statistic:                   0.02502
Date:                Tue, 04 Jun 2024   Prob (F-statistic):              0.875
Time:                        02:19:11   Log-Likelihood:                -131.99
No. Observations:                 100   AIC:                             268.0
Df Residuals:                      98   BIC:                             273.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1120      0.093      1.209      0.229      -0.072       0.296
x1            -0.0142      0.090     -0.158      0.875      -0.192       0.164
==============================================================================
Omnibus:                        0.089   Durbin-Watson:                   2.321
Prob(Omnibus):                  0.956   Jarque-Bera (JB):                0.218
Skew:                           0.060   Prob(JB):                        0.897
Kurtosis:                       2.805   Cond. No.                         1.17
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

13¶

In [ ]:
N = 100
r = 1000
T = []
for i in range(r):
    x = randn(N)
    y = randn(N)
    beta_1, beta_0 = min_sq(x, y)
    pre_y = beta_0+beta_1*x  # Predicted value of y
    RSS = np.linalg.norm(y-pre_y)**2
    RSE = np.sqrt(RSS/(N-1-1))
    B_1 = 1/np.linalg.norm(x-np.mean(x))**2
    se_1 = RSE*np.sqrt(B_1)
    T.append(beta_1/se_1)
plt.hist(T, bins=20, range=(-3, 3), density=True)
x = np.linspace(-4, 4, 400)
plt.plot(x, stats.t.pdf(x, 98))
plt.title(" When the null hypothesis holds")
plt.xlabel(' Value of t')
plt.ylabel(' Probability Density')
Out[ ]:
Text(0, 0.5, ' Probability Density')
No description has been provided for this image

15¶

In [ ]:
def R2(x, y):
    n = x.shape[0]
    xx = np.insert(x, 0, 1, axis=1)
    beta = np.linalg.inv(xx.T@xx)@xx.T@y
    y_hat = np.dot(xx, beta)
    y_bar = np.mean(y)
    RSS = np.linalg.norm(y-y_hat)**2
    TSS = np.linalg.norm(y-y_bar)**2
    return 1-RSS/TSS


N = 100
m = 2
x = randn(N, m)
y = randn(N)
R2(x, y)
# If univariate, the coefficient of determination is the square of the correlation coefficient
x = randn(N, 1)
y = randn(N)
R2(x, y)
Out[ ]:
0.00023430110506250656

16¶

In [ ]:
X = np.loadtxt("boston.txt", delimiter="\t")


def VIF(x):
    p = x.shape[1]
    values = []
    for j in range(p):
        S = list(set(range(p))-{j})
        values.append(1/(1-R2(x[:, S], x[:, j])))
    return values


VIF(X)
Out[ ]:
[1.8315366837134723,
 2.352185889014946,
 3.9925031533175335,
 1.095222668768822,
 4.58692024225555,
 2.2603743566681325,
 3.1008428195459823,
 4.3960072515073945,
 7.808198432681462,
 9.205542091810138,
 1.9930156565532828,
 1.3814629538442613,
 3.5815848036702076,
 3.8556842688338224]

18¶

In [ ]:
N = 100
p = 1
X = randn(N, p)
X = np.insert(X, 0, 1, axis=1)
beta = np.array([1, 1])
epsilon = randn(N)
y = X@beta+epsilon
# Define functions f(x) and g(x)
U = np.linalg.inv(X.T@X)
beta_hat = U@X.T@y
RSS = np.linalg.norm(y-X@beta_hat)**2
RSE = np.sqrt(RSS/(N-p-1))
alpha = 0.05


def f(x):
    x = np.array([1, x])
    # stats.t.ppf(0.975,df=N-p-1) # Point where cumulative probability is 1-alpha/2
    range = stats.t.ppf(0.975, df=N-p-1)*RSE*np.sqrt(x@U@x.T)
    lower = x@beta_hat-range
    upper = x@beta_hat+range
    return ([lower, upper])


def g(x):
    x = np.array([1, x])
    # stats.t.ppf(0.975,df=N-p-1) # Point where cumulative probability is 1-alpha/2
    range = stats.t.ppf(0.975, df=N-p-1)*RSE*np.sqrt(1+x@U@x.T)
    lower = x@beta_hat-range
    upper = x@beta_hat+range
    return ([lower, upper])


# Example
stats.t.ppf(0.975, df=1)  # Point corresponding to probability p
12.706204736432095
x_seq = np.arange(-10, 10, 0.1)
# Confidence interval
lower_seq1 = []
upper_seq1 = []
for i in range(len(x_seq)):
    lower_seq1.append(f(x_seq[i])[0])
    upper_seq1.append(f(x_seq[i])[1])
# Prediction interval
lower_seq2 = []
upper_seq2 = []
for i in range(len(x_seq)):
    lower_seq2.append(g(x_seq[i])[0])
    upper_seq2.append(g(x_seq[i])[1])
yy = beta_hat[0]+beta_hat[1]*x_seq
plt.xlim(np.min(x_seq), np.max(x_seq))
plt.ylim(np.min(lower_seq1), np.max(upper_seq1))
plt.plot(x_seq, yy, c="black")
plt.plot(x_seq, lower_seq1, c="blue")
plt.plot(x_seq, upper_seq1, c="red")
plt.plot(x_seq, lower_seq2, c="blue", linestyle="dashed")
plt.plot(x_seq, upper_seq2, c="red", linestyle="dashed")
plt.xlabel("x")
plt.ylabel("y")
Out[ ]:
Text(0, 0.5, 'y')
No description has been provided for this image