Simple Linear Regression

Published: June 05, 2024

import numpy as np
import pandas as pd 
from matplotlib.pyplot import subplots
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from statsmodels.stats.anova import anova_lm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,summarize,poly)

A = np.array([3,5,11])
dir(A)

A.sum()

Boston = load_data("Boston")
Boston.columns

#Boston?

X= pd.DataFrame({'intercept':np.ones(Boston.shape[0]),'lstat':Boston['lstat']}) # Adding the intercept column
X

Y=Boston['medv']
model=sm.OLS(Y,X) #Create an OLS Regression model 
results = model.fit()# Solves The OLS (Finds the most optimal coefficients) to minimize RSS
print(results.params)

summarize(results)

design = MS(['lstat'])
design = design.fit(Boston) # fit the model 
X= design.transform(Boston) # Transforms the raw data into a design matrix 
X= design.fit_transform(Boston) # combine both methods
X

results.summary()

results.params

new_df = pd.DataFrame({'lstat':[5,10,15]}) #values we want to make prediction for 
new_predictor = design.transform(new_df)
new_predictor

new_prediction = results.get_prediction(new_predictor)
new_prediction.predicted_mean # gives the estimated mean  for each of the new predictors 

new_prediction.conf_int(alpha=0.05) # Confidence interval with 95% confidence 
#each row is an interval for a value 

new_prediction.conf_int(obs=True , alpha =0.05) # obs= True will compute the prediction interval 
#As expected the prediction interval is wider than the confidence interval

def regression_line(ax,b,m, *args,**kwargs): # *args any number of non-named args while the **kwargs a allows any number of named args
    xlim=ax.get_xlim()
    print(xlim)
    ylim =[m*xlim[0]+b,m*xlim[1]+b]
    ax.plot(xlim,ylim,*args,**kwargs)

ax= Boston.plot.scatter('lstat','medv')
print(results.params['intercept'])
print(results.params['lstat'])
regression_line(ax,results.params['intercept'],results.params['lstat'],'r--',linewidth=3)
#Noitcing Some non-linearity

ax=subplots(figsize=(8,8))[1]
ax.scatter(results.fittedvalues, results.resid)
ax.set_xlabel("Fitted value")
ax.set_ylabel('Residual')
ax.axhline(0,c='k',ls='--'); # Adding horizontal line at 0 for reference 

#Leverage statistics
infl=results.get_influence()

ax=subplots(figsize=(5,5))[1]
print(X.shape[0])
ax.scatter(np.arange(X.shape[0]),infl.hat_matrix_diag)
ax.set_xlabel('Index')
ax.set_ylabel('Leverage')
value= np.argmax(infl.hat_matrix_diag)
print(value)

print(results.get_influence().hat_matrix_diag)

Share on

Bluesky Facebook LinkedIn X (formerly Twitter)

WOOJIN PARK

Share on