Simple Linear Regression
Published:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from statsmodels.stats.anova import anova_lm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,summarize,poly)
A = np.array([3,5,11])
dir(A)
A.sum()
Boston = load_data("Boston")
Boston.columns
#Boston?
X= pd.DataFrame({'intercept':np.ones(Boston.shape[0]),'lstat':Boston['lstat']}) # Adding the intercept column
X
Y=Boston['medv']
model=sm.OLS(Y,X) #Create an OLS Regression model
results = model.fit()# Solves The OLS (Finds the most optimal coefficients) to minimize RSS
print(results.params)
summarize(results)
design = MS(['lstat'])
design = design.fit(Boston) # fit the model
X= design.transform(Boston) # Transforms the raw data into a design matrix
X= design.fit_transform(Boston) # combine both methods
X
results.summary()
results.params
new_df = pd.DataFrame({'lstat':[5,10,15]}) #values we want to make prediction for
new_predictor = design.transform(new_df)
new_predictor
new_prediction = results.get_prediction(new_predictor)
new_prediction.predicted_mean # gives the estimated mean for each of the new predictors
new_prediction.conf_int(alpha=0.05) # Confidence interval with 95% confidence
#each row is an interval for a value
new_prediction.conf_int(obs=True , alpha =0.05) # obs= True will compute the prediction interval
#As expected the prediction interval is wider than the confidence interval
def regression_line(ax,b,m, *args,**kwargs): # *args any number of non-named args while the **kwargs a allows any number of named args
xlim=ax.get_xlim()
print(xlim)
ylim =[m*xlim[0]+b,m*xlim[1]+b]
ax.plot(xlim,ylim,*args,**kwargs)
ax= Boston.plot.scatter('lstat','medv')
print(results.params['intercept'])
print(results.params['lstat'])
regression_line(ax,results.params['intercept'],results.params['lstat'],'r--',linewidth=3)
#Noitcing Some non-linearity
ax=subplots(figsize=(8,8))[1]
ax.scatter(results.fittedvalues, results.resid)
ax.set_xlabel("Fitted value")
ax.set_ylabel('Residual')
ax.axhline(0,c='k',ls='--'); # Adding horizontal line at 0 for reference
#Leverage statistics
infl=results.get_influence()
ax=subplots(figsize=(5,5))[1]
print(X.shape[0])
ax.scatter(np.arange(X.shape[0]),infl.hat_matrix_diag)
ax.set_xlabel('Index')
ax.set_ylabel('Leverage')
value= np.argmax(infl.hat_matrix_diag)
print(value)
print(results.get_influence().hat_matrix_diag)
