Cross-Validation
Published:
Cross Validation and the Bootstrap
import numpy as np
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS , summarize, poly)
from ISLP.models import sklearn_sm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import (cross_validate, KFold, ShuffleSplit)
from sklearn.base import clone
from functools import partial
The Validation Set Approach
- Recall that the Validaiton Set Approach is just dividing the observations $n$ into two parts
- Training Set
- Validation Set
Auto = load_data('Auto')
Auto_train, Auto_test = train_test_split(Auto,test_size=196,random_state=10)
- Loading the Auto data set which contain $392$
- Splitting the dataset into two equal parts
hp_mm = MS(['horsepower'])
X_train = hp_mm.fit_transform(Auto_train)
Y_train = Auto_train['mpg']
simple_LR = sm.OLS(Y_train,X_train)
results = simple_LR.fit()
summarize(results)
- After fiting a Simple Linear Regression model using the training set
- We validate the model using the test/Validate set with the
predict()method
X_test = hp_mm.transform(Auto_test)
y_test = Auto_test['mpg']
valid_pred= results.predict(X_test)
np.mean((y_test-valid_pred)**2)
- The test MSE is estimated to be around $23.060$
def evalMSE(terms,response,train,test):
mm = MS(terms)
X_train =mm.fit_transform(train)
Y_train = train[response]
X_test = mm.transform(test)
Y_test = test[response]
results = sm.OLS(Y_train,X_train).fit()
test_pred = results.predict(X_test)
return np.mean((Y_test-test_pred)**2)
- Now we use the function to estimate the Validation MSE for quadratic and cubic fits
MSE = np.zeros(10)
for idx, degree in enumerate (range(1,11)):
MSE[idx] = evalMSE([poly('horsepower',degree)],'mpg',Auto_train,Auto_test)
MSE
- Running the function on different train and test sets will yield slightly different results each time
Cross-Validation
hp_model = sklearn_sm(sm.OLS,MS(['horsepower']))
X,Y= Auto.drop(columns=['mpg']),Auto['mpg']
X
Y
cv_results = cross_validate(hp_model,X,Y,cv=Auto.shape[0])
cv_err = np.mean(cv_results['test_score'])
cv_err
for col in cv_results:
print(col)
- The
cv=Auto.shape[0]is the $K$ number of folds, Since we provided all the $n$ observations it applied the LOOCV
cv_error = np.zeros(7)
H = np.array(Auto['horsepower'])
M= sklearn_sm(sm.OLS)
for i, d in enumerate(range(1,8)):
X = np.power.outer(H,np.arange(d+1))
print(X)
M_CV =cross_validate(M,X,Y,cv=Auto.shape[0])
cv_error[i]= np.mean(M_CV['test_score'])
cv_error
