Cross-Validation

Published:

Cross Validation and the Bootstrap

import numpy as np
import statsmodels.api as sm

from ISLP import load_data
from ISLP.models import (ModelSpec as MS , summarize, poly)
from ISLP.models import sklearn_sm

from sklearn.model_selection import train_test_split
from sklearn.model_selection import (cross_validate, KFold, ShuffleSplit)
from sklearn.base import clone 

from functools import partial

The Validation Set Approach

  • Recall that the Validaiton Set Approach is just dividing the observations $n$ into two parts
  • Training Set
  • Validation Set
Auto = load_data('Auto')
Auto_train, Auto_test = train_test_split(Auto,test_size=196,random_state=10)
  • Loading the Auto data set which contain $392$
  • Splitting the dataset into two equal parts
hp_mm = MS(['horsepower'])

X_train = hp_mm.fit_transform(Auto_train)
Y_train = Auto_train['mpg']

simple_LR = sm.OLS(Y_train,X_train)
results = simple_LR.fit()
summarize(results)
  • After fiting a Simple Linear Regression model using the training set
  • We validate the model using the test/Validate set with the predict() method
X_test  = hp_mm.transform(Auto_test)
y_test = Auto_test['mpg']
valid_pred= results.predict(X_test)
np.mean((y_test-valid_pred)**2)
  • The test MSE is estimated to be around $23.060$
def evalMSE(terms,response,train,test):
    mm = MS(terms)
    X_train =mm.fit_transform(train)
    Y_train = train[response]

    X_test = mm.transform(test)
    Y_test = test[response]

    results = sm.OLS(Y_train,X_train).fit()
    test_pred = results.predict(X_test)
    return np.mean((Y_test-test_pred)**2)
  • Now we use the function to estimate the Validation MSE for quadratic and cubic fits
MSE = np.zeros(10)

for idx, degree in enumerate (range(1,11)):
     MSE[idx] = evalMSE([poly('horsepower',degree)],'mpg',Auto_train,Auto_test)

MSE
  • Running the function on different train and test sets will yield slightly different results each time

Cross-Validation

hp_model = sklearn_sm(sm.OLS,MS(['horsepower']))

X,Y= Auto.drop(columns=['mpg']),Auto['mpg']
X
Y
cv_results = cross_validate(hp_model,X,Y,cv=Auto.shape[0])
cv_err = np.mean(cv_results['test_score'])
cv_err
for col in cv_results:
    print(col)
  • The cv=Auto.shape[0] is the $K$ number of folds, Since we provided all the $n$ observations it applied the LOOCV
cv_error = np.zeros(7)

H = np.array(Auto['horsepower'])
M= sklearn_sm(sm.OLS)

for i, d in enumerate(range(1,8)):
    X = np.power.outer(H,np.arange(d+1))
    print(X)
    M_CV =cross_validate(M,X,Y,cv=Auto.shape[0])

    cv_error[i]= np.mean(M_CV['test_score'])

cv_error

Categories: