#!/usr/bin/env python # coding: utf-8 # # # Linear regression, ridge regression and polynomial features # ## Case study with a simple one-dimensional function # # On the example of a simple sinusoidal one-dimensional function, we explore: # - the idea of a [training / test (or validation) set split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) of samples, and based on this, we evaluate the predictive power of # - [linear regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html), # - [ridge regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html), which is an **L_2-regularized** linear model, # # and furthermore, explore the effect of enriching the model by using [polynomial features](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html). # First, we load packages and functions that we will need. # In[23]: import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.linear_model import Ridge from sklearn.model_selection import train_test_split from sklearn.preprocessing import PolynomialFeatures from sklearn.metrics import mean_squared_error from sklearn.model_selection import GridSearchCV # We define a function to sample $n=\text{'n_sample'}$ points $x_1,\ldots,x_n$ uniformly distributed at random and define # corresponding samples $y_i=f(x_i) + \epsilon_i$ where # $f(x)= \sin(4 x) + x$ and $\epsilon_i$ are i.i.d. standard random variables. # In[24]: ### function to create random samples from a wave-like signal def make_wave(n_samples=100): rnd = np.random.RandomState(42) x = rnd.uniform(-3, 3, size=n_samples) x = np.sort(x) y_no_noise = (np.sin(4 * x) + x) y = y_no_noise + rnd.normal(size=len(x)) return x.reshape(-1, 1), y # Visualize the function f: fn = lambda x: np.sin(4 * x) + x line_xvalues = np.linspace(-3, 3, 1000).reshape(-1, 1) plt.figure(figsize=(14,8)) plt.plot(line_xvalues,fn(line_xvalues)) ax = plt.gca() ax.set_ylim(-5, 5) ax.set_xlim(-8, 8) ax.grid(True) # We create a data set (consisting of $x$-coordinates in '$X$' and $y$-coordinates in '$y$') based on the above model, using a 'nr_samples' samples. # In[25]: # %% Create data, create training and test data split nr_samples = 50 X, y = make_wave(n_samples=nr_samples) y = y.reshape((len(y), 1)) # We now use the method [train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html?highlight=train_test#sklearn.model_selection.train_test_split) of scikit-learn to split the available samples into to groups, a training dataset and a test dataset. The share of samples in the training data is provided by the choice of 'train_share', i.e., if $\text{train_share} = 0.6$, then 60% of the samples are used in the training dataset. # # From the samples variable $X$, we create a [pandas](https://pandas.pydata.org/docs/) DataFrame as its methods are sometimes more convenient to use - for example, we can easily obtain a vector of indices corresponding to the training and test set, respectively. We define the corresponding vectors 'id_train' and 'id_test'. # In[26]: train_share = 0.6 dfX = pd.DataFrame(X) X_train, X_test, y_train, y_test = train_test_split(dfX, y, train_size=train_share,random_state=25) # split data into a share of 'train_share' in training data set and a share of 1-'train_share' in test data set id_train = X_train.index # obtain index of training set X_train = X_train.to_numpy() id_test = X_test.index # obtain index of test set X_test= X_test.to_numpy() # The samples can be visualized as follows. # In[32]: # %% Plot training data plt.figure(figsize=(14,8)) plt.plot(X_train,y_train,'o',c='blue') ax = plt.gca() ax.set_ylim(-5, 5) ax.set_xlim(-8, 8) ax.spines['left'].set_position('center') ax.spines['right'].set_color('none') ax.spines['bottom'].set_position('center') ax.spines['top'].set_color('none') ax.legend(["training data"],loc=0,fontsize=8) ax.grid(True) # %% Plot training data and test data plt.figure(figsize=(14,8)) plt.plot(X_train,y_train,'o',c='blue') plt.plot(X_test,y_test,'o',c='red') ax = plt.gca() ax.set_ylim(-5, 5) ax.set_xlim(-8, 8) ax.spines['left'].set_position('center') ax.spines['right'].set_color('none') ax.spines['bottom'].set_position('center') ax.spines['top'].set_color('none') ax.legend(["training data","test data"],loc=0,fontsize=8) ax.grid(True) # We now train a linear regression model (with $\alpha=\text{alphaval}$) on the samples of the training set, and evaluate the mean squared errors as well as the resulting $R^2$ coefficient. We also train a linear and a ridge regression model on **polynomial features** of degree='degree_poly'. # We fit first the linear regression model to the training data with unmodified features. This corresponds to a truly linear model in the standard coordinate system. # In[34]: # %% Fit models to training data lr = LinearRegression().fit(X_train, y_train) # linear regression # ### Generation of Polynomial Features # # Based on the wave-like behavior of the function $f$ that is underlying the data generation process, # we would like to have larger expressive power of the models that we use. # For this reason we use an appropriate method of sklearn's [preprocessing](https://scikit-learn.org/stable/modules/classes.html?highlight=preprocessing#module-sklearn.preprocessing) to transform the $x$-samples to a vector of monomials, and run linear regression on these "enhanced" features derived from the samples. # In[35]: # %% Run linear regression with polynomial features of degree 'degree_poly': degree_poly = 10; poly = PolynomialFeatures(degree_poly) # define polynomial data model X_trainpoly = poly.fit_transform(X_train) # obtain coordinates of polynomial features of training set X_testpoly = poly.transform(X_test) # obtain coordinates of polynomial features of training set polylr = LinearRegression().fit(X_trainpoly,y_train) # perform linear regression with polynomial features # Now, we use ridge regression on the polynomial features for a fixed regularization parameter $\alpha=\text{alphaval}$. Please note the options of [linear_models](https://scikit-learn.org/stable/modules/linear_model.html#linear-model). # In[30]: # %% Run ridge regression with polynomial features alphaval = 10 # ridge regression parameter alpha polyridge = Ridge(alpha=alphaval).fit(X_trainpoly,y_train) # perform ridge regression with polynomial features line = np.linspace(-8, 8, 1000).reshape(-1, 1) line_poly = poly.transform(line) # We output the $R^2$ for the three considered models, for both training and test set, as well as the _mean squared errors_ for these. To obtain simple, formatted output, it is convenient to create a pandas [DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html?highlight=dataframe#pandas.DataFrame) that includes the text as a row description and the quantities as entries. # In[36]: text_trainerrors = ['Training error of linear regression:','Training error of linear regression, polynomial features:','Training error of ridge regression, alpha = '+ str(alphaval)+', polynomial features:'] trainerrors_list = [mean_squared_error(lr.predict(X_train), y_train), mean_squared_error(polylr.predict(X_trainpoly), y_train), mean_squared_error(polyridge.predict(X_trainpoly), y_train)] text_testerrors = ['Test error of linear regression:','Test error of linear regression, polynomial features:','Test error of ridge regression, alpha = '+ str(alphaval)+', polynomial features:'] testerrors_list = [mean_squared_error(lr.predict(X_test), y_test), mean_squared_error(polylr.predict(X_testpoly), y_test), mean_squared_error(polyridge.predict(X_testpoly), y_test)] text_r2 = ['R2 (train), linear regression:','R2 (train) of linear regression, polynomial features:','R2 (train) of ridge regression, alpha = '+ str(alphaval)+', polynomial features:'] r2_train_list = [lr.score(X_train,y_train), polylr.score(X_trainpoly, y_train), polyridge.score(X_trainpoly, y_train)] text_r2_test = ['R2 (test), linear regression:','R2 (test) of linear regression, polynomial features:','R2 (test) of ridge regression, alpha = '+ str(alphaval)+', polynomial features:'] r2_test_list = [lr.score(X_test,y_test), polylr.score(X_testpoly, y_test), polyridge.score(X_testpoly, y_test)] pd.options.display.max_colwidth = 100 print(pd.DataFrame(trainerrors_list, index=text_trainerrors, columns=[''])) print(pd.DataFrame(r2_train_list, index=text_r2, columns=[''])) print(pd.DataFrame(testerrors_list, index=text_testerrors, columns=[''])) print(pd.DataFrame(r2_test_list, index=text_r2_test, columns=[''])) # In[ ]: # We summarize the resulting curves in a plot. # In[37]: # %% Plot summary plt.figure(figsize=(14,8)) plt.plot(X_train,y_train,'o',c='blue') plt.plot(X_test,y_test,'o',c='red') plt.plot(line, lr.predict(line)) plt.plot(line, polylr.predict(line_poly)) plt.plot(line, polyridge.predict(line_poly)) ax = plt.gca() ax.spines['left'].set_position('center') ax.spines['right'].set_color('none') ax.spines['bottom'].set_position('center') ax.spines['top'].set_color('none') ax.set_ylim(-6, 6) ax.legend(["training data","test data","linear","lin., poly. features","ridge, alpha="+str(alphaval)], loc=0,fontsize=10) ax.grid(True) # How do you interpret the accuracy scores above in view of this plot? # # Exploring trade-offs of model complexity # # ## Linear regression on polynomial features of different degree # # We now consider the resulting errors of applying linear regression with polynomial features of different degrees in order to explore the behavior of training and test errors for different model complexities. # # To this end, we write a snippet of code that creates two [numpy.ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray) 'train_errors' and 'test_errors' that contain the mean squared errors on both training and test set that are obtain by running linear regression on polynomial features of degree between 1 and 30. # In[13]: # %% Explore trade-off of model complexity degree_range = range(1,30) train_errors = np.zeros((len(degree_range), )) test_errors = np.zeros((len(degree_range), )) for k, degree in enumerate(degree_range): poly = PolynomialFeatures(degree) # define polynomial data model X_trainpoly = poly.fit_transform(X_train) # obtain coordinates of polynomial features of training set X_testpoly = poly.fit_transform(X_test) # obtain coordinates of polynomial features of training set polylr = LinearRegression().fit(X_trainpoly,y_train) # perform linear regression with polynomial features train_errors[k] = mean_squared_error(polylr.predict(X_trainpoly), y_train) test_errors[k] = mean_squared_error(polylr.predict(X_testpoly), y_test) # Using this, we create a plot containing training and test errors on the $y$-axis and the degree of the polynomial model on the $x$-axis. # In[14]: # %% Plot training und test errors for different model complexities plt.figure() plt.plot(degree_range,train_errors) plt.plot(degree_range,test_errors) ax = plt.gca() ax.set_yscale('log') ax.legend(["training data","test data"], loc=0) ax.set(xlabel='degree of polynomial', ylabel='mean squared error', title='Linear regression with polynomial features of different degree'); print("Degree resulting in smallest error on test data: %f" % (np.argmin(test_errors)+1)) # We observe that for growing model complexity, the training error *decreases*, whereas the test error actually *increases*. # # Can you explain this behavior? # ## Ridge regression on polynomial features with different regularization parameters # # Next, we focus on ridge regression on features with fixed degree, but using different regularization parameters (between $\alpha=10^{-5}$ and $\alpha=10^{9}$). # In[15]: import warnings from scipy.linalg import LinAlgWarning warnings.filterwarnings("ignore",category= LinAlgWarning, module='sklearn') # In the following, we fix the polynomial degree to 11. For such polynomial features, write code that # - uses the available training-test set split to run a [hyperparameter optimization](https://scikit-learn.org/stable/modules/grid_search.html#tuning-the-hyper-parameters-of-an-estimator) for ridge regression over $60$ different values of $\alpha$, logaritmically spaced between $10^{-5}$ and $10^{9}$. Relevant methods you might want to use are _PredefinedSplit_ and _GridsearchCV_. # - Eventually, extract the test and training errors into the numpy.ndarrays 'train_errors_ridge' and 'test_errors_ridge'. # In[39]: # creating the polynomial features degree = 11 poly = PolynomialFeatures(degree) X_poly = poly.fit_transform(X) # We first focus on a (simpler) hyperparameter optimization over the already existing training-test set split (with folds etc.). Strictly speaking, this is not a good practice, since we use information from the test set to optimize a parameter in the learned model. # In[52]: test_fold = np.zeros((nr_samples, )) test_fold[id_train] = -1 test_fold[id_test] = 0 from sklearn.model_selection import PredefinedSplit cv = PredefinedSplit(test_fold) # Using this existing split, we use hyperparameter optimization for ridge regression to optimize the parameter $\alpha$. # In[53]: alphas=np.logspace(-5,9,num=60) # create vector of logarithmically interpolated values between 10^(-5) and 10^(9) parameters = {'alpha':alphas} gridsearch = GridSearchCV(Ridge(),param_grid=parameters,scoring='neg_mean_squared_error',return_train_score=True,cv=cv) gridsearch.fit(X_poly, y) train_errors_ridge = -gridsearch.cv_results_['mean_train_score'] test_errors_ridge = -gridsearch.cv_results_['mean_test_score'] # Define plotting function for training und test errors for different model complexities: # In[54]: def plot_train_test(alphas,train_error,test_error): plt.figure() plt.plot(alphas,train_errors_ridge) plt.plot(alphas,test_errors_ridge) ax = plt.gca() ax.set_xscale('log') ax.set_yscale('log') ax.set(xlabel='alpha', ylabel='mean squared error', title='Ridge regession for different model complexities') ax.legend(["training data","test data"], loc=0) # Plot resulting train/test errors: # In[55]: plot_train_test(alphas,train_errors_ridge,test_errors_ridge) # Alternatively, we also perform [5-fold crossvalidation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html?highlight=kfold#sklearn.model_selection.KFold) (see also [here](https://scikit-learn.org/stable/modules/cross_validation.html#k-fold)) of the same parameter $\alpha$ on the training set only. # In[58]: gridsearch_kfold = GridSearchCV(Ridge(),param_grid=parameters, scoring='neg_mean_squared_error', return_train_score=True,cv=5) X_trainpoly = poly.transform(X_train) gridsearch_kfold.fit(X_trainpoly, y_train) train_errors_ridge_kfold = -gridsearch_kfold.cv_results_['mean_train_score'] test_errors_ridge_kfold = -gridsearch_kfold.cv_results_['mean_test_score'] # Again, we plot the resulting train/test errors: # In[59]: plot_train_test(alphas,train_errors_ridge_kfold,test_errors_ridge_kfold) # Comparing the last two plots, we see that for this data set, the two different cross-validation strategies result in basically the same behavior. # Finally, let's plot the obtained function stemming from the "optimal" ridge regression model (optimized regularization parameter $\alpha$) compared to the ridge regression model that we obtain if we choose $\alpha = 10^{-5}$ or $\alpha = 10^{9}$. # In[17]: # %% Plot optimal, under-fitted and over-fitted model alpha_opt = gridsearch.best_params_['alpha'] alpha_overfit = alphas[-1] alpha_underfit = alphas[0] X_trainpoly0 = poly.fit_transform(X_train) # obtain coordinates of polynomial features of training set line_poly = poly.transform(line) rd = Ridge(alpha=alpha_opt).fit(X_trainpoly0,y_train) plt.figure(figsize=(14,8)) plt.plot(X_train,y_train,'o',c='blue') plt.plot(X_test,y_test,'o',c='red') plt.plot(line,rd.predict(line_poly)) rd = Ridge(alpha=alpha_overfit).fit(X_trainpoly0,y_train) plt.plot(line,rd.predict(line_poly)) rd = Ridge(alpha=alpha_underfit).fit(X_trainpoly0,y_train) plt.plot(line,rd.predict(line_poly)) ax = plt.gca() ax.set_ylim(-5, 5) ax.legend(["training data","test data","optimized alpha","alpha="+str(alpha_overfit),"alpha="+str(alpha_underfit)], loc=0) print("'Optimal' regularization parameter alpha (i.e., resulting in smallest test error): %f" % alpha_opt) # We observe that for the smallest regularization parameter $\alpha = 10^{-5}$ (which almost corresponds to a linear regression model without regularization), we can interpolate the training data points very well, at the cost of a large error on the test data points. # # On the other hand, for large regularization parameters such as $\alpha = 10^9$, we obtain a "simple" model which interpolates the training data less well, with a comparable error on the test set. # # The minimal error on the test data is achieved for a value of $\alpha$ that is in between.