#!/usr/bin/env python # coding: utf-8 # # # Linear regression, ridge regression and polynomial features # ## Case study with a simple one-dimensional function # # On the example of a simple sinusoidal one-dimensional function, we explore: # - the idea of a [training / test (or validation) set split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) of samples, and based on this, we evaluate the predictive power of # - [linear regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html), # - [ridge regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html), which is an **L_2-regularized** linear model, # # and furthermore, explore the effect of enriching the model by using [polynomial features](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html). # First, we load packages and functions that we will need. # In[ ]: import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.linear_model import Ridge from sklearn.model_selection import train_test_split from sklearn.preprocessing import PolynomialFeatures from sklearn.metrics import mean_squared_error from sklearn.model_selection import GridSearchCV # We define a function to sample $n=\text{'n_sample'}$ points $x_1,\ldots,x_n$ uniformly distributed at random and define # corresponding samples $y_i=f(x_i) + \epsilon_i$ where # $f(x)= \sin(4 x) + x$ and $\epsilon_i$ are i.i.d. standard random variables. # In[ ]: ### function to create random samples from a wave-like signal def make_wave(n_samples=100): rnd = np.random.RandomState(42) x = rnd.uniform(-3, 3, size=n_samples) x = np.sort(x) y_no_noise = (np.sin(4 * x) + x) y = y_no_noise + rnd.normal(size=len(x)) return x.reshape(-1, 1), y # Visualize the function f: fn = lambda x: np.sin(4 * x) + x line_xvalues = np.linspace(-3, 3, 1000).reshape(-1, 1) plt.figure(figsize=(14,8)) plt.plot(line_xvalues,fn(line_xvalues)) ax = plt.gca() ax.set_ylim(-5, 5) ax.set_xlim(-8, 8) ax.grid(True) # We create a data set (consisting of $x$-coordinates in '$X$' and $y$-coordinates in '$y$') based on the above model, using a 'nr_samples' samples. # In[ ]: # %% Create data, create training and test data split nr_samples = 50 X, y = make_wave(n_samples=nr_samples) # We now use the method [train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html?highlight=train_test#sklearn.model_selection.train_test_split) of scikit-learn to split the available samples into to groups, a training dataset and a test dataset. The share of samples in the training data is provided by the choice of 'train_share', i.e., if $\text{train_share} = 0.6$, then 60% of the samples are used in the training dataset. # # From the samples variable $X$, we create a [pandas](https://pandas.pydata.org/docs/) DataFrame as its methods are sometimes more convenient to use - for example, we can easily obtain a vector of indices corresponding to the training and test set, respectively. We define the corresponding vectors 'id_train' and 'id_test'. # In[ ]: train_share = 0.6 dfX = pd.DataFrame(X) X_train, X_test, y_train, y_test = # split data into a share of 'train_share' in training data set and a share of 1-'train_share' in test data set id_train = # obtain index of training set X_train = # convert the DataFrame object into a numpy.ndarray id_test = X_test.index # obtain index of test set X_test = # convert the DataFrame object into a numpy.ndarray # The samples can be visualized as follows. # In[ ]: # %% Plot training data plt.figure(figsize=(14,8)) plt.plot(X_train,y_train,'o',c='blue') ax = plt.gca() ax.set_ylim(-5, 5) ax.set_xlim(-8, 8) ax.spines['left'].set_position('center') ax.spines['right'].set_color('none') ax.spines['bottom'].set_position('center') ax.spines['top'].set_color('none') ax.legend(["training data"],loc=0,fontsize=8) ax.grid(True) # %% Plot training data and test data plt.figure(figsize=(14,8)) plt.plot(X_train,y_train,'o',c='blue') plt.plot(X_test,y_test,'o',c='red') ax = plt.gca() ax.set_ylim(-5, 5) ax.set_xlim(-8, 8) ax.spines['left'].set_position('center') ax.spines['right'].set_color('none') ax.spines['bottom'].set_position('center') ax.spines['top'].set_color('none') ax.legend(["training data","test data"],loc=0,fontsize=8) ax.grid(True) # We now train a linear regression model (with $\alpha=\text{alphaval}$) on the samples of the training set, and evaluate the mean squared errors as well as the resulting $R^2$ coefficient. We also train a linear and a ridge regression model on **polynomial features** of degree='degree_poly'. # We fit first the linear regression model to the training data with unmodified features. This corresponds to a truly linear model in the standard coordinate system. # In[ ]: # %% Fit models to training data lr = LinearRegression().fit(X_train, y_train) # linear regression # ### Generation of Polynomial Features # # Based on the wave-like behavior of the function $f$ that is underlying the data generation process, # we would like to have larger expressive power of the models that we use. # For this reason we use an appropriate method of sklearn's [preprocessing](https://scikit-learn.org/stable/modules/classes.html?highlight=preprocessing#module-sklearn.preprocessing) to transform the $x$-samples to a vector of monomials, and run linear regression on these "enhanced" features derived from the samples. # In[ ]: # %% Run linear regression with polynomial features of degree 'degree_poly': degree_poly = 10; poly = # define polynomial data model X_trainpoly = # obtain coordinates of polynomial features of training set X_testpoly = # obtain coordinates of polynomial features of training set polylr = # perform linear regression with polynomial features # Now, we use ridge regression on the polynomial features for a fixed regularization parameter $\alpha=\text{alphaval}$. Please note the options of [linear_models](https://scikit-learn.org/stable/modules/linear_model.html#linear-model). # In[ ]: # %% Run ridge regression with polynomial features alphaval = 10 # ridge regression parameter alpha polyridge = # perform ridge regression with polynomial features # We output the $R^2$ for the three considered models, for both training and test set, as well as the _mean squared errors_ (MSE) for these. To obtain simple, formatted output, it is convenient to create a pandas [DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html?highlight=dataframe#pandas.DataFrame) that includes the text as a row description and the quantities as entries. # # It is not necessary to compute the $R^2$ coefficients or the MSEs writing a seperate function - use the appropriate scikit-learn methods for that! # In[ ]: text_trainerrors = ['Training error of linear regression:','Training error of linear regression, polynomial features:','Training error of ridge regression, alpha = '+ str(alphaval)+', polynomial features:'] trainerrors_list = text_testerrors = testerrors_list = text_r2 = r2_train_list = text_r2_test = r2_test_list = pd.options.display.max_colwidth = 100 # create and print DataFrame from above # In[ ]: # We summarize the resulting curves in a plot. # In[ ]: # %% Plot summary line = np.linspace(-8, 8, 1000).reshape(-1, 1) line_poly = poly.transform(line) plt.figure(figsize=(14,8)) plt.plot(X_train,y_train,'o',c='blue') plt.plot(X_test,y_test,'o',c='red') plt.plot(line, lr.predict(line)) plt.plot(line, polylr.predict(line_poly)) plt.plot(line, polyridge.predict(line_poly)) ax = plt.gca() ax.spines['left'].set_position('center') ax.spines['right'].set_color('none') ax.spines['bottom'].set_position('center') ax.spines['top'].set_color('none') ax.set_ylim(-6, 6) ax.legend(["training data","test data","linear","lin., poly. features","ridge, alpha="+str(alphaval)], loc=0,fontsize=10) ax.grid(True) # How do you interpret the accuracy scores above in view of this plot? # # Exploring trade-offs of model complexity # # ## Linear regression on polynomial features of different degree # # We now consider the resulting errors of applying linear regression with polynomial features of different degrees in order to explore the behavior of training and test errors for different model complexities. # # To this end, we write a snippet of code that creates two [numpy.ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray) 'train_errors' and 'test_errors' that contain the mean squared errors on both training and test set that are obtain by running linear regression on polynomial features of degree between 1 and 30. # In[ ]: # %% Explore trade-off of model complexity degree_range = train_errors = test_errors = for k, degree in enumerate(degree_range): # define polynomial data model # obtain coordinates of polynomial features of training set # obtain coordinates of polynomial features of training set # perform linear regression with polynomial features train_errors[k] = test_errors[k] = # Using this, we create a plot containing training and test errors on the $y$-axis and the degree of the polynomial model on the $x$-axis. # In[ ]: # %% Plot training und test errors for different model complexities plt.figure() plt.plot(degree_range,train_errors) plt.plot(degree_range,test_errors) ax = plt.gca() ax.set_yscale('log') ax.legend(["training data","test data"], loc=0) ax.set(xlabel='degree of polynomial', ylabel='mean squared error', title='Linear regression with polynomial features of different degree'); print("Degree resulting in smallest error on test data: %f" % (np.argmin(test_errors)+1)) # We observe that for growing model complexity, the training error *decreases*, whereas the test error actually *increases*. # # Can you explain this behavior? # ## Ridge regression on polynomial features with different regularization parameters # # Next, we focus on ridge regression on features with fixed degree, but using different regularization parameters (between $\alpha=10^{-5}$ and $\alpha=10^{9}$). # In[ ]: import warnings from scipy.linalg import LinAlgWarning warnings.filterwarnings("ignore",category= LinAlgWarning, module='sklearn') # In the following, we fix the polynomial degree to 11. For such polynomial features, write code that # - uses the available training-test set split to run a [hyperparameter optimization](https://scikit-learn.org/stable/modules/grid_search.html#tuning-the-hyper-parameters-of-an-estimator) for ridge regression over $60$ different values of $\alpha$, logaritmically spaced between $10^{5}$ and $10^{9}$. Relevant methods you might want to use are _PredefinedSplit_ and _GridsearchCV_. # - Eventually, extract the test and training errors into the numpy.ndarrays 'train_errors_ridge' and 'test_errors_ridge'. # In[ ]: degree = 11 # %% Perform hyperparameter optimization for ridge regression on polynomial features of degree = 7 ## Define input of a "test fold" to be used by "PredefinedSplit" method of scikit-learn test_fold = #np.zeros((nr_samples, )) alphas= # create vector of logarithmically interpolated values between 10^(-5) and 10^(9) parameters = {'alpha':alphas} gridsearch = train_errors_ridge = test_errors_ridge = # In[ ]: # %% Plot training und test errors for different model complexities plt.figure() plt.plot(alphas,train_errors_ridge) plt.plot(alphas,test_errors_ridge) ax = plt.gca() ax.set_xscale('log') ax.set_yscale('log') ax.set(xlabel='alpha', ylabel='mean squared error', title='Ridge regession for different model complexities') ax.legend(["training data","test data"], loc=0) #ax.set_ylim(0, 20) # Finally, let's plot the obtained function stemming from the "optimal" ridge regression model (optimized regularization parameter $\alpha$) compared to the ridge regression model that we obtain if we choose $\alpha = 10^{-5}$ or $\alpha = 10^{9}$. # In[ ]: # %% Plot optimal, under-fitted and over-fitted model alpha_opt = gridsearch.best_params_['alpha'] alpha_overfit = alphas[-1] alpha_underfit = alphas[0] X_trainpoly0 = poly.fit_transform(X_train) # obtain coordinates of polynomial features of training set rd = Ridge(alpha=alpha_opt).fit(X_trainpoly0,y_train) plt.figure(figsize=(14,8)) plt.plot(X_train,y_train,'o',c='blue') plt.plot(X_test,y_test,'o',c='red') line_poly = poly.transform(line) plt.plot(line,rd.predict(line_poly)) rd = Ridge(alpha=alpha_overfit).fit(X_trainpoly0,y_train) plt.plot(line,rd.predict(line_poly)) rd = Ridge(alpha=alpha_underfit).fit(X_trainpoly0,y_train) plt.plot(line,rd.predict(line_poly)) ax = plt.gca() ax.set_ylim(-5, 5) ax.legend(["training data","test data","optimized alpha","alpha="+str(alpha_overfit),"alpha="+str(alpha_underfit)], loc=0) print("'Optimal' regularization parameter alpha (i.e., resulting in smallest test error): %f" % alpha_opt)