#!/usr/bin/env python
# coding: utf-8

# 
# # Linear regression, ridge regression and polynomial features
# ## Case study with a simple one-dimensional function
# 
# On the example of a simple sinusoidal one-dimensional function, we explore:
#   - the idea of a [training / test (or validation) set split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) of samples, and based on this, we evaluate the predictive power of 
#   - [linear regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html),
#   - [ridge regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html), which is an **L_2-regularized** linear model, 
#   
#  and furthermore, explore the effect of enriching the model by using [polynomial features](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html).

# First, we load packages and functions that we will need.

# In[ ]:


import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV


# We define a function to sample $n=\text{'n_sample'}$ points $x_1,\ldots,x_n$ uniformly distributed at random and define 
# corresponding samples $y_i=f(x_i) + \epsilon_i$ where 
# $f(x)= \sin(4 x) + x$ and $\epsilon_i$ are i.i.d. standard random variables. 

# In[ ]:


### function to create random samples from a wave-like signal
def make_wave(n_samples=100):
    rnd = np.random.RandomState(42)
    x = rnd.uniform(-3, 3, size=n_samples)
    x = np.sort(x)
    y_no_noise = (np.sin(4 * x) + x)
    y = y_no_noise + rnd.normal(size=len(x))
    return x.reshape(-1, 1), y

# Visualize the function f:
fn = lambda x: np.sin(4 * x) + x
line_xvalues = np.linspace(-3, 3, 1000).reshape(-1, 1)
plt.figure(figsize=(14,8))
plt.plot(line_xvalues,fn(line_xvalues))
ax = plt.gca()
ax.set_ylim(-5, 5)
ax.set_xlim(-8, 8)
ax.grid(True)


# We create a data set (consisting of $x$-coordinates in '$X$' and $y$-coordinates in '$y$') based on the above model, using a 'nr_samples' samples.

# In[ ]:


# %% Create data, create training and test data split
nr_samples = 50
X, y = make_wave(n_samples=nr_samples)


# We now use the method [train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html?highlight=train_test#sklearn.model_selection.train_test_split) of scikit-learn to split the available samples into to groups, a training dataset and a test dataset. The share of samples in the training data is provided by the choice of 'train_share', i.e., if $\text{train_share} = 0.6$, then 60% of the samples are used in the training dataset.
# 
# From the samples variable $X$, we create a [pandas](https://pandas.pydata.org/docs/) DataFrame as its methods are sometimes more convenient to use - for example, we can easily obtain a vector of indices corresponding to the training and test set, respectively. We define the corresponding vectors 'id_train' and 'id_test'.

# In[ ]:


train_share = 0.6
dfX = pd.DataFrame(X)
X_train, X_test, y_train, y_test =  # split data into  a share of 'train_share' in training data set and  a share of 1-'train_share' in test data set
id_train = # obtain index of training set
X_train =    # convert the DataFrame object into a numpy.ndarray
id_test = X_test.index # obtain index of test set
X_test =  # convert the DataFrame object into a numpy.ndarray


# The samples can be visualized as follows.

# In[ ]:


# %% Plot training data
plt.figure(figsize=(14,8))
plt.plot(X_train,y_train,'o',c='blue')
ax = plt.gca()
ax.set_ylim(-5, 5)
ax.set_xlim(-8, 8)
ax.spines['left'].set_position('center')
ax.spines['right'].set_color('none')
ax.spines['bottom'].set_position('center')
ax.spines['top'].set_color('none')
ax.legend(["training data"],loc=0,fontsize=8)
ax.grid(True)

# %% Plot training data and test data
plt.figure(figsize=(14,8))
plt.plot(X_train,y_train,'o',c='blue')
plt.plot(X_test,y_test,'o',c='red')
ax = plt.gca()
ax.set_ylim(-5, 5)
ax.set_xlim(-8, 8)
ax.spines['left'].set_position('center')
ax.spines['right'].set_color('none')
ax.spines['bottom'].set_position('center')
ax.spines['top'].set_color('none')
ax.legend(["training data","test data"],loc=0,fontsize=8)
ax.grid(True)


# We now train a linear regression model (with $\alpha=\text{alphaval}$) on the samples of the training set, and evaluate the mean squared errors as well as the resulting $R^2$ coefficient. We also train a linear and a ridge regression model on **polynomial features** of degree='degree_poly'.

# We fit first the linear regression model to the training data with unmodified features. This corresponds to a truly linear model in the standard coordinate system.

# In[ ]:


# %% Fit models to training data
lr = LinearRegression().fit(X_train, y_train) # linear regression


# ### Generation of Polynomial Features
# 
# Based on the wave-like behavior of the function $f$ that is underlying the data generation process, 
# we would like to have larger expressive power of the models that we use.
# For this reason we use an appropriate method of sklearn's [preprocessing](https://scikit-learn.org/stable/modules/classes.html?highlight=preprocessing#module-sklearn.preprocessing) to transform the $x$-samples to a vector of monomials, and run linear regression on these "enhanced" features derived from the samples.

# In[ ]:


# %% Run linear regression with polynomial features of degree 'degree_poly':
degree_poly = 10;
poly =  # define polynomial data model
X_trainpoly = # obtain coordinates of polynomial features of training set
X_testpoly =  # obtain coordinates of polynomial features of training set
polylr =  # perform linear regression with polynomial features


# Now, we use ridge regression on the polynomial features for a fixed regularization parameter $\alpha=\text{alphaval}$. Please note the options of [linear_models](https://scikit-learn.org/stable/modules/linear_model.html#linear-model).

# In[ ]:


# %% Run ridge regression with polynomial features
alphaval = 10 # ridge regression parameter alpha
polyridge =  # perform ridge regression with polynomial features


# We output the $R^2$ for the three considered models, for both training and test set, as well as the _mean squared errors_ (MSE) for these. To obtain simple, formatted output, it is convenient to create a pandas [DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html?highlight=dataframe#pandas.DataFrame) that includes the text as a row description and the quantities as entries.
# 
# It is not necessary to compute the $R^2$ coefficients or the MSEs writing a seperate function - use the appropriate scikit-learn methods for that!

# In[ ]:


text_trainerrors = ['Training error of linear regression:','Training error of linear regression, polynomial features:','Training error of ridge regression, alpha = '+ str(alphaval)+', polynomial features:']
trainerrors_list = 
text_testerrors = 
testerrors_list = 
text_r2 = 
r2_train_list = 
text_r2_test = 
r2_test_list = 


pd.options.display.max_colwidth = 100
# create and print DataFrame from above 


# In[ ]:


# We summarize the resulting curves in a plot.

# In[ ]:


# %% Plot summary
line = np.linspace(-8, 8, 1000).reshape(-1, 1)
line_poly = poly.transform(line)
plt.figure(figsize=(14,8))
plt.plot(X_train,y_train,'o',c='blue')
plt.plot(X_test,y_test,'o',c='red')
plt.plot(line, lr.predict(line))
plt.plot(line, polylr.predict(line_poly))
plt.plot(line, polyridge.predict(line_poly))
ax = plt.gca()
ax.spines['left'].set_position('center')
ax.spines['right'].set_color('none')
ax.spines['bottom'].set_position('center')
ax.spines['top'].set_color('none')
ax.set_ylim(-6, 6)
ax.legend(["training data","test data","linear","lin., poly. features","ridge, alpha="+str(alphaval)], loc=0,fontsize=10)
ax.grid(True)


# How do you interpret the accuracy scores above in view of this plot?

# # Exploring trade-offs of model complexity
# 
# ## Linear regression on polynomial features of different degree
# 
# We now consider the resulting errors of applying linear regression with polynomial features of different degrees in order to explore the behavior of training and test errors for different model complexities.
# 
# To this end, we write a snippet of code that creates two [numpy.ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray) 'train_errors' and 'test_errors' that contain the mean squared errors on both training and test set that are obtain by running linear regression on polynomial features of degree between 1 and 30.

# In[ ]:


# %% Explore trade-off of model complexity
degree_range = 
train_errors = 
test_errors = 
for k, degree in enumerate(degree_range):
     # define polynomial data model
     # obtain coordinates of polynomial features of training set
     # obtain coordinates of polynomial features of training set
     # perform linear regression with polynomial features
    train_errors[k] = 
    test_errors[k] = 


# Using this, we create a plot containing training and test errors on the $y$-axis and the degree of the polynomial model on the $x$-axis.

# In[ ]:


# %% Plot training und test errors for different model complexities
plt.figure()
plt.plot(degree_range,train_errors)
plt.plot(degree_range,test_errors)
ax = plt.gca()
ax.set_yscale('log')
ax.legend(["training data","test data"], loc=0)
ax.set(xlabel='degree of polynomial', ylabel='mean squared error',
       title='Linear regression with polynomial features of different degree');
print("Degree resulting in smallest error on test data: %f" % (np.argmin(test_errors)+1))


# We observe that for growing model complexity, the training error *decreases*, whereas the test error actually *increases*.
# 
# Can you explain this behavior?

# ## Ridge regression on polynomial features with different regularization parameters
# 
# Next, we focus on ridge regression on features with fixed degree, but using different regularization parameters (between $\alpha=10^{-5}$ and $\alpha=10^{9}$).

# In[ ]:


import warnings
from scipy.linalg import LinAlgWarning 
warnings.filterwarnings("ignore",category= LinAlgWarning, module='sklearn')


# In the following, we fix the polynomial degree to 11. For such polynomial features, write code that
#   - uses the available training-test set split to run a [hyperparameter optimization](https://scikit-learn.org/stable/modules/grid_search.html#tuning-the-hyper-parameters-of-an-estimator) for ridge regression over $60$ different values of $\alpha$, logaritmically spaced between $10^{5}$ and $10^{9}$. Relevant methods you might want to use are _PredefinedSplit_ and _GridsearchCV_.
#   - Eventually, extract the test and training errors into the numpy.ndarrays 'train_errors_ridge' and 'test_errors_ridge'.

# In[ ]:


degree = 11
# %% Perform hyperparameter optimization for ridge regression on polynomial features of degree = 7
## Define input of a "test fold" to be used by "PredefinedSplit" method of scikit-learn
test_fold = #np.zeros((nr_samples, ))


alphas= # create vector of logarithmically interpolated values between 10^(-5) and 10^(9)
parameters = {'alpha':alphas}

gridsearch = 


train_errors_ridge = 
test_errors_ridge = 


# In[ ]:


# %% Plot training und test errors for different model complexities
plt.figure()
plt.plot(alphas,train_errors_ridge)
plt.plot(alphas,test_errors_ridge)
ax = plt.gca()
ax.set_xscale('log')
ax.set_yscale('log')
ax.set(xlabel='alpha', ylabel='mean squared error',
       title='Ridge regession for different model complexities')
ax.legend(["training data","test data"], loc=0)
#ax.set_ylim(0, 20)


# Finally, let's plot the obtained function stemming from the "optimal" ridge regression model (optimized regularization parameter $\alpha$) compared to the ridge regression model that we obtain if we choose $\alpha = 10^{-5}$ or $\alpha = 10^{9}$.

# In[ ]:


# %% Plot optimal, under-fitted and over-fitted model
alpha_opt = gridsearch.best_params_['alpha']
alpha_overfit = alphas[-1]
alpha_underfit = alphas[0]
X_trainpoly0 = poly.fit_transform(X_train) # obtain coordinates of polynomial features of training set

rd = Ridge(alpha=alpha_opt).fit(X_trainpoly0,y_train)
plt.figure(figsize=(14,8))
plt.plot(X_train,y_train,'o',c='blue')
plt.plot(X_test,y_test,'o',c='red')
line_poly = poly.transform(line)
plt.plot(line,rd.predict(line_poly))
rd = Ridge(alpha=alpha_overfit).fit(X_trainpoly0,y_train)
plt.plot(line,rd.predict(line_poly))
rd = Ridge(alpha=alpha_underfit).fit(X_trainpoly0,y_train)
plt.plot(line,rd.predict(line_poly))
ax = plt.gca()
ax.set_ylim(-5, 5)
ax.legend(["training data","test data","optimized alpha","alpha="+str(alpha_overfit),"alpha="+str(alpha_underfit)], loc=0)

print("'Optimal' regularization parameter alpha (i.e., resulting in smallest test error): %f" % alpha_opt)