#!/usr/bin/env python # coding: utf-8 # # Principal Component Analysis # # We return to some of the previously considered data sets to explore them via principal component analysis (PCA). You information about how to use it in scikit-learn [here](https://scikit-learn.org/stable/modules/decomposition.html#pca) and it can be called via [sklearn.decomposition.PCA](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA). For larger datasets, especially if sparse matrix formats are used, it might be more feasible to use [sklearn.decomposition.TruncatedSVD](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD), which is computationally more efficient. # **Note:** For the 3D plots of this notebook blow, we will use an _interactive mode_ of matplotlib. You should be able to run everything smoothly after installing the package _ipympl_. You can install this package (if you are using conda/Anaconda) by the line # # `conda install -c conda-forge ipympl` # # or by looking for the package `ipympl` in the GUI of Anaconda Navigator and its package search for your environment. # ## Iris dataset # # First we load the [Iris dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html?highlight=iris#sklearn.datasets.load_iris) which we are already familiar with. # In[1]: from sklearn import datasets import numpy as np iris = datasets.load_iris() X = iris.data y = iris.target # In[2]: print(iris.DESCR) # We remember that the samples in this dataset contain **four** features: # In[3]: print(iris.feature_names) # In[4]: import seaborn as sns import pandas as pd import matplotlib.pyplot as plt # this is a small loop for creating a list with the category names iris.target_class = [] for k in range(len(iris.target)): iris.target_class.append(iris.target_names[iris.target[k]]) # for this, first create DataFrame df_iris = pd.DataFrame(iris.data,columns=iris['feature_names']) df_iris = df_iris.assign(iris_class=iris.target_class) sns.pairplot(df_iris, hue= "iris_class", palette="tab10", ) fig = plt.gcf() fig.set_size_inches(20, 20) # These are the pairwise scatter plot we created previously. # # Is there a better way to visualize the data, without the need of all these pairwise plots? # Our brain typially is able to visualize at most 3 dimensions well... # # Let's use PCA with two principal components. # In[5]: from sklearn.decomposition import PCA n_components = 2 pca = PCA(n_components=n_components) X_pca = pca.fit_transform(X) plt.figure(figsize=(12, 12)) for i, target_name in zip([0, 1, 2], iris.target_names): plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1], cmap='tab10', lw=2, label=target_name) plt.title("PCA of iris dataset") plt.legend(loc="best", shadow=False, scatterpoints=1) plt.axis([-4, 4, -1.5, 1.5]) plt.xlabel('First principal component') plt.ylabel('Second principal component') plt.show() # This looks visually not very different from some of the scatter plots above, but it **is** conceptually different: In one plot 2D, we are able to "summarize" all four features. # # Let's repeat PCA for **three** principal components. # In[6]: from mpl_toolkits.mplot3d import Axes3D n_components = 3 pca = PCA(n_components=n_components) X_pca = pca.fit_transform(X) plt.show() ax = fig.add_subplot(111,projection='3d') ax.scatter(X_pca[y == 1, 0],X_pca[y == 1, 1],X_pca[y == 1, 2]) plt.show() # In[7]: ### NOTE: In order to run this properly, you need to switch to an "interactive" graphics backend, # for example in Spyder under Preferences>IPython Console>Graphics from IPython import get_ipython get_ipython().run_line_magic('matplotlib', 'widget') from mpl_toolkits.mplot3d import Axes3D n_components = 3 pca = PCA(n_components=n_components) X_pca = pca.fit_transform(X) fig = plt.figure() #(figsize=(12, 12)) ax = fig.add_subplot(111, projection='3d') for i, target_name in zip([0, 1, 2], iris.target_names): ax.scatter3D(X_pca[y == i, 0], X_pca[y == i, 1], X_pca[y == i, 2],cmap='tab10', lw=4,label=target_name,marker="o",edgecolors='face') ax.set_xlabel('First principal component') ax.set_ylabel('Second principal component') ax.set_zlabel('Third principal component') plt.legend(loc="best", shadow=False, scatterpoints=1) plt.show() # This now gives a more flexible visualization that summarizes basically all the information we have available in one plot. # # Remember that the coordinates of the data points in this plot correspond to what we called the _principal components_ -- they are the rows of the matrix $\mathbf{X} \mathbf{V}$. # # Note also the different scaling of the axis, which is proportional to $\sqrt{\lambda_i}$, the $i$-th eigenvalue of the empirical covariance matrix $\mathbf{M} = \frac{1}{n}\mathbf{X}^T \mathbf{X}$. In other words, the scaling is proportional to the square root of the _variance explained by the $i$-th principal component_. # So, one may ask, how much of the total variance of the data set is explained by the three first principal components? # In[8]: vars(pca) # In[9]: print("Variance explained by each of the first three PCs:") pca.explained_variance_ # The ratio of the variance explained by the first three PCs can be calculated as # $$ # \frac{\sum_{i=1}^3 \lambda_i}{\sum_{i=1}^4 \lambda_i}, # $$ # amounts to: # In[10]: np.sum(pca.explained_variance_ratio_) # This is more than 99%, which means that the information lost by the projection from four to three dimensions is probably negligible. # # Let's squeeze out some more information from what we have done. In order to understand better how the four different features are behaved, we create a so-called [biplot](https://en.wikipedia.org/wiki/Biplot). # In[11]: # Create biplot V = pca.components_ fig = plt.figure() #(figsize=(12, 12)) ax = fig.add_subplot(111, projection='3d') mx = np.max(X_pca) mn = np.min(X_pca) for i, target_name in zip([0, 1, 2], iris.target_names): ax.scatter3D(X_pca[y == i, 0], X_pca[y == i, 1], X_pca[y == i, 2],cmap='tab10', lw=4,label=target_name,marker="o",edgecolors='face') scaling_fac = 3 for i in range(0,4): Q = ax.quiver(0,0,0,np.dot(scaling_fac*np.sqrt(pca.explained_variance_[0]),V[0,i]), scaling_fac*np.dot(np.sqrt(pca.explained_variance_[1]),V[1,i]), scaling_fac*np.dot(np.sqrt(pca.explained_variance_[2]),V[2,i]),color='red') ax.text(np.dot(scaling_fac*np.sqrt(pca.explained_variance_[0]),V[0,i]), np.dot(scaling_fac*np.sqrt(pca.explained_variance_[1]),V[1,i]), np.dot(scaling_fac*np.sqrt(pca.explained_variance_[2]),V[2,i]),iris.feature_names[i]) ax.set_xlabel('First principal component') ax.set_ylabel('Second principal component') ax.set_zlabel('Third principal component') ax.set_xlim(mn, mx) ax.set_ylim(mn, mx) ax.set_zlim(mn, mx) plt.legend(loc="best", shadow=False, scatterpoints=1) plt.show() # We note that we rescaled the axes compared to the 3d scatter plot above in order to better visualize the different magnitude of the principal components. # # From the quiver plot of the loading vectors, we can conclude that the feature 'sepal width' is quite uncorrelated with the other three features. This is due to the approximate orthogonality that we observe the corresponding loading vectors, while the other three have quite large scalar products /small angles with each other. # ### Boston housing dataset # # We now want to do some unsupervised learning via PCA can be useful in a regression setting, as the [Boston housing dataset](https://scikit-learn.org/stable/datasets/toy_dataset.html#boston-dataset). # # In particular we want to _reduce the dimensionality_ of the 13-dimensional features space before regressing on the housing prices (variable 'MEDV'). # # 1. # a) Load the dataset (sklearn.datasets.load_boston) that we already used previously. Use PCA to reduce the dimensionality to _5_ features (use original feature space), and run ridge regression on this features space of reduced dimensionality. How does the prediction performance change to what you saw previously? (Note: Unifying the scaling of the different features is important here, for example via [sklearn.preprocessing.StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html?highlight=standardscaler#sklearn.preprocessing.StandardScaler). # # b) Create a principal component plot of the features (after scaling), using a dimensionality of 2 or 3. Can you infer something about the correlations between different features? # # ### NLP data: Newsgroups dataset # # 2. Load the [Newsgroups dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html?highlight=newsgroups#sklearn.datasets.fetch_20newsgroups). Previously, we extracted features from the text documents using a count vectorizer and a TF-IDF vectorizer. # a) Repeat the classification experiments we ran before, but reducing the dimensionality by principal component analysis to $r=\in \{10, 100,1000\}$ features. How does this influence the classification accuracy? # ############################################### # 1.a) Load Boston housing data set. # In[12]: import pandas as pd from sklearn.datasets import load_boston # %% Load data set boston = load_boston() print(boston.data.shape) #get (numer of rows, number of columns or 'features') print(boston.DESCR) #get a description of the dataset df = pd.DataFrame(boston.data,columns=boston['feature_names']) df = df.assign(MEDV=boston.target) y = df['MEDV'].values df.columns x = df.drop(['MEDV'], axis=1).values # Standardize the scale of dataset: # In[13]: # Standardizing #Its an important step to standardize the features before doing PCA from sklearn.preprocessing import StandardScaler x = StandardScaler().fit_transform(x) # Apply PCA: # In[14]: # PCA projection from sklearn.decomposition import PCA pca = PCA(n_components = 5) principalComponents = pca.fit_transform(x) principalDf = pd.DataFrame(data= principalComponents, columns = ['PC1', 'PC2','PC3', 'PC4', 'PC5']) pca.explained_variance_ratio_ # In[15]: principalDf # In[16]: from sklearn.model_selection import train_test_split test_share = 0.2 X_train, X_test, y_train, y_test = train_test_split(principalComponents,y,random_state=10,test_size=test_share) # In[17]: # %% Set up ridge regression model import numpy as np from sklearn.linear_model import Ridge alpha_ridge = 1 ridge = Ridge(alpha=alpha_ridge,fit_intercept=True,normalize=False) ridge.fit(X_train, y_train) # Compute prediction scores for training and test data for model, plot correlation matrix of expected vs. "measured" median housing price def plot_correlation_true_predicted(y_train,y_test,model): # Plot correlation matrix of expected vs. "measured" median housing price for model plt.figure(figsize=(14,14)) plt.scatter(y_train,model.predicted_train) plt.scatter(y_test, model.predicted_test) ax=plt.gca() ax.set_ylim(0, 1.02*max(np.max(model.predicted_train),np.max(model.predicted_test))) ax.set_xlim(0, 1.02*max(np.max(y_train),np.max(y_test))) ax.legend(["training data","test data"], loc=0) ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3") ax.set(ylabel='predicted median housing price (in $1000)', xlabel='true median housing price (in $1000)') ax.set(title="Correlation between true and predicted prices for model "+str(model)) def eval_prediction_train_test(model,X_train,X_test,y_train,y_test,plot=True): model.score_train = model.score(X_train, y_train) model.score_test = model.score(X_test, y_test) model.predicted_test = model.predict(X_test) model.predicted_train = model.predict(X_train) print("R^2 value of model",str(model),"on the train set: %f" % model.score_train) print("R^2 value of model",str(model),"on the test set: %f" % model.score_test) if plot: plot_correlation_true_predicted(y_train,y_test,model) return model ridge = eval_prediction_train_test(ridge,X_train,X_test,y_train,y_test) # #### Run cross validation for ridge regression # In[21]: # %% Use cross validation to optimize regularization parameter alpha of ridge regression from sklearn.model_selection import GridSearchCV alphas=np.logspace(-5,9,num=60) # create vector of logarithmically interpolated values between 10^(-5) and 10^(9) parameters = {'alpha':alphas} def gridsearch_crossvalidation_model(model,X,y,parameters,cv=5,plot=True): gridsearch = GridSearchCV(model,param_grid=parameters,scoring='r2',return_train_score=True,cv=cv) # cv gridsearch.fit(X, y) gridsearch.train_errors = gridsearch.cv_results_['mean_train_score'] gridsearch.test_errors = gridsearch.cv_results_['mean_test_score'] if plot: # Plot training und test errors for different model complexities of ridge regression plt.figure(figsize=(8,8)) plt.plot(list(parameters.values())[0],gridsearch.train_errors) plt.plot(list(parameters.values())[0],gridsearch.test_errors) ax = plt.gca() ax.set_xscale('log') ax.set(xlabel='alpha', ylabel='R2 value', title=str(model)+' for different model complexities') #ax.set_box_aspect(1) ax.legend(["training data","test data"], loc=0) return gridsearch gridsearch_ridge = gridsearch_crossvalidation_model(Ridge(),principalComponents,y,parameters) ridge_optimized = gridsearch_ridge.best_estimator_ ridge_optimized = eval_prediction_train_test(ridge_optimized,X_train,X_test,y_train,y_test) # #### Run cross validation for lasso regression # In[22]: # %% Repeat previous experiments with Lasso instead of ridge regression from sklearn.linear_model import Lasso alpha_lasso = 0.5 lasso = Lasso(alpha=alpha_lasso,fit_intercept=True,normalize=False).fit(X_train, y_train) # train LASSO for fixed regularization parameter alpha=1 lasso = eval_prediction_train_test(lasso,X_train,X_test,y_train,y_test) print("Number of features used for model "+str(lasso)+": {}".format(np.sum(lasso.coef_ != 0))) # Run cross validation to optimize regularization parameter alpha of Lasso gridsearch_lasso = gridsearch_crossvalidation_model(Lasso(),X_train,y_train,parameters) lasso_optimized = gridsearch_lasso.best_estimator_ lasso_optimized = eval_prediction_train_test(lasso_optimized,X_train,X_test,y_train,y_test) print("Number of features used for model "+str(lasso_optimized)+": {}".format(np.sum(lasso_optimized.coef_ != 0))) # While the train set $R^2$ value is smaller on the train set, it is larger on the validation set than when using all features (compare to notebook from yesterday!) # #### Orthogonal matching pursuit # In[23]: from sklearn.linear_model import OrthogonalMatchingPursuit n_omp = 4 omp = OrthogonalMatchingPursuit(n_nonzero_coefs=n_omp) omp.fit(X_train, y_train) omp = eval_prediction_train_test(omp,X_train,X_test,y_train,y_test) # Why does it in fact not make much sense to use OMP in this setting? # We summarize the prediction accuracy of the methods: # In[24]: # %% Summarize results from sklearn.metrics import mean_squared_error results = {'Ridge': [ridge_optimized.score_test, mean_squared_error(ridge_optimized.predict(X_test), y_test)], 'Lasso': [lasso_optimized.score_test,mean_squared_error(lasso_optimized.predict(X_test), y_test)] , 'OMP': [omp.score_test,mean_squared_error(omp.predict(X_test), y_test)]} df_results = pd.DataFrame(data=results,index=['R2 on test set','Test MSE']) print(df_results) # The prediction accuracy is - despite the reduced dimensionality - similar to using full 13 features, arguably it is even a little higher. # In[25]: # %% Plot coefficients for different models plt.figure(figsize=(14,14)) plt.plot(lasso_optimized.coef_, '^', label=str(lasso_optimized)) plt.plot(ridge_optimized.coef_, 'v', label=str(ridge_optimized)) plt.plot(omp.coef_, 'o', label="OMP") plt.legend(ncol=2, loc=(0, 1.05)) #plt.ylim(-25, 25) plt.xlabel("Coefficient index") plt.ylabel("Coefficient magnitude") ax = plt.gca() ax.set_xticks([0,1,2,3,4]) ax.set_xticklabels(['PC1', 'PC2','PC3', 'PC4', 'PC5']) #ax.set_aspect(1) # 1. b) Create a principal component plot of the features (after scaling), using a dimensionality of 2 or 3. Can you infer something about the correlations between different features? # In[26]: from mpl_toolkits.mplot3d import Axes3D n_components = 2 # In[27]: pca2 = PCA(n_components = n_components) X_pca2 = pca2.fit_transform(x) # In[28]: # Create biplot (in 2-dimensions) V = pca2.components_ plt.figure(figsize=(12, 12)) plt.scatter(X_pca2[:, 0], X_pca2[:, 1], c=y,cmap='RdPu', lw=2) plt.title("PCA of iris dataset") plt.xlabel('First principal component') plt.ylabel('Second principal component') ax = plt.gca() scaling_fac = 1 for i in range(0,len(boston.feature_names)): plt.arrow(0,0,np.dot(scaling_fac*np.sqrt(pca2.explained_variance_[0]),V[0,i]), scaling_fac*np.dot(np.sqrt(pca2.explained_variance_[1]),V[1,i]),color='blue') ax.text(np.dot(scaling_fac*np.sqrt(pca2.explained_variance_[0]),V[0,i]), np.dot(scaling_fac*np.sqrt(pca2.explained_variance_[1]),V[1,i]),boston.feature_names[i]) #ax.set_xlim(mn, mx) #ax.set_ylim(mn, mx) #ax.set_zlim(mn, mx) plt.show() # We repeat the PCA and the biplot for three dimensions. # In[29]: n_components = 3 pca3 = PCA(n_components = n_components) X_pca3 = pca3.fit_transform(x) # In[30]: # Create biplot (in 3dimensions) V = pca3.components_ fig = plt.figure() #(figsize=(12, 12)) ax = fig.add_subplot(111, projection='3d') mx = np.max(X_pca3) mn = np.min(X_pca3) #for i, target_name in zip([0, 1, 2], iris.target_names): ax.scatter3D(X_pca3[:, 0], X_pca3[:, 1], X_pca3[:,2],cmap='RdPu', lw=3,c=y)#label=target_name,marker="o",edgecolors='face') scaling_fac = 5 for i in range(0,len(boston.feature_names)): Q = ax.quiver(0,0,0,np.dot(scaling_fac*np.sqrt(pca3.explained_variance_[0]),V[0,i]), scaling_fac*np.dot(np.sqrt(pca3.explained_variance_[1]),V[1,i]), scaling_fac*np.dot(np.sqrt(pca3.explained_variance_[2]),V[2,i]),color='red') ax.text(np.dot(scaling_fac*np.sqrt(pca3.explained_variance_[0]),V[0,i]), np.dot(scaling_fac*np.sqrt(pca3.explained_variance_[1]),V[1,i]), np.dot(scaling_fac*np.sqrt(pca3.explained_variance_[2]),V[2,i]),boston.feature_names[i]) ax.set_xlabel('First principal component') ax.set_ylabel('Second principal component') ax.set_zlabel('Third principal component') ax.set_xlim(mn, mx) ax.set_ylim(mn, mx) ax.set_zlim(mn, mx) # It is possible to draw certain conclusions from the two biplots. For example, since in our coloring scheme darker points correspond to locations with _higher_ median house price, we can figure out which attributes among the 13 rather correlate with large prices, and which with smaller prices. # # For example, it is quite evident that the attribute 'RM' correlates strongly with median house price - this is something that we would have guessed already from the pairwise scatterplots if we had looked at them. However, this biplot contains this information in just one plot. # On the other hand, as the loading vector of the attribute 'CRIM' (related to the crime rate) is pointing towards a a collection of locations with low median house price, one can come to the conclusion that 'CRIM' is an attribute that negatively influences the house price. # In[ ]: