#!/usr/bin/env python # coding: utf-8 # # Classification problems # ## Case study with the [iris dataset](https://archive.ics.uci.edu/ml/datasets/iris) analyzed by R.A. Fisher # # We study how to use learning methods to solve _classification_ problems with [scikit-learn](https://scikit-learn.org/stable/), focussing on a classical data set with a 4-dimensional feature space. # First, we load the dataset. # In[1]: from sklearn import datasets import numpy as np iris = datasets.load_iris() X = iris.data y = iris.target # ## Understanding the data # # Let's understand the dataset a little bit. First, we read the names of the categories ("targets"). # In[2]: print (iris['target_names']) # Luckily, there is a detailed description included in the dataset: # In[3]: print(iris.DESCR) # We find the feature names also here, as well as the size of the data: # In[4]: print(iris.feature_names) print (iris.data.shape) # Shape of the data matrix # have a peek at feature variables print (iris.data[:5]) # have a peek at target variables print (iris.target[:5]) # We can verify the distribution of the samples in the different categories: # In[5]: print("Number of samples in first iris class: %i" % (iris.target==0).sum()) print("Number of samples in second iris class: %i" % (iris.target==1).sum()) print("Number of samples in thrid iris class: %i" % (iris.target==2).sum()) # The data set is small enough that we can efficiently visualize pairwise scatter plots of the features. We use the labels/categories to color the data points. # In[6]: iris.target_names # In[7]: import seaborn as sns import pandas as pd import matplotlib.pyplot as plt # this is a small loop for creating a list with the category names iris.target_class = [] for k in range(len(iris.target)): iris.target_class.append(iris.target_names[iris.target[k]]) # for this, first create DataFrame df_iris = pd.DataFrame(iris.data,columns=iris['feature_names']) df_iris = df_iris.assign(iris_class=iris.target_class) sns.pairplot(df_iris, hue= "iris_class", palette="tab10", ) fig = plt.gcf() fig.set_size_inches(20, 20) # We now split the data set into a training and a test set. We choose a 50-50 split - considering the number of samples, this is a quite small training set (but on the other hand, this is a rather easy dataset). # In[8]: from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=10, test_size=0.5) # ## Classification # # ### K-nearest neighbors # # The first method we try is the [k-nearest neighbor classifier](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html?highlight=kneighbors#sklearn.neighbors.KNeighborsClassifier). Is main parameter is the number of neighbors 'n_neighbors' used to determine the class assignment. # In[9]: from sklearn.neighbors import KNeighborsClassifier from sklearn import metrics k_parameter = 1 knn = KNeighborsClassifier(n_neighbors=k_parameter) # ### Create classification maps # To obtain a feeling about what the classifier do, we create classification maps # for different algorithmic parameters (nr. of neighbors considered for KNeighborsClassifier). Since we will call this piece of code several times, we write a function for this. # In[10]: def classification_map(X,y,classifier,i=0,j=1,X_test=None,y_test=None,h=0.1): ''' h: step size in the mesh i: first feature number to be plotted j: second feature number to be plotted ''' import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap cmap_light = ListedColormap(['#FFBBBB', '#BBFFBB', '#BBBBFF']) cmap_bold = ListedColormap(['#CC0000', '#00AA00', '#0000CC']) # Points in a mesh of [x_min, m_max] x [y_min, y_max] x_min, x_max = X[:,i].min()-1, X[:,i].max()+1 y_min, y_max = X[:,j].min()-1, X[:,j].max()+1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) grid = np.c_[xx.ravel(), yy.ravel()] classifier.fit(X,y) Z = classifier.predict(grid) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.figure(figsize=(5,5)) if False: plt.scatter(xx, yy, c=Z, cmap=cmap_light, edgecolor='none') else: plt.pcolormesh(xx, yy, Z, cmap=cmap_light,shading='auto') # Plot also the training points plt.scatter(X[:,i], X[:,j], c=y, cmap=cmap_bold) plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.title("3-Class classification with classifier "+str(classifier)) # % (n_neighbors, weights)) if not X_test is None and not y_test is None: plt.scatter(X_test[:,i], X_test[:,j], c=y_test,cmap=cmap_bold,marker="x") ax=plt.gca() ax.legend(["training data","test data"],loc=0,fontsize=8) # The idea is to convey the way how the classifier works: In order to better visualize what is happening, we only use the **first two** features, ignoring the other two. # In[11]: classification_map(X_train[:,:2],y_train,knn,X_test=X_test[:,:2],y_test=y_test) # In[12]: k_parameter = 6 knn6 = KNeighborsClassifier(n_neighbors=k_parameter) knn6.set_params(n_neighbors=k_parameter) classification_map(X_train[:,:2],y_train,knn6,X_test=X_test[:,:2],y_test=y_test) k_parameter = 30 knn30 = KNeighborsClassifier(n_neighbors=k_parameter) knn30.set_params(n_neighbors=k_parameter) classification_map(X_train[:,:2],y_train,knn30,X_test=X_test[:,:2],y_test=y_test) # The different background colors indicate the decision boundaries. Do you understand why the boundaries are in the way they are? # Let's now see how well we can classify the iris data based on these classifiers. # In[13]: print("Prediction accuracy for",str(knn),"on the training set: %f" % knn.score(X_train[:,:2],y_train)) print("Prediction accuracy for",str(knn),"on the test set: %f" % knn.score(X_test[:,:2],y_test)) print("Prediction accuracy for",str(knn6),"on the training set: %f" % knn6.score(X_train[:,:2],y_train)) print("Prediction accuracy for",str(knn6),"on the test set: %f" % knn6.score(X_test[:,:2],y_test)) print("Prediction accuracy for",str(knn30),"on the training set: %f" % knn30.score(X_train[:,:2],y_train)) print("Prediction accuracy for",str(knn30),"on the test set: %f" % knn30.score(X_test[:,:2],y_test)) # As we can see in the illustrations above, the classification is not very optimal yet. # # We retrain the classifiers based on the **all four features**. The resulting accuracies are: # In[14]: knn.fit(X_train, y_train) print("Prediction accuracy for",str(knn),"on the training set: %f" % knn.score(X_train,y_train)) print("Prediction accuracy for",str(knn),"on the test set: %f" % knn.score(X_test,y_test)) knn6.fit(X_train, y_train) print("Prediction accuracy for",str(knn6),"on the training set: %f" % knn6.score(X_train,y_train)) print("Prediction accuracy for",str(knn6),"on the test set: %f" % knn6.score(X_test,y_test)) knn30.fit(X_train, y_train) print("Prediction accuracy for",str(knn30),"on the training set: %f" % knn30.score(X_train,y_train)) print("Prediction accuracy for",str(knn30),"on the test set: %f" % knn30.score(X_test,y_test)) # Now, the prediction is considerably better. We can identify the locations of the misclassifications by plotting the confusion matrices: # In[15]: print (iris['target_names']) metrics.plot_confusion_matrix(knn,X_test,y_test) metrics.plot_confusion_matrix(knn6,X_test,y_test) metrics.plot_confusion_matrix(knn30,X_test,y_test) # ### Logistic Regression # # We repeat the classification for a [logistic regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logisticregression#sklearn.linear_model.LogisticRegression) model. Again, we first illustrate the decision boundary based on two features only. # In[16]: from sklearn.linear_model import LogisticRegression # instantiate the model logreg = LogisticRegression() # create classification map for logistic regression classification_map(X_train[:,:2],y_train,logreg,X_test=X_test[:,:2],y_test=y_test) # We see that, unlike for k-NN, the decision boundaries are linear. # # Now we fit the model for all variables: # In[17]: # fit model for all variables logreg.fit(X_train, y_train) y_pred_class = logreg.predict(X_test) print("Prediction accuracy for",str(logreg),"on the training set: %f" % logreg.score(X_train,y_train)) print("Prediction accuracy for",str(logreg),"on the test set: %f" % logreg.score(X_test,y_test)) # In[18]: metrics.plot_confusion_matrix(logreg,X_test,y_test) print (iris['target_names']) # We observe that for logistic regression (with default parameters), the accuracy is slightly better than for the k-nearest neighbor classifiers. # In[ ]: