#!/usr/bin/env python
# coding: utf-8

# # Classification problems
# ## Case study with the [iris dataset](https://archive.ics.uci.edu/ml/datasets/iris) analyzed by R.A. Fisher
# 
# We study how to use learning methods to solve _classification_ problems with [scikit-learn](https://scikit-learn.org/stable/), focussing on a classical data set with a 4-dimensional feature space.

# First, we load the dataset.

# In[1]:


from sklearn import datasets
import numpy as np

iris = datasets.load_iris()
X = iris.data
y = iris.target


# ## Understanding the data
# 
# Let's understand the dataset a little bit. First, we read the names of the categories ("targets").

# In[2]:


print (iris['target_names'])


# Luckily, there is a detailed description included in the dataset:

# In[3]:


print(iris.DESCR)


# We find the feature names also here, as well as the size of the data:

# In[4]:


print(iris.feature_names) 
print (iris.data.shape) # Shape of the data matrix
# have a peek at feature variables
print (iris.data[:5])
# have a peek at target variables
print (iris.target[:5])


# We can verify the distribution of the samples in the different categories:

# In[5]:


print("Number of samples in first iris class: %i" % (iris.target==0).sum())
print("Number of samples in second iris class: %i" % (iris.target==1).sum())
print("Number of samples in thrid iris class: %i" % (iris.target==2).sum())


# The data set is small enough that we can efficiently visualize pairwise scatter plots of the features. We use the labels/categories to color the data points.

# In[6]:


iris.target_names


# In[7]:


import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

# this is a small loop for creating a list with the category names 
iris.target_class = []
for k in range(len(iris.target)):
    iris.target_class.append(iris.target_names[iris.target[k]])

# for this, first create DataFrame
df_iris = pd.DataFrame(iris.data,columns=iris['feature_names'])
df_iris = df_iris.assign(iris_class=iris.target_class)
sns.pairplot(df_iris, hue= "iris_class", palette="tab10", )
fig = plt.gcf()
fig.set_size_inches(20, 20)


# We now split the data set into a training and a test set. We choose a 50-50 split - considering the number of samples, this is a quite small training set (but on the other hand, this is a rather easy dataset).

# In[8]:


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=10,
                                                    test_size=0.5)


# ## Classification
# 
# ### K-nearest neighbors
# 
# The first method we try is the [k-nearest neighbor classifier](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html?highlight=kneighbors#sklearn.neighbors.KNeighborsClassifier). Is main parameter is the number of neighbors 'n_neighbors' used to determine the class assignment.

# In[9]:


from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

k_parameter = 1
knn = KNeighborsClassifier(n_neighbors=k_parameter)


# ### Create classification maps
# To obtain a feeling about what the classifier do, we create classification maps
# for different algorithmic parameters (nr. of neighbors considered for KNeighborsClassifier). Since we will call this piece of code several times, we write a function for this.

# In[10]:


def classification_map(X,y,classifier,i=0,j=1,X_test=None,y_test=None,h=0.1):
    '''
    h: step size in the mesh
    i: first feature number to be plotted
    j: second feature number to be plotted
    '''
    import matplotlib.pyplot as plt
    from matplotlib.colors import ListedColormap
    cmap_light = ListedColormap(['#FFBBBB', '#BBFFBB', '#BBBBFF'])
    cmap_bold = ListedColormap(['#CC0000', '#00AA00', '#0000CC'])
# Points in a mesh of [x_min, m_max] x [y_min, y_max]
    x_min, x_max = X[:,i].min()-1, X[:,i].max()+1
    y_min, y_max = X[:,j].min()-1, X[:,j].max()+1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    grid = np.c_[xx.ravel(), yy.ravel()]
    classifier.fit(X,y)
    Z = classifier.predict(grid)
    
    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(figsize=(5,5))
    if False:
        plt.scatter(xx, yy, c=Z, cmap=cmap_light, edgecolor='none')
    else:
        plt.pcolormesh(xx, yy, Z, cmap=cmap_light,shading='auto')
    # Plot also the training points
    plt.scatter(X[:,i], X[:,j], c=y, cmap=cmap_bold)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("3-Class classification with classifier "+str(classifier))
    #          % (n_neighbors, weights))
    if not X_test is None and not y_test is None:
        plt.scatter(X_test[:,i], X_test[:,j], c=y_test,cmap=cmap_bold,marker="x")
    ax=plt.gca()
    ax.legend(["training data","test data"],loc=0,fontsize=8)


# The idea is to convey the way how the classifier works: In order to better visualize what is happening, we only use the **first two** features, ignoring the other two.

# In[11]:


classification_map(X_train[:,:2],y_train,knn,X_test=X_test[:,:2],y_test=y_test)


# In[12]:


k_parameter = 6
knn6 = KNeighborsClassifier(n_neighbors=k_parameter)
knn6.set_params(n_neighbors=k_parameter)
classification_map(X_train[:,:2],y_train,knn6,X_test=X_test[:,:2],y_test=y_test)

k_parameter = 30
knn30 = KNeighborsClassifier(n_neighbors=k_parameter)
knn30.set_params(n_neighbors=k_parameter)
classification_map(X_train[:,:2],y_train,knn30,X_test=X_test[:,:2],y_test=y_test)


# The different background colors indicate the decision boundaries. Do you understand why the boundaries are in the way they are?

# Let's now see how well we can classify the iris data based on these classifiers.

# In[13]:


print("Prediction accuracy for",str(knn),"on the training set: %f" % knn.score(X_train[:,:2],y_train))
print("Prediction accuracy for",str(knn),"on the test set: %f" % knn.score(X_test[:,:2],y_test))
print("Prediction accuracy for",str(knn6),"on the training set: %f" % knn6.score(X_train[:,:2],y_train))
print("Prediction accuracy for",str(knn6),"on the test set: %f" % knn6.score(X_test[:,:2],y_test))
print("Prediction accuracy for",str(knn30),"on the training set: %f" % knn30.score(X_train[:,:2],y_train))
print("Prediction accuracy for",str(knn30),"on the test set: %f" % knn30.score(X_test[:,:2],y_test))


# As we can see in the illustrations above, the classification is not very optimal yet.
# 
# We retrain the classifiers based on the **all four features**. The resulting accuracies are:

# In[14]:


knn.fit(X_train, y_train)
print("Prediction accuracy for",str(knn),"on the training set: %f" % knn.score(X_train,y_train))
print("Prediction accuracy for",str(knn),"on the test set: %f" % knn.score(X_test,y_test))
knn6.fit(X_train, y_train)
print("Prediction accuracy for",str(knn6),"on the training set: %f" % knn6.score(X_train,y_train))
print("Prediction accuracy for",str(knn6),"on the test set: %f" % knn6.score(X_test,y_test))
knn30.fit(X_train, y_train)
print("Prediction accuracy for",str(knn30),"on the training set: %f" % knn30.score(X_train,y_train))
print("Prediction accuracy for",str(knn30),"on the test set: %f" % knn30.score(X_test,y_test))


# Now, the prediction is considerably better. We can identify the locations of the misclassifications by plotting the confusion matrices:

# In[15]:


print (iris['target_names'])
metrics.plot_confusion_matrix(knn,X_test,y_test)
metrics.plot_confusion_matrix(knn6,X_test,y_test)
metrics.plot_confusion_matrix(knn30,X_test,y_test)


# ### Logistic Regression
# 
# We repeat the classification for a [logistic regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logisticregression#sklearn.linear_model.LogisticRegression) model. Again, we first illustrate the decision boundary based on two features only.

# In[16]:


from sklearn.linear_model import LogisticRegression

# instantiate the model
logreg = LogisticRegression() 
# create classification map for logistic regression
classification_map(X_train[:,:2],y_train,logreg,X_test=X_test[:,:2],y_test=y_test)


# We see that, unlike for k-NN, the decision boundaries are linear.
# 
# Now we fit the model for all variables:

# In[17]:


# fit model for all variables
logreg.fit(X_train, y_train)
y_pred_class = logreg.predict(X_test)
print("Prediction accuracy for",str(logreg),"on the training set: %f" % logreg.score(X_train,y_train))
print("Prediction accuracy for",str(logreg),"on the test set: %f" % logreg.score(X_test,y_test))


# In[18]:


metrics.plot_confusion_matrix(logreg,X_test,y_test)
print (iris['target_names'])


# We observe that for logistic regression (with default parameters), the accuracy is slightly better than for the k-nearest neighbor classifiers.

# In[ ]: