#!/usr/bin/env python # coding: utf-8 # # Predictions from Text Data: Natural Language Processing # # ## Classification of e-mails from the [Newsgroups dataset](https://scikit-learn.org/stable/datasets/real_world.html#newsgroups-dataset) # # We now consider a text-based data set, which is based on e-mails. The e-mails have classified into 20 categories. # # The task is be to predict the categories of unseen e-mails based on a the knowledge of a set of already classified e-mails. # ### Understanding the data set # # We first load the data set and inspect its description. In this case, a seperation into training and test data is already provided (differentiated by the variable 'subset'). # In[ ]: from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import CountVectorizer import numpy as np train= fetch_20newsgroups(subset="train") print(train.DESCR) # To see what the e-mails look like, we just display a few of them, as well as the corresponding category: # In[ ]: print(train.data[0]) print(train.target[0]) print("Category: ",train.target_names[train.target[0]],"\n") print(train.data[10]) print(train.target[10]) print("Category: ",train.target_names[train.target[10]],"\n") print(train.data[50]) print(train.target[50]) print("Category: ",train.target_names[train.target[50]]) # The available categories are the following: # In[ ]: train.target_names # We can see above that data contains full e-mails that contain header and (sometimes) footer information. Since we want to assess the behavior of methods based on text only, avoiding the additional information of metadata, we reimport the data using the respective option. # # Furthermore, we select just a subset of the emails in 12 categories (instead of all 20) in order to speed up computations. # In[ ]: train_allcat = train categories = ['alt.atheism','talk.religion.misc','comp.graphics','sci.space', 'comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware','comp.sys.mac.hardware', 'talk.politics.misc','sci.med','rec.autos','sci.electronics','rec.motorcycles'] train= fetch_20newsgroups(subset="train",remove = ('headers', 'footers'),categories=categories) test= fetch_20newsgroups(subset="test",remove = ('headers', 'footers'),categories=categories) # We proceed to obtain some understanding of the data set: We extract the frequency of each category, first for the training set, then for the test set. Also, we obtain the memory size of the data we process. # In[ ]: classe, frequency_train = np.unique(train.target, return_counts=True) print(classe) print("Class frequencies in training set: ",frequency_train) _, frequency_test = np.unique(test.target, return_counts=True) print("Class frequencies in test set: ",frequency_train) def size_mb(docs): return sum(len(s.encode('utf-8')) for s in docs) / 1e6 print("Size of text in training set (all categories, w/ headers & footers): %0.3f MB" % size_mb(train_allcat.data)) print("Size of text in training set: %0.3f MB" % size_mb(train.data)) print("Size of text in test set: %0.3f MB" % size_mb(test.data)) # Apparently, the number of the samples is quite balanced across the categories (with a considerable smaller number only of samples except for the last category). # # The order of the categories is as follows: # In[ ]: train.target_names # So the category with the smallest number of samples in the training set (among the considered ones) are: # In[ ]: train.target_names[0],train.target_names[10],train.target_names[11] # ### Feature Extraction # # #### Count Vectorizer # # We now extract **features** which we can use for learning algorithms. We choose the [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html?highlight=countvectorizer#sklearn.feature_extraction.text.CountVectorizer) first, using a bound of $2^{14}$ features to be used - this is done to make the computations below faster. Ideally, one would increase this bound. # In[ ]: from time import time from optparse import OptionParser import sys from sklearn.feature_extraction.text import TfidfVectorizer op = OptionParser() argv = [] sys.argv[1:] (opts, args) = op.parse_args(argv) print("Extracting features from the training data using a count vectorizer") t0 = time() countvec = CountVectorizer(stop_words='english',max_features=2**14) X_train = countvec.fit_transform(train.data) X_test = countvec.transform(test.data) # Extracting features from the test data using the same vectorizer duration = time() - t0 # check computational effort to compute the features print("done in %fs at %0.3fMB/s" % (duration, size_mb(train.data) / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) # We create a pandas DataFrame in order to get an impression about the created dictionary and feature vectors: # In[ ]: import pandas as pd X_train_countvec_df = pd.DataFrame(X_train.todense()) # This are the different "words" that are in our vocabulary: X_train_countvec_df.columns = sorted(countvec.vocabulary_) print(X_train_countvec_df.columns) # This shows how a rows of our feature matrix look like: X_train_countvec_df # In[ ]: X_train_countvec_df # #### TF-IDF Vectorizer # # We repeat the feature extraction step using the [term frequency-inverse document frequency (TF-IDF)](https://en.wikipedia.org/wiki/Tf–idf) embedding of the documents. # In[ ]: print("Extracting features from the training data using a TF-IDF vectorizer") t0 = time() vectorizer_tfidf = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english',max_features=2**14) X_train_tfidf = vectorizer_tfidf.fit_transform(train.data) X_test_tfidf = vectorizer_tfidf.transform(test.data) # Extracting features from the test data using the same vectorizer duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, size_mb(train.data) / duration)) print("n_features: %d" % X_train_tfidf.shape[1]) print() # ## Applying the learning algorithms # # ### Logistic Regression # # Based on the both sets of extracted features, we now apply logistic regression to build a generalized linear model. Please note the [algorithmic options of logistic regression of scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logistic%20regression#sklearn.linear_model.LogisticRegression): The choice of the 'solver' becomes relevant here as the dataset is not that small. We also note that the default choice in the method is **with $\ell_2$-regularization** (with regularization parameter $C=1$). See also [these instructions](https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression) for some comments on the differnet options. # # We first use the count vectorizer encoding. # In[ ]: y_train = train["target"] y_test=test['target'] from sklearn.linear_model import LogisticRegression t0 = time() lr =LogisticRegression(solver='lbfgs',max_iter=150,multi_class='multinomial').fit(X_train,train.target) print("Runtime of training of "+str(lr)+" with count vectorizer encoding: ",format(time()-t0,"0.3f"),"s") t0 = time() print("Mean accuracy of model "+str(lr)+" on training data with count vectorizer encoding: ",lr.score(X_train,y_train)) print("Mean accuracy of model "+str(lr)+" on test data with count vectorizer encoding: ",lr.score(X_test,y_test)) print("Runtime of evaluating "+str(lr)+" on training and test data with count vectorizer encoding: ",format(time()-t0,"0.3f"),"s") # We observe that the training time is reasonable, but not trivial anymore. On the other hand, evaluating the model (calculating the accuaracy) is still very quick # # Now, we repeat this with the TF-IDF features. # In[ ]: t0 = time() lr2 =LogisticRegression(solver='lbfgs',max_iter=150,multi_class='multinomial').fit(X_train_tfidf,train.target) print("Runtime of training of "+str(lr2)+" with TF-IDF encoding: ",format(time()-t0,"0.3f"),"s") t0 = time() print("Mean accuracy of model "+str(lr2)+" on training data with TF-IDF encoding: ",lr2.score(X_train_tfidf,y_train)) print("Mean accuracy of model "+str(lr2)+" on test data with TF-IDF encoding: ",lr2.score(X_test_tfidf,y_test)) print("Runtime of evaluating "+str(lr2)+" on training and test data with TF-IDF encoding: ",format(time()-t0,"0.3f"),"s") # We note that the training of the logistic regression model takes considerably longer than the evaluation. The test accuracy for the TF-IDF encoding is better than for the counting encoding. # The very high training accuracy suggests that we are in a situation where overfitting occurs. # We can test the predictive quality of these models also on custom text documents: # In[ ]: text_test = ['Bill Gates and Steve Jobs are computer enterpreneurs.','Elon Musk wants to fly to Mars.'] X_custom_counts=countvec.transform(text_test) X_custom_tfidf=vectorizer_tfidf.transform(text_test) predicted=lr.predict(X_custom_counts) predicted2=lr2.predict(X_custom_tfidf) print("Predicuts of model "+str(lr)+" (count vectorizer encoding)") for doc,category in zip(text_test,predicted): print('%r => %s'%(doc,train.target_names[category])) print("Predicuts of model "+str(lr2)+" (TF-IDF encoding)") for doc,category in zip(text_test,predicted2): print('%r => %s'%(doc,train.target_names[category])) # As we have been under the impression that overfitting has been occuring, we now run a cross-validation on $\ell_2$-regularized logistic regression. We use "LogisticRegressionCV" instead of the generic method "GridSearchCV" (applied to a LogisticRegression) as this uses some tricks to make it computationally more efficient. # # However, running this cross validations for different regualarization parameters on a 5-fold split will still take a considerable amount of time. In practice, this could be run efficiently # using distributed computing, even for larger data sets. # In[ ]: from sklearn.linear_model import LogisticRegressionCV # We focus on the TF-IDF model as it exhibited better performance above. t0 = time() lr_optimal = LogisticRegressionCV(Cs=20,cv=5, random_state=10,penalty='l2', max_iter=150,multi_class='multinomial',solver='lbfgs',refit=True).fit(X_train_tfidf, y_train) print("Runtime of crossvalidation:",format(time()-t0,"0.3f"),"s") print("Mean accuracy of model "+str(lr_optimal)+" on training data with TF-IDF encoding: ",lr_optimal.score(X_train_tfidf,y_train)) print("Mean accuracy of model "+str(lr_optimal)+" on test data with TF-IDF encoding: ",lr_optimal.score(X_test_tfidf,y_test)) # We note that the cross validation was not successful in improving the accuracy on the hold-out test set. # # We plot the validation errors for the different regularization parameters $C$: # In[ ]: import matplotlib.pyplot as plt plt.figure(figsize=(10,10)) plt.plot(lr_optimal.Cs_,lr_optimal.scores_[1][1]) ax = plt.gca() ax.set_xscale('log') ax.set(xlabel='Parameter C', ylabel='Validation accuracy') ax.set_box_aspect(1) # We note that the maximal validation accuracy is as follows: # In[ ]: np.max(lr_optimal.scores_[1][1]) # **Can you explain why the maximal validation accuracy is considerably larger than the test accuracy?** # To get a better idea where the misclassifications take place (i.e., in which categories), we plot the [confusion matrix](https://scikit-learn.org/stable/modules/model_evaluation.html#confusion-matrix): # In[ ]: from sklearn.metrics import plot_confusion_matrix import matplotlib.pyplot as plt y_test_predicted = lr_optimal.predict(X_test_tfidf) plt.figure(figsize=(30,18)) ax1 = plt.gca() plot_confusion_matrix(lr_optimal,X_test_tfidf,y_test,display_labels=categories,ax=ax1) # ### K-Nearest Neighbors # # Instead of logistic regression, we can also use a k-nearest neighbors classifier. # In[ ]: from sklearn.neighbors import KNeighborsClassifier k_parameter = 5 t0 = time() knn=KNeighborsClassifier(n_neighbors=k_parameter).fit(X_train_tfidf, y_train) print("Runtime of training of "+str(knn)+":",format(time() - t0,"0.3f"),"s") t0 = time() print("Mean accuracy of model "+str(knn)+" on training data with TF-IDF encoding: ",knn.score(X_train_tfidf,y_train)) print("Runtime of evaluation of "+str(knn)+"on training set:",format(time() - t0,"0.3f"),"s") t0 = time() print("Mean accuracy of model "+str(knn)+" on test data with TF-IDF encoding: ",knn.score(X_test_tfidf,y_test)) print("Runtime of evaluation of "+str(knn)+" on test set:",format(time() - t0,"0.3f"),"s") # We observe that the performance of the nearest neighbors classifier is not very good in this setting (considering $5$ neighbors). Do you have any intuition why this is the case? # We now run a cross validation over the number of neighbors considered: # In[ ]: from sklearn.model_selection import GridSearchCV ks=np.arange(1,41,2) # create vector of logarithmically interpolated values between 10^(-5) and 10^(9) parameters = {'n_neighbors':ks} t0 = time() gridsearch = GridSearchCV(KNeighborsClassifier(),param_grid=parameters,scoring='accuracy',return_train_score=True,cv=5) gridsearch.fit(X_train_tfidf, y_train) print("Runtime of running of "+str(gridsearch)+"on training set:",format(time() - t0,"0.3f"),"s") # Plotting the training and validation accuracies, we observe the following: # In[ ]: train_accuracies = gridsearch.cv_results_['mean_train_score'] validation_accuracies = gridsearch.cv_results_['mean_test_score'] plt.figure() plt.plot(ks,train_accuracies) plt.plot(ks,validation_accuracies) ax = plt.gca() ax.set(xlabel='alpha', ylabel='accuracy', title='Cross-validation accuracies for k-Nearest Neighbors') ax.legend(["training data","validation data"], loc=0) print("Best parameter k:",str(gridsearch.best_params_['n_neighbors'])) #ax.set_xticks() print("Mean accuracy of model "+str(gridsearch.best_estimator_)+" on test data with TF-IDF encoding: ",gridsearch.best_estimator_.score(X_test_tfidf,y_test)) # We see that the best parameter for kNN is obtain for a considerably large number of neighbors. # If we hadn't run cross-validation, we might have missed that kNN can perform relatively well for this dataset, too!