# -*- coding: utf-8 -*-
"""
Created on Wed Nov 7 09:11:33 2018

@author: Márton Ispány

Comparing IBM PC and MAC hardware opinion in 20newsgroups
"""

from sklearn.datasets import fetch_20newsgroups;
import sklearn.feature_extraction.text as txt;
import sklearn.naive_bayes as nb;
from sklearn import metrics;
import matplotlib.pyplot as plt;
import numpy as np;
import itertools;

categories = [
     'comp.sys.ibm.pc.hardware',
     'comp.sys.mac.hardware',
];
ds_train = fetch_20newsgroups(subset='train',categories=categories);
ds_test = fetch_20newsgroups(subset='test',categories=categories);
n_train = len(ds_train.data);
n_test = len(ds_test.data);

n_words = np.zeros((25,1));
train_accuracy = np.zeros((25,1));
test_accuracy = np.zeros((25,1));
alpha = 1;

# Searching the optimal keywords list    
for i in range(25):  
    min_df = (i+1)*0.01;      
    vectorizer = txt.CountVectorizer(stop_words='english',min_df=min_df); 
    DT_train = vectorizer.fit_transform(ds_train.data); 
    n_words[i] = DT_train.shape[1];
    clf_MNB = nb.MultinomialNB(alpha=alpha);
    clf_MNB.fit(DT_train,ds_train.target);
    train_accuracy[i] = clf_MNB.score(DT_train,ds_train.target);
    DT_test = vectorizer.transform(ds_test.data);
    test_accuracy[i] = clf_MNB.score(DT_test,ds_test.target);

# Accuracy plot
fig = plt.figure(1);
plt.title('Accuracy plot for Naiv Bayes classifier');
plt.xlabel('Number of words');
plt.ylabel('Accuracy');
plt.plot(n_words,train_accuracy,c='blue',label='training');
plt.plot(n_words,test_accuracy,c='red',label='test');
plt.legend(loc="lower right");
plt.show();    

# Vectorizing using the optimal parameter
min_df = 0.13;
vectorizer = txt.CountVectorizer(stop_words='english',min_df=min_df); 
DT_train = vectorizer.fit_transform(ds_train.data);
vocabulary_list = vectorizer.get_feature_names();
vocabulary = np.asarray(vocabulary_list);  # vocabulary in 1D array 

# Fitting the final model
clf_MNB = nb.MultinomialNB(alpha=alpha);
clf_MNB.fit(DT_train,ds_train.target);
ds_train_pred = clf_MNB.predict(DT_train);

# Performance metrics
print(metrics.classification_report(ds_train.target, ds_train_pred,
                                    target_names=ds_train.target_names));
                                    
train_conf_mat = metrics.confusion_matrix(ds_train.target, ds_train_pred);

# Visualisation of the confusion matrix

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Greens):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    
# Plot non-normalized confusion matrix
plt.figure(2);
plot_confusion_matrix(train_conf_mat, classes=categories,
                      title='Confusion matrix for training set');

# Transforming the test dataset            
DT_test = vectorizer.transform(ds_test.data);
ds_test_pred = clf_MNB.predict(DT_test);

print(metrics.classification_report(ds_test.target, ds_test_pred,
                                    target_names=ds_test.target_names));

test_conf_mat = metrics.confusion_matrix(ds_test.target, ds_test_pred);
test_proba = clf_MNB.predict_proba(DT_test);

# Plot non-normalized confusion matrix
plt.figure(3);
plot_confusion_matrix(test_conf_mat, classes=categories,
                      title='Confusion matrix for test set');

# Computing false and true positive rate and AUC
fpr_train, tpr_train, _ = metrics.roc_curve(ds_train.target, ds_train_pred);
roc_auc_train = metrics.auc(fpr_train, tpr_train);                      
fpr_test, tpr_test, _ = metrics.roc_curve(ds_test.target, ds_test_pred);
roc_auc_test = metrics.auc(fpr_test, tpr_test);

plt.figure(4);
lw = 4;
plt.plot(fpr_train, tpr_train, color='blue',
         lw=lw, label='train (AUC = %0.2f)' % roc_auc_train);
plt.plot(fpr_test, tpr_test, color='red',
         lw=lw, label='test (AUC = %0.2f)' % roc_auc_test);
plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='--');
plt.xlim([0.0, 1.0]);
plt.ylim([0.0, 1.0]);
plt.xlabel('False Positive Rate');
plt.ylabel('True Positive Rate');
plt.title('Receiver operating characteristic curve');
plt.legend(loc="lower right");
plt.show();