# -*- coding: utf-8 -*-
"""
Created on Mon Nov 12 23:42:22 2018

@author: Márton
"""

from sklearn.datasets import fetch_20newsgroups;
import sklearn.feature_extraction.text as txt;
import sklearn.naive_bayes as nb;
from sklearn import metrics;
import matplotlib.pyplot as plt;
import numpy as np;
import itertools;

categories = [
        'rec.autos',
        'rec.motorcycles',
        'rec.sport.baseball',
        'rec.sport.hockey',
];
        
ds_train = fetch_20newsgroups(subset='train',categories=categories);
ds_test = fetch_20newsgroups(subset='test',categories=categories);
n_train = len(ds_train.data);
n_test = len(ds_test.data);

# Vectorization of the docs
min_df = 0.01;
vectorizer = txt.CountVectorizer(stop_words='english',min_df=min_df); 
DT_train = vectorizer.fit_transform(ds_train.data);
vocabulary_list = vectorizer.get_feature_names();
vocabulary = np.asarray(vocabulary_list);  # vocabulary in 1D array 

# Fitting the Naiv Bayes model
alpha = 1; 
clf_MNB = nb.MultinomialNB(alpha=alpha);
clf_MNB.fit(DT_train,ds_train.target);
ds_train_pred = clf_MNB.predict(DT_train);    

# Performance metrics
print(metrics.classification_report(ds_train.target, ds_train_pred,
                                    target_names=ds_train.target_names));
                                    
train_conf_mat = metrics.confusion_matrix(ds_train.target, ds_train_pred);

# Visualisation of the confusion matrix

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Greens):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    
# Plot non-normalized confusion matrix
plt.figure(1);
plot_confusion_matrix(train_conf_mat, classes=categories,
                      title='Confusion matrix for training set');  

# Transforming the test dataset            
DT_test = vectorizer.transform(ds_test.data);
ds_test_pred = clf_MNB.predict(DT_test);

print(metrics.classification_report(ds_test.target, ds_test_pred,
                                    target_names=ds_test.target_names));

test_conf_mat = metrics.confusion_matrix(ds_test.target, ds_test_pred);
test_proba = clf_MNB.predict_proba(DT_test);

# Plot non-normalized confusion matrix
plt.figure(2);
plot_confusion_matrix(test_conf_mat, classes=categories,
                      title='Confusion matrix for test set');    
                      
test_accuracy = clf_MNB.score(DT_test,ds_test.target);                