# -*- coding: utf-8 -*-
"""
Created on Sun Apr 19 09:55:30 2020

Task: Assessing of classifiers fitted for Spambase dataset
Binary (Binomial) classification problem
Classifiers: logistic regression, naive Bayes
Results: confusion matrix, ROC curve, AUC value
Original data source: https://archive.ics.uci.edu/ml/datasets/spambase

Python tools    
Libraries: numpy, matplotlib, urllib, sklearn
Modules: pyplot, request, linear_model, naive_bayes, model_selection, metrics
Classes: LogisticRegression, GaussianNB
Functions: urlopen, train_test_split, confusion_matrix, roc_curve, auc

@author: Márton Ispány
"""

import numpy as np;  # importing numerical computing package
from urllib.request import urlopen;  # importing url handling
from matplotlib import pyplot as plt;  # importing MATLAB-like plotting framework
from sklearn.linear_model import LogisticRegression; #  importing logistic regression classifier
from sklearn.linear_model import SGDClassifier; # importing stochastic gradient descent classifier
from sklearn.naive_bayes import GaussianNB; #  importing naive Bayes classifier
from sklearn.model_selection import train_test_split; # importing splitting
from sklearn.metrics import confusion_matrix, roc_curve, auc; #  importing performance metrics
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve, ConfusionMatrixDisplay; # importing performance graphs
    
# Reading the dataset
url = 'https://arato.inf.unideb.hu/ispany.marton/DataMining/Practice/Datasets/spamdata.csv';
raw_data = urlopen(url);
data = np.loadtxt(raw_data, skiprows=1, delimiter=";");  # reading numerical data from csv file
del raw_data;

# Reading attribute names 
url_names = 'https://arato.inf.unideb.hu/ispany.marton/DataMining/Practice/Datasets/spambase.names.txt	';
raw_names = urlopen(url_names);
attribute_names = [];   #  list for names
for line in raw_names:
    name = line.decode('utf-8');  # transforming bytes to string
    name = name[0:name.index(':')]; # extracting attribute name from string
    attribute_names.append(name);  # append the name to a list
del raw_names;

# Defining input and target variables
X = data[:,0:57];
y = data[:,57];
del data;
input_names = attribute_names[0:57];
target_names = ['not spam','spam'];

# Partitioning into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, 
                                shuffle = True, random_state=2021);

# Fitting logistic regression
logreg_classifier = LogisticRegression(solver='liblinear');
logreg_classifier.fit(X_train,y_train);
ypred_logreg = logreg_classifier.predict(X_train);   # spam prediction for train
accuracy_logreg_train = logreg_classifier.score(X_train,y_train);
cm_logreg_train = confusion_matrix(y_train, ypred_logreg); # train confusion matrix
ypred_logreg = logreg_classifier.predict(X_test);   # spam prediction for test
cm_logreg_test = confusion_matrix(y_test, ypred_logreg); # test confusion matrix
yprobab_logreg = logreg_classifier.predict_proba(X_test);  #  prediction probabilities
accuracy_logreg_test = logreg_classifier.score(X_test,y_test);

# Fitting logistic regression with penalty
logreg_classifier = LogisticRegression(penalty='l1', C=0.05, solver='liblinear');
logreg_classifier.fit(X_train,y_train);
ypred_logreg = logreg_classifier.predict(X_train);   # spam prediction for train
accuracy_logreg_train = logreg_classifier.score(X_train,y_train);
cm_logreg_train = confusion_matrix(y_train, ypred_logreg); # train confusion matrix
ypred_logreg = logreg_classifier.predict(X_test);   # spam prediction for test
cm_logreg_test = confusion_matrix(y_test, ypred_logreg); # test confusion matrix
yprobab_logreg = logreg_classifier.predict_proba(X_test);  #  prediction probabilities
accuracy_logreg_test = logreg_classifier.score(X_test,y_test);

# Plotting non-normalized confusion matrix
plot_confusion_matrix(logreg_classifier, X_train, y_train, display_labels = target_names);

plot_confusion_matrix(logreg_classifier, X_test, y_test, display_labels = target_names);

# Fitting SGDClassifier
SGD_classifier = SGDClassifier(loss='log',penalty='l1',alpha=0.01);
SGD_classifier.fit(X_train,y_train);
ypred_SGD = SGD_classifier.predict(X_train);   # spam prediction for train
accuracy_SGD_train = SGD_classifier.score(X_train,y_train);
cm_SGD_train = confusion_matrix(y_train, ypred_SGD); # train confusion matrix
ypred_SGD = SGD_classifier.predict(X_test);   # spam prediction for test
cm_SGD_test = confusion_matrix(y_test, ypred_SGD); # test confusion matrix
yprobab_SGD = SGD_classifier.predict_proba(X_test);  #  prediction probabilities
accuracy_SGD_test = SGD_classifier.score(X_test,y_test);

# Fitting naive Bayes classifier
naive_bayes_classifier = GaussianNB();
naive_bayes_classifier.fit(X_train,y_train);
ypred_naive_bayes = naive_bayes_classifier.predict(X_train);  # spam prediction for train
accuracy_NB_train = naive_bayes_classifier.score(X_train,y_train);
cm_naive_bayes_train = confusion_matrix(y_train, ypred_naive_bayes); # train confusion matrix
ypred_naive_bayes = naive_bayes_classifier.predict(X_test);  # spam prediction
cm_naive_bayes_test = confusion_matrix(y_test, ypred_naive_bayes); # test confusion matrix 
yprobab_naive_bayes = naive_bayes_classifier.predict_proba(X_test);  #  prediction probabilities

# Plotting non-normalized confusion matrix
plot_confusion_matrix(naive_bayes_classifier, X_train, y_train, display_labels = target_names);

plot_confusion_matrix(naive_bayes_classifier, X_test, y_test, display_labels = target_names); 

# Plotting ROC curve
plot_roc_curve(logreg_classifier, X_test, y_test);
plot_roc_curve(naive_bayes_classifier, X_test, y_test);

fpr_logreg, tpr_logreg, _ = roc_curve(y_test, yprobab_logreg[:,1]);
roc_auc_logreg = auc(fpr_logreg, tpr_logreg);

fpr_naive_bayes, tpr_naive_bayes, _ = roc_curve(y_test, yprobab_naive_bayes[:,1]);
roc_auc_naive_bayes = auc(fpr_naive_bayes, tpr_naive_bayes);

plt.figure(7);
lw = 2;
plt.plot(fpr_logreg, tpr_logreg, color='red',
         lw=lw, label='Logistic regression (AUC = %0.2f)' % roc_auc_logreg);
plt.plot(fpr_naive_bayes, tpr_naive_bayes, color='blue',
         lw=lw, label='Naive Bayes (AUC = %0.2f)' % roc_auc_naive_bayes);
plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='--');
plt.xlim([0.0, 1.0]);
plt.ylim([0.0, 1.05]);
plt.xlabel('False Positive Rate');
plt.ylabel('True Positive Rate');
plt.title('Receiver operating characteristic curve');
plt.legend(loc="lower right");
plt.show();

# Another method for visualizing confusion matrix

# 'Confusion matrix for training dataset (logistic regression)';
ConfusionMatrixDisplay(cm_logreg_train, display_labels=target_names).plot();
plt.show();

# title='Confusion matrix for test dataset (logistic regression)'
ConfusionMatrixDisplay(cm_logreg_test, display_labels=target_names).plot();
plt.show();

#  title='Confusion matrix for training dataset (naive Bayes)'
ConfusionMatrixDisplay(cm_naive_bayes_train, display_labels=target_names).plot();
plt.show();

# title='Confusion matrix for test dataset (naive Bayes)'
ConfusionMatrixDisplay(cm_naive_bayes_test, display_labels=target_names).plot();
plt.show();