# -*- coding: utf-8 -*-
"""
Created on Mon Apr 06 09:36:05 2020

Fitting classifiers for Spambase dataset
Classifiers: logistic regression, naive Bayes, nearest neighbor, neural network
Original data source: https://archive.ics.uci.edu/ml/datasets/spambase

@author: Márton Ispány
"""

import numpy as np;  # importing numerical computing package
from urllib.request import urlopen;  # importing url handling
from sklearn import model_selection as ms; # importing model selection tools
from sklearn import linear_model as lm; #  importing linear models
from sklearn import naive_bayes as nb; #  importing naive Bayes classifiers
from sklearn import neighbors;    # importing nearest neighbor methods
from sklearn import neural_network as nn; # importing neural network models
from matplotlib import pyplot as plt;  # importing MATLAB-like plotting framework
import matplotlib.colors as col;  # importing coloring tools from MatPlotLib

# Reading the dataset
url = 'https://arato.inf.unideb.hu/ispany.marton/DataMining/Practice/Datasets/spamdata.csv';
raw_data = urlopen(url);
data = np.loadtxt(raw_data, skiprows=1, delimiter=";");  # reading numerical data from csv file
del raw_data;

# Reading attribute names 
url_names = 'https://arato.inf.unideb.hu/ispany.marton/DataMining/Practice/Datasets/spambase.names.txt	';
raw_names = urlopen(url_names);
attribute_names = [];   #  list for names
for line in raw_names:
    name = line.decode('utf-8');  # transforming bytes to string
    name = name[0:name.index(':')]; # extracting attribute name from string
    attribute_names.append(name);  # append the name to a list
del raw_names;

# Defining input and target variables
X = data[:,0:56];
y = data[:,57];
del data;
input_names = attribute_names[0:56];
target_names = ['not spam','spam'];


# Partitioning into training and test sets
X_train, X_test, y_train, y_test = ms.train_test_split(X,y, test_size=0.3, 
                                shuffle = True, random_state=2020);

# Fitting logistic regression
logreg_classifier = lm.LogisticRegression();
logreg_classifier.fit(X_train,y_train);
score_logreg = logreg_classifier.score(X_test,y_test);  #  goodness of fit
ypred_logreg = logreg_classifier.predict(X_test);   # spam prediction
yprobab_logreg = logreg_classifier.predict_proba(X_test);  #  prediction probabilities

# Fitting naive Bayes classifier
naive_bayes_classifier = nb.GaussianNB();
naive_bayes_classifier.fit(X_train,y_train);
score_naive_bayes = naive_bayes_classifier.score(X_test,y_test);  #  goodness of fit
ypred_naive_bayes = naive_bayes_classifier.predict(X_test);  # spam prediction
yprobab_naive_bayes = naive_bayes_classifier.predict_proba(X_test);  #  prediction probabilities

# Fitting nearest neighbor classifier
K = 5;  # number of neighbors
knn_classifier = neighbors.KNeighborsClassifier(n_neighbors=K);
knn_classifier.fit(X_train,y_train);
score_knn = knn_classifier.score(X_test,y_test);  #  goodness of fit
ypred_knn = knn_classifier.predict(X_test);   # spam prediction
yprobab_knn = knn_classifier.predict_proba(X_test);  #  prediction probabilities

# Fitting neural network classifier
neural_classifier = nn.MLPClassifier(hidden_layer_sizes=(5));  #  number of hidden neurons: 5
neural_classifier.fit(X_train,y_train);
score_neural = neural_classifier.score(X_test,y_test);  #  goodness of fit
ypred_neural = neural_classifier.predict(X_test);   # spam prediction
yprobab_neural = neural_classifier.predict_proba(X_test);  #  prediction probabilities

#  The best model based on test score is MLP (Multilayer perceptron)
#  with 92.9%

# Visualization of spam prediction and probabilities using the best model (MLP)
# Color denotes the class, size denotes the probability
x_axis = 5;  # x axis attribute (0..55)
y_axis = 22;  # y axis attribute (0..55)
colors = ['blue','red'];  #  colors of spam and not spam
fig = plt.figure(1);
plt.title('Scatterplot for training digits dataset');
plt.xlabel(input_names[x_axis]);
plt.ylabel(input_names[y_axis]);
plt.scatter(X_test[:,x_axis],X_test[:,y_axis],s=100*yprobab_neural[0,:],
            c=ypred_neural,cmap=col.ListedColormap(colors));
plt.show();
