# -*- coding: utf-8 -*-
"""
Created on Sun Oct 2 22:47:35 2022

@author: Márton Ispány

Binary classification by two categories for 20newsgroups
"""

from sklearn.datasets import fetch_20newsgroups;
from sklearn.feature_extraction.text import CountVectorizer;
from sklearn.naive_bayes import MultinomialNB;
from sklearn.neural_network import MLPClassifier;
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix, classification_report;
import matplotlib.pyplot as plt;
import numpy as np;

categories = [
    'alt.atheism',
    'talk.religion.misc',
    'soc.religion.christian'
];
ds_train = fetch_20newsgroups(subset='train',categories=categories);
ds_test = fetch_20newsgroups(subset='test',categories=categories);
n_train = len(ds_train.data);
n_test = len(ds_test.data);
        
        
vectorizer = CountVectorizer(stop_words='english',max_df=0.8,min_df=0.01); 
DT_train = vectorizer.fit_transform(ds_train.data); 
vocabulary_dict = vectorizer.vocabulary_;
vocabulary_list = vectorizer.get_feature_names();
vocabulary = np.asarray(vocabulary_list);  # vocabulary in 1D array
stopwords = vectorizer.stop_words_;
n_words = DT_train.shape[1];

# document-term matrix in dense form 
doc_term_train = DT_train.toarray();

# Fitting Multinomial Naive Bayes model
alpha = 1;
clf_MNB = MultinomialNB(alpha=alpha);
clf_MNB.fit(DT_train,ds_train.target);
category_train_pred_MNB = clf_MNB.predict(DT_train);
train_accuracy_MNB = clf_MNB.score(DT_train,ds_train.target);
train_accuracy1_MNB = np.mean(category_train_pred_MNB == ds_train.target);
prob_train_MNB = clf_MNB.predict_proba(DT_train);

# Assessing MNB
train_conf_mat = confusion_matrix(ds_train.target, 
                category_train_pred_MNB); #, normalize = 'all');
train_mcm = multilabel_confusion_matrix(ds_train.target, 
                category_train_pred_MNB);

print(classification_report(ds_train.target, category_train_pred_MNB,
                                    target_names=ds_train.target_names));
                                    
cf_report = classification_report(ds_train.target, category_train_pred_MNB,
                                    target_names=ds_train.target_names, output_dict=True);                              
                                    
train_conf_mat = confusion_matrix(ds_train.target, 
                category_train_pred_MNB); #, normalize = 'all');

# transforming the test dataset            
DT_test = vectorizer.transform(ds_test.data);

# document-term matrix in dense form 
doc_term_test = DT_test.toarray();
                                    
ds_test_pred = clf_MNB.predict(DT_test);
test_accuracy = np.mean(ds_test_pred == ds_test.target);

print(classification_report(ds_test.target, ds_test_pred,
                                    target_names=ds_test.target_names));

test_conf_mat = confusion_matrix(ds_test.target, 
            ds_test_pred) #, normalize = 'all');

test_proba = clf_MNB.predict_proba(DT_test);

import sklearn.neighbors as ng; 
n_ng = 10;
clf_KNN = ng.KNeighborsClassifier(n_neighbors=n_ng);
clf_KNN.fit(DT_train,ds_train.target);
ds_train_pred = clf_KNN.predict(DT_train);
train_accuracy = np.mean(ds_train_pred == ds_train.target);


from sklearn.linear_model import SGDClassifier;

clf_SGD = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, 
                        random_state=42,max_iter=5, tol=None);
clf_SGD.fit(DT_train,ds_train.target);   
ds_train_pred = clf_SGD.predict(DT_train);
train_accuracy = np.mean(ds_train_pred == ds_train.target);

print(classification_report(ds_train.target, ds_train_pred,
                                    target_names=ds_train.target_names));   
                                    
ds_test_pred = clf_SGD.predict(DT_test);
test_accuracy = np.mean(ds_test_pred == ds_test.target);

print(classification_report(ds_test.target, ds_test_pred,
                                    target_names=ds_test.target_names));

test_conf_mat = confusion_matrix(ds_test.target, ds_test_pred);     

neural = MLPClassifier(hidden_layer_sizes=(2),max_iter=400);
neural.fit(DT_train,ds_train.target);
train_accuracy_neural = neural.score(DT_train,ds_train.target);

DT_test = vectorizer.transform(ds_test.data);
test_accuracy_neural = neural.score(DT_test,ds_test.target);                           