# -*- coding: utf-8 -*-
"""
Created on Mon Nov  5 22:47:35 2018

@author: Márton Ispány

Binary classification by two categories for 20newsgroups
"""

from sklearn.datasets import fetch_20newsgroups;
import sklearn.feature_extraction.text as txt;
import sklearn.naive_bayes as nb;
from sklearn import metrics;
import matplotlib.pyplot as plt;
import numpy as np;

categories = [
    'alt.atheism',
    'talk.religion.misc',
    'soc.religion.christian'
];
ds_train = fetch_20newsgroups(subset='train',categories=categories);
ds_test = fetch_20newsgroups(subset='test',categories=categories);
n_train = len(ds_train.data);
n_test = len(ds_test.data);
        
        
vectorizer = txt.CountVectorizer(stop_words='english',max_df=0.8,min_df=0.0001); 
DT_train = vectorizer.fit_transform(ds_train.data); 
vocabulary_dict = vectorizer.vocabulary_;
vocabulary_list = vectorizer.get_feature_names();
vocabulary = np.asarray(vocabulary_list);  # vocabulary in 1D array
stopwords = vectorizer.stop_words_;
n_words = DT_train.shape[1];

# document-term matrix in dense form 
doc_term_train = DT_train.toarray();

# stopword in list
stopwords_list = list(stopwords);

alpha = 1;
clf_MNB = nb.MultinomialNB(alpha=alpha);
clf_MNB.fit(DT_train,ds_train.target);
category_train_pred_MNB = clf_MNB.predict(DT_train);
train_accuracy_MNB = clf_MNB.score(DT_train,ds_train.target);
train_accuracy1_MNB = np.mean(category_train_pred_MNB == ds_train.target);
prob_train_MNB = clf_MNB.predict_proba(DT_train);



print(metrics.classification_report(ds_train.target, category_train_pred_MNB,
                                    target_names=ds_train.target_names));
                                    
cf_report = metrics.classification_report(ds_train.target, category_train_pred_MNB,
                                    target_names=ds_train.target_names, output_dict=True);                              
                                    
train_conf_mat = metrics.confusion_matrix(ds_train.target, 
                category_train_pred_MNB, normalize = 'all');

# transforming the test dataset            
DT_test = vectorizer.transform(ds_test.data);

# document-term matrix in dense form 
doc_term_test = DT_test.toarray();
                                    
ds_test_pred = clf_MNB.predict(DT_test);
test_accuracy = np.mean(ds_test_pred == ds_test.target);

print(metrics.classification_report(ds_test.target, ds_test_pred,
                                    target_names=ds_test.target_names));

test_conf_mat = metrics.confusion_matrix(ds_test.target, 
            ds_test_pred, normalize = 'all');

test_proba = clf_MNB.predict_proba(DT_test);

import sklearn.neighbors as ng; 
n_ng = 10;
clf_KNN = ng.KNeighborsClassifier(n_neighbors=n_ng);
clf_KNN.fit(DT_train,ds_train.target);
ds_train_pred = clf_KNN.predict(DT_train);
train_accuracy = np.mean(ds_train_pred == ds_train.target);


from sklearn.linear_model import SGDClassifier;

clf_SGD = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, 
                        random_state=42,max_iter=5, tol=None);
clf_SGD.fit(DT_train,ds_train.target);   
ds_train_pred = clf_SGD.predict(DT_train);
train_accuracy = np.mean(ds_train_pred == ds_train.target);

print(metrics.classification_report(ds_train.target, ds_train_pred,
                                    target_names=ds_train.target_names));   
                                    
ds_test_pred = clf_SGD.predict(DT_test);
test_accuracy = np.mean(ds_test_pred == ds_test.target);

print(metrics.classification_report(ds_test.target, ds_test_pred,
                                    target_names=ds_test.target_names));

test_conf_mat = metrics.confusion_matrix(ds_test.target, ds_test_pred);                                  