# -*- coding: utf-8 -*-
"""
Created on Mon Nov  5 22:47:35 2018

@author: Márton Ispány

Binary classification by two categories for 20newsgroups
"""

from sklearn.datasets import fetch_20newsgroups;
import sklearn.feature_extraction.text as txt;
import sklearn.neighbors as nb;
from sklearn import metrics;
import matplotlib.pyplot as plt;
import numpy as np;

categories = [
    'alt.atheism',
    'talk.religion.misc',
    'soc.religion.christian'
];
ds_train = fetch_20newsgroups(subset='train',categories=categories);
ds_test = fetch_20newsgroups(subset='test',categories=categories);
n_train = len(ds_train.data);
n_test = len(ds_test.data);
        
        
vectorizer = txt.CountVectorizer(stop_words='english',max_df=0.8,min_df=0.1); 
DT_train = vectorizer.fit_transform(ds_train.data); 
vocabulary_dict = vectorizer.vocabulary_;
vocabulary_list = vectorizer.get_feature_names();
vocabulary = np.asarray(vocabulary_list);  # vocabulary in 1D array
stopwords = vectorizer.stop_words_;
n_words = DT_train.shape[1];

# document-term matrix in dense form 
# doc_term_train = DT_train.todense().getA();

# stopword in list
stopwords_list = list(stopwords);

# transforming the test dataset            
DT_test = vectorizer.transform(ds_test.data);

# document-term matrix in dense form 
# doc_term_test = DT_test.todense().getA();

train_list = [];
test_list = [];

for i in range(50):
    nnb = i+2;
    clf_knn = nb.KNeighborsClassifier(n_neighbors=nnb);
    clf_knn.fit(DT_train,ds_train.target);
    ds_train_pred = clf_knn.predict(DT_train);
    train_accuracy = clf_knn.score(DT_train,ds_train.target);
    ds_test_pred = clf_knn.predict(DT_test);
    test_accuracy = clf_knn.score(DT_test,ds_test.target);
    train_list.append(train_accuracy);
    test_list.append(test_accuracy);
# {}    
                                    
fig = plt.figure(1);
plt.title('Diagnostic for KNN method');
plt.xlabel('Number of neighbours');
plt.ylabel('Score');
plt.plot(train_list,c='blue');
plt.plot(test_list,c='red');
plt.show();                               
                                    
train_conf_mat = metrics.confusion_matrix(ds_train.target, ds_train_pred);

train_list1 = [];
test_list1 = [];

for i in range(50):
    nnb = (i+2)*100;
    clf_knn = nb.RadiusNeighborsClassifier(radius=nnb);
    clf_knn.fit(DT_train,ds_train.target);
    ds_train_pred = clf_knn.predict(DT_train);
    train_accuracy = clf_knn.score(DT_train,ds_train.target);
    ds_test_pred = clf_knn.predict(DT_test);
    test_accuracy = clf_knn.score(DT_test,ds_test.target);
    train_list1.append(train_accuracy);
    test_list1.append(test_accuracy);
# {}    

                                    


print(metrics.classification_report(ds_test.target, ds_test_pred,
                                    target_names=ds_test.target_names));

test_conf_mat = metrics.confusion_matrix(ds_test.target, ds_test_pred);

test_proba = clf_knn.predict_proba(DT_test);


from sklearn.linear_model import SGDClassifier;

clf_SGD = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, 
                        random_state=42,max_iter=5, tol=None);
clf_SGD.fit(DT_train,ds_train.target);   
ds_train_pred = clf_SGD.predict(DT_train);
train_accuracy = np.mean(ds_train_pred == ds_train.target);

print(metrics.classification_report(ds_train.target, ds_train_pred,
                                    target_names=ds_train.target_names));   
                                    
ds_test_pred = clf_SGD.predict(DT_test);
test_accuracy = np.mean(ds_test_pred == ds_test.target);

print(metrics.classification_report(ds_test.target, ds_test_pred,
                                    target_names=ds_test.target_names));

test_conf_mat = metrics.confusion_matrix(ds_test.target, ds_test_pred);                                  