# -*- coding: utf-8 -*-
"""
Created on Mon Nov  4 16:58:43 2019

@author: Márton
"""

import numpy as np;
from sklearn.datasets import fetch_20newsgroups;  # importing the dataset
from sklearn.feature_extraction.text import CountVectorizer;  # importing text preprocessing
from sklearn.metrics.pairwise import cosine_similarity;  # importing pairwise similarity metrics
from sklearn.metrics import confusion_matrix;

# Importing the training and testing datasets
ds_train = fetch_20newsgroups(subset='train',
                             shuffle=True, random_state=2018);
ds_test = fetch_20newsgroups(subset='test',
                             shuffle=True, random_state=2018);
n_train = len(ds_train.data);   # number of training records
n_test = len(ds_test.data);
k = len(ds_train.target_names);

min_pr = 0.01;        
count = CountVectorizer(stop_words='english',min_df=min_pr); 
DT_train = count.fit_transform(ds_train.data); 
p = DT_train.shape[1];

DT_train_dense = DT_train.toarray();

# Computing class means
class_mean = np.zeros((k,p));
for i in range(k):
    mask = (ds_train.target == i).astype(int);
    class_mean[i,:] = np.average(DT_train_dense,axis=0,weights=mask);

# transforming the test dataset            
DT_test = count.transform(ds_test.data);

# computing cosine similarity
cos_sim_train = cosine_similarity(DT_train_dense,class_mean);
target_predict = np.argmax(cos_sim_train,axis=1);

cm_train = confusion_matrix(ds_train.target,target_predict);