# -*- coding: utf-8 -*-
"""
Created on Mon Oct 15 19:13:15 2018

@author: Márton
"""

from sklearn.datasets import fetch_20newsgroups;
import sklearn.feature_extraction.text as txt;
import sklearn.metrics.pairwise as pw;
from sklearn import decomposition as decomp;
import matplotlib.pyplot as plt;
import matplotlib.colors as col;
import pandas as pd;
import numpy as np;

categories = [
    'alt.atheism',
    'talk.religion.misc',
];
ds_train = fetch_20newsgroups(subset='train',categories=categories);
ds_test = fetch_20newsgroups(subset='test',categories=categories);
n_train = len(ds_train.data);
n_test = len(ds_test.data);
        
        
vectorizer = txt.CountVectorizer(stop_words='english',max_df=0.8,min_df=0.2); 
DT_train = vectorizer.fit_transform(ds_train.data); 
vocabulary_dict = vectorizer.vocabulary_;
vocabulary_list = vectorizer.get_feature_names();
vocabulary = np.asarray(vocabulary_list);  # vocabulary in 1D array
stopwords = vectorizer.stop_words_;
n_words = DT_train.shape[1];

# document-term matrix in dense form 
doc_term_train = DT_train.todense().getA();

# stopword in list
stopwords_list = list(stopwords);

# visualisation the frequencies of keywords
keywords_freq = np.sum(doc_term_train,axis=0);
fig = plt.figure(3);
plt.title('Frequency of keywords in training dataset');
plt.xlabel('Frequency');
plt.ylabel('Words');
plt.barh(vocabulary,keywords_freq, align='center', color='blue');
plt.show(); 

# visualisation the histogram of keywords freq by documents
docs_freq = np.sum(doc_term_train,axis=1);
fig = plt.figure(31);
plt.title('Histogram of keywords occurances');
plt.xlabel('Number of occurances of keywords in a document');
plt.ylabel('Frequency');
count, bins, ignored  = plt.hist(docs_freq,50,alpha=0.75);
plt.show();

# the first k most frequent keywords by documents
first_k_words = 3;
doc_freq_words = np.chararray((n_train,first_k_words));
doc_ind = np.argsort(doc_term_train,axis=1);
doc_ind = np.flip(doc_ind,axis=1);
doc_freq_words = vocabulary[doc_ind[:,0:first_k_words]];

# the first k most fre
first_k_docs = 20;
word_freq_docs = np.chararray((first_k_docs,n_words));
word_ind = np.argsort(doc_term_train,axis=0);
word_ind = np.flip(word_ind,axis=0);
wind = 8;  # index of word
word_freq_docs = list();
for i in range(first_k_docs):
        word_freq_docs.append(ds_train.data[word_ind[i,wind]]);

# transforming the test dataset            
DT_test = vectorizer.transform(ds_test.data);

# document-term matrix in dense form 
doc_term_test = DT_test.todense().getA();

cos_sim = pw.cosine_similarity(doc_term_train,doc_term_test)

svd = decomp.TruncatedSVD(n_components=2);
TD_svd = svd.fit_transform(DT_train);

colors = ['blue','red'];     

fig = plt.figure(1);
plt.title('Dimension reduction');
plt.xlabel('Dim1');
plt.ylabel('Dim2');
plt.scatter(TD_svd[:,0],TD_svd[:,1],s=50,c=ds_train.target,
            cmap=col.ListedColormap(colors));
plt.show();            

# Documentum-term matrix in dense form
n_term = TD_train.shape[1];
TD_train_dense = np.zeros((n_train,n_term));
for i in range(n_train):
    for j in range(n_term):
        TD_train_dense[i,j] = TD_train[i,j];

fig = plt.figure(2);
x = 1;
y = 5;
plt.title('Documents in the space of words');
plt.xlabel(vocabular[x]);
plt.ylabel(vocabular[y]);
plt.scatter(TD_train_dense[:,x],TD_train_dense[:,y],s=50, c=ds_train.target,
            cmap=col.ListedColormap(colors));
plt.show();       

TD_test = vectorizer.transform(ds_test.data); 