# -*- coding: utf-8 -*-
"""
Created on Wed Nov 23 09:02:34 2022

@author: Márton
"""

import numpy as np;  # importing numpy for arrays
import matplotlib.pyplot as plt;  # importing pyplot
import matplotlib.colors as col;  # importing coloring
from sklearn.datasets import fetch_20newsgroups;
from sklearn.feature_extraction.text import CountVectorizer;  # importing text preprocessing
from sklearn.decomposition import NMF;

# Importing the training and testing datasets
ds_train = fetch_20newsgroups(subset='train',
                             shuffle=True, random_state=2022);
ds_test = fetch_20newsgroups(subset='test',
                             shuffle=True, random_state=2022);
n_train = len(ds_train.data);
n_test = len(ds_test.data);
n_class = len(ds_train.target_names);

min_pr = 0.2;        
vectorizer = CountVectorizer(stop_words='english',min_df=min_pr); 
DT_train = vectorizer.fit_transform(ds_train.data); 
vocabulary_dict = vectorizer.vocabulary_;
vocabulary_list = vectorizer.get_feature_names();
vocabulary = np.asarray(vocabulary_list);  # vocabulary in 1D array
stopwords = vectorizer.stop_words_;
n_words = DT_train.shape[1];

# Transforming document-term matrix to dense form
doc_term_train = DT_train.toarray();
# doc_term_train = DT_train.todense().getA();  an other way

DT_test = vectorizer.transform(ds_test.data);

# Non-Negative Matrix Factorization
n_components = 2;
init = 'nndsvda';
nmf = NMF(
    n_components=n_components,
    random_state=1,
    init=init,
    beta_loss="frobenius",
#    alpha_W=0.00005,
#    alpha_H=0.00005,
    l1_ratio=1,
);
nmf.fit(DT_train);
H = nmf.components_;

# Drawing scatterplot for terms in reduced space                                
fig = plt.figure(1);
plt.title('Terms in reduced space');
plt.xlabel('First component');
plt.ylabel('Second component');
for i in range(n_words):
    plt.scatter(H[0,i],H[1,i],color='blue');
    plt.text(H[0,i]+.05, H[1,i]+.05, vocabulary_list[i], fontsize=9)
plt.show(); 

# Test dataset
nmf.transform(DT_test);
H = nmf.components_;

# Drawing scatterplot for terms in reduced space                                
fig = plt.figure(2);
plt.title('Terms in reduced space');
plt.xlabel('First component');
plt.ylabel('Second component');
for i in range(n_words):
    plt.scatter(H[0,i],H[1,i],color='blue');
    plt.text(H[0,i]+.05, H[1,i]+.05, vocabulary_list[i], fontsize=9)
plt.show(); 