# -*- coding: utf-8 -*-
"""
Created on Wed Nov 21 08:39:55 2018

@author: Márton
"""

from sklearn.datasets import fetch_20newsgroups;
import sklearn.feature_extraction.text as txt;  # importing text preprocessing
import sklearn.cluster as cluster;
import sklearn.mixture as mix;
import sklearn.utils.random as rd;
import matplotlib.pyplot as plt;
import numpy as np;

# Importing the training and testing datasets
ds_train = fetch_20newsgroups(subset='train',
                             shuffle=True, random_state=2018);
ds_test = fetch_20newsgroups(subset='test',
                             shuffle=True, random_state=2018);
n_train = len(ds_train.data);
n_test = len(ds_test.data);
n_class = len(ds_train.target_names);

# Vectorization of the docs
min_df = 0.05;
max_df = 0.1;
vectorizer = txt.TfidfVectorizer(stop_words='english',
                                 min_df=min_df,max_df=max_df); 
DT_train = vectorizer.fit_transform(ds_train.data);
vocabulary_list = vectorizer.get_feature_names();
vocabulary = np.asarray(vocabulary_list);  # vocabulary in 1D array 
n_words = DT_train.shape[1];
DT_test = vectorizer.transform(ds_test.data);
samp_ind1 = rd.sample_without_replacement(n_population=n_train, n_samples=1000);
samp_ind2 = rd.sample_without_replacement(n_population=n_test, n_samples=1000);
DT_train_sample = DT_train[samp_ind1,:];
DT_test_sample = DT_test[samp_ind2,:];

max_cluster = 30;
sse_train = [];
sse_test = [];
for i in range(max_cluster):
    n_clus = i+2;
    kmeans = cluster.KMeans(n_clusters=n_clus, n_init=3, max_iter=10, random_state=2019);
    kmeans.fit(DT_train_sample);
#    doc_cluster_train = kmeans.predict(DT_train);
    sse_train.append(kmeans.inertia_);
    kmeans.fit(DT_test_sample);
    sse_test.append(kmeans.inertia_);

fig = plt.figure(1);
plt.title('Diagnostic for KMeans method');
plt.xlabel('Number of clusters');
plt.ylabel('Inertia');
plt.plot(sse_train,c='blue');
plt.plot(sse_test,c='red');
plt.show();        
    
n_clus = 3;
gmm = mix.GaussianMixture(n_components=n_clus);
gmm.fit(DT_train);
doc_cluster_train = gmm.predict(DT_train);
doc_prob_train = gmm.predict_proba(DT_train);
