# -*- coding: utf-8 -*-
"""
Created on Thu Oct 28 14:38:47 2021

Task: Simple visualization of 20newsgroups by pandas
Results: descriptive stats and graphs

Python tools    
Libraries: numpy, pandas, sklearn
Modules: datasets
Functions: fetch_20newsgroups

@author: Márton Ispány
"""

import numpy as np;  # importing numpy for arrays
import pandas as pd; # importing pandas
from sklearn.datasets import fetch_20newsgroups; # importing 20newsgroups dataset

# Importing training and test datasets
ds_train = fetch_20newsgroups(subset='train',
                             shuffle=True, random_state=2021);
ds_test = fetch_20newsgroups(subset='test',
                             shuffle=True, random_state=2021);
n_train = len(ds_train.data);  # size of training dataset
n_test = len(ds_test.data);   # size of test dataset
n_class = len(ds_train.target_names);   # number of classes

# Computing the length of emails with their categories
mail_length = np.zeros((n_train), dtype=np.int64);
mail_target = [];
for i in range(n_train):
    mail_length[i] = len(ds_train.data[i]);
    mail_target.append(ds_train.target_names[ds_train.target[i]]);

# Building dataframe by dictionary    
d = {'Length': mail_length,'Target': mail_target};
df_train = pd.DataFrame(data=d);

# Computing basic descriptive statitics by pandas
mean = df_train.groupby('Target').mean();   # sample mean by target
var = df_train.groupby('Target').var();   # sample variance by target
desc_stat = df_train.groupby('Target').describe();   #  descriptive statistics by target

# Visualization by pandas
mean.plot.barh(title='Bar plot of mean email length'); # horizontal bar plot
desc_stat_sci = desc_stat.loc[['sci.crypt','sci.electronics','sci.med','sci.space']]; # sub-dataframe by rows
desc_stat_sci["Length","count"].plot.pie(title='Pie plot for categories frequency'); # pie of Length count for sub-dataframe
df_train.hist(column='Length', bins=50, by='Target'); 
df_train.boxplot(by='Target');