# -*- coding: utf-8 -*-
"""
Created on Mon Apr  8 10:38:19 2019

@author: Márton
"""

import numpy as np; 
import matplotlib.pyplot as plt;
from sklearn import datasets as ds;
from sklearn import model_selection as ms;
from sklearn import cluster, metrics;

# load dataset and partition in training and testing sets
digits = ds.load_digits();
n = digits.data.shape[0];
p = digits.data.shape[1];

# Particionálás tanító és teszt adatállományra
t_s = 0.3;
X_train, X_test, y_train, y_test = ms.train_test_split(digits.data, 
             digits.target, test_size=t_s, random_state=2019);
                                                       
# Ward clustering
n_c = 10;
link = 'ward';
AgglClus = cluster.AgglomerativeClustering(n_clusters=n_c, linkage=link);
AgglClus.fit(X_train);
digits_labels_train = AgglClus.labels_;

# Goodness of fit
cm_train_ward = metrics.cluster.contingency_matrix(y_train,digits_labels_train); 
rand_ward = metrics.adjusted_rand_score(y_train,digits_labels_train); 

# Single linkage clustering
n_c = 10;
link = 'single';
AgglClus = cluster.AgglomerativeClustering(n_clusters=n_c, linkage=link);
AgglClus.fit(X_train);
digits_labels_train = AgglClus.labels_;

# Goodness of fit
cm_train_single = metrics.cluster.contingency_matrix(y_train,digits_labels_train); 
rand_single = metrics.adjusted_rand_score(y_train,digits_labels_train); 

# Average linkage clustering
n_c = 10;
link = 'average';
AgglClus = cluster.AgglomerativeClustering(n_clusters=n_c, linkage=link);
AgglClus.fit(X_train);
digits_labels_train = AgglClus.labels_;

# Goodness of fit
cm_train_average = metrics.cluster.contingency_matrix(y_train,digits_labels_train); 
rand_average = metrics.adjusted_rand_score(y_train,digits_labels_train); 

# Complete linkage clustering
n_c = 5;
link = 'complete';
AgglClus = cluster.AgglomerativeClustering(n_clusters=n_c, linkage=link);
AgglClus.fit(X_train);
digits_labels_train = AgglClus.labels_;

# Goodness of fit
cm_train_complete = metrics.cluster.contingency_matrix(y_train,digits_labels_train); 
rand_complete = metrics.adjusted_rand_score(y_train,digits_labels_train); 

complete_tree = AgglClus.children_;

# Goodness of fit for Ward
Max_K = 30;
RW = np.zeros((Max_K-2));
for i in range(Max_K-2):
    n_c = i+2;
    clustering = cluster.AgglomerativeClustering(n_clusters=n_c, linkage='ward');
    clustering.fit(X_train);    
    RW[i] = metrics.adjusted_rand_score(y_train,clustering.labels_);
    
plt.figure(1);
plt.title('Rand index');
plt.plot(np.arange(2,Max_K),RW, color='blue');
plt.show();

