# -*- coding: utf-8 -*-
"""
Created on Mon May 4 09:23:04 2020

Advanced clustering (hierarchical and DBSCAN) of Aggregation dataset from the URL
https://arato.inf.unideb.hu/ispany.marton/DataMining/Practice/Clustering/

@author: Márton
"""

import numpy as np;  # importing numerical computing package
import urllib;  # importing url handling
from matplotlib import pyplot as plt;  # importing the MATLAB-like plotting tool
from sklearn import cluster;  # importing clustering algorithms
from sklearn import metrics;  # importing cluster metrics
from scipy.cluster.hierarchy import dendrogram  # importing clustering visualization

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

url = 'https://arato.inf.unideb.hu/ispany.marton/DataMining/Practice/Clustering/Aggregation.tsv';
raw_data = urllib.request.urlopen(url);  # opening url
data = np.loadtxt(raw_data, delimiter="\t");  # loading dataset
X = data[:,0:2];  #  splitting the input attributes
y = data[:,2];   #  label attribute

# Visualizing datapoints using label colors
fig = plt.figure(1);
plt.title('Scatterplot of datapoints with labels');
plt.xlabel('X');
plt.ylabel('Y');
plt.scatter(X[:,0],X[:,1],s=50,c=y);
plt.show();

# Agglomerative clustering with single linkage method
link = 'single';  #  linkage method
# Building the full tree
single_cluster = cluster.AgglomerativeClustering(distance_threshold=0, 
                            n_clusters=None,linkage=link);
single_cluster.fit(X);

# Plot the top p levels of the dendrogram
fig = plt.figure(2);
plt.title('Hierarchical Clustering Dendrogram (single linkage)');
plot_dendrogram(single_cluster, truncate_mode='level', p=4);
plt.xlabel("Number of points in node (or index of point if no parenthesis).");
plt.show();

# Generating clusters
K = 7;
single_cluster = cluster.AgglomerativeClustering(n_clusters=K,linkage=link);
single_cluster.fit(X);
ypred_single = single_cluster.labels_;
db_single = metrics.davies_bouldin_score(X,ypred_single);
cm_single = metrics.cluster.contingency_matrix(y,ypred_single);


# Visualizing datapoints using cluster label
fig = plt.figure(3);
plt.title('Scatterplot of datapoints with single linkage clustering');
plt.xlabel('X');
plt.ylabel('Y');
plt.scatter(X[:,0],X[:,1],s=50,c=ypred_single);
plt.show();

# Agglomerative clustering with complete linkage method
link = 'complete';  #  linkage method
# Building the full tree
complete_cluster = cluster.AgglomerativeClustering(distance_threshold=0, 
                            n_clusters=None,linkage=link);
complete_cluster.fit(X);

# Plot the top p levels of the dendrogram
fig = plt.figure(4);
plt.title('Hierarchical Clustering Dendrogram (complete linkage)');
plot_dendrogram(complete_cluster, truncate_mode='level', p=4);
plt.xlabel("Number of points in node (or index of point if no parenthesis).");
plt.show();

# Generating clusters
K = 7;
complete_cluster = cluster.AgglomerativeClustering(n_clusters=K,linkage=link);
complete_cluster.fit(X);
ypred_complete = complete_cluster.labels_;
db_complete = metrics.davies_bouldin_score(X,ypred_complete);
cm_complete = metrics.cluster.contingency_matrix(y,ypred_complete);

# Visualizing datapoints using cluster label
fig = plt.figure(5);
plt.title('Scatterplot of datapoints with complete linkage clustering');
plt.xlabel('X');
plt.ylabel('Y');
plt.scatter(X[:,0],X[:,1],s=50,c=ypred_complete);
plt.show();

# Agglomerative clustering with Ward method
link = 'ward';  #  linkage method
# Building the full tree
ward_cluster = cluster.AgglomerativeClustering(distance_threshold=0, 
                            n_clusters=None,linkage=link);
ward_cluster.fit(X);

# Plot the top p levels of the dendrogram
fig = plt.figure(6);
plt.title('Hierarchical Clustering Dendrogram (complete linkage)');
plot_dendrogram(ward_cluster, truncate_mode='level', p=4);
plt.xlabel("Number of points in node (or index of point if no parenthesis).");
plt.show();

# Generating clusters
K = 7;
ward_cluster = cluster.AgglomerativeClustering(n_clusters=K,linkage=link);
ward_cluster.fit(X);
ypred_ward = complete_cluster.labels_;
db_ward = metrics.davies_bouldin_score(X,ypred_ward);
cm_ward = metrics.cluster.contingency_matrix(y,ypred_ward);

# Visualizing datapoints using cluster label
fig = plt.figure(7);
plt.title('Scatterplot of datapoints with complete linkage clustering');
plt.xlabel('X');
plt.ylabel('Y');
plt.scatter(X[:,0],X[:,1],s=50,c=ypred_ward);
plt.show();

# DBSCAN clustering
radius = 1.06;   # radius of neighborhood
inner_points = 5;  #  inner point definition 
dbscan_cluster = cluster.DBSCAN(eps=radius, min_samples=inner_points);
dbscan_cluster.fit(X);
ypred_dbscan = dbscan_cluster.labels_;
cm_dbscan = metrics.cluster.contingency_matrix(y,ypred_dbscan);

# Visualizing datapoints using cluster label
fig = plt.figure(8);
plt.title('Scatterplot of datapoints with DBSCAN');
plt.xlabel('X');
plt.ylabel('Y');
plt.scatter(X[:,0],X[:,1],s=50,c=ypred_dbscan);
plt.show();

# The parameters (eps and min_samples) are determined by guessing
# The best result is DBSCAN which can recover the 7 original cluster
# more or less precisely 
# By contingency matrix (cm) the clustering results can also be compared
# where rows are the true labels, columns are the cluster labels