# -*- coding: utf-8 -*-
"""
Created on Mon May 4 00:00:55 2020

Clustering of Aggregation dataset

@author: Márton Ispány
"""

import numpy as np;  # importing numerical computing package
import urllib;  # importing url handling
from matplotlib import pyplot as plt;  # importing the MATLAB-like plotting tool
from sklearn import cluster;  # importing clustering algorithms
from sklearn import metrics;  # importing cluster metrics

url = 'https://arato.inf.unideb.hu/ispany.marton/DataMining/Practice/Clustering/Aggregation.tsv';
raw_data = urllib.request.urlopen(url);
data = np.loadtxt(raw_data, delimiter="\t");
X = data[:,0:2];
y = data[:,2];

fig = plt.figure(1);
plt.title('Scatterplot of datapoints with labels');
plt.xlabel('X');
plt.ylabel('Y');
plt.scatter(X[:,0],X[:,1],s=50,c=y);
plt.show();

# K-means clustering with fix K
K = 6;
kmeans_cluster = cluster.KMeans(n_clusters=K, random_state=2020);
kmeans_cluster.fit(X);
ypred = kmeans_cluster.predict(X);

fig = plt.figure(2);
plt.title('Scatterplot of datapoints clusters');
plt.xlabel('X');
plt.ylabel('Y');
plt.scatter(X[:,0],X[:,1],s=50,c=ypred);
plt.show();

Max_K = 30;
SSE = np.zeros((Max_K-2));
DB = np.zeros((Max_K-2));
for i in range(Max_K-2):
    n_c = i+2;
    kmeans = cluster.KMeans(n_clusters=n_c, random_state=2020);
    kmeans.fit(X);
    ypred = kmeans.labels_;
    SSE[i] = kmeans.inertia_;
    DB[i] = metrics.davies_bouldin_score(X,ypred);
    
fig = plt.figure(3);
plt.title('Sum of squares of error curve');
plt.xlabel('Number of clusters');
plt.ylabel('SSE');
plt.plot(np.arange(2,Max_K),SSE, color='red')
plt.show();

fig = plt.figure(4);
plt.title('Davies-Bouldin score curve');
plt.xlabel('Number of clusters');
plt.ylabel('DB index');
plt.plot(np.arange(2,Max_K),DB, color='blue')
plt.show();