# -*- coding: utf-8 -*-
"""
Created on Mon Apr  1 10:33:53 2019

K-means clustering of Iris dataset

@author: Márton Ispány
"""

from sklearn.datasets import load_iris;  # importing data loader
from sklearn import cluster, metrics;  # importing clustering algorithms and metrics
import sklearn.decomposition as decomp; # importing dimension reduction library
import matplotlib.pyplot as plt;  # importing the MATLAB-like plotting tool
import numpy as np; # importing numerical computing package

# Loading dataset
iris = load_iris();

# Kmeans clustering
n_c = 2; # number of clusters
kmeans = cluster.KMeans(n_clusters=n_c, random_state=2020);
kmeans.fit(iris.data);
iris_labels = kmeans.labels_;  # cluster labels
iris_centers = kmeans.cluster_centers_;  # centroid of clusters
sse = kmeans.inertia_;  # sum of squares of error (within sum of squares)
score = kmeans.score(iris.data);  # negative error

# both sse and score measure the goodness of clustering

db = metrics.davies_bouldin_score(iris.data,iris_labels);

# PCA with limited components
pca = decomp.PCA(n_components=2);
pca.fit(iris.data);
iris_pc = pca.transform(iris.data);  #  data coordinates in the PC space
centers_pc = pca.transform(iris_centers);  # the cluster centroids in the PC space

# Visualizing of clustering in the principal components space
fig = plt.figure(1);
plt.title('Clustering of the Iris data after PCA');
plt.xlabel('PC1');
plt.ylabel('PC2');
plt.scatter(iris_pc[:,0],iris_pc[:,1],s=50,c=iris_labels);  # data
plt.scatter(centers_pc[:,0],centers_pc[:,1],s=200,c='red',marker='X');  # centroids
plt.legend();
plt.show();

# Finding optimal cluster number
Max_K = 31;  # maximum cluster number
SSE = np.zeros((Max_K-2));  #  array for sum of squares errors
DB = np.zeros((Max_K-2));  # array for Davies Bouldin indeces
for i in range(Max_K-2):
    n_c = i+2;
    kmeans = cluster.KMeans(n_clusters=n_c, random_state=2020);
    kmeans.fit(iris.data);
    iris_labels = kmeans.labels_;
    SSE[i] = kmeans.inertia_;
    DB[i] = metrics.davies_bouldin_score(iris.data,iris_labels);

# Visualization of SSE values    
fig = plt.figure(2);
plt.title('Sum of squares of error curve');
plt.xlabel('Number of clusters');
plt.ylabel('SSE');
plt.plot(np.arange(2,Max_K),SSE, color='red')
plt.show();

# Visualization of DB scores
fig = plt.figure(3);
plt.title('Davies-Bouldin score curve');
plt.xlabel('Number of clusters');
plt.ylabel('DB index');
plt.plot(np.arange(2,Max_K),DB, color='blue')
plt.show();

# The local minimum of Davies Bouldin curve gives the optimal cluster number