# -*- coding: utf-8 -*-
"""
Created on Mon Oct 8 07:48:11 2017

Pricipal Component Analysis (PCA) of Iris data using 
the standard numpy libraries and 
the scikit-learn Machine Learning toolkit in Python

@author: Márton Ispány
"""

from sklearn import datasets as ds;
from sklearn import decomposition as decomp;
from sklearn import cross_validation as cv;
import numpy as np;  # importing numpy the scientific computing package with Python
import matplotlib.pyplot as plt;  # importing pyplot the MATLAB-like plotting framework from matplotlib library
import matplotlib.colors as col;
from numpy import linalg as LA;  # importing the linear algebra library from numpy
 
# load dataset and partition in training and testing sets
iris = ds.load_iris();  # load iris dataset
# Scatterplot for two input attributes
x_axis = 0;  # x axis attribute (0,1,2,3)
y_axis = 3;  # y axis attribute (0,1,2,3)
colors = ['red','green','blue']; # colors for target values setosa 
fig = plt.figure(1);
plt.title('Scatterplot for iris dataset');
plt.xlabel(iris.feature_names[x_axis]);
plt.ylabel(iris.feature_names[y_axis]);
plt.scatter(iris.data[:,x_axis],iris.data[:,y_axis],s=50,c=iris.target,cmap=col.ListedColormap(colors));
plt.show();

# Full PCA using standard numpy libraries
Cov = np.cov(np.transpose(iris.data));  # computing the covariance matrix for attributes
Cov_eig, Cov_vec = LA.eigh(Cov);     # solving the eigenvalue-eigenvector problem
PC_iris = np.dot(iris.data,Cov_vec);    # computing the principal components
Cov_pc = np.cov(np.transpose(PC_iris));  # checking the covariance matrix of PCs
dim = Cov.shape[0];  # dimension of the covariance matrix
tot_var = np.trace(Cov);  # total variance of the covariance matrix
diff = tot_var - np.sum(Cov_eig);  # checking the sum of the covariance matrix
Cov_eig = np.flipud(Cov_eig);  # eigenvalues in decreasing order
var_ratio = Cov_eig/tot_var;    # normalized variances
fig = plt.figure(2); # Explained variance ratio plot
plt.title('Explained variance ratio plot');
x_pos = np.arange(dim);
plt.xticks(x_pos,x_pos+1);
plt.xlabel('Principal Components');
plt.ylabel('Variance');
plt.bar(x_pos,var_ratio, align='center', alpha=0.5);
plt.show(); 
cumvar_ratio = np.cumsum(Cov_eig)/tot_var;    # normalized cumulative variances
fig = plt.figure(3); # Explained cumulative variance ratio plot
plt.title('Explained cumulative variance ratio plot');
x_pos = np.arange(dim);
#plt.xticks(x_pos,x_pos+1);
plt.xlabel('Principal Components');
plt.ylabel('Cumulative variance');
plt.plot(x_pos,cumvar_ratio);
plt.show(); 
PC_iris = np.fliplr(PC_iris); # changing the order of PCs 
colors = ['red','green','blue']; # colors for target values setosa
fig = plt.figure(4); # Iris data in PC space
plt.xlabel('PC1');
plt.ylabel('PC2');
plt.scatter(PC_iris[:,0],PC_iris[:,1],s=50,c=iris.target,cmap=col.ListedColormap(colors));
plt.show();

# Full PCA using scikit-learn
pca = decomp.PCA();
pca.fit(iris.data);
fig = plt.figure(5);
plt.title('Explained variance ratio plot');
var_ratio = pca.explained_variance_ratio_;
x_pos = np.arange(len(var_ratio));
plt.xticks(x_pos,x_pos+1);
plt.xlabel('Principal Components');
plt.ylabel('Variance');
plt.bar(x_pos,var_ratio, align='center', alpha=0.5);
plt.show(); 

# PCA with limited components
pca = decomp.PCA(n_components=2);
pca.fit(iris.data);
iris_pc = pca.transform(iris.data);
colors = ['green','red','blue']
fig = plt.figure(6);
plt.xlabel('PC1');
plt.ylabel('PC2');
plt.scatter(iris_pc[:,0],iris_pc[:,1],s=50,c=iris.target,cmap=col.ListedColormap(colors));
plt.show();

X_train, X_test, y_train, y_test = cv.train_test_split(iris.data, iris.target, test_size=0.1, random_state=1);

colors = ['green','red','blue']
 
plt.figure(7);
plt.scatter(X_train[:,1],X_train[:,2],c=y_train);
plt.show();

pca = decomp.PCA(n_components=2);
pca.fit(X_train);
X_train_pc = pca.transform(X_train);