# -*- coding: utf-8 -*-
"""
Created on Wed Oct 20 08:50:27 2021

Task: Dimension reduction of an artificial dataset by PCA and SVD
Results: descriptive stats and graphs

Python tools    
Libraries: numpy, matplotlib, pandas, sklearn
Modules: pyplot, decomposition
Classes: PCA, TruncatedSVD
Functions:

@author: Márton Ispány
"""

import numpy as np;  # importing numpy for arrays
import matplotlib.pyplot as plt;  # importing MATLAB-like plotting framework
from sklearn.decomposition import PCA;  # importing principal component analysis (PCA) class
from sklearn.decomposition import TruncatedSVD; # importing singular valued decomposition (SVD) class
import pandas as pd; # importing pandas

# Defining artificial dataset
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]);

# PCA of the dataset
pca = PCA(n_components=2);  # instance of the PCA class
pca.fit(X);   # fitting PCA to the data
T = pca.components_;  # transformation matrix
pca_singular = pca.singular_values_;  # singular values
Y = pca.transform(X);  # transformed dataset
variance = pca.explained_variance_;  # eigenvalues of the covariance matrix
var_ratio = pca.explained_variance_ratio_; # variance ratio as importance of PC's

# Checking 
I = np.dot(T,T.T);  # orthonormality
Y1 = np.dot(T,X.T).T;  # transformation of the original data, we should get Y
var_ratio1 = variance/np.sum(variance); # should get back the variance ratio

# Checcking that the transformed dataset is uncorrelated
df = pd.DataFrame(Y);
corr_matrix = df.corr();  # we get identity matrix!

# Visualization of the original and transformed dataset
plt.figure(1);
plt.title('Scatterplot for original and transformed dataset');
plt.xlabel('First coordinate');
plt.ylabel('Second coordinate');
plt.scatter(X[:,0],X[:,1],color='red',label='original');
plt.scatter(Y[:,0],Y[:,1],color='blue',label='transformed');
plt.plot([-4,4],[0,0],color='black');
plt.plot([0,0],[-3,3],color='black');
plt.legend(loc='lower right');
plt.show();

# SVD of the dataset
svd = TruncatedSVD(n_components=1);  # instance of the SVD class
svd.fit(X);   # fitting SVD to the data
svd_singular = svd.singular_values_;
Ysvd = svd.transform(X);  # transformed dataset
Tsvd= svd.components_;

# Checking
Ysvd1 = np.dot(Tsvd,X.T).T;  # transformation of the original data, we should get Y
# Compare the first column of Y and Ysvd

# SVD by numpy
U,S,V = np.linalg.svd(X,full_matrices=False);
X1 = np.dot(np.dot(U,np.diag(S)),V);