###############
# Authored by Weisheng Jiang
# Book 4  |  From Basic Arithmetic to Machine Learning
# Published and copyrighted by Tsinghua University Press
# Beijing, China, 2022
###############

# Bk4_Ch24_01_A

import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd  
from sklearn.datasets import load_iris

# A copy from Seaborn
iris = load_iris()

X = iris.data
y = iris.target

feature_names = ['Sepal length, x1','Sepal width, x2',
                 'Petal length, x3','Petal width, x4']

# Convert X array to dataframe
X_df = pd.DataFrame(X, columns=feature_names)

#%% Original data, X

X = X_df.to_numpy();

#%% Gram matrix, G

G = X.T@X

#%% Cosine similarity matrix, C

# from sklearn.metrics.pairwise import cosine_similarity
# C = cosine_similarity(X)
from numpy.linalg import inv

S_norm = np.diag(np.sqrt(np.diag(G)))
# scaling matrix, diagnal element is the norm of x_j

C = inv(S_norm)@G@inv(S_norm)

#%% centroid of data matrix, E(X)

E_X = X_df.mean().to_frame().T

#%% Demean, centralize, X_c

X_c = X_df.sub(X_df.mean())

#%% covariance matrix, Sigma

SIGMA = X_df.cov()

#%% correlation matrix, P

RHO = X_df.corr()

#%% Normalize data, Z_X

from scipy.stats import zscore

Z_X = zscore(X_df)

#%%

# Bk4_Ch24_01_B

#%% QR decomposition

from numpy.linalg import qr

Q, R = qr(X_df,mode = 'reduced')

#%%

# Bk4_Ch24_01_C

#%% Cholesky decomposition

from numpy.linalg import cholesky as chol

L_G = chol(G)
R_G = L_G.T

#%% Cholesky decompose covariance matrix, SIGMA

L_Sigma = chol(SIGMA)

R_Sigma = L_Sigma.T

#%%

# Bk4_Ch24_01_D

#%% eigen decompose G

from numpy.linalg import eig

Lambs_G,V_G = eig(G)
Lambs_G = np.diag(Lambs_G)

#%% eigen decompose Sigma, covariance matrix

Lambs_sigma,V_sigma = eig(SIGMA)
Lambs_sigma = np.diag(Lambs_sigma)

#%% eigen decompose P, correlation matrix

Lambs_P,V_P = eig(RHO)
Lambs_P = np.diag(Lambs_P)

#%%

# Bk4_Ch24_01_E

#%% SVD, original data X

from numpy.linalg import svd

U_X,S_X_,V_X = svd(X_df, full_matrices=False)
V_X = V_X.T

# full_matrices=True
# indices_diagonal = np.diag_indices(4)
# S_X = np.zeros_like(X_df)
# S_X[indices_diagonal] = S_X_

# full_matrices=False
S_X = np.diag(S_X_)

#%% SVD, original data Xc

U_Xc,S_Xc,V_Xc = svd(X_c, full_matrices=False)
V_Xc = V_Xc.T
S_Xc = np.diag(S_Xc)

#%% SVD, z scores

U_Z,S_Z,V_Z = svd(Z_X, full_matrices=False)
V_Z = V_Z.T
S_Z = np.diag(S_Z)