PCA Using Python/Sklearn – Gates Bolton Analytics

####-----------------------------------------------------------
#### PCA and Pairwise Correlation
##   3D Scatterplot
##   !! Dataset !!  (A sample of this data was used in this code)
##   Link to Full Iris Dataset from Kaggle:
##   https://www.kaggle.com/datasets/uciml/iris?resource=download
##
## Gates, 2024
####------------------------------------------------------------
##
## Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from IPython.display import clear_output
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
import seaborn as sns

##This is a small sample of the Iris dataset (link above)
## !!! This is MY path :)  YOU need to update this to be YOUR path !!!

path="C:/Users/profa/Desktop/UCB/Classes/ML CSCI 5612/Data/Iris_Subset2.csv"
DF=pd.read_csv(path)
print(DF)
OriginalDF=DF.copy()
print(OriginalDF)
##--------------------------------
## Remove and save the label
## Next, update the label so that 
## rather than names like "Iris-setosa"
## we use numbers instead. 
## This will be necessary when we "color"
## the data in our plot
##---------------------------------------
DFLabel=DF["Species"]  ## Save the Label 
print(DFLabel)  ## print the labels
print(type(DFLabel))  ## check the datatype you have
DFLabel_string=DFLabel
## Remap the label names from strongs to numbers
MyDic={"Iris-setosa":0, "Iris-versicolor":1, "Iris-virginica":2}
DFLabel = DFLabel.map(MyDic)  ## Update the label to your number remap values
print(DFLabel) ## Print the labels to confirm 
## Now, remove the label from the original dataframe
DF=DF.drop(["Species"], axis=1)
print(DF) #Print the dataframe to confirm 

###-------------------------------------------
### Standardize your dataset
###-------------------------------------------
scaler = StandardScaler() ##Instantiate
DF=scaler.fit_transform(DF) ## Scale data
print(DF)
print(type(DF))
print(DF.shape)

###############################################
###--------------PERFORM PCA------------------
###############################################
## Instantiate PCA and choose how many components
MyPCA=PCA(n_components=3)
# Project the original data into the PCA space
Result=MyPCA.fit_transform(DF)
## Print the values of the first component 
#print(Result[:,0]) 
print(Result) ## Print the new (transformed) dataset
print("The eigenvalues:", MyPCA.explained_variance_)
## Proof
MyCov=np.cov(Result.T)
print("Covar of the PC PCA Matrix: \n", MyCov) ## The variance here (on the diagonal) will match the eigenvalues
print("The relative eigenvalues are:",MyPCA.explained_variance_ratio_)
print("The actual eigenvalues are:", MyPCA.explained_variance_)
EVects=MyPCA.components_
print("The eigenvectors are:\n",EVects)
print(type(EVects))
print("The shape of the eigenvector matrix is\n", EVects.shape)
print(DF.shape)
print(DF)
## Proof to transform origial data to eigenbasis
## using the eigenvectors matrix.
print(EVects.T)
##  (15, 4) @ (4, 3)
Transf=DF@EVects.T
print("Proof that the transformed data is the EVects @ Data\n",Transf)


################################################
# Extract loadings
## Rerun PCA with all columns - no dim reduction
############################################################
MyPCA2=PCA(n_components=4)
# Project the original data into the PCA space
Result2=MyPCA2.fit_transform(DF)
print(MyPCA2.components_) 
print(MyPCA2.components_.T) ## Makes the eigenvectors columnwise
print(MyPCA2.explained_variance_ratio_) 

print(OriginalDF)
print(len(OriginalDF))
print(OriginalDF.columns[0:4])

for i in range(1, len(OriginalDF[0:4].columns)):
               print(i)
               
loadings = pd.DataFrame(MyPCA2.components_.T, 
                        columns=[f'PC{i}' for i in range(1, len(OriginalDF[0:4].columns))], 
                        index=OriginalDF.columns[0:4])
print(loadings)

## Print the most important variables using a threshold
threshold = 0.4
# Find features with loadings above the threshold for each principal component
important_features = {}
for column in loadings.columns:
    important_features[column] = loadings.index[loadings[column].abs() > threshold].tolist()

# Now 'important_features' dictionary contains the important features for each PC
for pc, features in important_features.items():
    print(f"{pc}: {', '.join(features)}")

# Plot heatmap of loadings
plt.figure(figsize=(10, 8))
sns.heatmap(loadings, annot=True, cmap='coolwarm')
plt.title('Feature Importance in Principal Components')
plt.show()
#################################################
## Visualize the transformed 3D dataset
## we just created using PCA
#################################################

fig2 = plt.figure()
    #figsize=(12, 12))
ax2 = fig2.add_subplot(projection='3d')
#Axes3D(fig2, rect=[0, 0, .90, 1], elev=48, azim=134)

x=Result[:,0]
y=Result[:,1] 
z=Result[:,2]
print(y)

ax2.scatter(x,y,z, cmap="RdYlGn", edgecolor='k', s=200, c=DFLabel)
#surf2 = ax2.plot_surface(x, y, z, cmap='viridis')
ax2.set_xlabel('X')
ax2.set_ylabel('Y')
ax2.set_zlabel('Z')
ax2.set_title('3D PCA')
#
plt.show()
plt.savefig("MyImage.jpeg")

############################################
## Create Plot to Show Eigenvalues
############################################
fig3=plt.figure()
ACCUM_eigenvalues = np.cumsum(MyPCA.explained_variance_)
print(ACCUM_eigenvalues)
print(MyPCA.explained_variance_ratio_)
plt.bar(range(0,len(MyPCA.explained_variance_ratio_)), MyPCA.explained_variance_ratio_, 
        alpha=0.5, align='center', label='Individual Explained Variances')
#plt.step(range(0,len(ACCUM_eigenvalues)), ACCUM_eigenvalues, where='mid',label='Cumulative Explained Variances')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.title("Eigenvalues: Percentage of Variance/Information")
#plt.legend(loc='best')
plt.tight_layout()
plt.show()

###############################################
## Create a DF of the most important features
##################################################
shape= MyPCA.components_.shape[0]
#print(shape)
feature_names=["SepalWidthCm", "PetalLengthCm", "PetalWidthCm", "SepalLengthCm"]

most_important = [np.abs(MyPCA.components_[i]).argmax() for i in range(shape)]
most_important_names = [feature_names[most_important[i]] for i in range(shape)]

# Build a doctionary of the imprtant features by PC
MyDic = {'PC{}'.format(i): most_important_names[i] for i in range(shape)}

# build the dataframe
Important_DF = pd.DataFrame(MyDic.items())
print(Important_DF)

###############################################
## Create a biplot of the most important features
##################################################
PCA_dataset= Result ## recall that we ran PCA with 3 to get this above
EVectors_as_columns=EVects.T
print(EVectors_as_columns)
## PCA  dataset
print(PCA_dataset)



def biplot(PCA_dataset, EVectors_as_columns, labels=feature_names):
    xs = PCA_dataset[:, 0]
    ys = PCA_dataset[:, 1]
    n = EVectors_as_columns.shape[0]
   
    scalex = 1.0 / (xs.max() - xs.min())
    scaley = 1.0 / (ys.max() - ys.min())
    
    plt.scatter(xs*scalex, ys*scaley, cmap="RdYlGn", edgecolor='k', s=100, c=DFLabel)
              
    for i in range(n):
        plt.arrow(0, 0, EVectors_as_columns[i, 0], EVectors_as_columns[i, 1], color='r', alpha=0.5)
        if labels is None:
            plt.text(EVectors_as_columns[i, 0] * 1.07 , EVectors_as_columns[i, 1] * 1.07, 
                     "Var" + str(i + 1), color='g', ha='center', va='center')
        else:
            plt.text(EVectors_as_columns[i, 0] * 1.07, 
                     EVectors_as_columns[i, 1] * 1.07, labels[i], color='g', 
                     ha='center', va='center')

# Plot biplot
plt.figure(figsize=(10, 20))
#figsize=(10, 16)
biplot(PCA_dataset, EVectors_as_columns, labels=feature_names)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Biplot of Principal Components')
plt.show()