Advanced PCA Code-Tutorial – Gates Bolton Analytics

The following code is a tutorial. The best method is to read the code, type the code (line by line), and run the code a few lines at a time to understand what it is doing and what it is showing you. The only way to gain value from and understand this code is to go line by line and to read all the comments.
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 30 12:06:12 2026

@author: profa
"""

####-----------------------------------------------------------
#### More on PCA - A CLoser Look at the Mathy bits - by hand
##   and using Sklearn as a comparison. 
##   Link to Full Iris Dataset from Kaggle:
##   https://www.kaggle.com/datasets/uciml/iris?resource=download
##
## Gates, 2026
####------------------------------------------------------------
##
## Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from IPython.display import clear_output
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
import seaborn as sns

##This is a small sample of the Iris dataset (link above)
## !!! This is MY path :)  YOU need to update this to be YOUR path !!!

path="C:/Users/profa/Desktop/Iris_Subset2.csv"
DF=pd.read_csv(path)
print(DF)
OriginalDF=DF.copy()
print(OriginalDF)
##--------------------------------
## Remove and save the label
## Next, update the label so that 
## rather than names like "Iris-setosa"
## we use numbers instead. 
## This will be necessary when we "color"
## the data in our plot
##---------------------------------------
DFLabel=DF["Species"]  ## Save the Label 
print(DFLabel)  ## print the labels
print(type(DFLabel))  ## check the datatype you have
DFLabel_string=DFLabel
## Remap the label names from strongs to numbers
MyDic={"Iris-setosa":0, "Iris-versicolor":1, "Iris-virginica":2}
DFLabel = DFLabel.map(MyDic)  ## Update the label to your number remap values
print(DFLabel) ## Print the labels to confirm 
## Now, remove the label from the original dataframe
DF=DF.drop(["Species"], axis=1)
print(DF) #Print the dataframe to confirm 

###-------------------------------------------
### Reduce the D of the data so we can "see" it 
### and can use it for this mathy tutorial
### Let's arbitrarily remove SepalLengthCm
###--------------------------------------------

DF=DF.drop(["SepalLengthCm"], axis=1)
print(DF) #Print the dataframe to confirm 

## Next - let's look at the data - we can
## now because its 3D

fig1 = plt.figure()
    #figsize=(12, 12))
ax1 = fig1.add_subplot(projection='3d')
# Set limits: (min, max)
ax1.set_xlim(-1, 5)
ax1.set_ylim(-1, 5)
ax1.set_zlim(-5, 5)

x=DF.iloc[:,0]
y=DF.iloc[:,1] 
z=DF.iloc[:,2]

ax1.scatter(x,y,z, cmap="RdYlGn", edgecolor='k', s=200, c=DFLabel)
#surf2 = ax2.plot_surface(x, y, z, cmap='viridis')
ax1.set_xlabel('X')
ax1.set_ylabel('Y')
ax1.set_zlabel('Z')
ax1.set_title('3D Plot of Dataset BEFORE we make changes')
#
plt.show()
plt.savefig("MyDataset_3DImage.jpeg")

###-----------------------------------------
## Now - just calculate and then subtract
## the mean.
##
## The replot and compare the two plots 
##  - the one before this and this one
##
## Can you see what happens when you subtract the mean
###-----------------------------------------------------
## You can do this many ways. 
Dataset_Mean  = DF.mean(axis=0)
print(Dataset_Mean) ## The mean is (3.060000, 3.873333, 1.246667)
## Now, subtract the mean from all the points (rows)
W_Data = DF - Dataset_Mean  ## This is called whitening
print(W_Data)

## Now - visualize this new W_Data
fig11 = plt.figure()
    #figsize=(12, 12))
ax11 = fig11.add_subplot(projection='3d')
## Use the SAME axis values so you can SEE the shift!
ax11.set_xlim(-1, 5)
ax11.set_ylim(-1, 5)
ax11.set_zlim(-5, 5)

x=W_Data.iloc[:,0]
y=W_Data.iloc[:,1] 
z=W_Data.iloc[:,2]

ax11.scatter(x,y,z, cmap="RdYlGn", edgecolor='k', s=200, c=DFLabel)
#surf2 = ax2.plot_surface(x, y, z, cmap='viridis')
ax11.set_xlabel('X')
ax11.set_ylabel('Y')
ax11.set_zlabel('Z')
ax11.set_title('3D Plot of Dataset AFTER we subtract the mean')
#
plt.show()
plt.savefig("MyDataset_3DImage_White.jpeg")

## IMPORTANT - notice that the SHAPE of the dataset
## did not change. Only its location 
## By subtracting the mean - we MOVE all the data
## so that its mean is 0. 

## Confirm that the mean is zero for the new W_Data.
W_Dataset_Mean  = np.round(W_Data.mean(axis=0),3) ## round it to 3 decimals
print(W_Dataset_Mean)

###-------------------------------------------------
## Next - divide the W_Data by the standard deviation. 
## By first subtracting the mean and then dividing 
## by the standard deviation, we are standardizing 
## the data.
## Visualize this as well....
StandDev_W_Data = np.std(W_Data, axis=0)
print(StandDev_W_Data)

Dataset_Standard  = W_Data/StandDev_W_Data
print(Dataset_Standard) 

## Check it!
print(np.std(Dataset_Standard, axis=0))
## The above must be "1"

## Now our dataset is standardized
## Let's use Python's SKlearn StandardScaler()
## and compare.

###----------------------------------------
###-------------------------------------------
### Standardize your dataset
###-------------------------------------------
print(DF)  ## This was the DF BEFORE you scaled it
## Use SKlearn's StandardScalar()
scaler = StandardScaler() ##Instantiate
DF=scaler.fit_transform(DF) ## Scale data
print(DF)
print(type(DF))
print(DF.shape)

## Compare
##
## Our standardized data is called Dataset_Standard
## The dataset we used Sklearn to standardize is called DF
print(Dataset_Standard)
print(DF)
## Make sure!!
print(np.round(Dataset_Standard - DF)) ## The diff is 0


## Now - visualize this new Dataset_Standard

fig12 = plt.figure()
    #figsize=(12, 12))
ax12 = fig12.add_subplot(projection='3d')
## Use the SAME axis values so you can SEE the shift!
ax12.set_xlim(-1, 5)
ax12.set_ylim(-1, 5)
ax12.set_zlim(-5, 5)

x=Dataset_Standard.iloc[:,0]
y=Dataset_Standard.iloc[:,1] 
z=Dataset_Standard.iloc[:,2]

ax12.scatter(x,y,z, cmap="RdYlGn", edgecolor='k', s=200, c=DFLabel)
#surf2 = ax2.plot_surface(x, y, z, cmap='viridis')
ax12.set_xlabel('X')
ax12.set_ylabel('Y')
ax12.set_zlabel('Z')
ax12.set_title('3D Plot of Dataset AFTER we Standardize')
#
plt.show()
plt.savefig("MyDataset_3DImage_Stand.jpeg")


## What does dividing by the std dev do?
## It rescales all the data by the same unit---
## It  - Changes the Units to "Standard Deviations": 
    ## The data is no longer expressed in its original 
    ## units (e.g., inches or pounds). Instead, 
    ## it is expressed as z-scores, which represent 
    ## how many standard deviations a value is from the mean.



###############################################
###--------------PERFORM PCA BY HAND - by step 
###############################################
## Here, let's take all the steps we need to reduce 
## our data to 2D using PCA. 
##
## Let's do this by hand (not using a package)
##
####################################################
## First, our goal is to determine the directions
## of the data (orthoganol) with the largest
## variance. 
##

## Calculate the covariance matrix to see
## how all the variables vary in comparison to each other
## and themselves

## The default behavior for functions like numpy.cov is to divide by n-1 (N-1) to provide an unbiased estimate of the population covariance

My_cov_matrix = np.cov(Dataset_Standard, rowvar=False)
## The above treats the columns as variables (not the rows)
print(My_cov_matrix.shape)
print(My_cov_matrix)

## Let's make sure this makes sense.
## First, our Dataset_Standard has the following shape and
## look:
print(Dataset_Standard.shape)
print(Dataset_Standard)
## This means that we have a dataset with 15 points
## where each point is a row and is 3D. 
## The first point is  (1.432078      -1.353222     -1.305797)

##  You may need to review variance and covariance

## Our resulting cov matrix must be 3 by 3
## it compares the variation between
## each variable and all other variables.


## For example - let's just look the first two variables
## 
print(Dataset_Standard.iloc[:,0])
print(Dataset_Standard.iloc[:,1])
## Inner product
inner_prod = np.inner(Dataset_Standard.iloc[:,0], Dataset_Standard.iloc[:,1])
print((inner_prod/(len(Dataset_Standard)-1)))

###
### ------------- PCA BY HAND ----------------
###

## Now that we have our covariance matrix
## We want to calculate the new directions
## of the vectors that point in the direction
## of the highest variation AND
## that are orthoganal to each other
## In other words, we are creating a new basis

## Here, we will calculate the eigenvectors
## and their associated eigenvalues. 
## The vectors are the transformation that 
## will "move" the current data so that 
## the first largest variation is along the new "x"
## the second along the new "y" and so on.
## The eigenvalues tell you the measure of
## the variation.

eigenvalues, eigenvectors = np.linalg.eig(My_cov_matrix)
## (each !!column!! corresponds to an eigenvalue when using linalg.eig)
print("The E Vectors are:")
print(eigenvectors, "\n")
print("The E Values are:")
print(eigenvalues, "\n")


## --------------------
## Use the matrix of eigenvectors
## to transform the original dataframe.
## We will transform the data that we standardized.
## ------------------------------------
print("Here is the matrix of eigenvectors - column-wise: ")
print(eigenvectors)

## Visual of BEFORE and AFTER the transformation
## Remember - in this entire example, we are using
## 3D data and leaving it as 3D after the transformation
## We are not yet reducing the dimension of the data.


################# BEFORE the transformation
fig13 = plt.figure()
    #figsize=(12, 12))
ax13 = fig13.add_subplot(projection='3d')
## Use the SAME axis values so you can SEE the shift!
ax13.set_xlim(-1, 5)
ax13.set_ylim(-1, 5)
ax13.set_zlim(-5, 5)

x=Dataset_Standard.iloc[:,0]
y=Dataset_Standard.iloc[:,1] 
z=Dataset_Standard.iloc[:,2]

ax13.scatter(x,y,z, cmap="RdYlGn", edgecolor='k', s=200, c=DFLabel)
#surf2 = ax2.plot_surface(x, y, z, cmap='viridis')
ax13.set_xlabel('X')
ax13.set_ylabel('Y')
ax13.set_zlabel('Z')
ax13.set_title('3D Plot of Dataset BEFORE we transform')
#
plt.show()
plt.savefig("BEFORE_MyDataset_3DImage_Stand.jpeg")

##################
##----------
## Now, transform the data using the eigenvectors
##
#########################################
## First - get the shape of the data
print(Dataset_Standard.shape)  ##15 by 3
## Then, we know from above that our eigenvectors are column-wise
## Create the RIGHT multiplication
New_Basis = Dataset_Standard@eigenvectors
print(New_Basis)
## Make this match the sklearn PCA output by switching all the signs
New_Basis = New_Basis *-1
print(New_Basis)

####-----------Sign Ambiguity-------------------------
####
#### Important - there is something called "sign ambiguity"
#### in PCA. Rad more about this in the following links
#### https://python-bloggers.com/2024/12/understanding-why-sklearn-pca-differs-from-scratch-implementations/
#### 
#### Sklearn always makes the 
####W hile the sign doesn't affect the mathematical properties or the results 
#of the PCA transformation, it can lead to confusion when comparing results 
#across different implementations (or even different runs of the same algorithm 
#in some cases). 
#scikit-learn uses a function called svd_flip internally to enforce a deterministic 
#output. This function flips the signs of the eigenvectors (which are derived from 
#the Singular Value Decomposition in sklearn's implementation) so that the element 
#with the maximum absolute value within each principal component always has a 
#vpositive sign. 

#########
## Plot the NEW and translated data
#####################################

fig14 = plt.figure()
    #figsize=(12, 12))
ax14 = fig14.add_subplot(projection='3d')
## Use the SAME axis values so you can SEE the shift!
ax14.set_xlim(-1, 5)
ax14.set_ylim(-1, 5)
ax14.set_zlim(-5, 5)

x=New_Basis.iloc[:,0]
y=New_Basis.iloc[:,1] 
z=New_Basis.iloc[:,2]

ax14.scatter(x,y,z, cmap="RdYlGn", edgecolor='k', s=200, c=DFLabel)
#surf2 = ax2.plot_surface(x, y, z, cmap='viridis')
ax14.set_xlabel('X')
ax14.set_ylabel('Y')
ax14.set_zlabel('Z')
ax14.set_title('3D Plot of Dataset AFTER we Transform')
#
plt.show()
plt.savefig("AFTER_Trans_MyDataset_3DImage_Stand.jpeg")



###############################################
###--------------PERFORM PCA Using Sklearn-----
##
## COMPARE the results below with above....
###############################################
print(DF)
## Instantiate PCA and choose how many components
MyPCA=PCA(n_components=3)
# Project the original data into the PCA space
Result=MyPCA.fit_transform(DF)
## Print the values of the first component 
#print(Result[:,0]) 
print(Result) ## Print the new (transformed) dataset

print("The eigenvalues:", MyPCA.explained_variance_)
## NOTICE that the eigenvalues from here are exactly
## the same as the eigenvalues calcuated above
## using np.linalg.eig
###----------------------------------------
###
### TAKE YOUR TIME
### and view all the code. 
### We are comparing "by hand" to Sklearn PCA
## to better understand PCA and to see that the
## answers are the same in either case. 
###
###----------------------------------------

MyCov=np.cov(Result.T)
print("Covar of the PC PCA Matrix: \n", MyCov) ## The variance here (on the diagonal) will match the eigenvalues
print("The relative eigenvalues are:",MyPCA.explained_variance_ratio_)
print("The actual eigenvalues are:", MyPCA.explained_variance_)

##---------------------------
## IMPORTANT !!
##------------------------------
##In sklearn (unlike doing this by hand above), the eigenvectors 
## (principal components) are provided as !! ROWS !! in the components_ attribute. 
## not columns. It is CRITICAL to learn how a package or library 
## does this so you have the right vectors.
##The "components_" attribute is an array with the shape 
## (n_components, n_features). 
## Each row represents one principal component (eigenvector).
EVects=MyPCA.components_
print("The eigenvectors are:\n",EVects)
print(type(EVects))
print("The shape of the eigenvector matrix is\n", EVects.shape)
#print(DF.shape)
print(EVects)
first_row = EVects[0]
print("The first eigenvector is:\n", first_row)

#####--------------------
## Here, we will multiply the eigenmatrix
## by the original dataframe (DF)
## to get the transformation. 
## This is the same thing we get when 
## we run
## Result=MyPCA.fit_transform(DF)
## as we did above. 
## You can confirm that DF@EVects.T
## is the same as "Result" 
print(DF)
## Proof to transform origial data to eigenbasis
## using the eigenvectors matrix.
print(EVects.T)
##  (15, 4) @ (4, 3)
Transf=DF@EVects.T
print("Proof that the transformed data is the EVects @ Data\n",Transf)
print("Result from above is:\n", np.round(Result,5))

#########
## Plot the NEW and translated data
#####################################

fig15 = plt.figure()
    #figsize=(12, 12))
ax15 = fig15.add_subplot(projection='3d')
## Use the SAME axis values so you can SEE the shift!
ax15.set_xlim(-1, 5)
ax15.set_ylim(-1, 5)
ax15.set_zlim(-5, 5)

x=Transf[:,0]
y=Transf[:,1] 
z=Transf[:,2]

ax15.scatter(x,y,z, cmap="RdYlGn", edgecolor='k', s=200, c=DFLabel)
#surf2 = ax2.plot_surface(x, y, z, cmap='viridis')
ax15.set_xlabel('X')
ax15.set_ylabel('Y')
ax15.set_zlabel('Z')
ax15.set_title('3D Plot of Dataset AFTER we Transform')
#
plt.show()
plt.savefig("AFTER_Trans_Using_SklearnPCA.jpeg")

################################################
# Extract loadings
## Rerun PCA with all columns - no dim reduction
############################################################
MyPCA2=PCA(n_components=3)
# Project the original data into the PCA space
Result2=MyPCA2.fit_transform(DF)
print(MyPCA2.components_) 
print(MyPCA2.components_.T) ## Makes the eigenvectors columnwise
print(MyPCA2.explained_variance_) 

print(OriginalDF)
print(len(OriginalDF))
print(OriginalDF.columns[0:3])

for i in range(1, len(OriginalDF[0:3].columns)):
               print(i)
               

print(MyPCA2.components_.T)      ## Now the eigenvectors are column-wise     
columns=[f'PC{i}' for i in range(0, len(DF[0:3]))]
print(columns)
print(OriginalDF[0:2].columns)
loadings = pd.DataFrame(MyPCA2.components_.T, 
                        columns=[f'PC{i}' for i in range(0, len(DF[0:3]))])
                        #index=DF[0:3])
print(loadings)

## Print the most important variables using a threshold
threshold = 0.4
# Find features with loadings above the threshold for each principal component
important_features = {}
for column in loadings.columns:
    important_features[column] = loadings.index[loadings[column].abs() > threshold].tolist()

print(important_features.items())

# Now 'important_features' dictionary contains the important features for each PC
for pc, features in important_features.items():
    print(pc, features)


# Plot heatmap of loadings
plt.figure(figsize=(10, 8))
sns.heatmap(loadings, annot=True, cmap='coolwarm')
plt.title('Feature Importance in Principal Components')
plt.show()


############################################
## Create Plot to Show Eigenvalues
############################################
fig3=plt.figure()
ACCUM_eigenvalues = np.cumsum(MyPCA.explained_variance_)
print(ACCUM_eigenvalues)
print(MyPCA.explained_variance_ratio_)
plt.bar(range(0,len(MyPCA.explained_variance_ratio_)), MyPCA.explained_variance_ratio_, 
        alpha=0.5, align='center', label='Individual Explained Variances')
#plt.step(range(0,len(ACCUM_eigenvalues)), ACCUM_eigenvalues, where='mid',label='Cumulative Explained Variances')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.title("Eigenvalues: Percentage of Variance/Information")
#plt.legend(loc='best')
plt.tight_layout()
plt.show()

###############################################
## Create a DF of the most important features
##################################################
shape= MyPCA.components_.shape[0]
#print(shape)
feature_names=["SepalWidthCm", "PetalLengthCm", "PetalWidthCm", "SepalLengthCm"]

most_important = [np.abs(MyPCA.components_[i]).argmax() for i in range(shape)]
most_important_names = [feature_names[most_important[i]] for i in range(shape)]

# Build a doctionary of the imprtant features by PC
MyDic = {'PC{}'.format(i): most_important_names[i] for i in range(shape)}

# build the dataframe
Important_DF = pd.DataFrame(MyDic.items())
print(Important_DF)

###############################################
## Create a biplot of the most important features
##################################################
##
## Important note - check the TYPE - for this code to run
## we need a numpy array

PCA_dataset= New_Basis ## recall that we ran PCA with 3 to get this above
print(type(New_Basis))
PCA_dataset= np.array(PCA_dataset)
print(PCA_dataset)

EVectors_as_columns=EVects.T
print(EVectors_as_columns)
## PCA  dataset
#print(PCA_dataset)



def biplot(PCA_dataset, EVectors_as_columns, labels=feature_names):
    xs = PCA_dataset[:, 0]
    ys = PCA_dataset[:, 1]
    n = EVectors_as_columns.shape[0]
   
    scalex = 1.0 / (xs.max() - xs.min())
    scaley = 1.0 / (ys.max() - ys.min())
    
    plt.scatter(xs*scalex, ys*scaley, cmap="RdYlGn", edgecolor='k', s=100, c=DFLabel)
              
    for i in range(n):
        plt.arrow(0, 0, EVectors_as_columns[i, 0], EVectors_as_columns[i, 1], color='r', alpha=0.5)
        if labels is None:
            plt.text(EVectors_as_columns[i, 0] * 1.07 , EVectors_as_columns[i, 1] * 1.07, 
                     "Var" + str(i + 1), color='g', ha='center', va='center')
        else:
            plt.text(EVectors_as_columns[i, 0] * 1.07, 
                     EVectors_as_columns[i, 1] * 1.07, labels[i], color='g', 
                     ha='center', va='center')

# Plot biplot
plt.figure(figsize=(10, 20))
#figsize=(10, 16)
biplot(PCA_dataset, EVectors_as_columns, labels=feature_names)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Biplot of Principal Components')
plt.show()