Simple PCA Using Python/Sklearn – Gates Bolton Analytics

####-----------------------------------------------------------
#### PCA and Pairwise Correlation
##   3D Scatterplot
##   !! Dataset !!  (A sample of this data was used in this code)
##   Link to Full Iris Dataset from Kaggle:
##   https://www.kaggle.com/datasets/uciml/iris?resource=download
##
## Gates, 2024
####------------------------------------------------------------
##
## Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from IPython.display import clear_output
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn import datasets
from sklearn.preprocessing import StandardScaler

##This is a small sample of the Iris dataset (link above)
## !!! This is MY path :)  YOU need to update this to be YOUR path !!!
path="/Datasets/Iris_Subset2.csv"
DF=pd.read_csv(path)
print(DF)
##--------------------------------
## Remove and save the label
## Next, update the label so that 
## rather than names like "Iris-setosa"
## we use numbers instead. 
## This will be necessary when we "color"
## the data in our plot
##---------------------------------------
DFLabel=DF["Species"]  ## Save the Label 
print(DFLabel)  ## print the labels
print(type(DFLabel))  ## check the datatype you have
## Remap the label names from strings to numbers
MyDic={"Iris-setosa":0, "Iris-versicolor":1, "Iris-virginica":2}
DFLabel = DFLabel.map(MyDic)  ## Update the label to your number remap values
print(DFLabel) ## Print the labels to confirm 
## Now, remove the label from the original dataframe
DF=DF.drop(["Species"], axis=1)
print(DF) #Print the dataframe to confirm 

###-------------------------------------------
### Standardize your dataset
###-------------------------------------------
scaler = StandardScaler() ##Instantiate
DF=scaler.fit_transform(DF) ## Scale data
print(DF)

###############################################
###--------------PERFORM PCA------------------
###############################################
## Instantiate PCA and choose how many components
MyPCA=PCA(n_components=3)
Result=MyPCA.fit_transform(DF)
## Print the values of the first component 
print(Result[:,0]) 
print(Result) ## Print the new (transformed) dataset
print("The explained variances are:")
print(MyPCA.explained_variance_ratio_)

#################################################
## Visualize the transformed 3D dataset
## we just created using PCA
#################################################
fig2 = plt.figure(figsize=(12, 12))
ax2 = Axes3D(fig2, rect=[0, 0, .90, 1], elev=48, azim=134)

x=Result[:,0]
y=Result[:,1] 
z=Result[:,2]

ax2.scatter(x,y,z, cmap="RdYlGn", edgecolor='k', s=200, c=DFLabel)
ax2.w_xaxis.set_ticklabels([])
ax2.w_yaxis.set_ticklabels([])
ax2.w_zaxis.set_ticklabels([])

plt.show()

###############################################
## Create a DF of the most important features
##################################################
shape= MyPCA.components_.shape[0]
#print(shape)
feature_names=["SepalWidthCm", "PetalLengthCm", "PetalWidthCm", "SepalLengthCm"]

most_important = [np.abs(MyPCA.components_[i]).argmax() for i in range(shape)]
most_important_names = [feature_names[most_important[i]] for i in range(shape)]

# Build a doctionary of the imprtant features by PC
MyDic = {'PC{}'.format(i): most_important_names[i] for i in range(shape)}

# build the dataframe
Important_DF = pd.DataFrame(MyDic.items())
print(Important_DF)