Bayesian Methods in Python/Sklearn – Gates Bolton Analytics

# -*- coding: utf-8 -*-
"""


@author: profa
"""

############################################################
##
## Naive Bayes Classification Code Example
## Gates
##
## This code is also a tutorial, be sure to
## read the comments and make any updates
## that are needed so you can run the code on your machine.
##
## !! Please note that when using code written by others
## it is a best practice to use the code as a 
## reference and resource - NOT as a copy/paste/hope.
##
## Different versions of Python, etc. can have 
## small effects on code.
## Always write your own code.#
############################################################
## LINKS to the Datasets
## Dataset1: Is for Gaussian Naive Bayes
## https://drive.google.com/file/d/1Q4_uqxn2mQQWSrW5nCT9V1KfKrqCf7qW/view?usp=sharing
## Dataset2: Is for Categorical Naive Bayes
## https://drive.google.com/file/d/17pCixPmu-kHQpGcz9EXyEQ_-YTPO3H5e/view?usp=sharing
## Dataset3: Is for Bernoulli Naive Bayes
## https://drive.google.com/file/d/1H7i6k4yGiir6PMsurUgI7xbFUDXTj_hQ/view?usp=sharing
## Dataset4: Is for Multinomial Naive Bayes
## https://drive.google.com/file/d/1itzJ2c0HfTdTkz4Is5dYmTlMKfw10uww/view?usp=sharing

## Bring in the Libraries
## This code will largely use Sklearn
## RE: https://scikit-learn.org/stable/modules/naive_bayes.html#naive-bayes

##################
## NOTICE !! Read Me
##########################
##
## Because our datasets here are small,
## it will sometimes happen that when you
## split your data into Training and Testing
## data, that your Testing data
## will not contain all the labels.
## This will mess up the code a little and 
## will give an error about FixedLocator locations...
## The easiest method is to rerun the code
## 
## You can also experiment with plotting the
## testing dataset (a good idea!) to assure
## that it is **balanced**. 
##################################################

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import OrdinalEncoder


##----------------------------------------
##Read in the datasets
## To do this - you will need to download
## the data and save it to a location that you know
## !! UPDATE THIS PATH to the location where your datasets are.
##---------------------------------------------------------------
path="C:/Users/profa/Desktop/Datasets/"
dataset1 = "StudentSummerProgramData_3D_Labeled_1.csv"
dataset2 = "StudentSummerProgramData_3D_Labeled_2.csv"
dataset3 = "Pretend_Text_Dataset_Labeled_Bernoulli_Binary.csv"
dataset4 = "Pretend_Text_Dataset_Labeled.csv"

Dataset1_Gaussian = pd.read_csv(path+dataset1)
Dataset2_Categorical = pd.read_csv(path+dataset2)
Dataset3_Bernoulli = pd.read_csv(path+dataset3)
Dataset4_MN = pd.read_csv(path+dataset4)

## Have a look to make sure all datasets read in
print(Dataset1_Gaussian)
print(Dataset2_Categorical)
print(Dataset3_Bernoulli)
print(Dataset4_MN)

## ---------------------------------------------------
## !! Important !! ##
## When using Categorical Naive Bayes in Sklearn
## We must "encode" our data with numbers from 
## 0 to n. 
## For example, if we have a variable, GPA 
## that is represented as "A", "B", "C", "D", "F"
## This will need to be encoded as 0, 1, 2, 3, 4
## ** Because the data we are using here is "ordinal"
## we will use OrdinalEncoder
## However, the same idea applies for categorical data 
## that is not ordinal. 
## Here is how to encode our data so that we can use Categorical NB 
## --------------------------------------------------------------------

## First, create a list of all values for each variable IN ORDER
GPA=["A", "B", "C", "D", "F"]
## Instantiate the OrdinalEncoder and include "categories=[...]
## Here, we place GPA in the [] because that this the variable
## we are encoding first.
MyOrdEncoder=OrdinalEncoder(categories=[GPA])
Dataset2_Categorical["GPA"]=MyOrdEncoder.fit_transform(Dataset2_Categorical[["GPA"]])
print(Dataset2_Categorical)

## Do the same for testScore
TestScore=["Excellent", "Good", "Medium"]
MyOrdEncoder=OrdinalEncoder(categories=[TestScore])
Dataset2_Categorical["TestScore"]=MyOrdEncoder.fit_transform(Dataset2_Categorical[["TestScore"]])
print(Dataset2_Categorical)

## Do the same for WritingScore
WritingScore=["High", "Medium", "Low"]
MyOrdEncoder=OrdinalEncoder(categories=[WritingScore])
Dataset2_Categorical["WritingScore"]=MyOrdEncoder.fit_transform(Dataset2_Categorical[["WritingScore"]])

## Now, notice when you print this dataset again
## it is all numbers (which are actually ordinal categories)
print(Dataset2_Categorical)

## For each of our datasets, we will need to
## (1) Create Training and Testing Datasets
## (2) Remove and save the labels
## Note - there are more elegant ways to do this
# but my goal is clarity for this tutorial. 
## Feel free to improve - always!

## Note - to reduce typing, let's use
## G  for Gaussian
## C for Categorical
## B for Bernoulli
## MN  for Multinomial

## The name of the label for this dataset is "Decision"
## Split the dataset into Training and Testing Data
Training_G, Testing_G = train_test_split(Dataset1_Gaussian, test_size=.3)
print("Training G:", Training_G)
print("Testing G:", Testing_G)
##  Save the Labels and then remove them from the Training and Testing data
Training_G_Label = Training_G["Decision"]
Training_G=Training_G.drop(["Decision"], axis=1)
Testing_G_Label = Testing_G["Decision"]
Testing_G=Testing_G.drop(["Decision"], axis=1)
print("Testing G:", Testing_G)
print("Testing G labels:", Testing_G_Label)

##--> for Categorical -----------------------------------------
Training_C, Testing_C = train_test_split(Dataset2_Categorical, test_size=.3)
##  Save the Labels and then remove them from the Training and Testing data
Training_C_Label = Training_C["Decision"]
Training_C=Training_C.drop(["Decision"], axis=1)
Testing_C_Label = Testing_C["Decision"]
Testing_C=Testing_C.drop(["Decision"], axis=1)
print("Testing C:", Testing_C)
print("Testing C labels:", Testing_C_Label)
## -------------------------------------------------------------

##--> for Bernoulli -----------------------------------------
## Note: The name of the label here is "LABEL"
Training_B, Testing_B = train_test_split(Dataset3_Bernoulli, test_size=.4)
##  Save the Labels and then remove them from the Training and Testing data
Training_B_Label = Training_B["LABEL"]
Training_B=Training_B.drop(["LABEL"], axis=1)
Testing_B_Label = Testing_B["LABEL"]
Testing_B=Testing_B.drop(["LABEL"], axis=1)
print("Testing B:", Testing_B)
print("Testing B labels:", Testing_B_Label)
## -------------------------------------------------------------

##--> for Multinomial -----------------------------------------
## Note: The name of the label here is "LABEL"
Training_MN, Testing_MN = train_test_split(Dataset4_MN, test_size=.4)
##  Save the Labels and then remove them from the Training and Testing data
Training_MN_Label = Training_MN["LABEL"]
Training_MN=Training_MN.drop(["LABEL"], axis=1)
Testing_MN_Label = Testing_MN["LABEL"]
Testing_MN=Testing_MN.drop(["LABEL"], axis=1)
print("Testing MN:", Testing_MN)
print("Testing MN labels:", Testing_MN_Label)
## -------------------------------------------------------------

## This will create one plot with all 4 confusion matrices in it
fig, ax = plt.subplots(2, 2) ## 2 by 2 subplot
####################################################
## Run Naive Bayes
####################################################

## For Gaussian Naive Bayes------------------------
## Instantiate first
MyGNB = GaussianNB()

## Training the model
## Notice we are using the specific dataset
## that we read in for the Gaussian NB
My_GNB_Model = MyGNB.fit(Training_G, Training_G_Label)
print(My_GNB_Model)

## Predict the Testing Data using the model
Predictions_G=My_GNB_Model.predict(Testing_G)
print(Predictions_G)

## Print the probabilities
## Recall that Naive Bayes calculates the 
## probability of each label even though it
## only predicts the largest of those probabilities.
## Knowing the probabilities of each label
## will help you to see how "certain" (or not)
## the model was about its prediction.
print("The Gaussian NB Model Prediction Probabilities are:")
print(My_GNB_Model.predict_proba(Testing_G).round(3))

## Confusion Matrix
CM_G = confusion_matrix(Testing_G_Label, Predictions_G)
print(CM_G)
## Pretty confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=CM_G,display_labels=My_GNB_Model.classes_)
ax[0][0].set_title("Gaussian Naive Bayes")
disp.plot(ax=ax[0][0])
#plt.show()
## --------------------------------------------------


## For Categorical Naive Bayes------------------------
## Instantiate first
MyCNB = CategoricalNB()
print(Training_C)
print(Training_C_Label)
## Training the model
My_CNB_Model = MyCNB.fit(Training_C, Training_C_Label)
print(My_CNB_Model)

## Predict the Testing Data using the model
Predictions_C=My_CNB_Model.predict(Testing_C)
print(Predictions_C)

## Print the actual probabilities
print("The Categorical NB Model Prediction Probabilities are:")
print(My_CNB_Model.predict_proba(Testing_C).round(3))

## Confusion Matrix
CM_C = confusion_matrix(Testing_C_Label, Predictions_C)
print(CM_C)
## Pretty confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=CM_C,display_labels=My_CNB_Model.classes_)
ax[0][1].set_title("Categorical Naive Bayes")
disp.plot(ax=ax[0][1])
#plt.show()
## --------------------------------------------------

## For Bernoulli Naive Bayes------------------------
## Instantiate first
MyBNB = BernoulliNB()
print(Training_B)
print(Training_B_Label)
## Training the model
My_BNB_Model = MyBNB.fit(Training_B, Training_B_Label)
print(My_BNB_Model)
print(My_BNB_Model.classes_)

## Predict the Testing Data using the model
Predictions_B=My_BNB_Model.predict(Testing_B)
print(Predictions_B)

## Print the actual probabilities
print("The Bernoulli NB Model Prediction Probabilities are:")
print(My_BNB_Model.predict_proba(Testing_B).round(3))

## Confusion Matrix
CM_B = confusion_matrix(Testing_B_Label, Predictions_B)
print(CM_B)
## Pretty confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=CM_B,display_labels=My_BNB_Model.classes_)
ax[1][0].set_title("Bernoulli Naive Bayes")
disp.plot(ax=ax[1][0])
#plt.show()
## --------------------------------------------------

## For Multinomial Naive Bayes------------------------
## Instantiate first
MyMN = MultinomialNB()
print(Training_MN)
print(Training_MN_Label)
## Traing the model
My_MN_Model = MyMN.fit(Training_MN, Training_MN_Label)
print(My_MN_Model)
print(My_MN_Model.classes_)

## Predict the Testing Data using the model
Predictions_MN=My_MN_Model.predict(Testing_MN)
print(Predictions_MN)

## Print the actual probabilities
print("The Multinomial NB Model Prediction Probabilities are:")
print(My_MN_Model.predict_proba(Testing_MN).round(3))

## Confusion Matrix
CM_MN = confusion_matrix(Testing_MN_Label, Predictions_MN)
print(CM_MN)
## Pretty confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=CM_MN,display_labels=My_MN_Model.classes_)
plt.title("Multinomial Naive Bayes")
disp.plot(ax=ax[1][1])
plt.show()
## --------------------------------------------------