Multinomial ANNs with OHE and CCE – Gates Bolton Analytics

Example 1: This example has options and is intended to be very simple and hard-coded for learning purposes. To see random options, etc. scroll down to Example 2. 




# -*- coding: utf-8 -*-
"""
Created on Sat Oct  4 11:32:23 2025

@author: profa
"""

## NN - FF and BP
## Gates

## Multiple outputs - 3 labels
## 3D data
## One- hot Encoding
## Categorical Cross Entropy
## Softmax

## Special Example for MOdule 2 and as an 
## example partial "solution" to the assignment

## Some of this code is hard-coded for simplicity

## ###################################################     

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
import statistics



## DATAset - hard coded for ease and simplicity
X = np.array(( [[2 ,  4  ,  2],  
               [-1  , 0 ,  -2],
               [3 ,  5 ,   1],
               [.4 , .1 , .03]]  ))

print("The dataset X is:\n", X)
print(type(X))
print(X.shape)

y = np.array(([[1, 0, 1, 2]])).T
print("y is\n", y)
original_y_values=y # save a copy
print(type(y))
print(y.shape)

## Normally - its best to Normalize the data (not the label!)
## However, here I will not so we can compare our
## by-hand math with out code results directly and easily
## or use min/max normalized_df=(df-df.min())/(df.max()-df.min())

## Define the architecture
InputColumns = 3
NumberOfLabels = 3
n = len(X) ## number of rows of entire X
print("The number of rows is:", n)
## Take the label off of X and make it a numpy array
LR=.01
LRB = .01
#................................................

###################### Creating one hot labels for y ------------------
## Keep a copy of the original labels
temp = y
print("The current label categoies are:\n", temp)
## Using Python/Sklearn OneHotEncoder
## https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

## Instantiate
MyOHE = OneHotEncoder()
## Run the encoder
Encoded = MyOHE.fit_transform(y).toarray()
#print(Encoded)
## Set y to the encoded result
y = Encoded
print("OH Encoded y is:\n ", y)
## Notice that [1    0    0]  is 0
## and that    [0    1    0]  is 1
## and that    [0    0    1]  is 2

## Its a good idea to identify and note this for you
## and other readers
##################------------------------------------
  

class NeuralNetwork(object):
    def __init__(self):
        
        self.InputNumColumns = InputColumns  ## columns
        self.OutputSize = 3 ## Categories
        self.HiddenUnits = 2   ## one layer with h units
        self.n = n  ## number of training examples, n
        
        print("Initialize NN\n")
        #Random W1
        ##self.W1 = np.random.randn(self.InputNumColumns, self.HiddenUnits) # c by h  
        ## Here we DO NOT WANT to randomize W1
        ## Instead, create the W1 to match the numbers you are using
        ## This allows you to compare your by-hand work to the code res
        self.W1 = np.array(( [[1 , 2],
                              [3 , 4],
                              [5 , 6]] ))
        print("INIT W1 is\n", self.W1)
        
        ## Do the same for W2, b, and c
        ##---------------------------------------------
        
        #self.W2 = np.random.randn(self.HiddenUnits, self.OutputSize) # h by o 
        self.W2 = np.array(( [[0  ,  -.1 , .1],
                              [-.1, -.2 , -.2]]))

        print("W2 is:\n", self.W2)
        
        #self.b = np.random.randn(1, self.HiddenUnits)
        self.b = np.array(([[0, 0]]))
        print("The b's are:\n", self.b)
        ## biases for layer 1
        
        #self.c = np.random.randn(1, self.OutputSize)
        self.c = np.array(([[0, 0, 0]]))
        print("The c is\n", self.c)
        ## bias for last layer
        
        
    def FeedForward(self, X):
        print("FeedForward\n\n")
        self.z = (np.dot(X, self.W1)) + self.b 
        #X is n by c   W1  is c by h -->  n by h
        print("Z1 is:\n", self.z)
        
        ## Using Sigmoid
        #self.h = self.Sigmoid(self.z) #activation function    
        ## shape: n by h
        ## Using ReLU
        self.h = self.ReLU(self.z)
        print("H is:\n", self.h)
        
        self.z2 = (np.dot(self.h, self.W2)) + self.c # n by h  @  h by o  -->  n by o  
        print("Z2 is:\n", self.z2)
        
        ## Using Softmax for the output activation
        output = self.Softmax(self.z2)  
        print("output Y^ (SM of Z2) is:\n", np.round(output,5))
        return output
        
    def Sigmoid(self, s, deriv=False):
        if (deriv == True):
            return s * (1 - s)
        return 1/(1 + np.exp(-s))
    

    def ReLU(self, s, deriv=False):
        if (deriv == True):
            
            dx = np.copy(s)
            print("relu s is:\n", s)
            dx[dx <= 0] = 0
            dx[dx > 0] = 1
            return dx
           
        print("relu s is:\n", s)
        return np.maximum(0, s)
    
    def Softmax(self, M):
        print("M is\n", M)
        expM = np.exp(M)
        #print("expM is\n", expM)
        SM=expM/np.sum(expM, axis=1)[:,None]
        print("SM is\n",SM )
        return SM 
    
    def BackProp(self, X, y, output):
        print("\n\nBackProp\n")
        self.LR = LR
        self.LRB=LRB  ## LR for biases
        
        # Y^ - Y
        self.output_error = output - y    
        #print("Y^ - Y\n", self.output_error)
        
        ## NOTE TO READER........................
        ## Here - we DO NOT multiply by derivative of Sig for y^ b/c we are using 
        ## cross entropy and softmax for the loss and last activation

        
        self.output_delta = self.output_error 
          
        ##(Y^ - Y)(W2)
        self.D_Error_W2 = self.output_delta.dot(self.W2.T) #  D_Error times W2
        print("(Y^ - Y) is\n",self.output_delta)
        print("W2.T is\n", self.W2.T)
        print(" (Y^ - Y) @ W2.T\n", self.D_Error_W2)
        
        ## (H)(1 - H) (Y^ - Y)(W2)
        ## We still use the Sigmoid on H 
        #self.H_D_Error_W2 = self.D_Error_W2 * self.Sigmoid(self.h, deriv=True) 
        self.H_D_Error_W2 = self.D_Error_W2 * self.ReLU(self.h, deriv=True)
        
        ## Note that * will multiply respective values together in each matrix
        #print("Derivative sig H is:\n", self.Sigmoid(self.h, deriv=True))
        #print("self.H_D_Error_W2 is\n", self.H_D_Error_W2)
        
        ################------UPDATE weights and biases ------------------
        print("Old W1: \n", self.W1)
        #print("Old W2 is:\n", self.W2)
        #print("X transpose is\n", X.T)
        
        ##  X.T  (H)(1 - H) (Y^ - Y)(W2)
        print("Using sum gradient........\n")
        ## The sum occurs implicitly because we are multiplying X.T (transpose)
        
        ## dW1 ==>  (X.T)@(H)(1 - H) (Y^ - Y)(W2)
        self.X_H_D_Error_W2 = X.T.dot(self.H_D_Error_W2) ## this is dW1
        
        ## dW2 ==> (H)T (Y^ - Y)  
        self.h_output_delta = self.h.T.dot(self.output_delta) ## this is for dW2
        
        #print("the gradient :\n", self.X_H_D_Error_W2)
        #print("the gradient average:\n", self.X_H_D_Error_W2/self.n)
        
        
        self.W1 = self.W1 - self.LR*(self.X_H_D_Error_W2) # c by h  adjusting first set (input -> hidden) weights
        self.W2 = self.W2 - self.LR*(self.h_output_delta) 
        
        
        print("The mean of the b update is\n", np.mean(self.H_D_Error_W2, axis=0))
        print("The b biases before the update are:\n", self.b)
        self.b = self.b  - self.LRB*np.mean(self.H_D_Error_W2, axis=0)
        #print("The H_D_Error_W2 is...\n", self.H_D_Error_W2)
        print("Updated bs are:\n", self.b)
        
        self.c = self.c - self.LR*np.mean(self.output_delta, axis=0)
        print("Updated c's are:\n", self.c)
        
        print("The W1 is: \n", self.W1)
        print("The W1 gradient is: \n", self.X_H_D_Error_W2)
        #print("The W1 gradient average is: \n", self.X_H_D_Error_W2/self.n)
        print("The W2 gradient  is: \n", self.h_output_delta)
        #print("The W2 gradient average is: \n", self.h_output_delta/self.n)
        print("The mean biases b gradient is:\n",np.mean(self.H_D_Error_W2, axis=0 ))
        print("The mean bias c gradient is: \n", np.mean(self.output_delta, axis=0))
        ################################################################
        
    def TrainNetwork(self, X, y):
        output = self.FeedForward(X)
        #print("Output in TNN\n", output)
        self.BackProp(X, y, output)
        return output

#-------------------------------------------------------------------        
MyNN = NeuralNetwork()

AvgLossList=[]
Epochs=30

for i in range(Epochs): 
    print("\nRUN:\n ", i)
    output=MyNN.TrainNetwork(X, y)
   ## LOSS
    print("The value of y here is:\n", y)
    print("The value of output here is:\n", output)
    
    output_log=np.log(output)
    print("The log of the output is:\n", output_log)
    print("This is the product of -y times log of output:\n",-y * output_log )
    row_sums_list = [sum(row) for row in (-y * output_log)]
    print("The row sums of logs are:\n", row_sums_list)
    print(type(row_sums_list))
    # Calculate the average of the row sums
    AvgLoss = statistics.mean(row_sums_list)
    print("The mean error is:\n", AvgLoss)

    #loss = np.mean(-y * np.log(output))  
    AvgLossList.append(AvgLoss)
    print("The average loss list is\n", AvgLossList)
   ## OUTPUT
    #print("The raw output is: \n", output)
    #print("Original y values:\n", original_y_values)
    numeric_output=np.argmax(output, axis=1) 
    #print('Prediction y^ is', numeric_output.T)  
    #print("Original Labels y are\n",original_y_values )
    ## Using Categorical Cross Entropy...........
    
   
    
###################-output and vis----------------------    

import matplotlib.pyplot as plt

fig1 = plt.figure()
ax = plt.axes()
x = np.linspace(0, Epochs, Epochs)
ax.plot(x, AvgLossList)
plt.show()    

## FIX THE SHAPES FIRST!!
print(numeric_output.shape)
numeric_output2 = np.array([numeric_output])
numeric_output2=numeric_output2.T
print(numeric_output2.shape)
print(original_y_values.shape)

print("The prediction accuracy via confusion matrix is:\n")
print(confusion_matrix(numeric_output2, original_y_values))

Example 2: This example has options. You will need to review and understand the code to use them. You can run this using ReLU or you can use Sigmoid for the hidden layer. Right now, it uses ReLU. If you want to switch it to use Sigmoid, you must change some of the code by commenting out lines and uncommenting others.



## NN - FF and BP
## Gates

## Multiple outputs - 3 labels
## 3D data
## One- hot Encoding
## Categorical Cross Entropy
## Softmax

## Special Example for MOdule 2 and as an 
## example partial "solution" to the assignment

## Some of this code is hard-coded for simplicity

## ###################################################     

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
import statistics


datafile="C:/Users/profa/Desktop/UCB/Classes/NN_CSCI_5122/IRIS_copy.csv"

## !! Update this to YOUR path
DF = pd.read_csv(datafile)
print(DF)

## Set y to the label. Check the shape!
y = np.array(DF.iloc[:,4])
y = np.array([y]).T
print("y is\n", y)
print("The shape of y is\n", y.shape) 
## DATAset - hard coded for ease and simplicity
X =np.array(DF.iloc[:,[0,1,2,3]])

print("The dataset X is:\n", X)
print(type(X))
print(X.shape)



## Normally - its best to Normalize the data (not the label!)
## However, here I will not so we can compare our
## by-hand math with out code results directly and easily
## or use min/max normalized_df=(df-df.min())/(df.max()-df.min())

## Define the architecture
InputColumns = 4
NumberOfLabels = 3
n = len(X) ## number of rows of entire X
print("The number of rows is:", n)
## Take the label off of X and make it a numpy array
LR=.01
LRB = .01
#................................................

###################### Creating one hot labels for y ------------------
## Keep a copy of the original labels
temp = y
original_y_values=y
print("The current label categoies are:\n", temp)
## Using Python/Sklearn OneHotEncoder
## https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

## Instantiate
MyOHE = OneHotEncoder()
## Run the encoder
Encoded = MyOHE.fit_transform(y).toarray()
#print(Encoded)
## Set y to the encoded result
y = Encoded
print("OH Encoded y is:\n ", y)
## Notice that [1    0    0]  is 0
## and that    [0    1    0]  is 1
## and that    [0    0    1]  is 2

## Its a good idea to identify and note this for you
## and other readers
##################------------------------------------
  

class NeuralNetwork(object):
    def __init__(self):
        
        self.InputNumColumns = InputColumns  ## columns
        self.OutputSize = 3 ## Categories
        self.HiddenUnits = 2   ## one layer with h units
        self.n = n  ## number of training examples, n
        
        print("Initialize NN\n")
        #Random W1
        self.W1 = np.random.randn(self.InputNumColumns, self.HiddenUnits) # c by h  
        ## Here we DO NOT WANT to randomize W1
        ## Instead, create the W1 to match the numbers you are using
        ## This allows you to compare your by-hand work to the code res
        
        print("INIT W1 is\n", self.W1)
        
        ## Do the same for W2, b, and c
        ##---------------------------------------------
        
        self.W2 = np.random.randn(self.HiddenUnits, self.OutputSize) # h by o 
        

        print("W2 is:\n", self.W2)
        
        self.b = np.random.randn(1, self.HiddenUnits)
        print("The b's are:\n", self.b)
        ## biases for layer 1
        
        self.c = np.random.randn(1, self.OutputSize)
        print("The c is\n", self.c)
        ## bias for last layer
        
        
    def FeedForward(self, X):
        print("FeedForward\n\n")
        self.z = (np.dot(X, self.W1)) + self.b 
        #X is n by c   W1  is c by h -->  n by h
        print("Z1 is:\n", self.z)
        
        ## Using Sigmoid
        #self.h = self.Sigmoid(self.z) #activation function    
        ## shape: n by h
        ## Using ReLU
        self.h = self.ReLU(self.z)
        print("H is:\n", self.h)
        
        self.z2 = (np.dot(self.h, self.W2)) + self.c # n by h  @  h by o  -->  n by o  
        print("Z2 is:\n", self.z2)
        
        ## Using Softmax for the output activation
        output = self.Softmax(self.z2)  
        print("output Y^ (SM of Z2) is:\n", np.round(output,5))
        return output
        
    def Sigmoid(self, s, deriv=False):
        if (deriv == True):
            return s * (1 - s)
        return 1/(1 + np.exp(-s))
    

    def ReLU(self, s, deriv=False):
        if (deriv == True):
            
            dx = np.copy(s)
            print("relu s is:\n", s)
            dx[dx <= 0] = 0
            dx[dx > 0] = 1
            return dx
           
        print("relu s is:\n", s)
        return np.maximum(0, s)
    
    def Softmax(self, M):
        print("M is\n", M)
        expM = np.exp(M)
        #print("expM is\n", expM)
        SM=expM/np.sum(expM, axis=1)[:,None]
        print("SM is\n",SM )
        return SM 
    
    def BackProp(self, X, y, output):
        print("\n\nBackProp\n")
        self.LR = LR
        self.LRB=LRB  ## LR for biases
        
        # Y^ - Y
        self.output_error = output - y    
        #print("Y^ - Y\n", self.output_error)
        
        ## NOTE TO READER........................
        ## Here - we DO NOT multiply by derivative of Sig for y^ b/c we are using 
        ## cross entropy and softmax for the loss and last activation

        
        self.output_delta = self.output_error 
          
        ##(Y^ - Y)(W2)
        self.D_Error_W2 = self.output_delta.dot(self.W2.T) #  D_Error times W2
        print("(Y^ - Y) is\n",self.output_delta)
        print("W2.T is\n", self.W2.T)
        print(" (Y^ - Y) @ W2.T\n", self.D_Error_W2)
        
        ## (H)(1 - H) (Y^ - Y)(W2) - for Sigmoid
        ## We still use the Sigmoid on H 
        #self.H_D_Error_W2 = self.D_Error_W2 * self.Sigmoid(self.h, deriv=True) 
        self.H_D_Error_W2 = self.D_Error_W2 * self.ReLU(self.h, deriv=True)
        
        ## Note that * will multiply respective values together in each matrix
        #print("Derivative sig H is:\n", self.Sigmoid(self.h, deriv=True))
        #print("self.H_D_Error_W2 is\n", self.H_D_Error_W2)
        
        ################------UPDATE weights and biases ------------------
        print("Old W1: \n", self.W1)
        #print("Old W2 is:\n", self.W2)
        #print("X transpose is\n", X.T)
        
        ##  X.T  (H)(1 - H) (Y^ - Y)(W2)
        print("Using sum gradient........\n")
        ## The sum occurs implicitly because we are multiplying X.T (transpose)
        
        ## dW1 ==>  (X.T)@(H)(1 - H) (Y^ - Y)(W2)
        self.X_H_D_Error_W2 = X.T.dot(self.H_D_Error_W2) ## this is dW1
        
        ## dW2 ==> (H)T (Y^ - Y)  
        self.h_output_delta = self.h.T.dot(self.output_delta) ## this is for dW2
        
        #print("the gradient :\n", self.X_H_D_Error_W2)
        #print("the gradient average:\n", self.X_H_D_Error_W2/self.n)
        
        
        self.W1 = self.W1 - self.LR*(self.X_H_D_Error_W2) # c by h  adjusting first set (input -> hidden) weights
        self.W2 = self.W2 - self.LR*(self.h_output_delta) 
        
        
        print("The mean of the b update is\n", np.mean(self.H_D_Error_W2, axis=0))
        print("The b biases before the update are:\n", self.b)
        self.b = self.b  - self.LRB*np.mean(self.H_D_Error_W2, axis=0)
        #print("The H_D_Error_W2 is...\n", self.H_D_Error_W2)
        print("Updated bs are:\n", self.b)
        
        self.c = self.c - self.LR*np.mean(self.output_delta, axis=0)
        print("Updated c's are:\n", self.c)
        
        print("The W1 is: \n", self.W1)
        print("The W1 gradient is: \n", self.X_H_D_Error_W2)
        #print("The W1 gradient average is: \n", self.X_H_D_Error_W2/self.n)
        print("The W2 gradient  is: \n", self.h_output_delta)
        #print("The W2 gradient average is: \n", self.h_output_delta/self.n)
        print("The mean biases b gradient is:\n",np.mean(self.H_D_Error_W2, axis=0 ))
        print("The mean bias c gradient is: \n", np.mean(self.output_delta, axis=0))
        ################################################################
        
    def TrainNetwork(self, X, y):
        output = self.FeedForward(X)
        #print("Output in TNN\n", output)
        self.BackProp(X, y, output)
        return output

#-------------------------------------------------------------------        
MyNN = NeuralNetwork()

AvgLossList=[]
Epochs=500

for i in range(Epochs): 
    print("\nRUN:\n ", i)
    output=MyNN.TrainNetwork(X, y)
   ## LOSS
    #print("The value of y here is:\n", y)
    #print("The value of output here is:\n", output)
    
    output_log=np.log(output)
    #print("The log of the output is:\n", output_log)
    #print("This is the product of -y times log of output:\n",-y * output_log )
    row_sums_list = [sum(row) for row in (-y * output_log)]
    #print("The row sums of logs are:\n", row_sums_list)
    print(type(row_sums_list))
    # Calculate the average of the row sums
    AvgLoss = statistics.mean(row_sums_list)
    print("The mean error is:\n", AvgLoss)

    #loss = np.mean(-y * np.log(output))  
    AvgLossList.append(AvgLoss)
    print("The average loss list is\n", AvgLossList)
   ## OUTPUT
    #print("The raw output is: \n", output)
    #print("Original y values:\n", original_y_values)
    numeric_output=np.argmax(output, axis=1) 
    #print('Prediction y^ is', numeric_output.T)  
    #print("Original Labels y are\n",original_y_values )
    ## Using Categorical Cross Entropy...........
    
   
    
###################-output and vis----------------------    

import matplotlib.pyplot as plt

fig1 = plt.figure()
ax = plt.axes()
x = np.linspace(0, Epochs, Epochs)
ax.plot(x, AvgLossList)    
plt.show()

## FIX THE SHAPES FIRST!!
print(numeric_output.shape)
numeric_output2 = np.array([numeric_output])
numeric_output2=numeric_output2.T
print(numeric_output2.shape)
print(original_y_values.shape)

#print("The prediction accuracy via confusion matrix is:\n")
print(confusion_matrix(numeric_output2, original_y_values))