CNN “By Hand” in Python vs. Keras


##########################
##
## The following code was written
## by one of my excellent students - 
## Ujas Shah. 
##
## This code offers two examples:
## The first is a CNN that is "by hand"
## meaning that it does not use a package
##
## The second example uses the same input
## but uses Tensorflow/Keras.
##
## This code is being shared simply 
## for interest and educational purposes.
##
#####################################################


import numpy as np
import random
from tensorflow.keras.datasets import mnist
from skimage.util import view_as_windows
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.losses import CategoricalCrossentropy

(trainX, trainY), (testX, testY) = mnist.load_data()
trainY = to_categorical(trainY)
testY = to_categorical(testY)
trainX = trainX / 255.0
testX = testX / 255.0
trainX = trainX.astype('float32')
testX = testX.astype('float32')

def relu(x):
    return np.maximum(0,x)

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

def initialize_weights(seed):
    np.random.seed(seed)
    
    kernel = np.random.rand(3,3).astype('float32')
    kernel_bias = np.random.randn(1,1).astype('float32')
    
    dense_weights = np.random.randn(676,10).astype('float32')
    dense_biases = np.random.randn(1,10).astype('float32')
    
    return kernel, kernel_bias, dense_weights, dense_biases

def convolution_layer(images, kernel, kernel_bias):
    k_h, k_w = kernel.shape
    i_h, i_w = images.shape
    
    # `view_as_windows` creates different blocks of the image that a kernel has 
    # to go through. These blocks are then reshaped into a vectors in a 2-D 
    # matrix, so that I can cross-correlate with simple matrix multiplication. 
    convolution = (view_as_windows(images,(k_h,k_w),step=1).reshape(-1,k_h*k_w)@kernel.reshape(k_h*k_w,-1)) - kernel_bias
    
    return relu(convolution), convolution

def dense_layer(in_matrix, dense_weights, dense_biases):
    return softmax(np.dot(in_matrix.T, dense_weights) - dense_biases)

def backprop(output, Y, conv_output, dense_weights, before_relu, X, lr, 
             kernel, kernel_bias, dense_biases):

    dl_dz = output - Y # 1 x 10
    dz_dw = conv_output # 676 x 1
    dz_db = dense_weights #676 x 10
    db_da = np.heaviside(before_relu, 0) # 676 x 1
    da_dk = view_as_windows(X,(3,3),step=1).reshape(-1,9).sum(axis=0).reshape(9,1) # 9 x 1

    """
    l: loss
    z: output before softmax
    w: dense layer weights
    b: output from conv after relu
    a: output from conv before relu
    k: kernel
    """

    kernel = kernel - lr * (dl_dz @ dz_db.T @ db_da * da_dk).reshape(3,3)
    kernel_bias = kernel_bias - lr * (dl_dz @ dz_db.T @ db_da)
    dense_biases = dense_biases - lr * (dl_dz)
    dense_weights = dense_weights - lr * (dl_dz.T @ dz_dw.T).T
    
    return kernel, kernel_bias, dense_biases, dense_weights

lr=0.0005
seed = 42

kernel, kernel_bias, dense_weights, dense_biases = initialize_weights(seed)
acc = 0

for i in range(60000):
    # k = random.randint(0,59999)
    X = trainX[i]
    Y = trainY[i]
    
    # Feed Forward
    conv_output, before_relu = convolution_layer(X,kernel, kernel_bias)
    output = dense_layer(conv_output, dense_weights, dense_biases)

    # Backprop
    kernel, kernel_bias, dense_biases, dense_weights = backprop(output, 
                                                                   Y, 
                                                                   conv_output, 
                                                                   dense_weights, 
                                                                   before_relu, 
                                                                   X, 
                                                                   lr, 
                                                                   kernel, 
                                                                   kernel_bias, 
                                                                   dense_biases)
    
    #Just to count accuracy as it is training
    if np.argmax(Y) == np.argmax(output):
        acc +=1
    
    if i % 5000 == 0 and i!=0:
        print('Training accuracy:', round(acc*100/i,2), '%')

test_acc = 0
for j in range(10000):
    conv_output, _ = convolution_layer(testX[j],kernel, kernel_bias)
    output = dense_layer(conv_output, dense_weights, dense_biases)
    
    if np.argmax(testY[j]) ==  np.argmax(output):
        test_acc +=1
        
print('Test accuracy:', test_acc*100/testX.shape[0], '%')

"""# Tensorflow implementation
Have tried to keep everything the same as the 'from scratch' implementation
"""

from numpy import mean
from numpy import std
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.optimizers import SGD

(trainX, trainY), (testX, testY) = mnist.load_data()
# reshape dataset to have a single channel
trainX = trainX.reshape((trainX.shape[0], 28, 28, 1))
testX = testX.reshape((testX.shape[0], 28, 28, 1))
# one hot encode target values
trainY = to_categorical(trainY)
testY = to_categorical(testY)

trainX = trainX.astype('float32')
testX = testX.astype('float32')
# normalize to range 0-1
trainX = trainX / 255.0
testX = testX / 255.0

model = Sequential()
model.add(Conv2D(1, (3, 3), activation='relu', kernel_initializer='random_normal', input_shape=(28, 28, 1)))
model.add(Flatten())
model.add(Dense(10, activation='softmax'))
# compile model
opt = SGD(learning_rate=0.005, momentum=0)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(trainX, trainY, epochs=1, batch_size=1, validation_data=(testX, testY), verbose=1, shuffle=True)

"""# Many kernels and many observations at a time (WiP)"""

# (trainX, trainY), (testX, testY) = mnist.load_data()
# trainY = to_categorical(trainY)
# testY = to_categorical(testY)
# trainX = trainX / 255.0
# testX = testX / 255.0
# trainX = trainX.astype('float32')
# testX = testX.astype('float32')

# X = trainX[:1000]
# Y = trainY[:1000]

# def relu(x):
#     return np.maximum(0,x)

# def softmax(x):
#     e_x = np.exp(x - np.max(x, axis=1).reshape(1000,1))
#     return e_x / e_x.sum(axis=1).reshape(1000,1)

# def initialize_weights():
#     kernels = np.random.rand(32,3,3).astype('float32')
#     kernel_biases = np.random.randn(1,32).astype('float32')
    
#     first_dense_weights = np.random.randn(21632,100).astype('float32')
#     first_dense_biases = np.random.randn(1,100).astype('float32')
    
#     second_dense_weights = np.random.randn(100,10).astype('float32')
#     second_dense_biases = np.random.randn(1,10).astype('float32')
    
#     return kernels, kernel_biases, first_dense_weights, first_dense_biases, second_dense_weights, second_dense_biases

# def convolution_layer(images, kernels, kernel_bias):
#     k_n, k_h, k_w = kernels.shape
#     i_n, i_h, i_w = images.shape
    
#     return relu(
#         np.concatenate(
#             np.vsplit(
#                 ((view_as_windows(images,(1,k_h,k_w),step=1).reshape(-1,k_h*k_w)@kernels.reshape(-1,k_h*k_w).T) 
#                  - kernel_bias),
#                 i_n),
#             axis=1).T.reshape(i_n,-1))

# def dense_layer(in_matrix, weights, biases, activation):
#     if activation == 'relu':
#         return relu(np.dot(in_matrix, weights) - biases)
#     if activation == 'softmax':
#         return softmax(np.dot(in_matrix, weights) - biases)

# kernels, kernel_biases, first_dense_weights, first_dense_biases, second_dense_weights, second_dense_biases = initialize_weights()

# output = convolution_layer(X,kernels, kernel_biases)
# output = dense_layer(output, first_dense_weights, first_dense_biases, activation='relu')
# output = dense_layer(output, second_dense_weights, second_dense_biases, activation='softmax')

# loss = CategoricalCrossentropy(axis=-1)
# loss(Y,output).numpy()