LSTM Encoder-Decoder Language Translation NN

# -*- coding: utf-8 -*-
"""
@author: profa
"""
###############################################################
## https://tatoeba.org/en/downloads
## Encoder - Decoder  - Spanish to English
##
## Gates
##
## Get the data here - its called spa and you will save it as .txt
## http://www.manythings.org/anki/

## References:
# https://towardsdatascience.com/how-to-build-an-encoder-decoder-translation-model-using-lstm-with-python-and-keras-a31e9d864b9b
# https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html
# https://keras.io/examples/nlp/lstm_seq2seq/
################################################
## UPDATE THIS TO YOUR PATH..........
filename = "C:/Users/profa/Desktop/UCB/NNCSCI5922/Code/spa.txt"

import string
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
#https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

## GOAL--------------------------------------------------------
# Split model into two parts, 
# An encoder that inputs the Spanish sentence and produces a hidden vector. 
# The encoder is built with an Embedding layer that converts the words 
# into a vector and a recurrent neural network (RNN) that calculates the hidden state, 
# here we will be using Long Short-Term Memory (LSTM) layer.

# Then the output of the encoder will be used as input for the decoder. 
# For the decoder, we will be using LSTM layer again, as well as a dense layer 
# that predicts the English word.
#-------------------------------------------------------------------


# Read file
translation_file = open(filename,"r", encoding='utf-8') 
raw_data = translation_file.read()
translation_file.close()

# Parse data
raw_data = raw_data.split('\n')
print(raw_data[1000:1010])
pairs = [sentence.split('\t') for sentence in  raw_data]
print(pairs[1000:1010])
pairs = pairs[1000:20000]
print(pairs[1000:1010])
print(pairs[1000])
print(pairs[-1])

## Test access to English and Spanish
for pair in pairs:
    print(pair[0])
    print(pair[1])

###############################
#Pre-process the data and get the maximum length of Spanish and English sentences.
##############################
def clean_sentence(sentence):
    # Lower case the sentence
    lower_case_sent = sentence.lower()
    # Strip punctuation
    string_punctuation = string.punctuation + "¡" + "¿" + "?"
    clean_sentence = lower_case_sent.translate(str.maketrans("", "", string_punctuation))
   
    return clean_sentence


def tokenize(sentences):
    # Create tokenizer
    text_tokenizer = Tokenizer()
    # Fit texts
    text_tokenizer.fit_on_texts(sentences)
    return text_tokenizer.texts_to_sequences(sentences), text_tokenizer



# Clean sentences
english_sentences = [clean_sentence(pair[0]) for pair in pairs]
spanish_sentences = [clean_sentence(pair[1]) for pair in pairs]

## Check the clean version
for es,ss  in zip(english_sentences,spanish_sentences):
    print(es)
    print(ss)
    
# Test 1
print(spanish_sentences[0])
tt, st=tokenize(spanish_sentences)
print(tt[0])
print(st)

# Test 2
print(english_sentences[1000])
tt, st=tokenize(english_sentences)
print(tt[1000])
print(st)



spa_text_tokenized, spa_text_tokenizer = tokenize(spanish_sentences)
print(spa_text_tokenized[0:5], spa_text_tokenizer)
eng_text_tokenized, eng_text_tokenizer = tokenize(english_sentences)

## View tokenization
for ets,sts, es, ss  in zip(eng_text_tokenized,spa_text_tokenized, english_sentences, spanish_sentences):
    print(ets)
    print(es)
    print(sts)
    print(ss)
    
print('Maximum length spanish sentence: {}'.format(len(max(spa_text_tokenized,key=len))))
print('Maximum length english sentence: {}'.format(len(max(eng_text_tokenized,key=len))))

##############################################
# We apply padding to make the maximum length of the sentences in each language equal.
################################################################################
# Check language length
spanish_vocab = len(spa_text_tokenizer.word_index) + 1
english_vocab = len(eng_text_tokenizer.word_index) + 1
print("Spanish vocabulary is of {} unique words".format(spanish_vocab))
print("English vocabulary is of {} unique words".format(english_vocab))


max_spanish_len = int(len(max(spa_text_tokenized,key=len)))
max_english_len = int(len(max(eng_text_tokenized,key=len)))

spa_pad_sentence = pad_sequences(spa_text_tokenized, max_spanish_len, padding = "post")
eng_pad_sentence = pad_sequences(eng_text_tokenized, max_english_len, padding = "post")

# Reshape data
spa_pad_sentence = spa_pad_sentence.reshape(*spa_pad_sentence.shape, 1)
eng_pad_sentence = eng_pad_sentence.reshape(*eng_pad_sentence.shape, 1)

print(eng_pad_sentence)
print(spa_pad_sentence)

#####################################################
##
##  Build the model
##
#####################################################

# First layer  -  embedding layer. 
# First add an Input Layer, 
# the only parameter  is ‘shape’, 
# which is the maximum length of the Spanish sentences, in our case 12.

# Then connect it to the embedding layer, 
# here the parameters  are 
#  ‘input_dim’ 
# which is the length of the Spanish vocabulary and 
#  ‘output_dim’ 
# which is the shape of the embedding vector. 
# This layer will convert any of the Spanish 
# words into a vector of the shape of the output dimension.

# Concept:  extract the meaning of the word 
# in a form of a spatial representation where each dimension will 
# be a characteristic defining the word. 

## For example, the world 
# ‘sol’ will be converted into a vector of shape 128. 
# The higher the output dimension the more semantic 
# meaning you can extract from each word, but also 
# the higher the calculations required and the processing time. 
# Finding a balance between speed and performance is required.

## Layer 1
input_sequence = Input(shape=(max_spanish_len,))
embedding = Embedding(input_dim=spanish_vocab, output_dim=128,)(input_sequence)
print(embedding.shape)


## Layer 2 - LSTM layer of size 64 - ENCODER
input_sequence = Input(shape=(max_spanish_len,))
embedding = Embedding(input_dim=spanish_vocab, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)


# The hidden vector is repeated n times, 
# so each time step of the LSTM receives the same vector. 
# In order to have this same vector for every time step we 
# need to use the layer RepeatVector, 
# its role is to repeat the vector it is receiving, 
# the only parameter we need to define is n, the number of repetitions. 
# This number is equal to the number of time step of the decoder part, 
# in other words the maximum English sentence length, 6.
input_sequence = Input(shape=(max_spanish_len,))
embedding = Embedding(input_dim=spanish_vocab, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)
r_vec = RepeatVector(max_english_len)(encoder)

## Layer 3  DECODER
## Once we have the input ready, we will continue with the decoder. 
# This is also built with a LSTM layer, the difference 
# is the parameter return_sequences, which in this case is ‘True’. 
# What is this parameter for? In the encoder part we were 
# expecting only one vector in the last time step and neglecting all the others, 
# here we are expecting an output vector at every time step so the Dense layer 
# can make a prediction.

input_sequence = Input(shape=(max_spanish_len,))
embedding = Embedding(input_dim=spanish_vocab, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)
r_vec = RepeatVector(max_english_len)(encoder)
decoder = LSTM(64, return_sequences=True, dropout=0.2)(r_vec)


## Last Layer
# We have one last step, to predict the translated word. 
# For this we need to use a Dense Layer. The parameter 
# we need to define is the number of units, this number 
# of units is the shape of the output vector and it 
# needs to be the same as the length of the English vocabulary. 
# Why? The vector will be all values close to zero, except 
# one of the units that will be close to 1. We then need 
# to map the index of the unit that outputs a 1 with a 
# dictionary where we map each unit to a word. For example, 
# if the input is the word ‘sol’ and the output is a vector 
# where all are zeros and then the unit 472 is 1, we map 
# this index against the dictionary containing the English 
# words and we get the value ‘sun’.
input_sequence = Input(shape=(max_spanish_len,))
embedding = Embedding(input_dim=spanish_vocab, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)
r_vec = RepeatVector(max_english_len)(encoder)
decoder = LSTM(64, return_sequences=True, dropout=0.2)(r_vec)
logits = TimeDistributed(Dense(english_vocab))(decoder)


## Stack the layers to create the model and add a function loss.
## Compile
enc_dec_model = Model(input_sequence, Activation('softmax')(logits))
enc_dec_model.compile(loss=sparse_categorical_crossentropy,
              optimizer=Adam(1e-3),
              metrics=['accuracy'])
enc_dec_model.summary()

## Train the model
model_results = enc_dec_model.fit(spa_pad_sentence, eng_pad_sentence, batch_size=30, epochs=100)


# When the model is trained we can make our first translation. 
# You will also find the function ‘logits_to_sentence’ 
# that maps the output of the dense layer with the English vocabulary.

def logits_to_sentence(logits, tokenizer):

    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = '<empty>' 

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

index = 1000
print("The english sentence is: {}".format(english_sentences[index]))
print("The spanish sentence is: {}".format(spanish_sentences[index]))
print('The predicted sentence is :')
print(logits_to_sentence(enc_dec_model.predict(spa_pad_sentence[index:index+1])[0], eng_text_tokenizer))