# -*- coding: utf-8 -*-
"""
@author: profa
"""
###############################################################
## https://tatoeba.org/en/downloads
## Encoder - Decoder - Spanish to English
##
## Gates
##
## Get the data here - its called spa and you will save it as .txt
## http://www.manythings.org/anki/
## References:
# https://towardsdatascience.com/how-to-build-an-encoder-decoder-translation-model-using-lstm-with-python-and-keras-a31e9d864b9b
# https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html
# https://keras.io/examples/nlp/lstm_seq2seq/
################################################
## UPDATE THIS TO YOUR PATH..........
filename = "C:/Users/profa/Desktop/UCB/NNCSCI5922/Code/spa.txt"
import string
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
#https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
## GOAL--------------------------------------------------------
# Split model into two parts,
# An encoder that inputs the Spanish sentence and produces a hidden vector.
# The encoder is built with an Embedding layer that converts the words
# into a vector and a recurrent neural network (RNN) that calculates the hidden state,
# here we will be using Long Short-Term Memory (LSTM) layer.
# Then the output of the encoder will be used as input for the decoder.
# For the decoder, we will be using LSTM layer again, as well as a dense layer
# that predicts the English word.
#-------------------------------------------------------------------
# Read file
translation_file = open(filename,"r", encoding='utf-8')
raw_data = translation_file.read()
translation_file.close()
# Parse data
raw_data = raw_data.split('\n')
print(raw_data[1000:1010])
pairs = [sentence.split('\t') for sentence in raw_data]
print(pairs[1000:1010])
pairs = pairs[1000:20000]
print(pairs[1000:1010])
print(pairs[1000])
print(pairs[-1])
## Test access to English and Spanish
for pair in pairs:
print(pair[0])
print(pair[1])
###############################
#Pre-process the data and get the maximum length of Spanish and English sentences.
##############################
def clean_sentence(sentence):
# Lower case the sentence
lower_case_sent = sentence.lower()
# Strip punctuation
string_punctuation = string.punctuation + "¡" + "¿" + "?"
clean_sentence = lower_case_sent.translate(str.maketrans("", "", string_punctuation))
return clean_sentence
def tokenize(sentences):
# Create tokenizer
text_tokenizer = Tokenizer()
# Fit texts
text_tokenizer.fit_on_texts(sentences)
return text_tokenizer.texts_to_sequences(sentences), text_tokenizer
# Clean sentences
english_sentences = [clean_sentence(pair[0]) for pair in pairs]
spanish_sentences = [clean_sentence(pair[1]) for pair in pairs]
## Check the clean version
for es,ss in zip(english_sentences,spanish_sentences):
print(es)
print(ss)
# Test 1
print(spanish_sentences[0])
tt, st=tokenize(spanish_sentences)
print(tt[0])
print(st)
# Test 2
print(english_sentences[1000])
tt, st=tokenize(english_sentences)
print(tt[1000])
print(st)
spa_text_tokenized, spa_text_tokenizer = tokenize(spanish_sentences)
print(spa_text_tokenized[0:5], spa_text_tokenizer)
eng_text_tokenized, eng_text_tokenizer = tokenize(english_sentences)
## View tokenization
for ets,sts, es, ss in zip(eng_text_tokenized,spa_text_tokenized, english_sentences, spanish_sentences):
print(ets)
print(es)
print(sts)
print(ss)
print('Maximum length spanish sentence: {}'.format(len(max(spa_text_tokenized,key=len))))
print('Maximum length english sentence: {}'.format(len(max(eng_text_tokenized,key=len))))
##############################################
# We apply padding to make the maximum length of the sentences in each language equal.
################################################################################
# Check language length
spanish_vocab = len(spa_text_tokenizer.word_index) + 1
english_vocab = len(eng_text_tokenizer.word_index) + 1
print("Spanish vocabulary is of {} unique words".format(spanish_vocab))
print("English vocabulary is of {} unique words".format(english_vocab))
max_spanish_len = int(len(max(spa_text_tokenized,key=len)))
max_english_len = int(len(max(eng_text_tokenized,key=len)))
spa_pad_sentence = pad_sequences(spa_text_tokenized, max_spanish_len, padding = "post")
eng_pad_sentence = pad_sequences(eng_text_tokenized, max_english_len, padding = "post")
# Reshape data
spa_pad_sentence = spa_pad_sentence.reshape(*spa_pad_sentence.shape, 1)
eng_pad_sentence = eng_pad_sentence.reshape(*eng_pad_sentence.shape, 1)
print(eng_pad_sentence)
print(spa_pad_sentence)
#####################################################
##
## Build the model
##
#####################################################
# First layer - embedding layer.
# First add an Input Layer,
# the only parameter is ‘shape’,
# which is the maximum length of the Spanish sentences, in our case 12.
# Then connect it to the embedding layer,
# here the parameters are
# ‘input_dim’
# which is the length of the Spanish vocabulary and
# ‘output_dim’
# which is the shape of the embedding vector.
# This layer will convert any of the Spanish
# words into a vector of the shape of the output dimension.
# Concept: extract the meaning of the word
# in a form of a spatial representation where each dimension will
# be a characteristic defining the word.
## For example, the world
# ‘sol’ will be converted into a vector of shape 128.
# The higher the output dimension the more semantic
# meaning you can extract from each word, but also
# the higher the calculations required and the processing time.
# Finding a balance between speed and performance is required.
## Layer 1
input_sequence = Input(shape=(max_spanish_len,))
embedding = Embedding(input_dim=spanish_vocab, output_dim=128,)(input_sequence)
print(embedding.shape)
## Layer 2 - LSTM layer of size 64 - ENCODER
input_sequence = Input(shape=(max_spanish_len,))
embedding = Embedding(input_dim=spanish_vocab, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)
# The hidden vector is repeated n times,
# so each time step of the LSTM receives the same vector.
# In order to have this same vector for every time step we
# need to use the layer RepeatVector,
# its role is to repeat the vector it is receiving,
# the only parameter we need to define is n, the number of repetitions.
# This number is equal to the number of time step of the decoder part,
# in other words the maximum English sentence length, 6.
input_sequence = Input(shape=(max_spanish_len,))
embedding = Embedding(input_dim=spanish_vocab, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)
r_vec = RepeatVector(max_english_len)(encoder)
## Layer 3 DECODER
## Once we have the input ready, we will continue with the decoder.
# This is also built with a LSTM layer, the difference
# is the parameter return_sequences, which in this case is ‘True’.
# What is this parameter for? In the encoder part we were
# expecting only one vector in the last time step and neglecting all the others,
# here we are expecting an output vector at every time step so the Dense layer
# can make a prediction.
input_sequence = Input(shape=(max_spanish_len,))
embedding = Embedding(input_dim=spanish_vocab, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)
r_vec = RepeatVector(max_english_len)(encoder)
decoder = LSTM(64, return_sequences=True, dropout=0.2)(r_vec)
## Last Layer
# We have one last step, to predict the translated word.
# For this we need to use a Dense Layer. The parameter
# we need to define is the number of units, this number
# of units is the shape of the output vector and it
# needs to be the same as the length of the English vocabulary.
# Why? The vector will be all values close to zero, except
# one of the units that will be close to 1. We then need
# to map the index of the unit that outputs a 1 with a
# dictionary where we map each unit to a word. For example,
# if the input is the word ‘sol’ and the output is a vector
# where all are zeros and then the unit 472 is 1, we map
# this index against the dictionary containing the English
# words and we get the value ‘sun’.
input_sequence = Input(shape=(max_spanish_len,))
embedding = Embedding(input_dim=spanish_vocab, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)
r_vec = RepeatVector(max_english_len)(encoder)
decoder = LSTM(64, return_sequences=True, dropout=0.2)(r_vec)
logits = TimeDistributed(Dense(english_vocab))(decoder)
## Stack the layers to create the model and add a function loss.
## Compile
enc_dec_model = Model(input_sequence, Activation('softmax')(logits))
enc_dec_model.compile(loss=sparse_categorical_crossentropy,
optimizer=Adam(1e-3),
metrics=['accuracy'])
enc_dec_model.summary()
## Train the model
model_results = enc_dec_model.fit(spa_pad_sentence, eng_pad_sentence, batch_size=30, epochs=100)
# When the model is trained we can make our first translation.
# You will also find the function ‘logits_to_sentence’
# that maps the output of the dense layer with the English vocabulary.
def logits_to_sentence(logits, tokenizer):
index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
index_to_words[0] = '<empty>'
return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])
index = 1000
print("The english sentence is: {}".format(english_sentences[index]))
print("The spanish sentence is: {}".format(spanish_sentences[index]))
print('The predicted sentence is :')
print(logits_to_sentence(enc_dec_model.predict(spa_pad_sentence[index:index+1])[0], eng_text_tokenizer))