1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
| import tensorflow as tf from modeling import Decoder, Encoder from data_process import load_dataset, max_length, preprocess_sentence_en, preprocess_sentence_zh from sklearn.model_selection import train_test_split import os import time import datetime import numpy as np import matplotlib.pyplot as plt import matplotlib.ticker as ticker from tqdm import tqdm
strategy = tf.distribute.MirroredStrategy() print(strategy.num_replicas_in_sync)
with strategy.scope(): input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer = load_dataset('cmn.txt') max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2) EPOCHS = 100 BUFFER_SIZE = len(input_tensor_train) BATCH_SIZE = 64 GLOBAL_BATCH_SZIE = BATCH_SIZE*strategy.num_replicas_in_sync steps_per_epoch = len(input_tensor_train) // BATCH_SIZE embedding_dim = 256 units = 1024
vocab_inp_size = len(inp_lang_tokenizer.word_index) + 1 vocab_tar_size = len(targ_lang_tokenizer.word_index) + 1 dataset_train = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE) dataset_train = dataset_train.batch(GLOBAL_BATCH_SZIE, drop_remainder=True) dataset_train = strategy.experimental_distribute_dataset(dataset_train)
encoder = Encoder(vocab_szie=vocab_inp_size, embedding_dim=embedding_dim, enc_units=units, batch_sz=BATCH_SIZE) decoder = Decoder(vocab_size=vocab_tar_size, embedding_dim=embedding_dim, dec_units=units, batch_sz=BATCH_SIZE)
optimizer = tf.keras.optimizers.Adam() loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
def loss_function(real, pred): mask = tf.math.logical_not(tf.math.equal(real, 0)) loss_ = loss_object(real, pred)
mask = tf.cast(mask, dtype=tf.float32)
loss_ *= mask return tf.reduce_mean(loss_)
checkpoint_dir = './train_checkpoint' checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt') checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)
def train_step(inp, targ, enc_hidden): loss = 0 with tf.GradientTape() as tape: enc_output, enc_hidden = encoder((inp, enc_hidden)) dec_hidden = enc_hidden dec_input = tf.expand_dims([targ_lang_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)
for t in range(1, targ.shape[1]): predictions, dec_hidden, _ = decoder((dec_input, dec_hidden, enc_output)) loss += loss_function(targ[:, t], predictions) dec_input = tf.expand_dims(targ[:, t], 1)
batch_loss = (loss / int(targ.shape[1])) variables = encoder.trainable_variables + decoder.trainable_variables
gradients = tape.gradient(loss, variables) optimizer.apply_gradients(zip(gradients, variables))
return batch_loss
@tf.function def destribute_train_loss(dataset_input): inp, targ, enc_hidden = dataset_input per_replica_losses = strategy.experimental_run_v2(train_step, args=(inp, targ, enc_hidden)) return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
for epoch in range(EPOCHS): start = time.time() enc_hidden = encoder.initialize_hidden_state() train_loss = 0 gbatch = 0 for (inp, tar) in tqdm(dataset_train):
train_loss += destribute_train_loss((inp, tar, enc_hidden)) gbatch+=1
if gbatch % 50 == 0: template = "Epoch {} Batch {} loss {:.4f} " tf.print(template.format(epoch + 1, gbatch, train_loss/(2.0*gbatch)))
tf.print("Epoch {} loss {:4f} ".format(epoch + 1, train_loss/(2.0*gbatch))) tf.print("Time take for 1 epoch {} sec\n".format(time.time() - start))
|