|
| 1 | +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | +# ============================================================================== |
| 15 | +"""Distributed Train. |
| 16 | +""" |
| 17 | + |
| 18 | +from __future__ import absolute_import |
| 19 | +from __future__ import division |
| 20 | +from __future__ import print_function |
| 21 | + |
| 22 | +from absl import app |
| 23 | +from absl import flags |
| 24 | +import tensorflow as tf # TF2 |
| 25 | +from tensorflow_examples.models.nmt_with_attention import nmt |
| 26 | +from tensorflow_examples.models.nmt_with_attention import utils |
| 27 | +from tensorflow_examples.models.nmt_with_attention.train import Train |
| 28 | +assert tf.__version__.startswith('2') |
| 29 | + |
| 30 | +FLAGS = flags.FLAGS |
| 31 | + |
| 32 | +# if additional flags are needed, define it here. |
| 33 | +flags.DEFINE_integer('num_gpu', 1, 'Number of GPUs to use') |
| 34 | + |
| 35 | + |
| 36 | +class DistributedTrain(Train): |
| 37 | + """Distributed Train class. |
| 38 | +
|
| 39 | + Args: |
| 40 | + epochs: Number of epochs. |
| 41 | + enable_function: Decorate function with tf.function. |
| 42 | + encoder: Encoder. |
| 43 | + decoder: Decoder. |
| 44 | + inp_lang: Input language tokenizer. |
| 45 | + targ_lang: Target language tokenizer. |
| 46 | + batch_size: Batch size. |
| 47 | + """ |
| 48 | + |
| 49 | + def __init__(self, epochs, enable_function, encoder, decoder, inp_lang, |
| 50 | + targ_lang, batch_size): |
| 51 | + Train.__init__( |
| 52 | + self, epochs, enable_function, encoder, decoder, inp_lang, targ_lang, |
| 53 | + batch_size) |
| 54 | + |
| 55 | + def training_loop(self, train_iterator, test_iterator, |
| 56 | + num_train_steps_per_epoch, num_test_steps_per_epoch, |
| 57 | + strategy): |
| 58 | + """Custom training and testing loop. |
| 59 | +
|
| 60 | + Args: |
| 61 | + train_iterator: Training iterator created using strategy |
| 62 | + test_iterator: Testing iterator created using strategy |
| 63 | + num_train_steps_per_epoch: number of training steps in an epoch. |
| 64 | + num_test_steps_per_epoch: number of test steps in an epoch. |
| 65 | + strategy: Distribution strategy |
| 66 | +
|
| 67 | + Returns: |
| 68 | + train_loss, test_loss |
| 69 | + """ |
| 70 | + |
| 71 | + # this code is expected to change. |
| 72 | + def distributed_train(): |
| 73 | + return strategy.experimental_run(self.train_step, train_iterator) |
| 74 | + |
| 75 | + def distributed_test(): |
| 76 | + return strategy.experimental_run(self.test_step, test_iterator) |
| 77 | + |
| 78 | + if self.enable_function: |
| 79 | + distributed_train = tf.function(distributed_train) |
| 80 | + distributed_test = tf.function(distributed_test) |
| 81 | + |
| 82 | + template = 'Epoch: {}, Train Loss: {}, Test Loss: {}' |
| 83 | + |
| 84 | + for epoch in range(self.epochs): |
| 85 | + self.train_loss_metric.reset_states() |
| 86 | + self.test_loss_metric.reset_states() |
| 87 | + |
| 88 | + train_iterator.initialize() |
| 89 | + for _ in range(num_train_steps_per_epoch): |
| 90 | + distributed_train() |
| 91 | + |
| 92 | + test_iterator.initialize() |
| 93 | + for _ in range(num_test_steps_per_epoch): |
| 94 | + distributed_test() |
| 95 | + |
| 96 | + print (template.format(epoch, |
| 97 | + self.train_loss_metric.result().numpy(), |
| 98 | + self.test_loss_metric.result().numpy())) |
| 99 | + |
| 100 | + return (self.train_loss_metric.result().numpy(), |
| 101 | + self.test_loss_metric.result().numpy()) |
| 102 | + |
| 103 | + |
| 104 | +def run_main(argv): |
| 105 | + del argv |
| 106 | + kwargs = utils.flags_dict() |
| 107 | + kwargs.update({'num_gpu': FLAGS.num_gpu}) |
| 108 | + main(**kwargs) |
| 109 | + |
| 110 | + |
| 111 | +def main(epochs, enable_function, buffer_size, batch_size, download_path, |
| 112 | + num_examples=70000, embedding_dim=256, enc_units=1024, dec_units=1024, |
| 113 | + num_gpu=1): |
| 114 | + |
| 115 | + devices = ['/device:GPU:{}'.format(i) for i in range(num_gpu)] |
| 116 | + strategy = tf.distribute.MirroredStrategy(devices) |
| 117 | + |
| 118 | + with strategy.scope(): |
| 119 | + file_path = utils.download(download_path) |
| 120 | + train_ds, test_ds, inp_lang, targ_lang = utils.create_dataset( |
| 121 | + file_path, num_examples, buffer_size, batch_size) |
| 122 | + vocab_inp_size = len(inp_lang.word_index) + 1 |
| 123 | + vocab_tar_size = len(targ_lang.word_index) + 1 |
| 124 | + |
| 125 | + num_train_steps_per_epoch = tf.data.experimental.cardinality(train_ds) |
| 126 | + num_test_steps_per_epoch = tf.data.experimental.cardinality(test_ds) |
| 127 | + |
| 128 | + train_iterator = strategy.make_dataset_iterator(train_ds) |
| 129 | + test_iterator = strategy.make_dataset_iterator(test_ds) |
| 130 | + |
| 131 | + encoder = nmt.Encoder(vocab_inp_size, embedding_dim, enc_units, batch_size) |
| 132 | + decoder = nmt.Decoder(vocab_tar_size, embedding_dim, dec_units, batch_size) |
| 133 | + |
| 134 | + train_obj = DistributedTrain(epochs, enable_function, encoder, decoder, |
| 135 | + inp_lang, targ_lang, batch_size) |
| 136 | + print ('Training ...') |
| 137 | + return train_obj.training_loop(train_iterator, |
| 138 | + test_iterator, |
| 139 | + num_train_steps_per_epoch, |
| 140 | + num_test_steps_per_epoch, |
| 141 | + strategy) |
| 142 | + |
| 143 | +if __name__ == '__main__': |
| 144 | + utils.nmt_flags() |
| 145 | + app.run(run_main) |
0 commit comments