Change location of TLT saved objects for distributed PYT (#344)

HarshaRamayanam · web-flow · commit fccc229e6470 · 2023-06-09T10:10:50.000-05:00
* Changed behaviour of saving torch dist.  objects

* Review changes

* Removed val_data as it is not used anywhere
diff --git a/tlt/distributed/pytorch/run_train_pyt.py b/tlt/distributed/pytorch/run_train_pyt.py
@@ -18,6 +18,7 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
+import os
 import argparse
 
 from tlt.distributed.pytorch.utils.pyt_distributed_utils import (
@@ -28,7 +29,12 @@
 
 if __name__ == "__main__":
 
-    # Program arguments
+    def directory_path(path):
+        if os.path.isdir(path):
+            return path
+        else:
+            raise argparse.ArgumentTypeError("'{}' is not a valid directory path.".format(path))
+
     print("******Distributed Training*****")
 
     description = 'Distributed training with PyTorch.'
@@ -46,23 +52,25 @@
                         help='Global batch size to distribute data (default: 128)')
     parser.add_argument('--disable_ipex', action='store_true', required=False, help="Disables IPEX optimization to "
                         "the model")
+    parser.add_argument('--tlt_saved_objects_dir', type=directory_path, required=False, help='Path to TLT saved '
+                        'distributed objects. The path must be accessible to all the nodes. For example: mounted '
+                        'NFS drive. This arg is helpful when using TLT API/CLI. '
+                        'See DistributedTorch.load_saved_objects() for more information.')
 
     args = parser.parse_args()
 
-    # Load the saved dataset and model objects
-    loaded_objects = DistributedTorch.load_saved_objects(use_case=args.use_case)
+    if args.tlt_saved_objects_dir is not None:
+        # Load the saved dataset and model objects
+        loaded_objects = DistributedTorch.load_saved_objects(args.tlt_saved_objects_dir)
 
-    dataset = loaded_objects['dataset']
-    train_subset = loaded_objects.get('train_subset', dataset)
-    test_subset = loaded_objects.get('test_subset', dataset)
-    validation_subset = loaded_objects.get('validation_subset', dataset)
-    model = loaded_objects['model']
-    loss = loaded_objects['loss']
-    optimizer = loaded_objects['optimizer']
+        train_data = loaded_objects.get('train_data')
+        model = loaded_objects['model']
+        loss = loaded_objects['loss']
+        optimizer = loaded_objects['optimizer']
 
     # Launch distributed job
     training_args = DistributedTrainingArguments(
-        dataset=train_subset,
+        dataset=train_data,
         model=model,
         criterion=loss,
         optimizer=optimizer,
diff --git a/tlt/distributed/pytorch/utils/pyt_distributed_utils.py b/tlt/distributed/pytorch/utils/pyt_distributed_utils.py
@@ -27,7 +27,6 @@
 from random import Random
 from torch.utils.data import DataLoader
 from torch.nn.parallel import DistributedDataParallel as DDP
-from tlt.distributed import TLT_DISTRIBUTED_DIR
 
 import oneccl_bindings_for_pytorch  # noqa # pylint: disable=unused-import
 import intel_extension_for_pytorch as ipex
@@ -245,7 +244,7 @@ def cleanup_ddp(cls):
             dist.destroy_process_group()
 
     @classmethod
-    def load_saved_objects(cls, use_case: str):
+    def load_saved_objects(cls, saved_objects_dir):
         """
         Helper function to load saved dataset and model objects
 
@@ -255,11 +254,6 @@ def load_saved_objects(cls, use_case: str):
         Returns:
             dict with loaded dataset and model objects
         """
-        if use_case == 'text_classification':
-            saved_objects_file = 'hf_saved_objects.obj'
-        elif use_case == 'image_classification':
-            saved_objects_file = 'torch_saved_objects.obj'
-        else:
-            raise ValueError("Distributed PyTorch for {} is not implemented yet".format(use_case))
+        saved_objects_file = 'torch_saved_objects.obj'
 
-        return torch.load(os.path.join(TLT_DISTRIBUTED_DIR, saved_objects_file))
+        return torch.load(os.path.join(saved_objects_dir, saved_objects_file))
diff --git a/tlt/models/image_classification/pytorch_image_classification_model.py b/tlt/models/image_classification/pytorch_image_classification_model.py
@@ -23,6 +23,8 @@
 import time
 import dill
 import subprocess
+import tempfile
+import shutil
 
 from tqdm import tqdm
 
@@ -244,7 +246,7 @@ def _fit(self, output_dir, dataset, epochs, do_eval, early_stopping, lr_decay):
                     'loss': train_epoch_loss,
                 }, os.path.join(checkpoint_dir, 'checkpoint.pt'))
 
-    def _fit_distributed(self, hostfile, nnodes, nproc_per_node, epochs, batch_size, ipex_optimize):
+    def _fit_distributed(self, saved_objects_dir, hostfile, nnodes, nproc_per_node, epochs, batch_size, ipex_optimize):
         distributed_vision_script = os.path.join(TLT_DISTRIBUTED_DIR, "pytorch", "run_train_pyt.py")
 
         default_port = '29500'
@@ -286,6 +288,7 @@ def _fit_distributed(self, hostfile, nnodes, nproc_per_node, epochs, batch_size,
         bash_command += ' --master_addr {}'.format(default_master_addr)
         bash_command += ' --master_port {}'.format(default_port)
         bash_command += ' --backend {}'.format('ccl')
+        bash_command += ' --tlt_saved_objects_dir {}'.format(saved_objects_dir)
         bash_command += ' --use_case {}'.format('image_classification')
         bash_command += ' --epochs {}'.format(epochs)
         bash_command += ' --batch_size {}'.format(batch_size)
@@ -346,9 +349,19 @@ def train(self, dataset: ImageClassificationDataset, output_dir, epochs=1, initi
             self._optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
 
         if distributed:
-            self.export_for_distributed(TLT_DISTRIBUTED_DIR, dataset)
-            batch_size = dataset._preprocessed['batch_size']
-            self._fit_distributed(hostfile, nnodes, nproc_per_node, epochs, batch_size, ipex_optimize)
+            try:
+                saved_objects_dir = self.export_for_distributed(
+                    export_dir=os.path.join(output_dir, 'tlt_saved_objects'),
+                    train_data=dataset.train_subset,
+                    val_data=dataset.validation_subset
+                )
+                batch_size = dataset._preprocessed['batch_size']
+                self._fit_distributed(saved_objects_dir, hostfile, nnodes, nproc_per_node, epochs, batch_size,
+                                      ipex_optimize)
+            except Exception as err:
+                print("Error: \'{}\' occured while distributed training".format(err))
+            finally:
+                self.cleanup_saved_objects_for_distributed()
 
         else:
             # Call ipex.optimize
@@ -467,26 +480,37 @@ def export(self, output_dir):
         else:
             raise ValueError("Unable to export the model, because it hasn't been trained yet")
 
-    def export_for_distributed(self, output_dir, dataset):
+    def export_for_distributed(self, export_dir=None, train_data=None, val_data=None):
         """
-        Helper function to export dataset and model objects to disk for distributed job
+        Exports the model, optimizer, loss, train data and validation data to the export_dir for distributed
+        script to access. Note that the export_dir must be accessible to all the nodes. For example: NFS shared
+        systems. Note that the export_dir is created using mkdtemp which reults in a unique dir name. For
+        example: "<export_dir_Am83Iw". If the export_dir is None, the default name is "saved_objects"
 
         Args:
-            output_dir (str): Path to a directory where the dataset and model objects are saved.
-                Default file name for saving the objects is "torch_saved_objects.obj"
-            dataset (ImageClassificationDataset): Dataset object to save. It must be an object of
-                ImageClassificationDataset so that the dataset info, train, test, and validation
-                subsets can be accessed.
+            export_dir (str): Directory name to export the model, optimizer, loss, train data and validation
+                data. export_dir must be accessible to all the nodes. For example: NFS shared systems. export_dir
+                is created using mkdtemp which reults in a unique dir name. For example: "<export_dir_Am83Iw".
+                If the export_dir is None, the default name is "saved_objects"
+            train_data (PyTorchDataset): Train dataset
+            val_data (PyTorchDataset): Validation dataset
         """
 
+        temp_dir_prefix = os.path.join(os.environ['HOME'], "saved_objects_") if export_dir is None else export_dir + "_"
+        self._temp_dir = tempfile.mkdtemp(prefix=temp_dir_prefix)
+
         objects_to_save = {
-            "dataset": dataset.dataset,
-            "info": dataset.info,
-            "train_subset": dataset.train_subset,
-            "test_subset": dataset.test_subset,
-            "validation_subset": dataset.validation_subset,
+            "train_data": train_data,
             "model": self._model,
             "optimizer": self._optimizer,
             "loss": self._loss
         }
-        torch.save(objects_to_save, os.path.join(output_dir, "torch_saved_objects.obj"))
+        torch.save(objects_to_save, os.path.join(self._temp_dir, "torch_saved_objects.obj"))
+        return self._temp_dir
+
+    def cleanup_saved_objects_for_distributed(self):
+        try:
+            print('Cleaning saved objects...')
+            shutil.rmtree(self._temp_dir)
+        except OSError as ose:
+            print('Error while cleaning the saved objects: {}'.format(ose))
diff --git a/tlt/models/image_classification/torchvision_image_classification_model.py b/tlt/models/image_classification/torchvision_image_classification_model.py
@@ -26,7 +26,6 @@
 
 from downloader.models import ModelDownloader
 from tlt import TLT_BASE_DIR
-from tlt.distributed import TLT_DISTRIBUTED_DIR
 from tlt.models.image_classification.pytorch_image_classification_model import PyTorchImageClassificationModel
 from tlt.datasets.image_classification.image_classification_dataset import ImageClassificationDataset
 from tlt.utils.file_utils import read_json_file
@@ -186,9 +185,19 @@ def train(self, dataset: ImageClassificationDataset, output_dir, epochs=1, initi
                 self._model, self._optimizer = ipex.optimize(self._model, optimizer=self._optimizer)
 
         if distributed:
-            self.export_for_distributed(TLT_DISTRIBUTED_DIR, dataset)
-            batch_size = dataset._preprocessed['batch_size']
-            self._fit_distributed(hostfile, nnodes, nproc_per_node, epochs, batch_size, ipex_optimize)
+            try:
+                saved_objects_dir = self.export_for_distributed(
+                    export_dir=os.path.join(output_dir, 'tlt_saved_objects'),
+                    train_data=dataset.train_subset,
+                    val_data=dataset.validation_subset
+                )
+                batch_size = dataset._preprocessed['batch_size']
+                self._fit_distributed(saved_objects_dir, hostfile, nnodes, nproc_per_node, epochs, batch_size,
+                                      ipex_optimize)
+            except Exception as err:
+                print("Error: \'{}\' occured while distributed training".format(err))
+            finally:
+                self.cleanup_saved_objects_for_distributed()
         else:
             self._model.train()
             self._fit(output_dir, dataset, epochs, do_eval, early_stopping, lr_decay)
diff --git a/tlt/models/text_classification/pytorch_hf_text_classification_model.py b/tlt/models/text_classification/pytorch_hf_text_classification_model.py
@@ -30,6 +30,8 @@
 from tqdm import tqdm
 from torch.utils.data import DataLoader
 import yaml
+import tempfile
+import shutil
 
 # Hugging Face imports
 from transformers import (
@@ -101,29 +103,39 @@ def __init__(self, model_name: str, model=None, optimizer=None, loss=None, **kwa
         self._trainer = None
         self._history = None
 
-    def export_for_distributed(self, output_dir, dataset):
+    def export_for_distributed(self, export_dir, train_data=None, val_data=None):
         """
-        Helper function to export dataset and model objects to disk for distributed job
+        Exports the model, optimizer, loss, train data and validation data to the export_dir for distributed
+        script to access. Note that the export_dir must be accessible to all the nodes. For example: NFS shared
+        systems. Note that the export_dir is created using mkdtemp which reults in a unique dir name. For
+        example: "<export_dir_Am83Iw". If the export_dir is None, the default name is "saved_objects"
 
         Args:
-            output_dir (str): Path to a directory where the dataset and model objects are saved.
-                Default file name for saving the objects is "hf_saved_objects.obj"
-            dataset (HFTextClassificationDataset): Dataset object to save. It must be an object of
-                HFTextClassificationDataset so that the dataset info, train, test, and validation
-                subsets can be accessed.
+            export_dir (str): Directory name to export the model, optimizer, loss, train data and validation
+                data. export_dir must be accessible to all the nodes. For example: NFS shared systems. export_dir
+                is created using mkdtemp which reults in a unique dir name. For example: "<export_dir_Am83Iw".
+                If the export_dir is None, the default name is "saved_objects"
+            train_data (PyTorchDataset): Train dataset
+            val_data (PyTorchDataset): Validation dataset
         """
+        temp_dir_prefix = os.path.join(os.environ['HOME'], "saved_objects_") if export_dir is None else export_dir + "_"
+        self._temp_dir = tempfile.mkdtemp(prefix=temp_dir_prefix)
 
         objects_to_save = {
-            "dataset": dataset.dataset,
-            "info": dataset.info,
-            "train_subset": dataset.train_subset,
-            "test_subset": dataset.test_subset,
-            "validation_subset": dataset.validation_subset,
+            "train_data": train_data,
             "model": self._model,
             "optimizer": self._optimizer,
             "loss": self._loss
         }
-        torch.save(objects_to_save, os.path.join(output_dir, "hf_saved_objects.obj"))
+        torch.save(objects_to_save, os.path.join(self._temp_dir, "torch_saved_objects.obj"))
+        return self._temp_dir
+
+    def cleanup_saved_objects_for_distributed(self):
+        try:
+            print('Cleaning saved objects...')
+            shutil.rmtree(self._temp_dir)
+        except OSError as ose:
+            print('Error while cleaning the saved objects: {}'.format(ose))
 
     @property
     def num_classes(self):
@@ -272,7 +284,7 @@ def _fit(self, output_dir, dataset, epochs, do_eval, early_stopping, lr_decay):
                     'loss': train_epoch_loss,
                 }, os.path.join(checkpoint_dir, 'checkpoint.pt'))
 
-    def _fit_distributed(self, hostfile, nnodes, nproc_per_node, epochs, batch_size, ipex_optimize):
+    def _fit_distributed(self, saved_objects_dir, hostfile, nnodes, nproc_per_node, epochs, batch_size, ipex_optimize):
         distributed_text_script = os.path.join(TLT_DISTRIBUTED_DIR, "pytorch", "run_train_pyt.py")
 
         default_port = '29500'
@@ -314,6 +326,7 @@ def _fit_distributed(self, hostfile, nnodes, nproc_per_node, epochs, batch_size,
         bash_command += ' --master_addr {}'.format(default_master_addr)
         bash_command += ' --master_port {}'.format(default_port)
         bash_command += ' --backend {}'.format('ccl')
+        bash_command += ' --tlt_saved_objects_dir {}'.format(saved_objects_dir)
         bash_command += ' --use_case {}'.format('text_classification')
         bash_command += ' --epochs {}'.format(epochs)
         bash_command += ' --batch_size {}'.format(batch_size)
@@ -467,11 +480,18 @@ def compute_metrics(p: EvalPrediction):
                 self._history = self._trainer.evaluate()
                 print("Val Acc: {:.5f}".format(self._history.get("eval_accuracy")))
         elif distributed:
-            self.export_for_distributed(
-                output_dir=TLT_DISTRIBUTED_DIR, dataset=dataset
-            )
-            self._fit_distributed(hostfile, nnodes, nproc_per_node, epochs, dataset._preprocessed["batch_size"],
-                                  ipex_optimize)
+            try:
+                saved_objects_dir = self.export_for_distributed(
+                    export_dir=os.path.join(output_dir, 'tlt_saved_objects'),
+                    train_data=dataset.train_subset,
+                    val_data=dataset.validation_subset
+                )
+                self._fit_distributed(saved_objects_dir, hostfile, nnodes, nproc_per_node, epochs,
+                                      dataset._preprocessed["batch_size"], ipex_optimize)
+            except Exception as err:
+                print("Error: \'{}\' occured while distributed training".format(err))
+            finally:
+                self.cleanup_saved_objects_for_distributed()
         else:
             self._trainer = None
             self._model.train()