Fix for CLI Text Classification Benchmark / Quantize (#404)

okhleif-IL · web-flow · commit f209471e0141 · 2023-08-23T14:50:55.000-07:00
* added an env variable to fix benchmark / quantize

* added env var to hf model
diff --git a/tlt/datasets/text_classification/tf_custom_text_classification_dataset.py b/tlt/datasets/text_classification/tf_custom_text_classification_dataset.py
@@ -75,7 +75,7 @@ class TFCustomTextClassificationDataset(TextClassificationDataset, TFDataset):
 
     """
 
-    def __init__(self, dataset_dir, dataset_name, csv_file_name, class_names, label_map_func=None,
+    def __init__(self, dataset_dir, dataset_name, csv_file_name, class_names=[], label_map_func=None,
                  defaults=[tf.string, tf.string], delimiter=",", header=False, select_cols=None, exclude_cols=None,
                  shuffle_files=True, seed=None, **kwargs):
         """
@@ -85,11 +85,6 @@ def __init__(self, dataset_dir, dataset_name, csv_file_name, class_names, label_
         if not os.path.exists(dataset_file):
             raise FileNotFoundError("The dataset file ({}) does not exist".format(dataset_file))
 
-        if not isinstance(class_names, list):
-            raise TypeError("The class_names is expected to be a list, but found a {}", type(class_names))
-        if len(class_names) == 0:
-            raise ValueError("The class_names list cannot be empty.")
-
         if label_map_func and not callable(label_map_func):
             raise TypeError("The label_map_func is expected to be a function, but found a {}", type(label_map_func))
 
diff --git a/tlt/models/hf_model.py b/tlt/models/hf_model.py
@@ -176,6 +176,8 @@ def benchmark(self, dataset, saved_model_dir=None, warmup=10, iteration=100, cor
             FileNotFoundError: if a model.pt is not found in the saved_model_dir or if the inc_config_path file
             is not found
         """
+        os.environ["NC_ENV_CONF"] = "True"
+
         # Verify dataset is of the right type
         if not isinstance(dataset, self._inc_compatible_dataset):
             raise NotImplementedError('Quantization has only been implemented for TLT datasets, and type '
diff --git a/tlt/models/pytorch_model.py b/tlt/models/pytorch_model.py
@@ -275,6 +275,8 @@ def benchmark(self, dataset, saved_model_dir=None, warmup=10, iteration=100, cor
             FileNotFoundError: if a model.pt is not found in the saved_model_dir or if the inc_config_path file
             is not found
         """
+        os.environ["NC_ENV_CONF"] = "True"
+
         # Verify dataset is of the right type
         if not isinstance(dataset, self._inc_compatible_dataset):
             raise NotImplementedError('Quantization has only been implemented for TLT datasets, and type '
diff --git a/tlt/models/tf_model.py b/tlt/models/tf_model.py
@@ -409,6 +409,8 @@ def benchmark(self, dataset, saved_model_dir=None, warmup=10, iteration=100, cor
             FileNotFoundError: if a saved_model.pb is not found in the saved_model_dir or if the inc_config_path file
             is not found
         """
+        os.environ["NC_ENV_CONF"] = "True"
+
         # If provided, the saved model directory should exist and contain a saved_model.pb file
         if saved_model_dir is not None:
             if not os.path.isdir(saved_model_dir):
diff --git a/tlt/tools/cli/commands/train.py b/tlt/tools/cli/commands/train.py
@@ -276,6 +276,9 @@ def train(framework, model_name, use_case, output_dir, dataset_dir, dataset_file
                 if not class_names:
                     raise ValueError("Loading a text classification dataset requires --class-names to specify a list "
                                      "of the class labels for the dataset.")
+                elif len(class_names) == 0:
+                    raise ValueError("Loading a text classification dataset requires --class-names to specify a list "
+                                     "of the class labels of which the len > 0")
                 dataset = dataset_factory.load_dataset(dataset_dir, model.use_case, model.framework, dataset_name,
                                                        class_names=class_names, csv_file_name=dataset_file,
                                                        delimiter=delimiter)