Merge branch 'main' into main-public

ashahba · ashahba · commit 673661e5dec8 · 2024-01-23T11:11:36.000-08:00
diff --git a/notebooks/requirements.txt b/notebooks/requirements.txt
@@ -1,4 +1,4 @@
-Pillow==10.1.0
+Pillow==10.2.0
 PyYAML==6.0.1
 charset-normalizer==3.3.0
 datasets==2.14.5
@@ -13,7 +13,7 @@ jmespath==1.0.1
 matplotlib-inline==0.1.6
 matplotlib==3.7.3; python_version=='3.8'
 matplotlib==3.8.0; python_version>'3.8'
-notebook==7.0.5
+notebook==7.0.7
 numpy==1.23.5
 opencv-python==4.8.1.78
 pandas==2.0.3; python_version=='3.8'
@@ -30,6 +30,6 @@ tensorflow-datasets==4.9.3; python_version>'3.8'
 tensorflow-hub==0.15.0
 torch==1.13.1
 torchvision==0.14.1
-transformers==4.34.0
+transformers==4.36.0
 urllib3==2.0.7
 evaluate==0.4.0
diff --git a/notebooks/text_classification/pytorch_text_classification/PyTorch_Text_Classifier_fine_tuning.ipynb b/notebooks/text_classification/pytorch_text_classification/PyTorch_Text_Classifier_fine_tuning.ipynb
@@ -137,7 +137,7 @@
     "    objects for train and evaluations splits, along with helper functions for preprocessing the dataset.\n",
     "    \"\"\"\n",
     "\n",
-    "    def __init__(self, dataset_name, tokenizer, sentence1_key, sentence2_key, label_key):\n",
+    "    def __init__(self, dataset_name, tokenizer, sentence1_key, sentence2_key, label_key, max_length=None):\n",
     "        self.tokenizer = tokenizer\n",
     "        self.dataset_name = dataset_name\n",
     "        self.class_labels = None\n",
@@ -150,12 +150,15 @@
     "        self.sentence1_key = sentence1_key\n",
     "        self.sentence2_key = sentence2_key\n",
     "        self.label_key = label_key\n",
+    "\n",
+    "        # Max sequence length\n",
+    "        self.max_length = max_length\n",
     "        \n",
     "    def tokenize_function(self, examples):\n",
     "        # Define the tokenizer args, depending on if the data has 2 sentences or just 1\n",
     "        args = ((examples[self.sentence1_key],) if self.sentence2_key is None \\\n",
     "                 else (examples[self.sentence1_key], examples[self.sentence2_key]))\n",
-    "        return self.tokenizer(*args, padding=\"max_length\", truncation=True)\n",
+    "        return self.tokenizer(*args, padding=\"max_length\", truncation=True, max_length=self.max_length)\n",
     "    \n",
     "    def tokenize_dataset(self, dataset):\n",
     "        # Apply the tokenize function to the dataset\n",
@@ -232,7 +235,7 @@
     "    \"\"\"\n",
     "    \n",
     "    def __init__(self, tokenizer, dataset_dir, dataset_name, train_size, eval_size, train_split_name,\n",
-    "                 eval_split_name, sentence1_key, sentence2_key, label_key):\n",
+    "                 eval_split_name, sentence1_key, sentence2_key, label_key, max_length=None):\n",
     "        \"\"\"\n",
     "        Initialize the HFDSTextClassificationData class for a text classification dataset from Hugging Face.\n",
     "        \n",
@@ -249,10 +252,12 @@
     "        :param sentence1_key: Name of the sentence1 column\n",
     "        :param sentence2_key: Name of the sentence2 column or `None` if there's only one text column\n",
     "        :param label_key: Name of the label column\n",
+    "        :param max_length: Optional max sequence length (default None will use the tokenizer's max sequence)\n",
     "        \"\"\"\n",
     "\n",
     "        # Init base class\n",
-    "        TextClassificationData.__init__(self, dataset_name, tokenizer, sentence1_key, sentence2_key, label_key) \n",
+    "        TextClassificationData.__init__(self, dataset_name, tokenizer, sentence1_key, sentence2_key, label_key,\n",
+    "                                        max_length) \n",
     "        \n",
     "        # Load the dataset from the Hugging Face dataset API\n",
     "        self.dataset = load_dataset(dataset_name, cache_dir=dataset_dir)\n",
@@ -279,8 +284,11 @@
     "sentence2_key = None\n",
     "label_key = \"label\"\n",
     "\n",
+    "# Max sequence length\n",
+    "max_length = None\n",
+    "\n",
     "dataset = HFDSTextClassificationData(tokenizer, dataset_dir, dataset_name, train_dataset_size, eval_dataset_size,\n",
-    "                                     Split.TRAIN, Split.TEST, sentence1_key, sentence2_key, label_key)\n",
+    "                                     Split.TRAIN, Split.TEST, sentence1_key, sentence2_key, label_key, max_length)\n",
     "\n",
     "# Print a sample of the data\n",
     "dataset.display_sample(Split.TRAIN, sample_size=5)"
@@ -326,7 +334,8 @@
     "    \"\"\"\n",
     "    \n",
     "    def __init__(self, tokenizer, dataset_name, dataset_dir, data_files, delimiter, label_names, sentence1_key, sentence2_key,\n",
-    "                 label_key, train_percent=0.8, eval_percent=0.2, train_size=None, eval_size=None, map_function=None):\n",
+    "                 label_key, train_percent=0.8, eval_percent=0.2, train_size=None, eval_size=None, map_function=None,\n",
+    "                 max_length=None):\n",
     "        \"\"\"\n",
     "        Intialize the CustomCsvTextClassificationData class for a text classification\n",
     "        dataset. The classes uses the Hugging Face datasets API to load the CSV file,\n",
@@ -352,9 +361,10 @@
     "        :param eval_size: Size of the eval dataset. Set to `None` to use all the data.\n",
     "        :param map_function: (Optional) Map function to apply to the dataset. For example, if the csv file has string\n",
     "                             labels instead of numerical values, map function can do the conversion.\n",
+    "        :param max_length: Optional max sequence length (default None will use the tokenizer's max sequence)                \n",
     "        \"\"\"\n",
     "        # Init base class\n",
-    "        TextClassificationData.__init__(self, dataset_name, tokenizer, sentence1_key, sentence2_key, label_key)\n",
+    "        TextClassificationData.__init__(self, dataset_name, tokenizer, sentence1_key, sentence2_key, label_key, max_length)\n",
     "        \n",
     "        if (train_percent + eval_percent) > 1:\n",
     "            raise ValueError(\"The combined value of the train percentage and eval percentage \" \\\n",
@@ -408,14 +418,17 @@
     "sentence2_key = None\n",
     "label_key = \"label\"\n",
     "\n",
+    "# Max sequence length\n",
+    "max_length = None\n",
+    "\n",
     "# Map function to translate labels in the csv file to numerical values when loading the dataset\n",
     "def map_spam(example):\n",
     "    example[\"label\"] = int(example[\"label\"] == \"spam\")\n",
     "    return example\n",
     "\n",
     "dataset = CustomCsvTextClassificationData(tokenizer, \"smsspamcollection\", dataset_dir, [renamed_csv], delimiter,\n",
     "                                          label_names, sentence1_key, sentence2_key, label_key, train_size=1000,\n",
-    "                                          eval_size=1000, map_function=map_spam)\n",
+    "                                          eval_size=1000, map_function=map_spam, max_length=max_length)\n",
     "\n",
     "# Print a sample of the data\n",
     "dataset.display_sample(Split.TRAIN, 10)"
diff --git a/pytorch_requirements.txt b/pytorch_requirements.txt
@@ -7,4 +7,4 @@ protobuf==4.24.4
 python-dateutil==2.8.2
 torch==1.13.1
 torchvision==0.14.1
-transformers[torch]==4.34.0
+transformers[torch]==4.36.0
diff --git a/tensorflow_requirements.txt b/tensorflow_requirements.txt
@@ -4,4 +4,4 @@ numpy==1.23.5
 tensorflow-datasets==4.9.2; python_version=='3.8'
 tensorflow-datasets==4.9.3; python_version>'3.8'
 tensorflow-hub==0.15.0
-transformers[tensorflow]==4.34.0
+transformers[tensorflow]==4.36.0
diff --git a/tests/pytorch_tests/test_text_classification.py b/tests/pytorch_tests/test_text_classification.py
@@ -53,7 +53,7 @@ def test_pyt_text_classification(model_name, dataset_name, extra_layers, correct
 
     # Preprocess the dataset
     dataset.preprocess(model_name, batch_size=32)
-    dataset.shuffle_split(train_pct=0.02, val_pct=0.01, seed=6)
+    dataset.shuffle_split(train_pct=0.1, val_pct=0.01, seed=6)
     assert dataset._validation_type == 'shuffle_split'
 
     # Evaluate before training
@@ -157,8 +157,7 @@ def test_pyt_text_classification_trainer(model_name, dataset_name):
         saved_model_dir = model.export(output_dir)
         assert os.path.isdir(saved_model_dir)
         assert os.path.isfile(os.path.join(saved_model_dir, "config.json"))
-        assert os.path.isfile(os.path.join(saved_model_dir, "pytorch_model.bin"))
-
+        assert os.path.isfile(os.path.join(saved_model_dir, "model.safetensors"))
         # Load the saved model using load_model and verify that a prediction matches the original model
         loaded_model = model_factory.load_model(model_name, saved_model_dir, framework,
                                                 'text_classification', model_hub='huggingface')
diff --git a/tests/requirements-test.txt b/tests/requirements-test.txt
@@ -1,4 +1,4 @@
-Pillow==10.1.0
+Pillow==10.2.0
 flake8==6.1.0
 mock==5.1.0
 pytest-cov==4.1.0
diff --git a/tlt/models/text_classification/pytorch_hf_text_classification_model.py b/tlt/models/text_classification/pytorch_hf_text_classification_model.py
@@ -820,7 +820,8 @@ def load_from_directory(self, model_dir: str):
             # Load torch model
             PyTorchModel.load_from_directory(self, model_dir)
         elif os.path.exists(os.path.join(model_dir, 'config.json')) and \
-                os.path.exists(os.path.join(model_dir, 'pytorch_model.bin')):
+                (os.path.exists(os.path.join(model_dir, 'pytorch_model.bin')) or
+                 os.path.exists(os.path.join(model_dir, 'model.safetensors'))):
             # Load model using the transformers method
             self._model = AutoModelForSequenceClassification.from_pretrained(model_dir)
             self._optimizer = self._optimizer_class(self._model.parameters(), lr=self._learning_rate)
diff --git a/workflows/disease_prediction/requirements.txt b/workflows/disease_prediction/requirements.txt
@@ -1,4 +1,4 @@
-Pillow==10.1.0
+Pillow==10.2.0
 PyYAML==6.0.1
 intel-tensorflow==2.12.0
 neural-compressor==2.1.1
diff --git a/workflows/vision_anomaly_detection/requirements.txt b/workflows/vision_anomaly_detection/requirements.txt
@@ -1,4 +1,4 @@
-Pillow==10.1.0
+Pillow==10.2.0
 PyYAML==6.0.1
 dill==0.3.7
 intel-extension-for-pytorch==1.13.100

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-Pillow==10.1.0`
	`1`	`+Pillow==10.2.0`
`2`	`2`	`flake8==6.1.0`
`3`	`3`	`mock==5.1.0`
`4`	`4`	`pytest-cov==4.1.0`