Skip to content

Commit 673661e

Browse files
committed
Merge branch 'main' into main-public
2 parents f77deff + b087eb7 commit 673661e

File tree

9 files changed

+33
-20
lines changed

9 files changed

+33
-20
lines changed

notebooks/requirements.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
Pillow==10.1.0
1+
Pillow==10.2.0
22
PyYAML==6.0.1
33
charset-normalizer==3.3.0
44
datasets==2.14.5
@@ -13,7 +13,7 @@ jmespath==1.0.1
1313
matplotlib-inline==0.1.6
1414
matplotlib==3.7.3; python_version=='3.8'
1515
matplotlib==3.8.0; python_version>'3.8'
16-
notebook==7.0.5
16+
notebook==7.0.7
1717
numpy==1.23.5
1818
opencv-python==4.8.1.78
1919
pandas==2.0.3; python_version=='3.8'
@@ -30,6 +30,6 @@ tensorflow-datasets==4.9.3; python_version>'3.8'
3030
tensorflow-hub==0.15.0
3131
torch==1.13.1
3232
torchvision==0.14.1
33-
transformers==4.34.0
33+
transformers==4.36.0
3434
urllib3==2.0.7
3535
evaluate==0.4.0

notebooks/text_classification/pytorch_text_classification/PyTorch_Text_Classifier_fine_tuning.ipynb

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@
137137
" objects for train and evaluations splits, along with helper functions for preprocessing the dataset.\n",
138138
" \"\"\"\n",
139139
"\n",
140-
" def __init__(self, dataset_name, tokenizer, sentence1_key, sentence2_key, label_key):\n",
140+
" def __init__(self, dataset_name, tokenizer, sentence1_key, sentence2_key, label_key, max_length=None):\n",
141141
" self.tokenizer = tokenizer\n",
142142
" self.dataset_name = dataset_name\n",
143143
" self.class_labels = None\n",
@@ -150,12 +150,15 @@
150150
" self.sentence1_key = sentence1_key\n",
151151
" self.sentence2_key = sentence2_key\n",
152152
" self.label_key = label_key\n",
153+
"\n",
154+
" # Max sequence length\n",
155+
" self.max_length = max_length\n",
153156
" \n",
154157
" def tokenize_function(self, examples):\n",
155158
" # Define the tokenizer args, depending on if the data has 2 sentences or just 1\n",
156159
" args = ((examples[self.sentence1_key],) if self.sentence2_key is None \\\n",
157160
" else (examples[self.sentence1_key], examples[self.sentence2_key]))\n",
158-
" return self.tokenizer(*args, padding=\"max_length\", truncation=True)\n",
161+
" return self.tokenizer(*args, padding=\"max_length\", truncation=True, max_length=self.max_length)\n",
159162
" \n",
160163
" def tokenize_dataset(self, dataset):\n",
161164
" # Apply the tokenize function to the dataset\n",
@@ -232,7 +235,7 @@
232235
" \"\"\"\n",
233236
" \n",
234237
" def __init__(self, tokenizer, dataset_dir, dataset_name, train_size, eval_size, train_split_name,\n",
235-
" eval_split_name, sentence1_key, sentence2_key, label_key):\n",
238+
" eval_split_name, sentence1_key, sentence2_key, label_key, max_length=None):\n",
236239
" \"\"\"\n",
237240
" Initialize the HFDSTextClassificationData class for a text classification dataset from Hugging Face.\n",
238241
" \n",
@@ -249,10 +252,12 @@
249252
" :param sentence1_key: Name of the sentence1 column\n",
250253
" :param sentence2_key: Name of the sentence2 column or `None` if there's only one text column\n",
251254
" :param label_key: Name of the label column\n",
255+
" :param max_length: Optional max sequence length (default None will use the tokenizer's max sequence)\n",
252256
" \"\"\"\n",
253257
"\n",
254258
" # Init base class\n",
255-
" TextClassificationData.__init__(self, dataset_name, tokenizer, sentence1_key, sentence2_key, label_key) \n",
259+
" TextClassificationData.__init__(self, dataset_name, tokenizer, sentence1_key, sentence2_key, label_key,\n",
260+
" max_length) \n",
256261
" \n",
257262
" # Load the dataset from the Hugging Face dataset API\n",
258263
" self.dataset = load_dataset(dataset_name, cache_dir=dataset_dir)\n",
@@ -279,8 +284,11 @@
279284
"sentence2_key = None\n",
280285
"label_key = \"label\"\n",
281286
"\n",
287+
"# Max sequence length\n",
288+
"max_length = None\n",
289+
"\n",
282290
"dataset = HFDSTextClassificationData(tokenizer, dataset_dir, dataset_name, train_dataset_size, eval_dataset_size,\n",
283-
" Split.TRAIN, Split.TEST, sentence1_key, sentence2_key, label_key)\n",
291+
" Split.TRAIN, Split.TEST, sentence1_key, sentence2_key, label_key, max_length)\n",
284292
"\n",
285293
"# Print a sample of the data\n",
286294
"dataset.display_sample(Split.TRAIN, sample_size=5)"
@@ -326,7 +334,8 @@
326334
" \"\"\"\n",
327335
" \n",
328336
" def __init__(self, tokenizer, dataset_name, dataset_dir, data_files, delimiter, label_names, sentence1_key, sentence2_key,\n",
329-
" label_key, train_percent=0.8, eval_percent=0.2, train_size=None, eval_size=None, map_function=None):\n",
337+
" label_key, train_percent=0.8, eval_percent=0.2, train_size=None, eval_size=None, map_function=None,\n",
338+
" max_length=None):\n",
330339
" \"\"\"\n",
331340
" Intialize the CustomCsvTextClassificationData class for a text classification\n",
332341
" dataset. The classes uses the Hugging Face datasets API to load the CSV file,\n",
@@ -352,9 +361,10 @@
352361
" :param eval_size: Size of the eval dataset. Set to `None` to use all the data.\n",
353362
" :param map_function: (Optional) Map function to apply to the dataset. For example, if the csv file has string\n",
354363
" labels instead of numerical values, map function can do the conversion.\n",
364+
" :param max_length: Optional max sequence length (default None will use the tokenizer's max sequence) \n",
355365
" \"\"\"\n",
356366
" # Init base class\n",
357-
" TextClassificationData.__init__(self, dataset_name, tokenizer, sentence1_key, sentence2_key, label_key)\n",
367+
" TextClassificationData.__init__(self, dataset_name, tokenizer, sentence1_key, sentence2_key, label_key, max_length)\n",
358368
" \n",
359369
" if (train_percent + eval_percent) > 1:\n",
360370
" raise ValueError(\"The combined value of the train percentage and eval percentage \" \\\n",
@@ -408,14 +418,17 @@
408418
"sentence2_key = None\n",
409419
"label_key = \"label\"\n",
410420
"\n",
421+
"# Max sequence length\n",
422+
"max_length = None\n",
423+
"\n",
411424
"# Map function to translate labels in the csv file to numerical values when loading the dataset\n",
412425
"def map_spam(example):\n",
413426
" example[\"label\"] = int(example[\"label\"] == \"spam\")\n",
414427
" return example\n",
415428
"\n",
416429
"dataset = CustomCsvTextClassificationData(tokenizer, \"smsspamcollection\", dataset_dir, [renamed_csv], delimiter,\n",
417430
" label_names, sentence1_key, sentence2_key, label_key, train_size=1000,\n",
418-
" eval_size=1000, map_function=map_spam)\n",
431+
" eval_size=1000, map_function=map_spam, max_length=max_length)\n",
419432
"\n",
420433
"# Print a sample of the data\n",
421434
"dataset.display_sample(Split.TRAIN, 10)"

pytorch_requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@ protobuf==4.24.4
77
python-dateutil==2.8.2
88
torch==1.13.1
99
torchvision==0.14.1
10-
transformers[torch]==4.34.0
10+
transformers[torch]==4.36.0

tensorflow_requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ numpy==1.23.5
44
tensorflow-datasets==4.9.2; python_version=='3.8'
55
tensorflow-datasets==4.9.3; python_version>'3.8'
66
tensorflow-hub==0.15.0
7-
transformers[tensorflow]==4.34.0
7+
transformers[tensorflow]==4.36.0

tests/pytorch_tests/test_text_classification.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def test_pyt_text_classification(model_name, dataset_name, extra_layers, correct
5353

5454
# Preprocess the dataset
5555
dataset.preprocess(model_name, batch_size=32)
56-
dataset.shuffle_split(train_pct=0.02, val_pct=0.01, seed=6)
56+
dataset.shuffle_split(train_pct=0.1, val_pct=0.01, seed=6)
5757
assert dataset._validation_type == 'shuffle_split'
5858

5959
# Evaluate before training
@@ -157,8 +157,7 @@ def test_pyt_text_classification_trainer(model_name, dataset_name):
157157
saved_model_dir = model.export(output_dir)
158158
assert os.path.isdir(saved_model_dir)
159159
assert os.path.isfile(os.path.join(saved_model_dir, "config.json"))
160-
assert os.path.isfile(os.path.join(saved_model_dir, "pytorch_model.bin"))
161-
160+
assert os.path.isfile(os.path.join(saved_model_dir, "model.safetensors"))
162161
# Load the saved model using load_model and verify that a prediction matches the original model
163162
loaded_model = model_factory.load_model(model_name, saved_model_dir, framework,
164163
'text_classification', model_hub='huggingface')

tests/requirements-test.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
Pillow==10.1.0
1+
Pillow==10.2.0
22
flake8==6.1.0
33
mock==5.1.0
44
pytest-cov==4.1.0

tlt/models/text_classification/pytorch_hf_text_classification_model.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -820,7 +820,8 @@ def load_from_directory(self, model_dir: str):
820820
# Load torch model
821821
PyTorchModel.load_from_directory(self, model_dir)
822822
elif os.path.exists(os.path.join(model_dir, 'config.json')) and \
823-
os.path.exists(os.path.join(model_dir, 'pytorch_model.bin')):
823+
(os.path.exists(os.path.join(model_dir, 'pytorch_model.bin')) or
824+
os.path.exists(os.path.join(model_dir, 'model.safetensors'))):
824825
# Load model using the transformers method
825826
self._model = AutoModelForSequenceClassification.from_pretrained(model_dir)
826827
self._optimizer = self._optimizer_class(self._model.parameters(), lr=self._learning_rate)

workflows/disease_prediction/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
Pillow==10.1.0
1+
Pillow==10.2.0
22
PyYAML==6.0.1
33
intel-tensorflow==2.12.0
44
neural-compressor==2.1.1

workflows/vision_anomaly_detection/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
Pillow==10.1.0
1+
Pillow==10.2.0
22
PyYAML==6.0.1
33
dill==0.3.7
44
intel-extension-for-pytorch==1.13.100

0 commit comments

Comments
 (0)