From 087aa1e5984041199676d513711b953263f163a7 Mon Sep 17 00:00:00 2001 From: Kaz Nishimura Date: Sun, 24 Mar 2024 17:39:41 +0900 Subject: [PATCH 1/2] Improve handling of missing `vocab_file` attribute in HFTokenizerConverter This commit updates `HFTokenizerConverter` to handle cases where the `hf_tokenizer` object might not have a `vocab_file` attribute. Changes: * Uses `getattr` to retrieve the `vocab_file` attribute for flexibility * Stores the retrieved value in a separate variable `vocab_file` for clarity * Checks if `vocab_file` is `None` before checking its existence This ensures the converter works correctly even with tokenizers that don't define a `vocab_file` attribute. --- onnxruntime_extensions/_hf_cvt.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/onnxruntime_extensions/_hf_cvt.py b/onnxruntime_extensions/_hf_cvt.py index e379c9834..3f9bcfeb4 100644 --- a/onnxruntime_extensions/_hf_cvt.py +++ b/onnxruntime_extensions/_hf_cvt.py @@ -43,10 +43,11 @@ def convert_json_vocab(hf_tokenizer): f"{hf_tokenizer.__name__}: vocab_files_names is not found") tokenizer_file = filenames["tokenizer_file"] - if (hf_tokenizer.vocab_file is None) or (not os.path.exists(hf_tokenizer.vocab_file)): + vocab_file = getattr(hf_tokenizer, "vocab_file", None) + if (vocab_file is None) or (not os.path.exists(vocab_file)): model_dir = hf_tokenizer.name_or_path else: - model_dir = os.path.dirname(hf_tokenizer.vocab_file) + model_dir = os.path.dirname(vocab_file) tokenizer_json = json.load( open(os.path.join(model_dir, tokenizer_file), "r", encoding="utf-8")) # get vocab object from json file From 6f5311c995fb5133e14a9c53769ddfd96a519ac4 Mon Sep 17 00:00:00 2001 From: Kaz Nishimura Date: Sun, 24 Mar 2024 18:18:54 +0900 Subject: [PATCH 2/2] [Experimental] Add GPTNeoXTokenizer support to HFTokenizerConverter This commit adds support for the `GPTNeoXTokenizer` class from Hugging Face Transformers to the `HFTokenizerConverter` in the `onnxruntime_extensions` library. Specifically, it includes a new entry in the `_PROCESSOR_DICT` dictionary for `GPTNeoXTokenizer`, referencing the existing `HFTokenizerConverter.bpe_tokenizer` and `HFTokenizerConverter.bpe_decoder` functions for handling tokenization and decoding. This allows seamless integration of GPTNeoX models with ONNX Runtime through the `HFTokenizerConverter`. --- onnxruntime_extensions/_hf_cvt.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/onnxruntime_extensions/_hf_cvt.py b/onnxruntime_extensions/_hf_cvt.py index 3f9bcfeb4..6113e86a9 100644 --- a/onnxruntime_extensions/_hf_cvt.py +++ b/onnxruntime_extensions/_hf_cvt.py @@ -182,6 +182,8 @@ def spm_decoder(self, **kwargs): 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None), "CodeGenTokenizer": TokenOpParam('GPT2Tokenizer', HFTokenizerConverter.bpe_tokenizer, 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None), + "GPTNeoXTokenizer": TokenOpParam('GPT2Tokenizer', HFTokenizerConverter.bpe_tokenizer, + 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None), "CLIPTokenizer": TokenOpParam('CLIPTokenizer', HFTokenizerConverter.clip_tokenizer, 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None), "RobertaTokenizer": TokenOpParam('RobertaTokenizer', HFTokenizerConverter.roberta_tokenizer,