From 087aa1e5984041199676d513711b953263f163a7 Mon Sep 17 00:00:00 2001
From: Kaz Nishimura <kazssym@linuxfront.com>
Date: Sun, 24 Mar 2024 17:39:41 +0900
Subject: [PATCH 1/2] Improve handling of missing `vocab_file` attribute in
 HFTokenizerConverter

This commit updates `HFTokenizerConverter` to handle cases where the `hf_tokenizer` object might not have a `vocab_file` attribute.

Changes:

* Uses `getattr` to retrieve the `vocab_file` attribute for flexibility
* Stores the retrieved value in a separate variable `vocab_file` for clarity
* Checks if `vocab_file` is `None` before checking its existence

This ensures the converter works correctly even with tokenizers that don't define a `vocab_file` attribute.
---
 onnxruntime_extensions/_hf_cvt.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/onnxruntime_extensions/_hf_cvt.py b/onnxruntime_extensions/_hf_cvt.py
index e379c9834..3f9bcfeb4 100644
--- a/onnxruntime_extensions/_hf_cvt.py
+++ b/onnxruntime_extensions/_hf_cvt.py
@@ -43,10 +43,11 @@ def convert_json_vocab(hf_tokenizer):
                 f"{hf_tokenizer.__name__}: vocab_files_names is not found")
 
         tokenizer_file = filenames["tokenizer_file"]
-        if (hf_tokenizer.vocab_file is None) or (not os.path.exists(hf_tokenizer.vocab_file)):
+        vocab_file = getattr(hf_tokenizer, "vocab_file", None)
+        if (vocab_file is None) or (not os.path.exists(vocab_file)):
             model_dir = hf_tokenizer.name_or_path
         else:
-            model_dir = os.path.dirname(hf_tokenizer.vocab_file)
+            model_dir = os.path.dirname(vocab_file)
         tokenizer_json = json.load(
             open(os.path.join(model_dir, tokenizer_file), "r", encoding="utf-8"))
         # get vocab object from json file

From 6f5311c995fb5133e14a9c53769ddfd96a519ac4 Mon Sep 17 00:00:00 2001
From: Kaz Nishimura <kazssym@linuxfront.com>
Date: Sun, 24 Mar 2024 18:18:54 +0900
Subject: [PATCH 2/2] [Experimental] Add GPTNeoXTokenizer support to
 HFTokenizerConverter

This commit adds support for the `GPTNeoXTokenizer` class from Hugging Face Transformers to the `HFTokenizerConverter` in the `onnxruntime_extensions` library.

Specifically, it includes a new entry in the `_PROCESSOR_DICT` dictionary for `GPTNeoXTokenizer`, referencing the existing `HFTokenizerConverter.bpe_tokenizer` and `HFTokenizerConverter.bpe_decoder` functions for handling tokenization and decoding.

This allows seamless integration of GPTNeoX models with ONNX Runtime through the `HFTokenizerConverter`.
---
 onnxruntime_extensions/_hf_cvt.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/onnxruntime_extensions/_hf_cvt.py b/onnxruntime_extensions/_hf_cvt.py
index 3f9bcfeb4..6113e86a9 100644
--- a/onnxruntime_extensions/_hf_cvt.py
+++ b/onnxruntime_extensions/_hf_cvt.py
@@ -182,6 +182,8 @@ def spm_decoder(self, **kwargs):
                                          'BpeDecoder',      HFTokenizerConverter.bpe_decoder, None),
     "CodeGenTokenizer":     TokenOpParam('GPT2Tokenizer',   HFTokenizerConverter.bpe_tokenizer,
                                          'BpeDecoder',      HFTokenizerConverter.bpe_decoder, None),
+    "GPTNeoXTokenizer":     TokenOpParam('GPT2Tokenizer',   HFTokenizerConverter.bpe_tokenizer,
+                                         'BpeDecoder',      HFTokenizerConverter.bpe_decoder, None),
     "CLIPTokenizer":        TokenOpParam('CLIPTokenizer',   HFTokenizerConverter.clip_tokenizer,
                                          'BpeDecoder',      HFTokenizerConverter.bpe_decoder, None),
     "RobertaTokenizer":     TokenOpParam('RobertaTokenizer',        HFTokenizerConverter.roberta_tokenizer,