Merge branch 'main' into os/fix-onig-build-gcc-15

ArthurZucker · web-flow · commit a3ad375e103f · 2025-05-27T14:06:26.000+02:00
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -10,7 +10,7 @@ jobs:
   build:
     runs-on: ${{ matrix.os }}
     env:
-      MACOSX_DEPLOYMENT_TARGET: 10.11
+      MACOSX_DEPLOYMENT_TARGET: 10.12
     strategy:
       matrix:
         os: [ubuntu-latest, windows-latest, macOS-latest]
diff --git a/bindings/node/lib/bindings/encoding.test.ts b/bindings/node/lib/bindings/encoding.test.ts
@@ -122,7 +122,7 @@ describe('Encoding', () => {
       expect(indexes).toEqual([3, 5])
     })
 
-    it('returns the corrent indexes with pair sequences', () => {
+    it('returns the correct indexes with pair sequences', () => {
       expect(encodingDual.wordToTokens(3, 0)).toEqual([3, 5])
       expect(encodingDual.wordToTokens(3, 1)).toEqual([8, 9])
     })
diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml
@@ -14,8 +14,8 @@ serde = { version = "1.0", features = ["rc", "derive"] }
 serde_json = "1.0"
 libc = "0.2"
 env_logger = "0.11"
-pyo3 = { version = "0.23", features = ["abi3", "abi3-py39", "py-clone"] }
-numpy = "0.23"
+pyo3 = { version = "0.24.2", features = ["abi3", "abi3-py39", "py-clone"] }
+numpy = "0.24"
 ndarray = "0.16"
 itertools = "0.12"
 
@@ -24,7 +24,7 @@ path = "../../tokenizers"
 
 [dev-dependencies]
 tempfile = "3.10"
-pyo3 = { version = "0.23", features = ["auto-initialize"] }
+pyo3 = { version = "0.24.2", features = ["auto-initialize"] }
 
 [features]
-defaut = ["pyo3/extension-module"]
+default = ["pyo3/extension-module"]
diff --git a/bindings/python/py_src/tokenizers/decoders/__init__.pyi b/bindings/python/py_src/tokenizers/decoders/__init__.pyi
@@ -33,7 +33,7 @@ class BPEDecoder(Decoder):
 
     Args:
         suffix (:obj:`str`, `optional`, defaults to :obj:`</w>`):
-            The suffix that was used to caracterize an end-of-word. This suffix will
+            The suffix that was used to characterize an end-of-word. This suffix will
             be replaced by whitespaces during the decoding
     """
     def __init__(self, suffix="</w>"):
diff --git a/bindings/python/py_src/tokenizers/implementations/byte_level_bpe.py b/bindings/python/py_src/tokenizers/implementations/byte_level_bpe.py
@@ -16,7 +16,7 @@ class ByteLevelBPETokenizer(BaseTokenizer):
     def __init__(
         self,
         vocab: Optional[Union[str, Dict[str, int]]] = None,
-        merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
+        merges: Optional[Union[str, List[Tuple[str, str]]]] = None,
         add_prefix_space: bool = False,
         lowercase: bool = False,
         dropout: Optional[float] = None,
diff --git a/bindings/python/py_src/tokenizers/implementations/char_level_bpe.py b/bindings/python/py_src/tokenizers/implementations/char_level_bpe.py
@@ -25,7 +25,7 @@ class CharBPETokenizer(BaseTokenizer):
     def __init__(
         self,
         vocab: Optional[Union[str, Dict[str, int]]] = None,
-        merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
+        merges: Optional[Union[str, List[Tuple[str, str]]]] = None,
         unk_token: Union[str, AddedToken] = "<unk>",
         suffix: str = "</w>",
         dropout: Optional[float] = None,
diff --git a/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py b/bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py
@@ -16,7 +16,7 @@ class SentencePieceBPETokenizer(BaseTokenizer):
     def __init__(
         self,
         vocab: Optional[Union[str, Dict[str, int]]] = None,
-        merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
+        merges: Optional[Union[str, List[Tuple[str, str]]]] = None,
         unk_token: Union[str, AddedToken] = "<unk>",
         replacement: str = "▁",
         add_prefix_space: bool = True,
diff --git a/bindings/python/scripts/convert.py b/bindings/python/scripts/convert.py
@@ -397,7 +397,7 @@ def main():
         "--models",
         type=lambda s: s.split(","),
         default=pretraineds,
-        help=f"The pretrained tokenizers you want to test agains, (default: {pretraineds})",
+        help=f"The pretrained tokenizers you want to test against, (default: {pretraineds})",
     )
     args = parser.parse_args()
 
diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs
@@ -404,7 +404,7 @@ impl PyMetaspaceDec {
 ///
 /// Args:
 ///     suffix (:obj:`str`, `optional`, defaults to :obj:`</w>`):
-///         The suffix that was used to caracterize an end-of-word. This suffix will
+///         The suffix that was used to characterize an end-of-word. This suffix will
 ///         be replaced by whitespaces during the decoding
 #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "BPEDecoder")]
 pub struct PyBPEDecoder {}
diff --git a/bindings/python/tests/bindings/test_trainers.py b/bindings/python/tests/bindings/test_trainers.py
@@ -14,7 +14,7 @@
     trainers,
 )
 
-from ..utils import data_dir, train_files
+from ..utils import data_dir, train_files, DATA_PATH
 
 
 class TestBpeTrainer:
@@ -287,16 +287,17 @@ def test_can_modify(self):
         trainer.initial_alphabet = ["d", "z"]
         assert sorted(trainer.initial_alphabet) == ["d", "z"]
 
-    def test_continuing_prefix_trainer_mismatch(self):
+    def test_continuing_prefix_trainer_mismatch(self, train_files):
         UNK = "[UNK]"
         special_tokens = [UNK]
         tokenizer = Tokenizer(models.BPE(unk_token=UNK, continuing_subword_prefix="##"))
         trainer = trainers.BpeTrainer(special_tokens=special_tokens)
         tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
             [pre_tokenizers.Whitespace(), pre_tokenizers.Digits(individual_digits=True)]
         )
-        tokenizer.train(files=["data/big.txt"], trainer=trainer)
+        tokenizer.train(files=[train_files["big"]], trainer=trainer)
 
-        tokenizer.save("data/tokenizer.json")
+        tokenizer_json = os.path.join(DATA_PATH, "tokenizer.json")
+        tokenizer.save(tokenizer_json)
 
-        tokenizer.from_file("data/tokenizer.json")
+        tokenizer.from_file(tokenizer_json)
diff --git a/tokenizers/benches/llama3.rs b/tokenizers/benches/llama3.rs
@@ -7,7 +7,7 @@ use tokenizers::Tokenizer;
 pub fn llama3(c: &mut Criterion) {
     let data = std::fs::read_to_string("data/big.txt").unwrap();
     let mut group = c.benchmark_group("llama3-encode");
-    group.throughput(Throughput::Bytes(data.bytes().len() as u64));
+    group.throughput(Throughput::Bytes(data.len() as u64));
     group.bench_function("llama3-offsets", |b| {
         let tokenizer =
             Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None).unwrap();
diff --git a/tokenizers/examples/unstable_wasm/www/package-lock.json b/tokenizers/examples/unstable_wasm/www/package-lock.json
diff --git a/tokenizers/src/decoders/byte_fallback.rs b/tokenizers/src/decoders/byte_fallback.rs
@@ -28,11 +28,7 @@ impl Decoder for ByteFallback {
 
         for token in tokens {
             let bytes = if token.len() == 6 && token.starts_with("<0x") && token.ends_with('>') {
-                if let Ok(byte) = u8::from_str_radix(&token[3..5], 16) {
-                    Some(byte)
-                } else {
-                    None
-                }
+                u8::from_str_radix(&token[3..5], 16).ok()
             } else {
                 None
             };
diff --git a/tokenizers/src/models/bpe/model.rs b/tokenizers/src/models/bpe/model.rs
@@ -221,7 +221,7 @@ pub struct BPE {
     pub unk_token: Option<String>,
     /// An optional prefix to use on any subword that exist only behind another one
     pub continuing_subword_prefix: Option<String>,
-    /// An optional suffix to caracterize and end-of-word subword
+    /// An optional suffix to characterize and end-of-word subword
     pub end_of_word_suffix: Option<String>,
     /// Do multiple unk tokens get fused
     pub fuse_unk: bool,
diff --git a/tokenizers/src/models/bpe/trainer.rs b/tokenizers/src/models/bpe/trainer.rs
@@ -190,7 +190,7 @@ pub struct BpeTrainer {
     pub initial_alphabet: HashSet<char>,
     /// An optional prefix to use on any subword that exist only behind another one
     pub continuing_subword_prefix: Option<String>,
-    /// An optional suffix to caracterize and end-of-word subword
+    /// An optional suffix to characterize and end-of-word subword
     pub end_of_word_suffix: Option<String>,
     /// An optional parameter to limit the max length of any single token
     pub max_token_length: Option<usize>,
diff --git a/tokenizers/src/models/mod.rs b/tokenizers/src/models/mod.rs
@@ -35,7 +35,7 @@ impl Serialize for OrderedVocabIter<'_> {
     {
         // There could be holes so max + 1 is more correct than vocab_r.len()
         let mut holes = vec![];
-        let result = if let Some(max) = self.vocab_r.iter().map(|(key, _)| key).max() {
+        let result = if let Some(max) = self.vocab_r.keys().max() {
             let iter = (0..*max + 1).filter_map(|i| {
                 if let Some(token) = self.vocab_r.get(&i) {
                     Some((token, i))
@@ -50,7 +50,7 @@ impl Serialize for OrderedVocabIter<'_> {
         };
 
         if !holes.is_empty() {
-            warn!("The OrderedVocab you are attempting to save contains holes for indices {:?}, your vocabulary could be corrupted !", holes);
+            warn!("The OrderedVocab you are attempting to save contains holes for indices {holes:?}, your vocabulary could be corrupted !");
             println!("The OrderedVocab you are attempting to save contains holes for indices {holes:?}, your vocabulary could be corrupted !");
         }
         result
diff --git a/tokenizers/src/models/unigram/model.rs b/tokenizers/src/models/unigram/model.rs
@@ -313,7 +313,7 @@ impl Unigram {
                 && node.id == self.unk_id.ok_or(UnigramError::MissingUnkId)?
             {
                 token.push(
-                    String::from_utf8(sentence[starts_at..ends_at].as_bytes().to_vec()).unwrap(),
+                    String::from_utf8((sentence.as_bytes()[starts_at..ends_at]).to_vec()).unwrap(),
                 );
             } else {
                 if !token.is_empty() {
@@ -322,7 +322,7 @@ impl Unigram {
                     token = vec![];
                 }
                 results.push(
-                    String::from_utf8(sentence[starts_at..ends_at].as_bytes().to_vec()).unwrap(),
+                    String::from_utf8((sentence.as_bytes()[starts_at..ends_at]).to_vec()).unwrap(),
                 );
             }
             ends_at = starts_at;
diff --git a/tokenizers/src/models/unigram/trainer.rs b/tokenizers/src/models/unigram/trainer.rs
@@ -401,7 +401,7 @@ impl UnigramTrainer {
 
                 let logsum_alt = (sum + freq[id] * (alternatives.len() - 1) as f64).ln();
 
-                // The frequencies of altenatives are increased by freq[i].
+                // The frequencies of alternatives are increased by freq[i].
                 let mut logprob_alt = 0.0;
                 for n in &alternatives[id] {
                     logprob_alt += (freq[*n] + freq[id]).ln() - logsum_alt;
diff --git a/tokenizers/src/models/wordlevel/mod.rs b/tokenizers/src/models/wordlevel/mod.rs
@@ -73,7 +73,7 @@ impl WordLevelBuilder {
         self
     }
 
-    /// Contructs a `WordLevel` model that uses the `WordLevelBuilder`'s configuration.
+    /// Constructs a `WordLevel` model that uses the `WordLevelBuilder`'s configuration.
     pub fn build(mut self) -> Result<WordLevel> {
         if let Some(vocab) = self.config.files {
             self.config.vocab = WordLevel::read_file(&vocab)?;
diff --git a/tokenizers/src/models/wordpiece/mod.rs b/tokenizers/src/models/wordpiece/mod.rs
@@ -93,7 +93,7 @@ impl WordPieceBuilder {
         self
     }
 
-    /// Contructs a `WordPiece` model that uses the `WordPieceBuilder`'s configuration.
+    /// Constructs a `WordPiece` model that uses the `WordPieceBuilder`'s configuration.
     pub fn build(mut self) -> Result<WordPiece> {
         if let Some(vocab) = self.config.files {
             self.config.vocab = WordPiece::read_file(&vocab)?;
diff --git a/tokenizers/src/models/wordpiece/trainer.rs b/tokenizers/src/models/wordpiece/trainer.rs
@@ -170,7 +170,7 @@ impl WordPieceTrainer {
         // Transfer the vocab
         model.vocab = new_wordpiece.vocab;
         model.vocab_r = new_wordpiece.vocab_r;
-        // The continuing_subword_prefix is the only other option to be overriden by the trainer
+        // The continuing_subword_prefix is the only other option to be overridden by the trainer
         model.continuing_subword_prefix = new_wordpiece.continuing_subword_prefix;
 
         Ok(special_tokens)
diff --git a/tokenizers/src/normalizers/byte_level.rs b/tokenizers/src/normalizers/byte_level.rs
@@ -35,7 +35,7 @@ impl Normalizer for ByteLevel {
             let mut i = 0;
             for cur_char in s.chars() {
                 let size = cur_char.len_utf8();
-                let bytes = s[i..i + size].as_bytes();
+                let bytes = &s.as_bytes()[i..i + size];
                 i += size;
                 transformations.extend(
                     bytes
diff --git a/tokenizers/src/normalizers/precompiled.rs b/tokenizers/src/normalizers/precompiled.rs
@@ -12,7 +12,7 @@ fn replace(transformations: &mut Vec<(char, isize)>, old_part: &str, new_part: &
     transformations.extend(new_part.chars().map(|c| (c, 0)));
 
     match diff.cmp(&0) {
-        // If we are adding some characters, the last DIFF characters shoud be == 1
+        // If we are adding some characters, the last DIFF characters should be == 1
         Ordering::Greater => {
             transformations
                 .iter_mut()
diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs
@@ -135,7 +135,7 @@ impl PreTokenizer for ByteLevel {
             let mut i = 0;
             for cur_char in s.chars() {
                 let size = cur_char.len_utf8();
-                let bytes = s[i..i + size].as_bytes();
+                let bytes = &s.as_bytes()[i..i + size];
                 i += size;
                 transformations.extend(
                     bytes
diff --git a/tokenizers/src/processors/bert.rs b/tokenizers/src/processors/bert.rs
@@ -65,9 +65,9 @@ impl PostProcessor for BertProcessing {
                     let ids = [&[self.cls.1], encoding.get_ids(), &[self.sep.1]].concat();
                     let type_ids = [&[0], encoding.get_type_ids(), &[0]].concat();
                     let tokens = [
-                        &[self.cls.0.clone()],
+                        std::slice::from_ref(&self.cls.0),
                         encoding.get_tokens(),
-                        &[self.sep.0.clone()],
+                        std::slice::from_ref(&self.sep.0),
                     ]
                     .concat();
                     let words = [&[None], encoding.get_word_ids(), &[None]].concat();
@@ -95,9 +95,9 @@ impl PostProcessor for BertProcessing {
                                     [&[self.cls.1], encoding.get_ids(), &[self.sep.1]].concat();
                                 let type_ids = [&[0], encoding.get_type_ids(), &[0]].concat();
                                 let tokens = [
-                                    &[self.cls.0.clone()],
+                                    std::slice::from_ref(&self.cls.0),
                                     encoding.get_tokens(),
-                                    &[self.sep.0.clone()],
+                                    std::slice::from_ref(&self.sep.0),
                                 ]
                                 .concat();
                                 let words = [&[None], encoding.get_word_ids(), &[None]].concat();
@@ -130,7 +130,8 @@ impl PostProcessor for BertProcessing {
                 } else {
                     let pair_ids = [encoding.get_ids(), &[self.sep.1]].concat();
                     let pair_type_ids = [encoding.get_type_ids(), &[1]].concat();
-                    let pair_tokens = [encoding.get_tokens(), &[self.sep.0.clone()]].concat();
+                    let pair_tokens =
+                        [encoding.get_tokens(), std::slice::from_ref(&self.sep.0)].concat();
                     let pair_words = [encoding.get_word_ids(), &[None]].concat();
                     let pair_offsets = [encoding.get_offsets(), &[(0, 0)]].concat();
                     let pair_special_tokens =
@@ -155,7 +156,8 @@ impl PostProcessor for BertProcessing {
                                 let pair_ids = [encoding.get_ids(), &[self.sep.1]].concat();
                                 let pair_type_ids = [encoding.get_type_ids(), &[1]].concat();
                                 let pair_tokens =
-                                    [encoding.get_tokens(), &[self.sep.0.clone()]].concat();
+                                    [encoding.get_tokens(), std::slice::from_ref(&self.sep.0)]
+                                        .concat();
                                 let pair_words = [encoding.get_word_ids(), &[None]].concat();
                                 let pair_offsets = [encoding.get_offsets(), &[(0, 0)]].concat();
                                 let pair_special_tokens =
diff --git a/tokenizers/src/processors/roberta.rs b/tokenizers/src/processors/roberta.rs
@@ -95,9 +95,9 @@ impl PostProcessor for RobertaProcessing {
                     let ids = [&[self.cls.1], encoding.get_ids(), &[self.sep.1]].concat();
                     let type_ids = [&[0], encoding.get_type_ids(), &[0]].concat();
                     let tokens = [
-                        &[self.cls.0.clone()],
+                        std::slice::from_ref(&self.cls.0),
                         encoding.get_tokens(),
-                        &[self.sep.0.clone()],
+                        std::slice::from_ref(&self.sep.0),
                     ]
                     .concat();
                     let words = [&[None], encoding.get_word_ids(), &[None]].concat();
@@ -125,9 +125,9 @@ impl PostProcessor for RobertaProcessing {
                                     [&[self.cls.1], encoding.get_ids(), &[self.sep.1]].concat();
                                 let type_ids = vec![0; encoding.get_ids().len() + 2];
                                 let tokens = [
-                                    &[self.cls.0.clone()],
+                                    std::slice::from_ref(&self.cls.0),
                                     encoding.get_tokens(),
-                                    &[self.sep.0.clone()],
+                                    std::slice::from_ref(&self.sep.0),
                                 ]
                                 .concat();
                                 let words = [&[None], encoding.get_word_ids(), &[None]].concat();
@@ -161,9 +161,9 @@ impl PostProcessor for RobertaProcessing {
                     let pair_ids = [&[self.sep.1], encoding.get_ids(), &[self.sep.1]].concat();
                     let pair_type_ids = vec![0; encoding.get_ids().len() + 2];
                     let pair_tokens = [
-                        &[self.sep.0.clone()],
+                        std::slice::from_ref(&self.sep.0),
                         encoding.get_tokens(),
-                        &[self.sep.0.clone()],
+                        std::slice::from_ref(&self.sep.0),
                     ]
                     .concat();
                     let pair_words = [&[None], encoding.get_word_ids(), &[None]].concat();
@@ -191,9 +191,9 @@ impl PostProcessor for RobertaProcessing {
                                     [&[self.sep.1], encoding.get_ids(), &[self.sep.1]].concat();
                                 let pair_type_ids = vec![0; encoding.get_ids().len() + 2];
                                 let pair_tokens = [
-                                    &[self.sep.0.clone()],
+                                    std::slice::from_ref(&self.sep.0),
                                     encoding.get_tokens(),
-                                    &[self.sep.0.clone()],
+                                    std::slice::from_ref(&self.sep.0),
                                 ]
                                 .concat();
                                 let pair_words =
diff --git a/tokenizers/src/processors/template.rs b/tokenizers/src/processors/template.rs
diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs
diff --git a/tokenizers/src/tokenizer/encoding.rs b/tokenizers/src/tokenizer/encoding.rs
diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs
diff --git a/tokenizers/src/tokenizer/pattern.rs b/tokenizers/src/tokenizer/pattern.rs
diff --git a/tokenizers/src/tokenizer/serialization.rs b/tokenizers/src/tokenizer/serialization.rs
diff --git a/tokenizers/src/utils/from_pretrained.rs b/tokenizers/src/utils/from_pretrained.rs
diff --git a/tokenizers/src/utils/mod.rs b/tokenizers/src/utils/mod.rs

Original file line number	Diff line number	Diff line change
`@@ -397,7 +397,7 @@ def main():`
`397`	`397`	`"--models",`
`398`	`398`	`type=lambda s: s.split(","),`
`399`	`399`	`default=pretraineds,`
`400`		`- help=f"The pretrained tokenizers you want to test agains, (default: {pretraineds})",`
	`400`	`+ help=f"The pretrained tokenizers you want to test against, (default: {pretraineds})",`
`401`	`401`	`)`
`402`	`402`	`args = parser.parse_args()`
`403`	`403`