diff --git a/bindings/python/py_src/tokenizers/decoders/__init__.pyi b/bindings/python/py_src/tokenizers/decoders/__init__.pyi index 672aebb8d..adad6f53b 100644 --- a/bindings/python/py_src/tokenizers/decoders/__init__.pyi +++ b/bindings/python/py_src/tokenizers/decoders/__init__.pyi @@ -57,7 +57,7 @@ class ByteFallback(Decoder): ByteFallback Decoder ByteFallback is a simple trick which converts tokens looking like `<0x61>` to pure bytes, and attempts to make them into a string. If the tokens - cannot be decoded you will get � instead for each inconvertible byte token + cannot be decoded you will get � instead for each inconvertable byte token """ def __init__(self): diff --git a/bindings/python/py_src/tokenizers/normalizers/__init__.pyi b/bindings/python/py_src/tokenizers/normalizers/__init__.pyi index 1f5555104..8c4e744d1 100644 --- a/bindings/python/py_src/tokenizers/normalizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/normalizers/__init__.pyi @@ -389,7 +389,7 @@ class Nmt(Normalizer): class Precompiled(Normalizer): """ Precompiled normalizer - Don't use manually it is used for compatibility for SentencePiece. + Don't use manually it is used for compatiblity for SentencePiece. """ def __init__(self, precompiled_charsmap): pass diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi index 6f31ff3a2..ea1b4954e 100644 --- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi @@ -48,7 +48,7 @@ class BertPreTokenizer(PreTokenizer): BertPreTokenizer This pre-tokenizer splits tokens on spaces, and also on punctuation. - Each occurrence of a punctuation character will be treated separately. + Each occurence of a punctuation character will be treated separately. """ def __init__(self): pass @@ -421,11 +421,11 @@ class Split(PreTokenizer): Args: pattern (:obj:`str` or :class:`~tokenizers.Regex`): - A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`. - If you want to use a regex pattern, it has to be wrapped around a `tokenizers.Regex`, + A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`. + If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`, otherwise we consider is as a string pattern. For example `pattern="|"` means you want to split on `|` (imagine a csv file for example), while - `pattern=tokenizers.Regex("1|2")` means you split on either '1' or '2'. + `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'. behavior (:class:`~tokenizers.SplitDelimiterBehavior`): The behavior to use when splitting. Choices: "removed", "isolated", "merged_with_previous", "merged_with_next", diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index 4a408ff1d..59ebca5b4 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -603,24 +603,6 @@ impl Decoder for PyDecoderWrapper { } } -/// Decoders Module -#[pymodule] -pub fn decoders(m: &Bound<'_, PyModule>) -> PyResult<()> { - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - Ok(()) -} - /// Class needed for streaming decode /// #[pyclass(module = "tokenizers.decoders", name = "DecodeStream")] @@ -661,6 +643,13 @@ impl PyDecodeStream { } } + #[pyo3(signature = (sequence_ids), text_signature = "(self, sequence_ids)")] + fn with_sequence(&mut self, sequence_ids: Vec) { + self.ids = sequence_ids; + self.prefix_index = self.ids.len(); + self.prefix = "".to_string(); + } + #[pyo3(signature = (tokenizer, id), text_signature = "(self, tokenizer, id)")] fn step(&mut self, tokenizer: &PyTokenizer, id: u32) -> PyResult> { ToPyResult(tk::tokenizer::step_decode_stream( @@ -675,6 +664,24 @@ impl PyDecodeStream { } } +/// Decoders Module +#[pymodule] +pub fn decoders(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + Ok(()) +} + #[cfg(test)] mod test { use std::sync::{Arc, RwLock}; diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py index d50f283e7..5050f60d3 100644 --- a/bindings/python/tests/bindings/test_tokenizer.py +++ b/bindings/python/tests/bindings/test_tokenizer.py @@ -371,6 +371,9 @@ def test_decode(self): assert stream.step(tokenizer, 2) == " is" assert stream.step(tokenizer, 3) == " john" + stream.with_sequence([0, 1, 2, 3]) + assert stream.step(tokenizer, 4) == "my name is john pair" + def test_decode_stream(self): vocab = [ ("", 0.0), diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 808d120d5..9d1a2de90 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -1072,6 +1072,11 @@ where &mut self.prefix_index, ) } + + // Allows prefilling the tokenizer. Bit weird because not called in python + pub fn with_sequence(&mut self, sequence_ids: Vec) { + self.ids = sequence_ids; + } } /// Internal function exposed only to bypass python limitations