Skip to content

Commit a3ad375

Browse files
authored
Merge branch 'main' into os/fix-onig-build-gcc-15
2 parents c2b1b5d + b4d8dfc commit a3ad375

File tree

35 files changed

+84
-89
lines changed

35 files changed

+84
-89
lines changed

.github/workflows/rust.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ jobs:
1010
build:
1111
runs-on: ${{ matrix.os }}
1212
env:
13-
MACOSX_DEPLOYMENT_TARGET: 10.11
13+
MACOSX_DEPLOYMENT_TARGET: 10.12
1414
strategy:
1515
matrix:
1616
os: [ubuntu-latest, windows-latest, macOS-latest]

bindings/node/lib/bindings/encoding.test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ describe('Encoding', () => {
122122
expect(indexes).toEqual([3, 5])
123123
})
124124

125-
it('returns the corrent indexes with pair sequences', () => {
125+
it('returns the correct indexes with pair sequences', () => {
126126
expect(encodingDual.wordToTokens(3, 0)).toEqual([3, 5])
127127
expect(encodingDual.wordToTokens(3, 1)).toEqual([8, 9])
128128
})

bindings/python/Cargo.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ serde = { version = "1.0", features = ["rc", "derive"] }
1414
serde_json = "1.0"
1515
libc = "0.2"
1616
env_logger = "0.11"
17-
pyo3 = { version = "0.23", features = ["abi3", "abi3-py39", "py-clone"] }
18-
numpy = "0.23"
17+
pyo3 = { version = "0.24.2", features = ["abi3", "abi3-py39", "py-clone"] }
18+
numpy = "0.24"
1919
ndarray = "0.16"
2020
itertools = "0.12"
2121

@@ -24,7 +24,7 @@ path = "../../tokenizers"
2424

2525
[dev-dependencies]
2626
tempfile = "3.10"
27-
pyo3 = { version = "0.23", features = ["auto-initialize"] }
27+
pyo3 = { version = "0.24.2", features = ["auto-initialize"] }
2828

2929
[features]
30-
defaut = ["pyo3/extension-module"]
30+
default = ["pyo3/extension-module"]

bindings/python/py_src/tokenizers/decoders/__init__.pyi

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ class BPEDecoder(Decoder):
3333
3434
Args:
3535
suffix (:obj:`str`, `optional`, defaults to :obj:`</w>`):
36-
The suffix that was used to caracterize an end-of-word. This suffix will
36+
The suffix that was used to characterize an end-of-word. This suffix will
3737
be replaced by whitespaces during the decoding
3838
"""
3939
def __init__(self, suffix="</w>"):

bindings/python/py_src/tokenizers/implementations/byte_level_bpe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class ByteLevelBPETokenizer(BaseTokenizer):
1616
def __init__(
1717
self,
1818
vocab: Optional[Union[str, Dict[str, int]]] = None,
19-
merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
19+
merges: Optional[Union[str, List[Tuple[str, str]]]] = None,
2020
add_prefix_space: bool = False,
2121
lowercase: bool = False,
2222
dropout: Optional[float] = None,

bindings/python/py_src/tokenizers/implementations/char_level_bpe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class CharBPETokenizer(BaseTokenizer):
2525
def __init__(
2626
self,
2727
vocab: Optional[Union[str, Dict[str, int]]] = None,
28-
merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
28+
merges: Optional[Union[str, List[Tuple[str, str]]]] = None,
2929
unk_token: Union[str, AddedToken] = "<unk>",
3030
suffix: str = "</w>",
3131
dropout: Optional[float] = None,

bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class SentencePieceBPETokenizer(BaseTokenizer):
1616
def __init__(
1717
self,
1818
vocab: Optional[Union[str, Dict[str, int]]] = None,
19-
merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
19+
merges: Optional[Union[str, List[Tuple[str, str]]]] = None,
2020
unk_token: Union[str, AddedToken] = "<unk>",
2121
replacement: str = "▁",
2222
add_prefix_space: bool = True,

bindings/python/scripts/convert.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,7 @@ def main():
397397
"--models",
398398
type=lambda s: s.split(","),
399399
default=pretraineds,
400-
help=f"The pretrained tokenizers you want to test agains, (default: {pretraineds})",
400+
help=f"The pretrained tokenizers you want to test against, (default: {pretraineds})",
401401
)
402402
args = parser.parse_args()
403403

bindings/python/src/decoders.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,7 @@ impl PyMetaspaceDec {
404404
///
405405
/// Args:
406406
/// suffix (:obj:`str`, `optional`, defaults to :obj:`</w>`):
407-
/// The suffix that was used to caracterize an end-of-word. This suffix will
407+
/// The suffix that was used to characterize an end-of-word. This suffix will
408408
/// be replaced by whitespaces during the decoding
409409
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "BPEDecoder")]
410410
pub struct PyBPEDecoder {}

bindings/python/tests/bindings/test_trainers.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
trainers,
1515
)
1616

17-
from ..utils import data_dir, train_files
17+
from ..utils import data_dir, train_files, DATA_PATH
1818

1919

2020
class TestBpeTrainer:
@@ -287,16 +287,17 @@ def test_can_modify(self):
287287
trainer.initial_alphabet = ["d", "z"]
288288
assert sorted(trainer.initial_alphabet) == ["d", "z"]
289289

290-
def test_continuing_prefix_trainer_mismatch(self):
290+
def test_continuing_prefix_trainer_mismatch(self, train_files):
291291
UNK = "[UNK]"
292292
special_tokens = [UNK]
293293
tokenizer = Tokenizer(models.BPE(unk_token=UNK, continuing_subword_prefix="##"))
294294
trainer = trainers.BpeTrainer(special_tokens=special_tokens)
295295
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
296296
[pre_tokenizers.Whitespace(), pre_tokenizers.Digits(individual_digits=True)]
297297
)
298-
tokenizer.train(files=["data/big.txt"], trainer=trainer)
298+
tokenizer.train(files=[train_files["big"]], trainer=trainer)
299299

300-
tokenizer.save("data/tokenizer.json")
300+
tokenizer_json = os.path.join(DATA_PATH, "tokenizer.json")
301+
tokenizer.save(tokenizer_json)
301302

302-
tokenizer.from_file("data/tokenizer.json")
303+
tokenizer.from_file(tokenizer_json)

0 commit comments

Comments
 (0)