|
| 1 | +# tests/test_token_ids_unique.py |
| 2 | +# Checks that token IDs are unique. We don't check token "names" (dict keys are unique by definition). |
| 3 | + |
| 4 | +import pytest |
| 5 | +import tiktoken |
| 6 | +from collections import defaultdict |
| 7 | + |
| 8 | +ENCODING_NAMES = tiktoken.list_encoding_names() |
| 9 | + |
| 10 | +@pytest.mark.parametrize("enc_name", ENCODING_NAMES) |
| 11 | +def test_special_token_ids_are_unique(enc_name): |
| 12 | + """ |
| 13 | + Special tokens: no two different names should share the same token id. |
| 14 | + """ |
| 15 | + enc = tiktoken.get_encoding(enc_name) |
| 16 | + sp = getattr(enc, "_special_tokens", {}) |
| 17 | + if not sp: |
| 18 | + pytest.skip(f"{enc_name}: no special tokens") |
| 19 | + |
| 20 | + id2names = defaultdict(list) |
| 21 | + for name, tid in sp.items(): |
| 22 | + id2names[tid].append(name) |
| 23 | + |
| 24 | + dups = {tid: names for tid, names in id2names.items() if len(names) > 1} |
| 25 | + assert not dups, f"{enc_name}: duplicated special token ids: {dups}" |
| 26 | + |
| 27 | +@pytest.mark.parametrize("enc_name", ENCODING_NAMES) |
| 28 | +def test_mergeable_token_ids_are_unique(enc_name): |
| 29 | + """ |
| 30 | + Mergeable (vocab) tokens: token ids should be unique. |
| 31 | + Note: some builds may not expose `_mergeable_ranks` on Python side; skip in that case. |
| 32 | + """ |
| 33 | + enc = tiktoken.get_encoding(enc_name) |
| 34 | + mr = getattr(enc, "_mergeable_ranks", None) |
| 35 | + if not mr: |
| 36 | + pytest.skip(f"{enc_name}: mergeable ranks not exposed") |
| 37 | + |
| 38 | + ids = list(mr.values()) |
| 39 | + assert len(ids) == len(set(ids)), f"{enc_name}: duplicated mergeable token ids" |
| 40 | + |
0 commit comments