@@ -19,20 +19,22 @@ def tokenize(
19
19
"""Tokenize the text into tokens.
20
20
21
21
Args:
22
- text: The text to tokenize.
22
+ text: The utf-8 encoded string to tokenize.
23
23
add_bos: Whether to add a beginning of sequence token.
24
- special: Whether to tokenize text literally or as special tokens."""
24
+ special: Whether to tokenize special tokens.
25
+ """
25
26
raise NotImplementedError
26
27
27
28
@abc .abstractmethod
28
29
def detokenize (
29
- self , tokens : List [int ], prev_tokens : Optional [List [int ]] = None
30
+ self , tokens : List [int ], prev_tokens : Optional [List [int ]] = None , special : bool = False
30
31
) -> bytes :
31
32
"""Detokenize the tokens into text.
32
33
33
34
Args:
34
- tokens: The tokens to detokenize.
35
- prev_tokens: If tokens is a continuation of a previous sequence, the previous tokens.
35
+ tokens: The list of tokens to detokenize.
36
+ prev_tokens: The list of previous tokens. Offset mapping will be performed if provided.
37
+ special: Whether to detokenize special tokens.
36
38
"""
37
39
raise NotImplementedError
38
40
@@ -47,9 +49,9 @@ def tokenize(
47
49
return self ._model .tokenize (text , add_bos = add_bos , special = special )
48
50
49
51
def detokenize (
50
- self , tokens : List [int ], prev_tokens : Optional [List [int ]] = None
52
+ self , tokens : List [int ], prev_tokens : Optional [List [int ]] = None , special : bool = False
51
53
) -> bytes :
52
- return self ._model .detokenize (tokens )
54
+ return self ._model .detokenize (tokens , special = special )
53
55
54
56
def encode (
55
57
self , text : str , add_bos : bool = True , special : bool = True
@@ -78,18 +80,19 @@ def tokenize(
78
80
)
79
81
80
82
def detokenize (
81
- self , tokens : List [int ], prev_tokens : Optional [List [int ]] = None
83
+ self , tokens : List [int ], prev_tokens : Optional [List [int ]] = None , special : bool = False
82
84
) -> bytes :
85
+ skip_special_tokens = not special
83
86
if prev_tokens is not None :
84
- text = self .hf_tokenizer .decode (prev_tokens + tokens ).encode (
87
+ text = self .hf_tokenizer .decode (prev_tokens + tokens , skip_special_tokens = skip_special_tokens ).encode (
85
88
"utf-8" , errors = "ignore"
86
89
)
87
- prev_text = self .hf_tokenizer .decode (prev_tokens ).encode (
90
+ prev_text = self .hf_tokenizer .decode (prev_tokens , skip_special_tokens = skip_special_tokens ).encode (
88
91
"utf-8" , errors = "ignore"
89
92
)
90
93
return text [len (prev_text ) :]
91
94
else :
92
- return self .hf_tokenizer .decode (tokens ).encode ("utf-8" , errors = "ignore" )
95
+ return self .hf_tokenizer .decode (tokens , skip_special_tokens = skip_special_tokens ).encode ("utf-8" , errors = "ignore" )
93
96
94
97
@classmethod
95
98
def from_pretrained (cls , pretrained_model_name_or_path : str ) -> "LlamaHFTokenizer" :
0 commit comments