@@ -130,42 +130,25 @@ static std::vector<uint64_t> _byte_pair_merge(
130130// ---- Helper utils end -------------------------------------------------------
131131// ---- protected start --------------------------------------------------------
132132
133- std::pair<std::optional<std::string>, re2::StringPiece >
133+ std::pair<std::optional<std::string>, std::string >
134134BPETokenizerBase::split_with_allowed_special_token_ (
135- re2::StringPiece& input,
135+ const std::string& input,
136+ size_t offset,
136137 const TokenMap& allowed_special) const {
137138 if (!special_token_regex_) {
138- return std::make_pair (std::nullopt , input);
139+ return std::make_pair (std::nullopt , input. substr (offset) );
139140 }
140141
141- #if __cplusplus >= 202002L
142- auto start = input.begin ();
143- #else
144- const char * start = input.data ();
145- #endif
142+ auto matches = special_token_regex_->find_all (input.substr (offset));
146143
147- std::string special;
148- while (true ) {
149- if (!re2::RE2::FindAndConsume (&input, *special_token_regex_, &special)) {
150- // No special token.
151- break ;
144+ for (const auto & m : matches) {
145+ std::string matched_text = input.substr (offset + m.start , m.end - m.start );
146+ if (allowed_special.tryGetInteger (matched_text).has_value ()) {
147+ return {matched_text, input.substr (offset, m.start )};
152148 }
153-
154- if (allowed_special.tryGetInteger (special).has_value ()) {
155- // Found an allowed special token, split the text with it.
156- #if __cplusplus >= 202002L
157- return std::make_pair (
158- special,
159- re2::StringPiece (start, input.begin () - start - special.size ()));
160- #else
161- return std::make_pair (
162- special,
163- re2::StringPiece (start, (input.data () - start) - special.size ()));
164- #endif
165- } // else try to find the next special token
166149 }
167150
168- return std::make_pair (std:: nullopt , input) ;
151+ return { std::nullopt , input. substr (offset)} ;
169152}
170153
171154Result<std::pair<std::vector<uint64_t >, uint64_t >>
@@ -174,33 +157,31 @@ BPETokenizerBase::encode_with_special_token_(
174157 const TokenMap& allowed_special) const {
175158 std::vector<uint64_t > tokens;
176159 uint64_t last_piece_token_len = 0 ;
177- re2::StringPiece input (text);
178- while (true ) {
160+ size_t offset = 0 ;
161+
162+ while (offset < text.size ()) {
179163 auto [special, sub_input] =
180- split_with_allowed_special_token_ (input , allowed_special);
164+ split_with_allowed_special_token_ (text, offset , allowed_special);
181165
182166 TK_CHECK_OK_OR_RETURN_ERROR (
183167 _encode (sub_input, tokens, last_piece_token_len));
168+ offset += sub_input.size ();
184169
185170 if (special) {
186171 const auto result = special_token_map_->tryGetInteger (*special);
187172 if (!result) {
188- // Should never go here, since special pattern includes all special
189- // chars.
190173 TK_LOG (Error, " unknown special token: %s\n " , special->c_str ());
191174 return Error::EncodeFailure;
192175 }
193176
194177 tokens.push_back (*result);
195178 last_piece_token_len = 0 ;
179+ offset += special->size (); // advance past the matched token
196180 } else {
197181 break ;
198182 }
199183 }
200184
201- // last_piece_token_len is how many tokens came from the last regex split.
202- // This is used for determining unstable tokens, since you can't merge
203- // across (stable) regex splits
204185 return std::make_pair (tokens, last_piece_token_len);
205186}
206187
@@ -273,7 +254,7 @@ Result<std::string> BPETokenizerBase::decode(uint64_t prev, uint64_t cur)
273254 } else {
274255 token_bytes = *result;
275256 }
276- _decode (token_bytes, ret);
257+ _decode (std::string ( token_bytes) , ret);
277258
278259 return ret;
279260}
0 commit comments