@@ -274,12 +274,28 @@ def _validate_logprobs(
274274 # the logprob token id at this sequence position
275275 decoded_token = pos_logprob_dict [lp_tok ].decoded_token
276276 ref_decoded_token = _ref_convert_id_to_token (dtv .tokenizer , lp_tok )
277- assert decoded_token == ref_decoded_token , (
278- f"Sampled logprob token id { lp_tok } decodes to"
279- f" { ref_decoded_token } but Logprob decoded"
280- f" token is { decoded_token } instead"
281- f" (at position { idx } )"
282- )
277+
278+ # With UTF-8 correction logic, tokens ending with "�"
279+ # (incomplete byte sequences) are corrected to either
280+ # empty string or proper UTF-8 characters
281+ if ref_decoded_token .endswith ("�" ):
282+ # Token needs UTF-8 correction
283+ assert not decoded_token .endswith ("�" ), (
284+ f"Sampled logprob token id { lp_tok } decodes to"
285+ f" '{ ref_decoded_token } ' (ends with replacement char)"
286+ f" but corrected decoded token '{ decoded_token } '"
287+ f" still ends with replacement char"
288+ f" (at position { idx } ). UTF-8 correction should"
289+ f" have removed it."
290+ )
291+ else :
292+ # No correction needed, should match exactly
293+ assert decoded_token == ref_decoded_token , (
294+ f"Sampled logprob token id { lp_tok } decodes to"
295+ f" { ref_decoded_token } but Logprob decoded"
296+ f" token is { decoded_token } instead"
297+ f" (at position { idx } )"
298+ )
283299
284300 ref_cumulative_logprob += pos_logprob_dict [sampled_token ].logprob
285301 # Assert that cumulative logprobs are correct
@@ -420,12 +436,28 @@ def _validate_logprobs(
420436 # the logprob token id at this sequence position
421437 decoded_token = pos_logprob_dict [plp_tok ].decoded_token
422438 ref_decoded_token = _ref_convert_id_to_token (dtv .tokenizer , plp_tok )
423- assert decoded_token == ref_decoded_token , (
424- f"Prompt logprob token id { plp_tok } decodes to"
425- f" { ref_decoded_token } but Logprob decoded"
426- f" token is { decoded_token } instead"
427- f" (at position { idx } )"
428- )
439+
440+ # With UTF-8 correction logic, tokens ending with "�"
441+ # (incomplete byte sequences) are corrected to either
442+ # empty string or proper UTF-8 characters
443+ if ref_decoded_token .endswith ("�" ):
444+ # Token needs UTF-8 correction
445+ assert not decoded_token .endswith ("�" ), (
446+ f"Prompt logprob token id { plp_tok } decodes to"
447+ f" '{ ref_decoded_token } ' (ends with replacement char)"
448+ f" but corrected decoded token '{ decoded_token } '"
449+ f" still ends with replacement char"
450+ f" (at position { idx } ). UTF-8 correction should"
451+ f" have removed it."
452+ )
453+ else :
454+ # No correction needed, should match exactly
455+ assert decoded_token == ref_decoded_token , (
456+ f"Prompt logprob token id { plp_tok } decodes to"
457+ f" { ref_decoded_token } but Logprob decoded"
458+ f" token is { decoded_token } instead"
459+ f" (at position { idx } )"
460+ )
429461 else :
430462 # Prompt logprobs disabled for this request
431463 assert prompt_logprobs is None
0 commit comments