Skip to content

Commit 2f4e654

Browse files
[Bugfix] vLLM produces invalid UTF-8 tokens and “�” (vllm-project#28874)
Signed-off-by: John Calderon <jcalderon@nvidia.com> Co-authored-by: Benjamin Chislett <bchislett@nvidia.com>
1 parent 3c98c2d commit 2f4e654

File tree

3 files changed

+535
-29
lines changed

3 files changed

+535
-29
lines changed

tests/v1/engine/test_output_processor.py

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -274,12 +274,28 @@ def _validate_logprobs(
274274
# the logprob token id at this sequence position
275275
decoded_token = pos_logprob_dict[lp_tok].decoded_token
276276
ref_decoded_token = _ref_convert_id_to_token(dtv.tokenizer, lp_tok)
277-
assert decoded_token == ref_decoded_token, (
278-
f"Sampled logprob token id {lp_tok} decodes to"
279-
f" {ref_decoded_token} but Logprob decoded"
280-
f" token is {decoded_token} instead"
281-
f" (at position {idx})"
282-
)
277+
278+
# With UTF-8 correction logic, tokens ending with "�"
279+
# (incomplete byte sequences) are corrected to either
280+
# empty string or proper UTF-8 characters
281+
if ref_decoded_token.endswith("�"):
282+
# Token needs UTF-8 correction
283+
assert not decoded_token.endswith("�"), (
284+
f"Sampled logprob token id {lp_tok} decodes to"
285+
f" '{ref_decoded_token}' (ends with replacement char)"
286+
f" but corrected decoded token '{decoded_token}'"
287+
f" still ends with replacement char"
288+
f" (at position {idx}). UTF-8 correction should"
289+
f" have removed it."
290+
)
291+
else:
292+
# No correction needed, should match exactly
293+
assert decoded_token == ref_decoded_token, (
294+
f"Sampled logprob token id {lp_tok} decodes to"
295+
f" {ref_decoded_token} but Logprob decoded"
296+
f" token is {decoded_token} instead"
297+
f" (at position {idx})"
298+
)
283299

284300
ref_cumulative_logprob += pos_logprob_dict[sampled_token].logprob
285301
# Assert that cumulative logprobs are correct
@@ -420,12 +436,28 @@ def _validate_logprobs(
420436
# the logprob token id at this sequence position
421437
decoded_token = pos_logprob_dict[plp_tok].decoded_token
422438
ref_decoded_token = _ref_convert_id_to_token(dtv.tokenizer, plp_tok)
423-
assert decoded_token == ref_decoded_token, (
424-
f"Prompt logprob token id {plp_tok} decodes to"
425-
f" {ref_decoded_token} but Logprob decoded"
426-
f" token is {decoded_token} instead"
427-
f" (at position {idx})"
428-
)
439+
440+
# With UTF-8 correction logic, tokens ending with "�"
441+
# (incomplete byte sequences) are corrected to either
442+
# empty string or proper UTF-8 characters
443+
if ref_decoded_token.endswith("�"):
444+
# Token needs UTF-8 correction
445+
assert not decoded_token.endswith("�"), (
446+
f"Prompt logprob token id {plp_tok} decodes to"
447+
f" '{ref_decoded_token}' (ends with replacement char)"
448+
f" but corrected decoded token '{decoded_token}'"
449+
f" still ends with replacement char"
450+
f" (at position {idx}). UTF-8 correction should"
451+
f" have removed it."
452+
)
453+
else:
454+
# No correction needed, should match exactly
455+
assert decoded_token == ref_decoded_token, (
456+
f"Prompt logprob token id {plp_tok} decodes to"
457+
f" {ref_decoded_token} but Logprob decoded"
458+
f" token is {decoded_token} instead"
459+
f" (at position {idx})"
460+
)
429461
else:
430462
# Prompt logprobs disabled for this request
431463
assert prompt_logprobs is None

0 commit comments

Comments
 (0)