From 51235afde6888e4ff05e58dd3bfbf9c2b12d9eb4 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 4 Apr 2026 15:07:16 +0900 Subject: [PATCH] fix: disable thinking mode for JSON output requests Models with reasoning/thinking capabilities (e.g. Qwen3 with --reasoning-parser) emit tags before generating content. When combined with response_format=json_object and vLLM's guided decoding, this causes an infinite abort-retry loop because the thinking tokens violate the JSON schema constraint. Automatically sets enable_thinking=false via chat_template_kwargs whenever structured JSON output is requested. --- backend/app/services/oasis_profile_generator.py | 3 ++- backend/app/utils/llm_client.py | 10 +++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/backend/app/services/oasis_profile_generator.py b/backend/app/services/oasis_profile_generator.py index 2555997..4e18c4c 100644 --- a/backend/app/services/oasis_profile_generator.py +++ b/backend/app/services/oasis_profile_generator.py @@ -478,7 +478,8 @@ def _generate_profile_with_llm( {"role": "user", "content": prompt} ], response_format={"type": "json_object"}, - temperature=0.7 - (attempt * 0.1) # Lower temperature with each retry + temperature=0.7 - (attempt * 0.1), # Lower temperature with each retry + extra_body={"chat_template_kwargs": {"enable_thinking": False}}, # Don't set max_tokens, let LLM generate freely ) diff --git a/backend/app/utils/llm_client.py b/backend/app/utils/llm_client.py index 9a2d926..52a8494 100644 --- a/backend/app/utils/llm_client.py +++ b/backend/app/utils/llm_client.py @@ -72,12 +72,16 @@ def chat( if response_format: kwargs["response_format"] = response_format + # Disable thinking mode (e.g. Qwen3 tags) when requesting + # structured JSON output — thinking tokens break JSON parsing and + # cause infinite retry loops with vLLM's guided decoding. + kwargs.setdefault("extra_body", {}) + kwargs["extra_body"]["chat_template_kwargs"] = {"enable_thinking": False} # For Ollama: pass num_ctx via extra_body to prevent prompt truncation if self._is_ollama() and self._num_ctx: - kwargs["extra_body"] = { - "options": {"num_ctx": self._num_ctx} - } + kwargs.setdefault("extra_body", {}) + kwargs["extra_body"]["options"] = {"num_ctx": self._num_ctx} response = self.client.chat.completions.create(**kwargs) content = response.choices[0].message.content