diff --git a/backend/app/services/oasis_profile_generator.py b/backend/app/services/oasis_profile_generator.py index 2555997..4e18c4c 100644 --- a/backend/app/services/oasis_profile_generator.py +++ b/backend/app/services/oasis_profile_generator.py @@ -478,7 +478,8 @@ def _generate_profile_with_llm( {"role": "user", "content": prompt} ], response_format={"type": "json_object"}, - temperature=0.7 - (attempt * 0.1) # Lower temperature with each retry + temperature=0.7 - (attempt * 0.1), # Lower temperature with each retry + extra_body={"chat_template_kwargs": {"enable_thinking": False}}, # Don't set max_tokens, let LLM generate freely ) diff --git a/backend/app/utils/llm_client.py b/backend/app/utils/llm_client.py index 9a2d926..52a8494 100644 --- a/backend/app/utils/llm_client.py +++ b/backend/app/utils/llm_client.py @@ -72,12 +72,16 @@ def chat( if response_format: kwargs["response_format"] = response_format + # Disable thinking mode (e.g. Qwen3 tags) when requesting + # structured JSON output — thinking tokens break JSON parsing and + # cause infinite retry loops with vLLM's guided decoding. + kwargs.setdefault("extra_body", {}) + kwargs["extra_body"]["chat_template_kwargs"] = {"enable_thinking": False} # For Ollama: pass num_ctx via extra_body to prevent prompt truncation if self._is_ollama() and self._num_ctx: - kwargs["extra_body"] = { - "options": {"num_ctx": self._num_ctx} - } + kwargs.setdefault("extra_body", {}) + kwargs["extra_body"]["options"] = {"num_ctx": self._num_ctx} response = self.client.chat.completions.create(**kwargs) content = response.choices[0].message.content