nikmcfly · back2zion · Apr 4, 2026
diff --git a/backend/app/services/oasis_profile_generator.py b/backend/app/services/oasis_profile_generator.py
@@ -478,7 +478,8 @@ def _generate_profile_with_llm(
                         {"role": "user", "content": prompt}
                     ],
                     response_format={"type": "json_object"},
-                    temperature=0.7 - (attempt * 0.1)  # Lower temperature with each retry
+                    temperature=0.7 - (attempt * 0.1),  # Lower temperature with each retry
+                    extra_body={"chat_template_kwargs": {"enable_thinking": False}},
                     # Don't set max_tokens, let LLM generate freely
                 )
 

diff --git a/backend/app/utils/llm_client.py b/backend/app/utils/llm_client.py
@@ -72,12 +72,16 @@ def chat(
 
         if response_format:
             kwargs["response_format"] = response_format
+            # Disable thinking mode (e.g. Qwen3 <think> tags) when requesting
+            # structured JSON output — thinking tokens break JSON parsing and
+            # cause infinite retry loops with vLLM's guided decoding.
+            kwargs.setdefault("extra_body", {})
+            kwargs["extra_body"]["chat_template_kwargs"] = {"enable_thinking": False}
 
         # For Ollama: pass num_ctx via extra_body to prevent prompt truncation
         if self._is_ollama() and self._num_ctx:
-            kwargs["extra_body"] = {
-                "options": {"num_ctx": self._num_ctx}
-            }
+            kwargs.setdefault("extra_body", {})
+            kwargs["extra_body"]["options"] = {"num_ctx": self._num_ctx}
 
         response = self.client.chat.completions.create(**kwargs)
         content = response.choices[0].message.content