From 51235afde6888e4ff05e58dd3bfbf9c2b12d9eb4 Mon Sep 17 00:00:00 2001
From: Your Name <your.email@company.com>
Date: Sat, 4 Apr 2026 15:07:16 +0900
Subject: [PATCH] fix: disable thinking mode for JSON output requests

Models with reasoning/thinking capabilities (e.g. Qwen3 with
--reasoning-parser) emit <think> tags before generating content.
When combined with response_format=json_object and vLLM's guided
decoding, this causes an infinite abort-retry loop because the
thinking tokens violate the JSON schema constraint.

Automatically sets enable_thinking=false via chat_template_kwargs
whenever structured JSON output is requested.
---
 backend/app/services/oasis_profile_generator.py |  3 ++-
 backend/app/utils/llm_client.py                 | 10 +++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/backend/app/services/oasis_profile_generator.py b/backend/app/services/oasis_profile_generator.py
index 2555997..4e18c4c 100644
--- a/backend/app/services/oasis_profile_generator.py
+++ b/backend/app/services/oasis_profile_generator.py
@@ -478,7 +478,8 @@ def _generate_profile_with_llm(
                         {"role": "user", "content": prompt}
                     ],
                     response_format={"type": "json_object"},
-                    temperature=0.7 - (attempt * 0.1)  # Lower temperature with each retry
+                    temperature=0.7 - (attempt * 0.1),  # Lower temperature with each retry
+                    extra_body={"chat_template_kwargs": {"enable_thinking": False}},
                     # Don't set max_tokens, let LLM generate freely
                 )
 
diff --git a/backend/app/utils/llm_client.py b/backend/app/utils/llm_client.py
index 9a2d926..52a8494 100644
--- a/backend/app/utils/llm_client.py
+++ b/backend/app/utils/llm_client.py
@@ -72,12 +72,16 @@ def chat(
 
         if response_format:
             kwargs["response_format"] = response_format
+            # Disable thinking mode (e.g. Qwen3 <think> tags) when requesting
+            # structured JSON output — thinking tokens break JSON parsing and
+            # cause infinite retry loops with vLLM's guided decoding.
+            kwargs.setdefault("extra_body", {})
+            kwargs["extra_body"]["chat_template_kwargs"] = {"enable_thinking": False}
 
         # For Ollama: pass num_ctx via extra_body to prevent prompt truncation
         if self._is_ollama() and self._num_ctx:
-            kwargs["extra_body"] = {
-                "options": {"num_ctx": self._num_ctx}
-            }
+            kwargs.setdefault("extra_body", {})
+            kwargs["extra_body"]["options"] = {"num_ctx": self._num_ctx}
 
         response = self.client.chat.completions.create(**kwargs)
         content = response.choices[0].message.content