Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion backend/app/services/oasis_profile_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,8 @@ def _generate_profile_with_llm(
{"role": "user", "content": prompt}
],
response_format={"type": "json_object"},
temperature=0.7 - (attempt * 0.1) # Lower temperature with each retry
temperature=0.7 - (attempt * 0.1), # Lower temperature with each retry
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
# Don't set max_tokens, let LLM generate freely
)

Expand Down
10 changes: 7 additions & 3 deletions backend/app/utils/llm_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,16 @@ def chat(

if response_format:
kwargs["response_format"] = response_format
# Disable thinking mode (e.g. Qwen3 <think> tags) when requesting
# structured JSON output — thinking tokens break JSON parsing and
# cause infinite retry loops with vLLM's guided decoding.
kwargs.setdefault("extra_body", {})
kwargs["extra_body"]["chat_template_kwargs"] = {"enable_thinking": False}

# For Ollama: pass num_ctx via extra_body to prevent prompt truncation
if self._is_ollama() and self._num_ctx:
kwargs["extra_body"] = {
"options": {"num_ctx": self._num_ctx}
}
kwargs.setdefault("extra_body", {})
kwargs["extra_body"]["options"] = {"num_ctx": self._num_ctx}

response = self.client.chat.completions.create(**kwargs)
content = response.choices[0].message.content
Expand Down