Spaces:
Sleeping
Sleeping
Commit ·
e7cf650
1
Parent(s): d60da4f
added supress
Browse files- config/settings.py +6 -2
- generation/llm_client.py +5 -3
config/settings.py
CHANGED
|
@@ -36,8 +36,12 @@ class Settings(BaseSettings):
|
|
| 36 |
# Active tier: "primary" | "fallback" | "local"
|
| 37 |
active_llm_tier: str = "local"
|
| 38 |
|
| 39 |
-
# Thinking mode: "off" =
|
| 40 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
thinking_mode: str = "off"
|
| 42 |
|
| 43 |
# Extra token budget added on top of max_tokens when thinking is enabled
|
|
|
|
| 36 |
# Active tier: "primary" | "fallback" | "local"
|
| 37 |
active_llm_tier: str = "local"
|
| 38 |
|
| 39 |
+
# Thinking mode: "off" = plain completion, no thinking whatsoever
|
| 40 |
+
# "strip" = let model think, but strip <think> tags from output
|
| 41 |
+
# "full" = return raw response including <think> blocks
|
| 42 |
+
# "suppress" = actively suppress thinking via /no_think (Ollama) or
|
| 43 |
+
# chat_template_kwargs (vLLM). Use for models like Qwen3
|
| 44 |
+
# that think by default and need explicit suppression.
|
| 45 |
thinking_mode: str = "off"
|
| 46 |
|
| 47 |
# Extra token budget added on top of max_tokens when thinking is enabled
|
generation/llm_client.py
CHANGED
|
@@ -107,16 +107,18 @@ def chat_complete(
|
|
| 107 |
patched_messages = messages
|
| 108 |
extra_body: dict[str, Any] = kwargs.pop("extra_body", {})
|
| 109 |
|
| 110 |
-
|
|
|
|
|
|
|
| 111 |
if resolved_tier == "local":
|
| 112 |
patched_messages = _apply_no_think(messages)
|
| 113 |
else:
|
| 114 |
extra_body = {**extra_body, "chat_template_kwargs": {"enable_thinking": False}}
|
| 115 |
|
| 116 |
-
# When thinking is enabled, add
|
| 117 |
# has room to reason without truncating the actual answer.
|
| 118 |
effective_max_tokens = max_tokens
|
| 119 |
-
if settings.thinking_mode
|
| 120 |
effective_max_tokens = max_tokens + settings.thinking_token_budget
|
| 121 |
|
| 122 |
resp = client.chat.completions.create(
|
|
|
|
| 107 |
patched_messages = messages
|
| 108 |
extra_body: dict[str, Any] = kwargs.pop("extra_body", {})
|
| 109 |
|
| 110 |
+
# "suppress" = actively inject /no_think or vLLM flag for models
|
| 111 |
+
# like Qwen3 that think by default and need explicit suppression.
|
| 112 |
+
if settings.thinking_mode == "suppress":
|
| 113 |
if resolved_tier == "local":
|
| 114 |
patched_messages = _apply_no_think(messages)
|
| 115 |
else:
|
| 116 |
extra_body = {**extra_body, "chat_template_kwargs": {"enable_thinking": False}}
|
| 117 |
|
| 118 |
+
# When thinking is enabled (strip/full), add budget so the model
|
| 119 |
# has room to reason without truncating the actual answer.
|
| 120 |
effective_max_tokens = max_tokens
|
| 121 |
+
if settings.thinking_mode in ("strip", "full"):
|
| 122 |
effective_max_tokens = max_tokens + settings.thinking_token_budget
|
| 123 |
|
| 124 |
resp = client.chat.completions.create(
|