Spaces:

ub-aac-chatbot
/

aac-chatbot

Sleeping

akashkolte commited on Apr 13

Commit

e7cf650

1 Parent(s): d60da4f

added supress

Files changed (2) hide show

config/settings.py CHANGED Viewed

@@ -36,8 +36,12 @@ class Settings(BaseSettings):
     # Active tier: "primary" | "fallback" | "local"
     active_llm_tier: str = "local"
-    # Thinking mode: "off" = disable <think> (fastest), "strip" = allow
-    # thinking but strip <think> tags from output, "full" = keep everything
     thinking_mode: str = "off"
     # Extra token budget added on top of max_tokens when thinking is enabled

     # Active tier: "primary" | "fallback" | "local"
     active_llm_tier: str = "local"
+    # Thinking mode: "off" = plain completion, no thinking whatsoever
+    # "strip" = let model think, but strip <think> tags from output
+    # "full" = return raw response including <think> blocks
+    # "suppress" = actively suppress thinking via /no_think (Ollama) or
+    #              chat_template_kwargs (vLLM). Use for models like Qwen3
+    #              that think by default and need explicit suppression.
     thinking_mode: str = "off"
     # Extra token budget added on top of max_tokens when thinking is enabled

generation/llm_client.py CHANGED Viewed

@@ -107,16 +107,18 @@ def chat_complete(
     patched_messages = messages
     extra_body: dict[str, Any] = kwargs.pop("extra_body", {})
-    if settings.thinking_mode == "off":
         if resolved_tier == "local":
             patched_messages = _apply_no_think(messages)
         else:
             extra_body = {**extra_body, "chat_template_kwargs": {"enable_thinking": False}}
-    # When thinking is enabled, add the configured budget so the model
     # has room to reason without truncating the actual answer.
     effective_max_tokens = max_tokens
-    if settings.thinking_mode != "off":
         effective_max_tokens = max_tokens + settings.thinking_token_budget
     resp = client.chat.completions.create(

     patched_messages = messages
     extra_body: dict[str, Any] = kwargs.pop("extra_body", {})
+    # "suppress" = actively inject /no_think or vLLM flag for models
+    # like Qwen3 that think by default and need explicit suppression.
+    if settings.thinking_mode == "suppress":
         if resolved_tier == "local":
             patched_messages = _apply_no_think(messages)
         else:
             extra_body = {**extra_body, "chat_template_kwargs": {"enable_thinking": False}}
+    # When thinking is enabled (strip/full), add budget so the model
     # has room to reason without truncating the actual answer.
     effective_max_tokens = max_tokens
+    if settings.thinking_mode in ("strip", "full"):
         effective_max_tokens = max_tokens + settings.thinking_token_budget
     resp = client.chat.completions.create(