akashkolte commited on
Commit
e7cf650
·
1 Parent(s): d60da4f

added supress

Browse files
Files changed (2) hide show
  1. config/settings.py +6 -2
  2. generation/llm_client.py +5 -3
config/settings.py CHANGED
@@ -36,8 +36,12 @@ class Settings(BaseSettings):
36
  # Active tier: "primary" | "fallback" | "local"
37
  active_llm_tier: str = "local"
38
 
39
- # Thinking mode: "off" = disable <think> (fastest), "strip" = allow
40
- # thinking but strip <think> tags from output, "full" = keep everything
 
 
 
 
41
  thinking_mode: str = "off"
42
 
43
  # Extra token budget added on top of max_tokens when thinking is enabled
 
36
  # Active tier: "primary" | "fallback" | "local"
37
  active_llm_tier: str = "local"
38
 
39
+ # Thinking mode: "off" = plain completion, no thinking whatsoever
40
+ # "strip" = let model think, but strip <think> tags from output
41
+ # "full" = return raw response including <think> blocks
42
+ # "suppress" = actively suppress thinking via /no_think (Ollama) or
43
+ # chat_template_kwargs (vLLM). Use for models like Qwen3
44
+ # that think by default and need explicit suppression.
45
  thinking_mode: str = "off"
46
 
47
  # Extra token budget added on top of max_tokens when thinking is enabled
generation/llm_client.py CHANGED
@@ -107,16 +107,18 @@ def chat_complete(
107
  patched_messages = messages
108
  extra_body: dict[str, Any] = kwargs.pop("extra_body", {})
109
 
110
- if settings.thinking_mode == "off":
 
 
111
  if resolved_tier == "local":
112
  patched_messages = _apply_no_think(messages)
113
  else:
114
  extra_body = {**extra_body, "chat_template_kwargs": {"enable_thinking": False}}
115
 
116
- # When thinking is enabled, add the configured budget so the model
117
  # has room to reason without truncating the actual answer.
118
  effective_max_tokens = max_tokens
119
- if settings.thinking_mode != "off":
120
  effective_max_tokens = max_tokens + settings.thinking_token_budget
121
 
122
  resp = client.chat.completions.create(
 
107
  patched_messages = messages
108
  extra_body: dict[str, Any] = kwargs.pop("extra_body", {})
109
 
110
+ # "suppress" = actively inject /no_think or vLLM flag for models
111
+ # like Qwen3 that think by default and need explicit suppression.
112
+ if settings.thinking_mode == "suppress":
113
  if resolved_tier == "local":
114
  patched_messages = _apply_no_think(messages)
115
  else:
116
  extra_body = {**extra_body, "chat_template_kwargs": {"enable_thinking": False}}
117
 
118
+ # When thinking is enabled (strip/full), add budget so the model
119
  # has room to reason without truncating the actual answer.
120
  effective_max_tokens = max_tokens
121
+ if settings.thinking_mode in ("strip", "full"):
122
  effective_max_tokens = max_tokens + settings.thinking_token_budget
123
 
124
  resp = client.chat.completions.create(