akashkolte commited on
Commit
d60da4f
Β·
1 Parent(s): e06dc15

updated settings

Browse files
.env.example CHANGED
@@ -18,11 +18,19 @@ FALLBACK_BASE_URL=http://<GCP_IP>:8000/v1
18
 
19
  # ── Local Ollama (dev) ────────────────────────────────────────────────────────
20
  LOCAL_BASE_URL=http://localhost:11434/v1
21
- LOCAL_MODEL=qwen3:8b
22
 
23
  # ── MLflow ────────────────────────────────────────────────────────────────────
24
  MLFLOW_TRACKING_URI=mlruns
25
  MLFLOW_EXPERIMENT=aac-chatbot
26
 
 
 
 
 
 
 
 
 
27
  # ── Latency fallback threshold (seconds) ──────────────────────────────────────
28
  FALLBACK_LATENCY_THRESHOLD=3.5
 
18
 
19
  # ── Local Ollama (dev) ────────────────────────────────────────────────────────
20
  LOCAL_BASE_URL=http://localhost:11434/v1
21
+ LOCAL_MODEL=gemma4:31b-cloud # qwen3:8b qwen3.5:397b-cloud
22
 
23
  # ── MLflow ────────────────────────────────────────────────────────────────────
24
  MLFLOW_TRACKING_URI=mlruns
25
  MLFLOW_EXPERIMENT=aac-chatbot
26
 
27
+ # ── Thinking mode ─────────────────────────────────────────────────────────────
28
+ # "off" β€” suppress thinking (fastest, best for latency-sensitive AAC)
29
+ # "strip" β€” let model think, but strip <think> tags from output
30
+ # "full" β€” return raw response including <think> blocks
31
+ THINKING_MODE=off
32
+ # Extra tokens added when thinking is enabled (strip/full). Ignored when off.
33
+ THINKING_TOKEN_BUDGET=4096
34
+
35
  # ── Latency fallback threshold (seconds) ──────────────────────────────────────
36
  FALLBACK_LATENCY_THRESHOLD=3.5
config/settings.py CHANGED
@@ -36,6 +36,14 @@ class Settings(BaseSettings):
36
  # Active tier: "primary" | "fallback" | "local"
37
  active_llm_tier: str = "local"
38
 
 
 
 
 
 
 
 
 
39
  # Wall-clock threshold (seconds) that triggers fallback within a turn
40
  fallback_latency_threshold: float = 3.5
41
 
 
36
  # Active tier: "primary" | "fallback" | "local"
37
  active_llm_tier: str = "local"
38
 
39
+ # Thinking mode: "off" = disable <think> (fastest), "strip" = allow
40
+ # thinking but strip <think> tags from output, "full" = keep everything
41
+ thinking_mode: str = "off"
42
+
43
+ # Extra token budget added on top of max_tokens when thinking is enabled
44
+ # (thinking_mode = "strip" or "full"). Set to 0 if using a non-thinking model.
45
+ thinking_token_budget: int = 4096
46
+
47
  # Wall-clock threshold (seconds) that triggers fallback within a turn
48
  fallback_latency_threshold: float = 3.5
49
 
generation/llm_client.py CHANGED
@@ -11,13 +11,14 @@ Tier 3 β€” local: Qwen3-8B via Ollama on MacBook M2 (dev / offline)
11
  Active tier is controlled by settings.active_llm_tier or the `tier`
12
  argument passed explicitly by the planner node.
13
 
14
- Qwen3 note: Qwen3 defaults to thinking mode (<think>…</think> tokens).
15
- For AAC we always use non-thinking mode (sub-6 s latency requirement).
16
- We prepend /no_think to the first user message β€” this is the Ollama-
17
- compatible way. vLLM uses extra_body chat_template_kwargs instead.
18
  """
19
  from __future__ import annotations
20
 
 
21
  from functools import lru_cache
22
  from typing import Any
23
 
@@ -25,9 +26,6 @@ from openai import OpenAI
25
 
26
  from config.settings import settings
27
 
28
- # Models that require non-thinking mode enforcement
29
- _QWEN3_MODELS = {"qwen3", "qwen/qwen3"}
30
-
31
 
32
  @lru_cache(maxsize=3)
33
  def _build_client(base_url: str, api_key: str) -> OpenAI:
@@ -62,15 +60,10 @@ def active_model(tier: str | None = None) -> str:
62
  }[resolved]
63
 
64
 
65
- def _is_qwen3(model: str) -> bool:
66
- return any(model.lower().startswith(prefix) for prefix in _QWEN3_MODELS)
67
-
68
-
69
  def _apply_no_think(messages: list[dict]) -> list[dict]:
70
  """
71
- Prepend /no_think to the first user message to disable Qwen3 thinking mode.
72
- This is the Ollama-compatible approach (works with the OpenAI-compat endpoint).
73
- vLLM uses extra_body instead β€” handled separately in chat_complete().
74
  """
75
  result = list(messages)
76
  for i, msg in enumerate(result):
@@ -80,6 +73,11 @@ def _apply_no_think(messages: list[dict]) -> list[dict]:
80
  return result
81
 
82
 
 
 
 
 
 
83
  def chat_complete(
84
  messages: list[dict],
85
  max_tokens: int,
@@ -88,11 +86,12 @@ def chat_complete(
88
  **kwargs: Any,
89
  ) -> str:
90
  """
91
- Unified chat completion that always enforces Qwen3 non-thinking mode.
92
- Returns the response text string directly.
93
 
94
- Use this in pipeline nodes instead of calling client.chat.completions.create
95
- directly β€” it handles the thinking-mode suppression for all tiers.
 
 
96
 
97
  In local dev mode (active_llm_tier="local"), all tier requests are
98
  redirected to Ollama β€” there is no separate fallback server locally.
@@ -108,23 +107,32 @@ def chat_complete(
108
  patched_messages = messages
109
  extra_body: dict[str, Any] = kwargs.pop("extra_body", {})
110
 
111
- if _is_qwen3(model):
112
  if resolved_tier == "local":
113
- # Ollama: /no_think prefix in the user message
114
  patched_messages = _apply_no_think(messages)
115
  else:
116
- # vLLM: disable via chat template kwargs
117
  extra_body = {**extra_body, "chat_template_kwargs": {"enable_thinking": False}}
118
 
 
 
 
 
 
 
119
  resp = client.chat.completions.create(
120
  model=model,
121
  messages=patched_messages,
122
- max_tokens=max_tokens,
123
  temperature=temperature,
124
  extra_body=extra_body or None,
125
  **kwargs,
126
  )
127
- return (resp.choices[0].message.content or "").strip()
 
 
 
 
 
128
 
129
 
130
  def warmup(tier: str | None = None) -> None:
 
11
  Active tier is controlled by settings.active_llm_tier or the `tier`
12
  argument passed explicitly by the planner node.
13
 
14
+ Thinking mode is controlled by settings.thinking_mode:
15
+ "off" β€” prepend /no_think (Ollama) or chat_template_kwargs (vLLM)
16
+ "strip" β€” let the model think, but strip <think>…</think> from output
17
+ "full" β€” return everything including <think> blocks
18
  """
19
  from __future__ import annotations
20
 
21
+ import re
22
  from functools import lru_cache
23
  from typing import Any
24
 
 
26
 
27
  from config.settings import settings
28
 
 
 
 
29
 
30
  @lru_cache(maxsize=3)
31
  def _build_client(base_url: str, api_key: str) -> OpenAI:
 
60
  }[resolved]
61
 
62
 
 
 
 
 
63
  def _apply_no_think(messages: list[dict]) -> list[dict]:
64
  """
65
+ Prepend /no_think to the first user message.
66
+ This is the Ollama-compatible way to suppress thinking mode.
 
67
  """
68
  result = list(messages)
69
  for i, msg in enumerate(result):
 
73
  return result
74
 
75
 
76
+ def _strip_think_tags(text: str) -> str:
77
+ """Remove <think>…</think> blocks from model output."""
78
+ return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
79
+
80
+
81
  def chat_complete(
82
  messages: list[dict],
83
  max_tokens: int,
 
86
  **kwargs: Any,
87
  ) -> str:
88
  """
89
+ Model-agnostic chat completion. Returns the response text directly.
 
90
 
91
+ Thinking mode behaviour is controlled entirely by settings.thinking_mode:
92
+ "off" β€” suppress thinking via /no_think (Ollama) or extra_body (vLLM)
93
+ "strip" β€” allow thinking but remove <think> tags from the response
94
+ "full" β€” return the raw response including any <think> blocks
95
 
96
  In local dev mode (active_llm_tier="local"), all tier requests are
97
  redirected to Ollama β€” there is no separate fallback server locally.
 
107
  patched_messages = messages
108
  extra_body: dict[str, Any] = kwargs.pop("extra_body", {})
109
 
110
+ if settings.thinking_mode == "off":
111
  if resolved_tier == "local":
 
112
  patched_messages = _apply_no_think(messages)
113
  else:
 
114
  extra_body = {**extra_body, "chat_template_kwargs": {"enable_thinking": False}}
115
 
116
+ # When thinking is enabled, add the configured budget so the model
117
+ # has room to reason without truncating the actual answer.
118
+ effective_max_tokens = max_tokens
119
+ if settings.thinking_mode != "off":
120
+ effective_max_tokens = max_tokens + settings.thinking_token_budget
121
+
122
  resp = client.chat.completions.create(
123
  model=model,
124
  messages=patched_messages,
125
+ max_tokens=effective_max_tokens,
126
  temperature=temperature,
127
  extra_body=extra_body or None,
128
  **kwargs,
129
  )
130
+ raw = resp.choices[0].message.content or ""
131
+
132
+ if settings.thinking_mode in ("off", "strip"):
133
+ raw = _strip_think_tags(raw)
134
+
135
+ return raw.strip()
136
 
137
 
138
  def warmup(tier: str | None = None) -> None:
pipeline/nodes/intent.py CHANGED
@@ -7,6 +7,7 @@ IntentRoute that drives all downstream routing decisions.
7
  """
8
  from __future__ import annotations
9
 
 
10
  import time
11
  from typing import Literal, Optional
12
 
@@ -134,7 +135,10 @@ def run(state: PipelineState) -> dict:
134
  )
135
 
136
  try:
137
- parsed = IntentRouteSchema.model_validate_json(raw)
 
 
 
138
  route = {
139
  "sub_intents": [si.model_dump() for si in parsed.sub_intents],
140
  "style_constraints": parsed.style_constraints.model_dump(),
 
7
  """
8
  from __future__ import annotations
9
 
10
+ import re
11
  import time
12
  from typing import Literal, Optional
13
 
 
135
  )
136
 
137
  try:
138
+ # Strip markdown fences (```json ... ```) that many models add
139
+ cleaned = re.sub(r"^```(?:json)?\s*", "", raw.strip())
140
+ cleaned = re.sub(r"\s*```$", "", cleaned.strip())
141
+ parsed = IntentRouteSchema.model_validate_json(cleaned)
142
  route = {
143
  "sub_intents": [si.model_dump() for si in parsed.sub_intents],
144
  "style_constraints": parsed.style_constraints.model_dump(),