Spaces:

ub-aac-chatbot
/

aac-chatbot

Sleeping

App Files Files Community

akashkolte commited on Apr 13

Commit

d60da4f

1 Parent(s): e06dc15

updated settings

Browse files

Files changed (4) hide show

.env.example +9 -1
config/settings.py +8 -0
generation/llm_client.py +31 -23
pipeline/nodes/intent.py +5 -1

.env.example CHANGED Viewed

@@ -18,11 +18,19 @@ FALLBACK_BASE_URL=http://<GCP_IP>:8000/v1
 # ── Local Ollama (dev) ────────────────────────────────────────────────────────
 LOCAL_BASE_URL=http://localhost:11434/v1
-LOCAL_MODEL=qwen3:8b
 # ── MLflow ────────────────────────────────────────────────────────────────────
 MLFLOW_TRACKING_URI=mlruns
 MLFLOW_EXPERIMENT=aac-chatbot
 # ── Latency fallback threshold (seconds) ──────────────────────────────────────
 FALLBACK_LATENCY_THRESHOLD=3.5

 # ── Local Ollama (dev) ────────────────────────────────────────────────────────
 LOCAL_BASE_URL=http://localhost:11434/v1
+LOCAL_MODEL=gemma4:31b-cloud     # qwen3:8b qwen3.5:397b-cloud
 # ── MLflow ────────────────────────────────────────────────────────────────────
 MLFLOW_TRACKING_URI=mlruns
 MLFLOW_EXPERIMENT=aac-chatbot
+# ── Thinking mode ─────────────────────────────────────────────────────────────
+# "off"   — suppress thinking (fastest, best for latency-sensitive AAC)
+# "strip" — let model think, but strip <think> tags from output
+# "full"  — return raw response including <think> blocks
+THINKING_MODE=off
+# Extra tokens added when thinking is enabled (strip/full). Ignored when off.
+THINKING_TOKEN_BUDGET=4096
 # ── Latency fallback threshold (seconds) ──────────────────────────────────────
 FALLBACK_LATENCY_THRESHOLD=3.5

config/settings.py CHANGED Viewed

@@ -36,6 +36,14 @@ class Settings(BaseSettings):
     # Active tier: "primary" | "fallback" | "local"
     active_llm_tier: str = "local"
     # Wall-clock threshold (seconds) that triggers fallback within a turn
     fallback_latency_threshold: float = 3.5

     # Active tier: "primary" | "fallback" | "local"
     active_llm_tier: str = "local"
+    # Thinking mode: "off" = disable <think> (fastest), "strip" = allow
+    # thinking but strip <think> tags from output, "full" = keep everything
+    thinking_mode: str = "off"
+    # Extra token budget added on top of max_tokens when thinking is enabled
+    # (thinking_mode = "strip" or "full"). Set to 0 if using a non-thinking model.
+    thinking_token_budget: int = 4096
     # Wall-clock threshold (seconds) that triggers fallback within a turn
     fallback_latency_threshold: float = 3.5

generation/llm_client.py CHANGED Viewed

@@ -11,13 +11,14 @@ Tier 3 — local:    Qwen3-8B via Ollama on MacBook M2 (dev / offline)
 Active tier is controlled by settings.active_llm_tier or the `tier`
 argument passed explicitly by the planner node.
-Qwen3 note: Qwen3 defaults to thinking mode (<think>…</think> tokens).
-For AAC we always use non-thinking mode (sub-6 s latency requirement).
-We prepend /no_think to the first user message — this is the Ollama-
-compatible way. vLLM uses extra_body chat_template_kwargs instead.
 """
 from __future__ import annotations
 from functools import lru_cache
 from typing import Any
@@ -25,9 +26,6 @@ from openai import OpenAI
 from config.settings import settings
-# Models that require non-thinking mode enforcement
-_QWEN3_MODELS = {"qwen3", "qwen/qwen3"}
 @lru_cache(maxsize=3)
 def _build_client(base_url: str, api_key: str) -> OpenAI:
@@ -62,15 +60,10 @@ def active_model(tier: str | None = None) -> str:
     }[resolved]
-def _is_qwen3(model: str) -> bool:
-    return any(model.lower().startswith(prefix) for prefix in _QWEN3_MODELS)
 def _apply_no_think(messages: list[dict]) -> list[dict]:
     """
-    Prepend /no_think to the first user message to disable Qwen3 thinking mode.
-    This is the Ollama-compatible approach (works with the OpenAI-compat endpoint).
-    vLLM uses extra_body instead — handled separately in chat_complete().
     """
     result = list(messages)
     for i, msg in enumerate(result):
@@ -80,6 +73,11 @@ def _apply_no_think(messages: list[dict]) -> list[dict]:
     return result
 def chat_complete(
     messages: list[dict],
     max_tokens: int,
@@ -88,11 +86,12 @@ def chat_complete(
     **kwargs: Any,
 ) -> str:
     """
-    Unified chat completion that always enforces Qwen3 non-thinking mode.
-    Returns the response text string directly.
-    Use this in pipeline nodes instead of calling client.chat.completions.create
-    directly — it handles the thinking-mode suppression for all tiers.
     In local dev mode (active_llm_tier="local"), all tier requests are
     redirected to Ollama — there is no separate fallback server locally.
@@ -108,23 +107,32 @@ def chat_complete(
     patched_messages = messages
     extra_body: dict[str, Any] = kwargs.pop("extra_body", {})
-    if _is_qwen3(model):
         if resolved_tier == "local":
-            # Ollama: /no_think prefix in the user message
             patched_messages = _apply_no_think(messages)
         else:
-            # vLLM: disable via chat template kwargs
             extra_body = {**extra_body, "chat_template_kwargs": {"enable_thinking": False}}
     resp = client.chat.completions.create(
         model=model,
         messages=patched_messages,
-        max_tokens=max_tokens,
         temperature=temperature,
         extra_body=extra_body or None,
         **kwargs,
     )
-    return (resp.choices[0].message.content or "").strip()
 def warmup(tier: str | None = None) -> None:

 Active tier is controlled by settings.active_llm_tier or the `tier`
 argument passed explicitly by the planner node.
+Thinking mode is controlled by settings.thinking_mode:
+  "off"   — prepend /no_think (Ollama) or chat_template_kwargs (vLLM)
+  "strip" — let the model think, but strip <think>…</think> from output
+  "full"  — return everything including <think> blocks
 """
 from __future__ import annotations
+import re
 from functools import lru_cache
 from typing import Any
 from config.settings import settings
 @lru_cache(maxsize=3)
 def _build_client(base_url: str, api_key: str) -> OpenAI:
     }[resolved]
 def _apply_no_think(messages: list[dict]) -> list[dict]:
     """
+    Prepend /no_think to the first user message.
+    This is the Ollama-compatible way to suppress thinking mode.
     """
     result = list(messages)
     for i, msg in enumerate(result):
     return result
+def _strip_think_tags(text: str) -> str:
+    """Remove <think>…</think> blocks from model output."""
+    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
 def chat_complete(
     messages: list[dict],
     max_tokens: int,
     **kwargs: Any,
 ) -> str:
     """
+    Model-agnostic chat completion. Returns the response text directly.
+    Thinking mode behaviour is controlled entirely by settings.thinking_mode:
+      "off"   — suppress thinking via /no_think (Ollama) or extra_body (vLLM)
+      "strip" — allow thinking but remove <think> tags from the response
+      "full"  — return the raw response including any <think> blocks
     In local dev mode (active_llm_tier="local"), all tier requests are
     redirected to Ollama — there is no separate fallback server locally.
     patched_messages = messages
     extra_body: dict[str, Any] = kwargs.pop("extra_body", {})
+    if settings.thinking_mode == "off":
         if resolved_tier == "local":
             patched_messages = _apply_no_think(messages)
         else:
             extra_body = {**extra_body, "chat_template_kwargs": {"enable_thinking": False}}
+    # When thinking is enabled, add the configured budget so the model
+    # has room to reason without truncating the actual answer.
+    effective_max_tokens = max_tokens
+    if settings.thinking_mode != "off":
+        effective_max_tokens = max_tokens + settings.thinking_token_budget
     resp = client.chat.completions.create(
         model=model,
         messages=patched_messages,
+        max_tokens=effective_max_tokens,
         temperature=temperature,
         extra_body=extra_body or None,
         **kwargs,
     )
+    raw = resp.choices[0].message.content or ""
+    if settings.thinking_mode in ("off", "strip"):
+        raw = _strip_think_tags(raw)
+    return raw.strip()
 def warmup(tier: str | None = None) -> None:

pipeline/nodes/intent.py CHANGED Viewed

@@ -7,6 +7,7 @@ IntentRoute that drives all downstream routing decisions.
 """
 from __future__ import annotations
 import time
 from typing import Literal, Optional
@@ -134,7 +135,10 @@ def run(state: PipelineState) -> dict:
         )
         try:
-            parsed = IntentRouteSchema.model_validate_json(raw)
             route = {
                 "sub_intents": [si.model_dump() for si in parsed.sub_intents],
                 "style_constraints": parsed.style_constraints.model_dump(),

 """
 from __future__ import annotations
+import re
 import time
 from typing import Literal, Optional
         )
         try:
+            # Strip markdown fences (```json ... ```) that many models add
+            cleaned = re.sub(r"^```(?:json)?\s*", "", raw.strip())
+            cleaned = re.sub(r"\s*```$", "", cleaned.strip())
+            parsed = IntentRouteSchema.model_validate_json(cleaned)
             route = {
                 "sub_intents": [si.model_dump() for si in parsed.sub_intents],
                 "style_constraints": parsed.style_constraints.model_dump(),