Spaces:
Sleeping
Sleeping
Commit ·
9ad188a
1
Parent(s): ab9851d
Added scaffolding for evals
Browse files- CLAUDE.md +3 -0
- README.md +22 -1
- backend/api/main.py +55 -2
- backend/config/settings.py +3 -0
- backend/evals/__init__.py +37 -0
- backend/evals/efficiency.py +13 -0
- backend/evals/faithfulness.py +12 -0
- backend/evals/multimodal_alignment.py +18 -0
- backend/generation/llm_client.py +9 -1
- backend/pipeline/nodes/planner.py +7 -2
- backend/pipeline/state.py +1 -0
- frontend/src/App.css +124 -2
- frontend/src/components/ChatPanel.tsx +5 -0
- frontend/src/components/EvalPanel.tsx +137 -0
- frontend/src/components/LatencyMetrics.tsx +2 -2
- frontend/src/types.ts +16 -0
- run.sh +11 -1
CLAUDE.md
CHANGED
|
@@ -103,6 +103,9 @@ Copy `.env.example` → `.env` and set:
|
|
| 103 |
|
| 104 |
## Development Notes
|
| 105 |
|
|
|
|
|
|
|
|
|
|
| 106 |
- **Adding a persona**: add to `PERSONAS` in `data/generate_users.py`, re-run it,
|
| 107 |
then `python -m backend.retrieval.vector_store` to rebuild indexes
|
| 108 |
- **Changing LLM**: set `ACTIVE_LLM_TIER` in `.env` — no code changes needed
|
|
|
|
| 103 |
|
| 104 |
## Development Notes
|
| 105 |
|
| 106 |
+
- **NEVER use local Ollama models** (e.g. `qwen3:8b`, `gemma3:1b`) — this machine
|
| 107 |
+
is not powerful enough and will break. Always use cloud-backed models like
|
| 108 |
+
`qwen3.5:397b-cloud` or `gpt-oss:20b-cloud` via Ollama, or vLLM tiers.
|
| 109 |
- **Adding a persona**: add to `PERSONAS` in `data/generate_users.py`, re-run it,
|
| 110 |
then `python -m backend.retrieval.vector_store` to rebuild indexes
|
| 111 |
- **Changing LLM**: set `ACTIVE_LLM_TIER` in `.env` — no code changes needed
|
README.md
CHANGED
|
@@ -184,11 +184,32 @@ To add a new persona, edit `data/generate_users.py` and re-run `python -m backen
|
|
| 184 |
|
| 185 |
## TODO
|
| 186 |
|
| 187 |
-
- [ ] Add evals for performance
|
| 188 |
- [ ] Add more dataset
|
| 189 |
- [ ] Reduce latency in intention
|
| 190 |
- [ ] Add more detailed todos
|
| 191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
---
|
| 193 |
|
| 194 |
## Team
|
|
|
|
| 184 |
|
| 185 |
## TODO
|
| 186 |
|
|
|
|
| 187 |
- [ ] Add more dataset
|
| 188 |
- [ ] Reduce latency in intention
|
| 189 |
- [ ] Add more detailed todos
|
| 190 |
|
| 191 |
+
### Evals (`backend/evals/`)
|
| 192 |
+
|
| 193 |
+
Per-turn metrics returned in `ChatResponse.eval_scores` and rendered in the React debug panel.
|
| 194 |
+
|
| 195 |
+
| Metric | File | Status |
|
| 196 |
+
|--------|------|--------|
|
| 197 |
+
| Communication Efficiency | `efficiency.py` | Done — SLO check on `t_total` |
|
| 198 |
+
| Factual Faithfulness | `faithfulness.py` | Stub |
|
| 199 |
+
| Multimodal Alignment | `multimodal_alignment.py` | Stub |
|
| 200 |
+
| Perceived Authenticity | (frontend) | UI star rating; not persisted yet |
|
| 201 |
+
|
| 202 |
+
- [ ] **Faithfulness** — Load cross-encoder NLI model (e.g. `cross-encoder/nli-deberta-v3-small`),
|
| 203 |
+
split response into sentences, check entailment against evidence chunks. Groundedness =
|
| 204 |
+
fraction with max entailment > 0.5; hallucination rate = fraction with contradiction > 0.5
|
| 205 |
+
and entailment < 0.3. Empty `chunks` → `no_evidence=True`.
|
| 206 |
+
- [ ] **Multimodal Alignment** — Rule-based (no model):
|
| 207 |
+
- Affect → sentiment-word overlap (reuse `affect_positive_map` from planner)
|
| 208 |
+
- Gesture → expected-word overlap (reuse `gesture_word_map` from planner)
|
| 209 |
+
- Gaze → check whether retrieved chunks came from `gaze_bucket` and response references them
|
| 210 |
+
- Overall = mean of non-None sub-scores
|
| 211 |
+
- [ ] **Authenticity** — Persist Likert ratings (currently client-side only). Add `POST /chat/rate`.
|
| 212 |
+
|
| 213 |
---
|
| 214 |
|
| 215 |
## Team
|
backend/api/main.py
CHANGED
|
@@ -8,7 +8,11 @@ from fastapi.middleware.cors import CORSMiddleware
|
|
| 8 |
from pydantic import BaseModel
|
| 9 |
|
| 10 |
from backend.config.settings import settings
|
| 11 |
-
from backend.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
from backend.guardrails.checks import check_input
|
| 13 |
from backend.pipeline.graph import aac_graph
|
| 14 |
from backend.pipeline.state import PipelineState
|
|
@@ -63,15 +67,31 @@ class ChatRequest(BaseModel):
|
|
| 63 |
air_written_text: str | None = None
|
| 64 |
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
class ChatResponse(BaseModel):
|
| 67 |
user_id: str
|
| 68 |
query: str
|
| 69 |
response: str
|
| 70 |
affect: str
|
| 71 |
llm_tier: str
|
|
|
|
| 72 |
retrieval_mode: str
|
| 73 |
latency: dict
|
| 74 |
guardrail_passed: bool
|
|
|
|
| 75 |
|
| 76 |
|
| 77 |
# ── Helpers ────────────────────────────────────────────────────────────────────
|
|
@@ -123,6 +143,7 @@ def _build_initial_state(req: ChatRequest, session: dict) -> PipelineState:
|
|
| 123 |
candidates=[],
|
| 124 |
selected_response=None,
|
| 125 |
llm_tier_used="",
|
|
|
|
| 126 |
latency_log={
|
| 127 |
"t_sensing": 0.0,
|
| 128 |
"t_intent": 0.0,
|
|
@@ -143,6 +164,23 @@ def health():
|
|
| 143 |
return {"status": "ok", "models_ready": _models_ready}
|
| 144 |
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
@app.get("/users")
|
| 147 |
def list_users():
|
| 148 |
try:
|
|
@@ -170,6 +208,7 @@ def chat(req: ChatRequest):
|
|
| 170 |
response=guard["fallback"],
|
| 171 |
affect="NEUTRAL",
|
| 172 |
llm_tier="none",
|
|
|
|
| 173 |
retrieval_mode="none",
|
| 174 |
latency={},
|
| 175 |
guardrail_passed=False,
|
|
@@ -184,13 +223,27 @@ def chat(req: ChatRequest):
|
|
| 184 |
session["session_history"] = result["session_history"]
|
| 185 |
session["bucket_priors"] = result["bucket_priors"]
|
| 186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
return ChatResponse(
|
| 188 |
user_id=req.user_id,
|
| 189 |
query=req.query,
|
| 190 |
response=result["selected_response"] or "",
|
| 191 |
-
affect=
|
| 192 |
llm_tier=result.get("llm_tier_used", "unknown"),
|
|
|
|
| 193 |
retrieval_mode=result.get("retrieval_mode_used", "unknown"),
|
| 194 |
latency=result.get("latency_log") or {},
|
| 195 |
guardrail_passed=result.get("guardrail_passed", True),
|
|
|
|
| 196 |
)
|
|
|
|
| 8 |
from pydantic import BaseModel
|
| 9 |
|
| 10 |
from backend.config.settings import settings
|
| 11 |
+
from backend.evals import compute_evals
|
| 12 |
+
from backend.generation.llm_client import ( # active_model used by /debug/config
|
| 13 |
+
active_model,
|
| 14 |
+
get_client,
|
| 15 |
+
)
|
| 16 |
from backend.guardrails.checks import check_input
|
| 17 |
from backend.pipeline.graph import aac_graph
|
| 18 |
from backend.pipeline.state import PipelineState
|
|
|
|
| 67 |
air_written_text: str | None = None
|
| 68 |
|
| 69 |
|
| 70 |
+
class EvalScoresResponse(BaseModel):
|
| 71 |
+
groundedness: float
|
| 72 |
+
hallucination_rate: float
|
| 73 |
+
no_evidence: bool
|
| 74 |
+
t_total_s: float
|
| 75 |
+
slo_target_s: float
|
| 76 |
+
slo_passed: bool
|
| 77 |
+
slo_margin_s: float
|
| 78 |
+
multimodal_alignment: float
|
| 79 |
+
affect_alignment: float
|
| 80 |
+
gesture_alignment: float
|
| 81 |
+
gaze_alignment: float
|
| 82 |
+
|
| 83 |
+
|
| 84 |
class ChatResponse(BaseModel):
|
| 85 |
user_id: str
|
| 86 |
query: str
|
| 87 |
response: str
|
| 88 |
affect: str
|
| 89 |
llm_tier: str
|
| 90 |
+
llm_model: str
|
| 91 |
retrieval_mode: str
|
| 92 |
latency: dict
|
| 93 |
guardrail_passed: bool
|
| 94 |
+
eval_scores: EvalScoresResponse | None = None
|
| 95 |
|
| 96 |
|
| 97 |
# ── Helpers ────────────────────────────────────────────────────────────────────
|
|
|
|
| 143 |
candidates=[],
|
| 144 |
selected_response=None,
|
| 145 |
llm_tier_used="",
|
| 146 |
+
llm_model_used="",
|
| 147 |
latency_log={
|
| 148 |
"t_sensing": 0.0,
|
| 149 |
"t_intent": 0.0,
|
|
|
|
| 164 |
return {"status": "ok", "models_ready": _models_ready}
|
| 165 |
|
| 166 |
|
| 167 |
+
@app.get("/debug/config")
|
| 168 |
+
def debug_config():
|
| 169 |
+
"""Return active model + key settings for the debug panel."""
|
| 170 |
+
return {
|
| 171 |
+
"active_llm_tier": settings.active_llm_tier,
|
| 172 |
+
"active_model": active_model(),
|
| 173 |
+
"thinking_mode": settings.thinking_mode,
|
| 174 |
+
"embed_model": settings.embed_model,
|
| 175 |
+
"rerank_model": settings.rerank_model,
|
| 176 |
+
"retrieval_top_k": settings.retrieval_top_k,
|
| 177 |
+
"retrieval_rerank_k": settings.retrieval_rerank_k,
|
| 178 |
+
"fallback_latency_threshold": settings.fallback_latency_threshold,
|
| 179 |
+
"slo_target_s": settings.slo_target_s,
|
| 180 |
+
"num_candidates": settings.num_candidates,
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
|
| 184 |
@app.get("/users")
|
| 185 |
def list_users():
|
| 186 |
try:
|
|
|
|
| 208 |
response=guard["fallback"],
|
| 209 |
affect="NEUTRAL",
|
| 210 |
llm_tier="none",
|
| 211 |
+
llm_model="none",
|
| 212 |
retrieval_mode="none",
|
| 213 |
latency={},
|
| 214 |
guardrail_passed=False,
|
|
|
|
| 223 |
session["session_history"] = result["session_history"]
|
| 224 |
session["bucket_priors"] = result["bucket_priors"]
|
| 225 |
|
| 226 |
+
# Compute evaluation metrics
|
| 227 |
+
affect_emotion = (result.get("affect") or {}).get("emotion", "NEUTRAL")
|
| 228 |
+
eval_scores = compute_evals(
|
| 229 |
+
response=result["selected_response"] or "",
|
| 230 |
+
chunks=result.get("retrieved_chunks") or [],
|
| 231 |
+
latency_log=result.get("latency_log") or {},
|
| 232 |
+
affect=affect_emotion,
|
| 233 |
+
gesture_tag=req.gesture_tag,
|
| 234 |
+
gaze_bucket=req.gaze_bucket,
|
| 235 |
+
slo_target=settings.slo_target_s,
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
return ChatResponse(
|
| 239 |
user_id=req.user_id,
|
| 240 |
query=req.query,
|
| 241 |
response=result["selected_response"] or "",
|
| 242 |
+
affect=affect_emotion,
|
| 243 |
llm_tier=result.get("llm_tier_used", "unknown"),
|
| 244 |
+
llm_model=result.get("llm_model_used", "unknown"),
|
| 245 |
retrieval_mode=result.get("retrieval_mode_used", "unknown"),
|
| 246 |
latency=result.get("latency_log") or {},
|
| 247 |
guardrail_passed=result.get("guardrail_passed", True),
|
| 248 |
+
eval_scores=eval_scores,
|
| 249 |
)
|
backend/config/settings.py
CHANGED
|
@@ -69,5 +69,8 @@ class Settings(BaseSettings):
|
|
| 69 |
rank_beta: float = 0.3 # style similarity weight
|
| 70 |
rank_gamma: float = 0.3 # affect-match weight
|
| 71 |
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
settings = Settings()
|
|
|
|
| 69 |
rank_beta: float = 0.3 # style similarity weight
|
| 70 |
rank_gamma: float = 0.3 # affect-match weight
|
| 71 |
|
| 72 |
+
# ── Evaluation ────────────────────────────────────────────────────────────
|
| 73 |
+
slo_target_s: float = 6.0 # max acceptable response latency (seconds)
|
| 74 |
+
|
| 75 |
|
| 76 |
settings = Settings()
|
backend/evals/__init__.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Evaluation metrics — compute after pipeline returns, before API response.
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
from backend.evals.efficiency import compute_efficiency
|
| 5 |
+
from backend.evals.faithfulness import compute_faithfulness
|
| 6 |
+
from backend.evals.multimodal_alignment import compute_multimodal_alignment
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def compute_evals(
|
| 10 |
+
response: str,
|
| 11 |
+
chunks: list[dict],
|
| 12 |
+
latency_log: dict,
|
| 13 |
+
affect: str | None,
|
| 14 |
+
gesture_tag: str | None,
|
| 15 |
+
gaze_bucket: str | None,
|
| 16 |
+
slo_target: float = 6.0,
|
| 17 |
+
) -> dict:
|
| 18 |
+
"""Run all eval scorers and return a unified EvalScores dict."""
|
| 19 |
+
faith = compute_faithfulness(response, chunks)
|
| 20 |
+
eff = compute_efficiency(latency_log, slo_target)
|
| 21 |
+
align = compute_multimodal_alignment(
|
| 22 |
+
response, affect, gesture_tag, gaze_bucket, chunks
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
return {
|
| 26 |
+
"groundedness": faith["groundedness"],
|
| 27 |
+
"hallucination_rate": faith["hallucination_rate"],
|
| 28 |
+
"no_evidence": faith["no_evidence"],
|
| 29 |
+
"t_total_s": eff["t_total"],
|
| 30 |
+
"slo_target_s": eff["slo_target"],
|
| 31 |
+
"slo_passed": eff["slo_passed"],
|
| 32 |
+
"slo_margin_s": eff["margin_s"],
|
| 33 |
+
"multimodal_alignment": align["overall_score"],
|
| 34 |
+
"affect_alignment": align["affect_alignment"],
|
| 35 |
+
"gesture_alignment": align["gesture_alignment"],
|
| 36 |
+
"gaze_alignment": align["gaze_alignment"],
|
| 37 |
+
}
|
backend/evals/efficiency.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Communication efficiency — SLO pass/fail on response latency.
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def compute_efficiency(latency_log: dict, slo_target: float = 6.0) -> dict:
|
| 6 |
+
"""Check if total response time meets the SLO target."""
|
| 7 |
+
t_total = latency_log.get("t_total", 0.0)
|
| 8 |
+
return {
|
| 9 |
+
"t_total": round(t_total, 3),
|
| 10 |
+
"slo_target": slo_target,
|
| 11 |
+
"slo_passed": t_total < slo_target,
|
| 12 |
+
"margin_s": round(slo_target - t_total, 3),
|
| 13 |
+
}
|
backend/evals/faithfulness.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# NLI-based faithfulness scoring.
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def compute_faithfulness(response: str, chunks: list[dict]) -> dict:
|
| 6 |
+
"""Compute groundedness and hallucination rate via NLI."""
|
| 7 |
+
no_evidence = len(chunks) == 0
|
| 8 |
+
return {
|
| 9 |
+
"groundedness": 0.0,
|
| 10 |
+
"hallucination_rate": 0.0,
|
| 11 |
+
"no_evidence": no_evidence,
|
| 12 |
+
}
|
backend/evals/multimodal_alignment.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Multimodal alignment scoring.
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def compute_multimodal_alignment(
|
| 6 |
+
response: str,
|
| 7 |
+
affect: str | None,
|
| 8 |
+
gesture_tag: str | None,
|
| 9 |
+
gaze_bucket: str | None,
|
| 10 |
+
chunks: list[dict],
|
| 11 |
+
) -> dict:
|
| 12 |
+
"""Score alignment between non-verbal inputs and generated text."""
|
| 13 |
+
return {
|
| 14 |
+
"overall_score": 0.0,
|
| 15 |
+
"affect_alignment": 0.0,
|
| 16 |
+
"gesture_alignment": 0.0,
|
| 17 |
+
"gaze_alignment": 0.0,
|
| 18 |
+
}
|
backend/generation/llm_client.py
CHANGED
|
@@ -97,11 +97,19 @@ def chat_complete(
|
|
| 97 |
**kwargs,
|
| 98 |
)
|
| 99 |
raw = (resp.choices[0].message.content if resp.choices else "") or ""
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
if settings.thinking_mode in ("off", "strip"):
|
| 102 |
raw = _strip_think_tags(raw)
|
| 103 |
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
|
| 107 |
def warmup(tier: str | None = None) -> None:
|
|
|
|
| 97 |
**kwargs,
|
| 98 |
)
|
| 99 |
raw = (resp.choices[0].message.content if resp.choices else "") or ""
|
| 100 |
+
print(
|
| 101 |
+
f"[llm_client] tier={resolved_tier} model={model} raw_len={len(raw)} raw={raw[:200]!r}"
|
| 102 |
+
)
|
| 103 |
|
| 104 |
if settings.thinking_mode in ("off", "strip"):
|
| 105 |
raw = _strip_think_tags(raw)
|
| 106 |
|
| 107 |
+
stripped = raw.strip()
|
| 108 |
+
if not stripped:
|
| 109 |
+
print(
|
| 110 |
+
f"[llm_client] WARNING: empty response after strip. finish_reason={resp.choices[0].finish_reason if resp.choices else 'none'}"
|
| 111 |
+
)
|
| 112 |
+
return stripped
|
| 113 |
|
| 114 |
|
| 115 |
def warmup(tier: str | None = None) -> None:
|
backend/pipeline/nodes/planner.py
CHANGED
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
| 4 |
import time
|
| 5 |
|
| 6 |
from backend.config.settings import settings
|
| 7 |
-
from backend.generation.llm_client import chat_complete
|
| 8 |
from backend.guardrails.checks import check_output
|
| 9 |
from backend.pipeline.state import PipelineState
|
| 10 |
from backend.sensing.gesture import GESTURE_TO_TAG
|
|
@@ -94,11 +94,16 @@ def _run(state: PipelineState, tier: str) -> dict:
|
|
| 94 |
4,
|
| 95 |
)
|
| 96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
return {
|
| 98 |
"augmented_prompt": prompt,
|
| 99 |
"candidates": candidates,
|
| 100 |
"selected_response": selected,
|
| 101 |
-
"llm_tier_used":
|
|
|
|
| 102 |
"latency_log": latency_log,
|
| 103 |
"guardrail_passed": guard["passed"],
|
| 104 |
}
|
|
|
|
| 4 |
import time
|
| 5 |
|
| 6 |
from backend.config.settings import settings
|
| 7 |
+
from backend.generation.llm_client import active_model, chat_complete
|
| 8 |
from backend.guardrails.checks import check_output
|
| 9 |
from backend.pipeline.state import PipelineState
|
| 10 |
from backend.sensing.gesture import GESTURE_TO_TAG
|
|
|
|
| 94 |
4,
|
| 95 |
)
|
| 96 |
|
| 97 |
+
# Mirror chat_complete's tier collapsing so the reported model matches what ran.
|
| 98 |
+
actual_tier = "local" if settings.active_llm_tier == "local" else tier
|
| 99 |
+
actual_model = active_model(actual_tier)
|
| 100 |
+
|
| 101 |
return {
|
| 102 |
"augmented_prompt": prompt,
|
| 103 |
"candidates": candidates,
|
| 104 |
"selected_response": selected,
|
| 105 |
+
"llm_tier_used": actual_tier,
|
| 106 |
+
"llm_model_used": actual_model,
|
| 107 |
"latency_log": latency_log,
|
| 108 |
"guardrail_passed": guard["passed"],
|
| 109 |
}
|
backend/pipeline/state.py
CHANGED
|
@@ -90,6 +90,7 @@ class PipelineState(TypedDict):
|
|
| 90 |
candidates: list[str] # 2-3 candidate responses
|
| 91 |
selected_response: str | None
|
| 92 |
llm_tier_used: str # "primary" | "fallback" | "local"
|
|
|
|
| 93 |
|
| 94 |
# ── L5: Feedback / tracking ───────────────────────────────────────────────
|
| 95 |
latency_log: LatencyLog | None
|
|
|
|
| 90 |
candidates: list[str] # 2-3 candidate responses
|
| 91 |
selected_response: str | None
|
| 92 |
llm_tier_used: str # "primary" | "fallback" | "local"
|
| 93 |
+
llm_model_used: str # actual model name (e.g. "gemma4:31b-cloud")
|
| 94 |
|
| 95 |
# ── L5: Feedback / tracking ───────────────────────────────────────────────
|
| 96 |
latency_log: LatencyLog | None
|
frontend/src/App.css
CHANGED
|
@@ -124,14 +124,15 @@ select:focus, input[type="text"]:focus {
|
|
| 124 |
font-weight: 500;
|
| 125 |
}
|
| 126 |
|
| 127 |
-
/* ──
|
| 128 |
|
| 129 |
-
.
|
| 130 |
font-size: 12px;
|
| 131 |
color: #888;
|
| 132 |
text-transform: uppercase;
|
| 133 |
letter-spacing: 0.5px;
|
| 134 |
margin-bottom: 6px;
|
|
|
|
| 135 |
}
|
| 136 |
|
| 137 |
.metric-row {
|
|
@@ -139,6 +140,13 @@ select:focus, input[type="text"]:focus {
|
|
| 139 |
justify-content: space-between;
|
| 140 |
font-size: 13px;
|
| 141 |
padding: 2px 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
}
|
| 143 |
|
| 144 |
.metric-label {
|
|
@@ -150,6 +158,14 @@ select:focus, input[type="text"]:focus {
|
|
| 150 |
font-family: monospace;
|
| 151 |
}
|
| 152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
.no-metrics {
|
| 154 |
color: #555;
|
| 155 |
font-size: 13px;
|
|
@@ -260,3 +276,109 @@ select:focus, input[type="text"]:focus {
|
|
| 260 |
color: #e55;
|
| 261 |
font-size: 13px;
|
| 262 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
font-weight: 500;
|
| 125 |
}
|
| 126 |
|
| 127 |
+
/* ── Shared metric primitives (latency, eval) ─────────────────────── */
|
| 128 |
|
| 129 |
+
.section-title {
|
| 130 |
font-size: 12px;
|
| 131 |
color: #888;
|
| 132 |
text-transform: uppercase;
|
| 133 |
letter-spacing: 0.5px;
|
| 134 |
margin-bottom: 6px;
|
| 135 |
+
font-weight: 600;
|
| 136 |
}
|
| 137 |
|
| 138 |
.metric-row {
|
|
|
|
| 140 |
justify-content: space-between;
|
| 141 |
font-size: 13px;
|
| 142 |
padding: 2px 0;
|
| 143 |
+
color: #aaa;
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
.metric-row.sub {
|
| 147 |
+
padding-left: 12px;
|
| 148 |
+
font-size: 11px;
|
| 149 |
+
color: #777;
|
| 150 |
}
|
| 151 |
|
| 152 |
.metric-label {
|
|
|
|
| 158 |
font-family: monospace;
|
| 159 |
}
|
| 160 |
|
| 161 |
+
.metric-value.pass {
|
| 162 |
+
color: #4caf50;
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
.metric-value.fail {
|
| 166 |
+
color: #f44336;
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
.no-metrics {
|
| 170 |
color: #555;
|
| 171 |
font-size: 13px;
|
|
|
|
| 276 |
color: #e55;
|
| 277 |
font-size: 13px;
|
| 278 |
}
|
| 279 |
+
|
| 280 |
+
/* ── Eval panel ──────────────────────────────────────────────────── */
|
| 281 |
+
|
| 282 |
+
.eval-panel {
|
| 283 |
+
margin-top: 8px;
|
| 284 |
+
border-top: 1px solid #3a3d47;
|
| 285 |
+
padding-top: 6px;
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
.eval-toggle {
|
| 289 |
+
background: none;
|
| 290 |
+
border: none;
|
| 291 |
+
color: #888;
|
| 292 |
+
font-size: 12px;
|
| 293 |
+
cursor: pointer;
|
| 294 |
+
padding: 2px 0;
|
| 295 |
+
display: flex;
|
| 296 |
+
align-items: center;
|
| 297 |
+
gap: 8px;
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
.eval-toggle:hover {
|
| 301 |
+
color: #bbb;
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
.slo-badge {
|
| 305 |
+
font-size: 10px;
|
| 306 |
+
padding: 1px 6px;
|
| 307 |
+
border-radius: 3px;
|
| 308 |
+
font-weight: 600;
|
| 309 |
+
}
|
| 310 |
+
|
| 311 |
+
.slo-badge.pass {
|
| 312 |
+
background: #1b3a1b;
|
| 313 |
+
color: #4caf50;
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
.slo-badge.fail {
|
| 317 |
+
background: #3a1b1b;
|
| 318 |
+
color: #f44336;
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
.eval-details {
|
| 322 |
+
display: flex;
|
| 323 |
+
flex-direction: column;
|
| 324 |
+
gap: 10px;
|
| 325 |
+
margin-top: 8px;
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
.eval-section {
|
| 329 |
+
display: flex;
|
| 330 |
+
flex-direction: column;
|
| 331 |
+
gap: 4px;
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
.eval-na {
|
| 335 |
+
font-size: 11px;
|
| 336 |
+
color: #666;
|
| 337 |
+
font-style: italic;
|
| 338 |
+
}
|
| 339 |
+
|
| 340 |
+
.score-bar {
|
| 341 |
+
height: 4px;
|
| 342 |
+
background: #2a2d37;
|
| 343 |
+
border-radius: 2px;
|
| 344 |
+
overflow: hidden;
|
| 345 |
+
}
|
| 346 |
+
|
| 347 |
+
.score-bar-fill {
|
| 348 |
+
height: 100%;
|
| 349 |
+
border-radius: 2px;
|
| 350 |
+
transition: width 0.3s ease;
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
/* ── Star rating ─────────────────────────────────────────────────── */
|
| 354 |
+
|
| 355 |
+
.star-rating {
|
| 356 |
+
display: flex;
|
| 357 |
+
align-items: center;
|
| 358 |
+
gap: 2px;
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
.star-rating .star {
|
| 362 |
+
background: none;
|
| 363 |
+
border: none;
|
| 364 |
+
font-size: 18px;
|
| 365 |
+
cursor: pointer;
|
| 366 |
+
color: #3a3d47;
|
| 367 |
+
padding: 0;
|
| 368 |
+
line-height: 1;
|
| 369 |
+
transition: color 0.15s;
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
.star-rating .star.active {
|
| 373 |
+
color: #ff9800;
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
.star-rating .star:hover {
|
| 377 |
+
color: #ffb74d;
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
.star-label {
|
| 381 |
+
font-size: 11px;
|
| 382 |
+
color: #888;
|
| 383 |
+
margin-left: 6px;
|
| 384 |
+
}
|
frontend/src/components/ChatPanel.tsx
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import { useState, useRef, useEffect } from "react";
|
| 2 |
import type { ChatMessage, SensingState, Affect, LatencyLog } from "../types";
|
| 3 |
import { sendChat } from "../lib/api";
|
|
|
|
| 4 |
|
| 5 |
interface Props {
|
| 6 |
userId: string | null;
|
|
@@ -59,6 +60,7 @@ export function ChatPanel({
|
|
| 59 |
content: res.response,
|
| 60 |
latency: res.latency,
|
| 61 |
affect: res.affect,
|
|
|
|
| 62 |
},
|
| 63 |
]);
|
| 64 |
onLatency(res.latency);
|
|
@@ -88,6 +90,9 @@ export function ChatPanel({
|
|
| 88 |
{msg.role === "partner" ? "Partner" : "AAC User"}
|
| 89 |
</span>
|
| 90 |
<p>{msg.content}</p>
|
|
|
|
|
|
|
|
|
|
| 91 |
</div>
|
| 92 |
))}
|
| 93 |
{loading && (
|
|
|
|
| 1 |
import { useState, useRef, useEffect } from "react";
|
| 2 |
import type { ChatMessage, SensingState, Affect, LatencyLog } from "../types";
|
| 3 |
import { sendChat } from "../lib/api";
|
| 4 |
+
import { EvalPanel } from "./EvalPanel";
|
| 5 |
|
| 6 |
interface Props {
|
| 7 |
userId: string | null;
|
|
|
|
| 60 |
content: res.response,
|
| 61 |
latency: res.latency,
|
| 62 |
affect: res.affect,
|
| 63 |
+
evalScores: res.eval_scores,
|
| 64 |
},
|
| 65 |
]);
|
| 66 |
onLatency(res.latency);
|
|
|
|
| 90 |
{msg.role === "partner" ? "Partner" : "AAC User"}
|
| 91 |
</span>
|
| 92 |
<p>{msg.content}</p>
|
| 93 |
+
{msg.role === "aac_user" && msg.evalScores && (
|
| 94 |
+
<EvalPanel evalScores={msg.evalScores} />
|
| 95 |
+
)}
|
| 96 |
</div>
|
| 97 |
))}
|
| 98 |
{loading && (
|
frontend/src/components/EvalPanel.tsx
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from "react";
|
| 2 |
+
import type { EvalScores } from "../types";
|
| 3 |
+
|
| 4 |
+
interface Props {
|
| 5 |
+
evalScores: EvalScores;
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
function ScoreBar({ value }: { value: number }) {
|
| 9 |
+
const pct = Math.min(value * 100, 100);
|
| 10 |
+
const color = pct > 70 ? "#4caf50" : pct > 40 ? "#ff9800" : "#f44336";
|
| 11 |
+
return (
|
| 12 |
+
<div className="score-bar">
|
| 13 |
+
<div className="score-bar-fill" style={{ width: `${pct}%`, background: color }} />
|
| 14 |
+
</div>
|
| 15 |
+
);
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
function StarRating({
|
| 19 |
+
value,
|
| 20 |
+
onChange,
|
| 21 |
+
}: {
|
| 22 |
+
value: number | null;
|
| 23 |
+
onChange: (v: number) => void;
|
| 24 |
+
}) {
|
| 25 |
+
const [hover, setHover] = useState(0);
|
| 26 |
+
return (
|
| 27 |
+
<div className="star-rating">
|
| 28 |
+
{[1, 2, 3, 4, 5].map((star) => (
|
| 29 |
+
<button
|
| 30 |
+
key={star}
|
| 31 |
+
className={`star ${star <= (hover || (value ?? 0)) ? "active" : ""}`}
|
| 32 |
+
onMouseEnter={() => setHover(star)}
|
| 33 |
+
onMouseLeave={() => setHover(0)}
|
| 34 |
+
onClick={() => onChange(star)}
|
| 35 |
+
>
|
| 36 |
+
★
|
| 37 |
+
</button>
|
| 38 |
+
))}
|
| 39 |
+
{value !== null && <span className="star-label">{value}/5</span>}
|
| 40 |
+
</div>
|
| 41 |
+
);
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
export function EvalPanel({ evalScores }: Props) {
|
| 45 |
+
const [expanded, setExpanded] = useState(false);
|
| 46 |
+
const [likert, setLikert] = useState<number | null>(null);
|
| 47 |
+
|
| 48 |
+
return (
|
| 49 |
+
<div className="eval-panel">
|
| 50 |
+
<button
|
| 51 |
+
className="eval-toggle"
|
| 52 |
+
onClick={() => setExpanded(!expanded)}
|
| 53 |
+
>
|
| 54 |
+
{expanded ? "▾" : "▸"} Eval Metrics
|
| 55 |
+
{evalScores.slo_passed ? (
|
| 56 |
+
<span className="slo-badge pass">SLO ✓</span>
|
| 57 |
+
) : (
|
| 58 |
+
<span className="slo-badge fail">SLO ✗</span>
|
| 59 |
+
)}
|
| 60 |
+
{likert !== null && (
|
| 61 |
+
<span className="slo-badge">{likert}/5 ★</span>
|
| 62 |
+
)}
|
| 63 |
+
</button>
|
| 64 |
+
|
| 65 |
+
{expanded && (
|
| 66 |
+
<div className="eval-details">
|
| 67 |
+
<div className="eval-section">
|
| 68 |
+
<div className="section-title">Factual Faithfulness</div>
|
| 69 |
+
{evalScores.no_evidence ? (
|
| 70 |
+
<div className="eval-na">N/A — no evidence retrieved</div>
|
| 71 |
+
) : (
|
| 72 |
+
<>
|
| 73 |
+
<div className="metric-row">
|
| 74 |
+
<span>Groundedness</span>
|
| 75 |
+
<span className="metric-value">{(evalScores.groundedness * 100).toFixed(0)}%</span>
|
| 76 |
+
</div>
|
| 77 |
+
<ScoreBar value={evalScores.groundedness} />
|
| 78 |
+
<div className="metric-row">
|
| 79 |
+
<span>Hallucination Rate</span>
|
| 80 |
+
<span className={`metric-value ${evalScores.hallucination_rate > 0.2 ? "fail" : "pass"}`}>
|
| 81 |
+
{(evalScores.hallucination_rate * 100).toFixed(0)}%
|
| 82 |
+
</span>
|
| 83 |
+
</div>
|
| 84 |
+
</>
|
| 85 |
+
)}
|
| 86 |
+
</div>
|
| 87 |
+
|
| 88 |
+
<div className="eval-section">
|
| 89 |
+
<div className="section-title">Communication Efficiency</div>
|
| 90 |
+
<div className="metric-row">
|
| 91 |
+
<span>Response Time</span>
|
| 92 |
+
<span className={`metric-value ${evalScores.slo_passed ? "pass" : "fail"}`}>
|
| 93 |
+
{evalScores.t_total_s.toFixed(2)}s
|
| 94 |
+
{evalScores.slo_passed ? " ✓" : " ✗"}
|
| 95 |
+
</span>
|
| 96 |
+
</div>
|
| 97 |
+
<div className="metric-row sub">
|
| 98 |
+
<span>SLO Target</span>
|
| 99 |
+
<span className="metric-value">
|
| 100 |
+
< {evalScores.slo_target_s.toFixed(1)}s (margin: {evalScores.slo_margin_s.toFixed(2)}s)
|
| 101 |
+
</span>
|
| 102 |
+
</div>
|
| 103 |
+
</div>
|
| 104 |
+
|
| 105 |
+
<div className="eval-section">
|
| 106 |
+
<div className="section-title">Multimodal Alignment</div>
|
| 107 |
+
<div className="metric-row">
|
| 108 |
+
<span>Overall</span>
|
| 109 |
+
<span className="metric-value">{(evalScores.multimodal_alignment * 100).toFixed(0)}%</span>
|
| 110 |
+
</div>
|
| 111 |
+
<ScoreBar value={evalScores.multimodal_alignment} />
|
| 112 |
+
<div className="metric-row sub">
|
| 113 |
+
<span>Affect</span>
|
| 114 |
+
<span className="metric-value">{(evalScores.affect_alignment * 100).toFixed(0)}%</span>
|
| 115 |
+
</div>
|
| 116 |
+
<div className="metric-row sub">
|
| 117 |
+
<span>Gesture</span>
|
| 118 |
+
<span className="metric-value">{(evalScores.gesture_alignment * 100).toFixed(0)}%</span>
|
| 119 |
+
</div>
|
| 120 |
+
<div className="metric-row sub">
|
| 121 |
+
<span>Gaze</span>
|
| 122 |
+
<span className="metric-value">{(evalScores.gaze_alignment * 100).toFixed(0)}%</span>
|
| 123 |
+
</div>
|
| 124 |
+
</div>
|
| 125 |
+
|
| 126 |
+
<div className="eval-section">
|
| 127 |
+
<div className="section-title">Perceived Authenticity</div>
|
| 128 |
+
<div className="metric-row">
|
| 129 |
+
<span>Rate this response</span>
|
| 130 |
+
</div>
|
| 131 |
+
<StarRating value={likert} onChange={setLikert} />
|
| 132 |
+
</div>
|
| 133 |
+
</div>
|
| 134 |
+
)}
|
| 135 |
+
</div>
|
| 136 |
+
);
|
| 137 |
+
}
|
frontend/src/components/LatencyMetrics.tsx
CHANGED
|
@@ -16,8 +16,8 @@ export function LatencyMetrics({ latency }: Props) {
|
|
| 16 |
if (!latency) return <p className="no-metrics">No turn yet</p>;
|
| 17 |
|
| 18 |
return (
|
| 19 |
-
<div
|
| 20 |
-
<
|
| 21 |
{FIELDS.map(({ key, label }) => (
|
| 22 |
<div key={key} className="metric-row">
|
| 23 |
<span className="metric-label">{label}</span>
|
|
|
|
| 16 |
if (!latency) return <p className="no-metrics">No turn yet</p>;
|
| 17 |
|
| 18 |
return (
|
| 19 |
+
<div>
|
| 20 |
+
<div className="section-title">Latency</div>
|
| 21 |
{FIELDS.map(({ key, label }) => (
|
| 22 |
<div key={key} className="metric-row">
|
| 23 |
<span className="metric-label">{label}</span>
|
frontend/src/types.ts
CHANGED
|
@@ -33,6 +33,20 @@ export interface LatencyLog {
|
|
| 33 |
t_total: number;
|
| 34 |
}
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
export interface ChatResponse {
|
| 37 |
user_id: string;
|
| 38 |
query: string;
|
|
@@ -42,6 +56,7 @@ export interface ChatResponse {
|
|
| 42 |
retrieval_mode: string;
|
| 43 |
latency: LatencyLog;
|
| 44 |
guardrail_passed: boolean;
|
|
|
|
| 45 |
}
|
| 46 |
|
| 47 |
export interface ChatMessage {
|
|
@@ -49,4 +64,5 @@ export interface ChatMessage {
|
|
| 49 |
content: string;
|
| 50 |
latency?: LatencyLog;
|
| 51 |
affect?: string;
|
|
|
|
| 52 |
}
|
|
|
|
| 33 |
t_total: number;
|
| 34 |
}
|
| 35 |
|
| 36 |
+
export interface EvalScores {
|
| 37 |
+
groundedness: number;
|
| 38 |
+
hallucination_rate: number;
|
| 39 |
+
no_evidence: boolean;
|
| 40 |
+
t_total_s: number;
|
| 41 |
+
slo_target_s: number;
|
| 42 |
+
slo_passed: boolean;
|
| 43 |
+
slo_margin_s: number;
|
| 44 |
+
multimodal_alignment: number;
|
| 45 |
+
affect_alignment: number;
|
| 46 |
+
gesture_alignment: number;
|
| 47 |
+
gaze_alignment: number;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
export interface ChatResponse {
|
| 51 |
user_id: string;
|
| 52 |
query: string;
|
|
|
|
| 56 |
retrieval_mode: string;
|
| 57 |
latency: LatencyLog;
|
| 58 |
guardrail_passed: boolean;
|
| 59 |
+
eval_scores: EvalScores | null;
|
| 60 |
}
|
| 61 |
|
| 62 |
export interface ChatMessage {
|
|
|
|
| 64 |
content: string;
|
| 65 |
latency?: LatencyLog;
|
| 66 |
affect?: string;
|
| 67 |
+
evalScores?: EvalScores | null;
|
| 68 |
}
|
run.sh
CHANGED
|
@@ -3,6 +3,16 @@ set -euo pipefail
|
|
| 3 |
|
| 4 |
export PYTHONWARNINGS="ignore::UserWarning:multiprocessing.resource_tracker"
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
PIDS=()
|
| 7 |
|
| 8 |
cleanup() {
|
|
@@ -22,7 +32,7 @@ trap cleanup INT TERM
|
|
| 22 |
|
| 23 |
# Use Node 22 if available (Vite 8 requires Node 20.19+ or 22.12+)
|
| 24 |
if [ -x /opt/homebrew/opt/node@22/bin/node ]; then
|
| 25 |
-
export PATH="/opt/homebrew/opt/node@22/bin:$PATH"
|
| 26 |
fi
|
| 27 |
|
| 28 |
# Start Ollama if not already running
|
|
|
|
| 3 |
|
| 4 |
export PYTHONWARNINGS="ignore::UserWarning:multiprocessing.resource_tracker"
|
| 5 |
|
| 6 |
+
CONDA_ENV="aac-chatbot"
|
| 7 |
+
|
| 8 |
+
# Activate conda env
|
| 9 |
+
if ! command -v conda >/dev/null 2>&1; then
|
| 10 |
+
echo "ERROR: conda not found. Run setup.sh first." >&2
|
| 11 |
+
exit 1
|
| 12 |
+
fi
|
| 13 |
+
eval "$(conda shell.bash hook)"
|
| 14 |
+
conda activate "$CONDA_ENV"
|
| 15 |
+
|
| 16 |
PIDS=()
|
| 17 |
|
| 18 |
cleanup() {
|
|
|
|
| 32 |
|
| 33 |
# Use Node 22 if available (Vite 8 requires Node 20.19+ or 22.12+)
|
| 34 |
if [ -x /opt/homebrew/opt/node@22/bin/node ]; then
|
| 35 |
+
export PATH="/opt/homebrew/opt/node@22/bin:/opt/homebrew/bin:$PATH"
|
| 36 |
fi
|
| 37 |
|
| 38 |
# Start Ollama if not already running
|