shwetangisingh commited on
Commit
9ad188a
·
1 Parent(s): ab9851d

Added scaffolding for evals

Browse files
CLAUDE.md CHANGED
@@ -103,6 +103,9 @@ Copy `.env.example` → `.env` and set:
103
 
104
  ## Development Notes
105
 
 
 
 
106
  - **Adding a persona**: add to `PERSONAS` in `data/generate_users.py`, re-run it,
107
  then `python -m backend.retrieval.vector_store` to rebuild indexes
108
  - **Changing LLM**: set `ACTIVE_LLM_TIER` in `.env` — no code changes needed
 
103
 
104
  ## Development Notes
105
 
106
+ - **NEVER use local Ollama models** (e.g. `qwen3:8b`, `gemma3:1b`) — this machine
107
+ is not powerful enough and will break. Always use cloud-backed models like
108
+ `qwen3.5:397b-cloud` or `gpt-oss:20b-cloud` via Ollama, or vLLM tiers.
109
  - **Adding a persona**: add to `PERSONAS` in `data/generate_users.py`, re-run it,
110
  then `python -m backend.retrieval.vector_store` to rebuild indexes
111
  - **Changing LLM**: set `ACTIVE_LLM_TIER` in `.env` — no code changes needed
README.md CHANGED
@@ -184,11 +184,32 @@ To add a new persona, edit `data/generate_users.py` and re-run `python -m backen
184
 
185
  ## TODO
186
 
187
- - [ ] Add evals for performance
188
  - [ ] Add more dataset
189
  - [ ] Reduce latency in intention
190
  - [ ] Add more detailed todos
191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  ---
193
 
194
  ## Team
 
184
 
185
  ## TODO
186
 
 
187
  - [ ] Add more dataset
188
  - [ ] Reduce latency in intention
189
  - [ ] Add more detailed todos
190
 
191
+ ### Evals (`backend/evals/`)
192
+
193
+ Per-turn metrics returned in `ChatResponse.eval_scores` and rendered in the React debug panel.
194
+
195
+ | Metric | File | Status |
196
+ |--------|------|--------|
197
+ | Communication Efficiency | `efficiency.py` | Done — SLO check on `t_total` |
198
+ | Factual Faithfulness | `faithfulness.py` | Stub |
199
+ | Multimodal Alignment | `multimodal_alignment.py` | Stub |
200
+ | Perceived Authenticity | (frontend) | UI star rating; not persisted yet |
201
+
202
+ - [ ] **Faithfulness** — Load cross-encoder NLI model (e.g. `cross-encoder/nli-deberta-v3-small`),
203
+ split response into sentences, check entailment against evidence chunks. Groundedness =
204
+ fraction with max entailment > 0.5; hallucination rate = fraction with contradiction > 0.5
205
+ and entailment < 0.3. Empty `chunks` → `no_evidence=True`.
206
+ - [ ] **Multimodal Alignment** — Rule-based (no model):
207
+ - Affect → sentiment-word overlap (reuse `affect_positive_map` from planner)
208
+ - Gesture → expected-word overlap (reuse `gesture_word_map` from planner)
209
+ - Gaze → check whether retrieved chunks came from `gaze_bucket` and response references them
210
+ - Overall = mean of non-None sub-scores
211
+ - [ ] **Authenticity** — Persist Likert ratings (currently client-side only). Add `POST /chat/rate`.
212
+
213
  ---
214
 
215
  ## Team
backend/api/main.py CHANGED
@@ -8,7 +8,11 @@ from fastapi.middleware.cors import CORSMiddleware
8
  from pydantic import BaseModel
9
 
10
  from backend.config.settings import settings
11
- from backend.generation.llm_client import get_client
 
 
 
 
12
  from backend.guardrails.checks import check_input
13
  from backend.pipeline.graph import aac_graph
14
  from backend.pipeline.state import PipelineState
@@ -63,15 +67,31 @@ class ChatRequest(BaseModel):
63
  air_written_text: str | None = None
64
 
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  class ChatResponse(BaseModel):
67
  user_id: str
68
  query: str
69
  response: str
70
  affect: str
71
  llm_tier: str
 
72
  retrieval_mode: str
73
  latency: dict
74
  guardrail_passed: bool
 
75
 
76
 
77
  # ── Helpers ────────────────────────────────────────────────────────────────────
@@ -123,6 +143,7 @@ def _build_initial_state(req: ChatRequest, session: dict) -> PipelineState:
123
  candidates=[],
124
  selected_response=None,
125
  llm_tier_used="",
 
126
  latency_log={
127
  "t_sensing": 0.0,
128
  "t_intent": 0.0,
@@ -143,6 +164,23 @@ def health():
143
  return {"status": "ok", "models_ready": _models_ready}
144
 
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  @app.get("/users")
147
  def list_users():
148
  try:
@@ -170,6 +208,7 @@ def chat(req: ChatRequest):
170
  response=guard["fallback"],
171
  affect="NEUTRAL",
172
  llm_tier="none",
 
173
  retrieval_mode="none",
174
  latency={},
175
  guardrail_passed=False,
@@ -184,13 +223,27 @@ def chat(req: ChatRequest):
184
  session["session_history"] = result["session_history"]
185
  session["bucket_priors"] = result["bucket_priors"]
186
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  return ChatResponse(
188
  user_id=req.user_id,
189
  query=req.query,
190
  response=result["selected_response"] or "",
191
- affect=(result.get("affect") or {}).get("emotion", "NEUTRAL"),
192
  llm_tier=result.get("llm_tier_used", "unknown"),
 
193
  retrieval_mode=result.get("retrieval_mode_used", "unknown"),
194
  latency=result.get("latency_log") or {},
195
  guardrail_passed=result.get("guardrail_passed", True),
 
196
  )
 
8
  from pydantic import BaseModel
9
 
10
  from backend.config.settings import settings
11
+ from backend.evals import compute_evals
12
+ from backend.generation.llm_client import ( # active_model used by /debug/config
13
+ active_model,
14
+ get_client,
15
+ )
16
  from backend.guardrails.checks import check_input
17
  from backend.pipeline.graph import aac_graph
18
  from backend.pipeline.state import PipelineState
 
67
  air_written_text: str | None = None
68
 
69
 
70
+ class EvalScoresResponse(BaseModel):
71
+ groundedness: float
72
+ hallucination_rate: float
73
+ no_evidence: bool
74
+ t_total_s: float
75
+ slo_target_s: float
76
+ slo_passed: bool
77
+ slo_margin_s: float
78
+ multimodal_alignment: float
79
+ affect_alignment: float
80
+ gesture_alignment: float
81
+ gaze_alignment: float
82
+
83
+
84
  class ChatResponse(BaseModel):
85
  user_id: str
86
  query: str
87
  response: str
88
  affect: str
89
  llm_tier: str
90
+ llm_model: str
91
  retrieval_mode: str
92
  latency: dict
93
  guardrail_passed: bool
94
+ eval_scores: EvalScoresResponse | None = None
95
 
96
 
97
  # ── Helpers ────────────────────────────────────────────────────────────────────
 
143
  candidates=[],
144
  selected_response=None,
145
  llm_tier_used="",
146
+ llm_model_used="",
147
  latency_log={
148
  "t_sensing": 0.0,
149
  "t_intent": 0.0,
 
164
  return {"status": "ok", "models_ready": _models_ready}
165
 
166
 
167
+ @app.get("/debug/config")
168
+ def debug_config():
169
+ """Return active model + key settings for the debug panel."""
170
+ return {
171
+ "active_llm_tier": settings.active_llm_tier,
172
+ "active_model": active_model(),
173
+ "thinking_mode": settings.thinking_mode,
174
+ "embed_model": settings.embed_model,
175
+ "rerank_model": settings.rerank_model,
176
+ "retrieval_top_k": settings.retrieval_top_k,
177
+ "retrieval_rerank_k": settings.retrieval_rerank_k,
178
+ "fallback_latency_threshold": settings.fallback_latency_threshold,
179
+ "slo_target_s": settings.slo_target_s,
180
+ "num_candidates": settings.num_candidates,
181
+ }
182
+
183
+
184
  @app.get("/users")
185
  def list_users():
186
  try:
 
208
  response=guard["fallback"],
209
  affect="NEUTRAL",
210
  llm_tier="none",
211
+ llm_model="none",
212
  retrieval_mode="none",
213
  latency={},
214
  guardrail_passed=False,
 
223
  session["session_history"] = result["session_history"]
224
  session["bucket_priors"] = result["bucket_priors"]
225
 
226
+ # Compute evaluation metrics
227
+ affect_emotion = (result.get("affect") or {}).get("emotion", "NEUTRAL")
228
+ eval_scores = compute_evals(
229
+ response=result["selected_response"] or "",
230
+ chunks=result.get("retrieved_chunks") or [],
231
+ latency_log=result.get("latency_log") or {},
232
+ affect=affect_emotion,
233
+ gesture_tag=req.gesture_tag,
234
+ gaze_bucket=req.gaze_bucket,
235
+ slo_target=settings.slo_target_s,
236
+ )
237
+
238
  return ChatResponse(
239
  user_id=req.user_id,
240
  query=req.query,
241
  response=result["selected_response"] or "",
242
+ affect=affect_emotion,
243
  llm_tier=result.get("llm_tier_used", "unknown"),
244
+ llm_model=result.get("llm_model_used", "unknown"),
245
  retrieval_mode=result.get("retrieval_mode_used", "unknown"),
246
  latency=result.get("latency_log") or {},
247
  guardrail_passed=result.get("guardrail_passed", True),
248
+ eval_scores=eval_scores,
249
  )
backend/config/settings.py CHANGED
@@ -69,5 +69,8 @@ class Settings(BaseSettings):
69
  rank_beta: float = 0.3 # style similarity weight
70
  rank_gamma: float = 0.3 # affect-match weight
71
 
 
 
 
72
 
73
  settings = Settings()
 
69
  rank_beta: float = 0.3 # style similarity weight
70
  rank_gamma: float = 0.3 # affect-match weight
71
 
72
+ # ── Evaluation ────────────────────────────────────────────────────────────
73
+ slo_target_s: float = 6.0 # max acceptable response latency (seconds)
74
+
75
 
76
  settings = Settings()
backend/evals/__init__.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Evaluation metrics — compute after pipeline returns, before API response.
2
+ from __future__ import annotations
3
+
4
+ from backend.evals.efficiency import compute_efficiency
5
+ from backend.evals.faithfulness import compute_faithfulness
6
+ from backend.evals.multimodal_alignment import compute_multimodal_alignment
7
+
8
+
9
+ def compute_evals(
10
+ response: str,
11
+ chunks: list[dict],
12
+ latency_log: dict,
13
+ affect: str | None,
14
+ gesture_tag: str | None,
15
+ gaze_bucket: str | None,
16
+ slo_target: float = 6.0,
17
+ ) -> dict:
18
+ """Run all eval scorers and return a unified EvalScores dict."""
19
+ faith = compute_faithfulness(response, chunks)
20
+ eff = compute_efficiency(latency_log, slo_target)
21
+ align = compute_multimodal_alignment(
22
+ response, affect, gesture_tag, gaze_bucket, chunks
23
+ )
24
+
25
+ return {
26
+ "groundedness": faith["groundedness"],
27
+ "hallucination_rate": faith["hallucination_rate"],
28
+ "no_evidence": faith["no_evidence"],
29
+ "t_total_s": eff["t_total"],
30
+ "slo_target_s": eff["slo_target"],
31
+ "slo_passed": eff["slo_passed"],
32
+ "slo_margin_s": eff["margin_s"],
33
+ "multimodal_alignment": align["overall_score"],
34
+ "affect_alignment": align["affect_alignment"],
35
+ "gesture_alignment": align["gesture_alignment"],
36
+ "gaze_alignment": align["gaze_alignment"],
37
+ }
backend/evals/efficiency.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Communication efficiency — SLO pass/fail on response latency.
2
+ from __future__ import annotations
3
+
4
+
5
+ def compute_efficiency(latency_log: dict, slo_target: float = 6.0) -> dict:
6
+ """Check if total response time meets the SLO target."""
7
+ t_total = latency_log.get("t_total", 0.0)
8
+ return {
9
+ "t_total": round(t_total, 3),
10
+ "slo_target": slo_target,
11
+ "slo_passed": t_total < slo_target,
12
+ "margin_s": round(slo_target - t_total, 3),
13
+ }
backend/evals/faithfulness.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # NLI-based faithfulness scoring.
2
+ from __future__ import annotations
3
+
4
+
5
+ def compute_faithfulness(response: str, chunks: list[dict]) -> dict:
6
+ """Compute groundedness and hallucination rate via NLI."""
7
+ no_evidence = len(chunks) == 0
8
+ return {
9
+ "groundedness": 0.0,
10
+ "hallucination_rate": 0.0,
11
+ "no_evidence": no_evidence,
12
+ }
backend/evals/multimodal_alignment.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multimodal alignment scoring.
2
+ from __future__ import annotations
3
+
4
+
5
+ def compute_multimodal_alignment(
6
+ response: str,
7
+ affect: str | None,
8
+ gesture_tag: str | None,
9
+ gaze_bucket: str | None,
10
+ chunks: list[dict],
11
+ ) -> dict:
12
+ """Score alignment between non-verbal inputs and generated text."""
13
+ return {
14
+ "overall_score": 0.0,
15
+ "affect_alignment": 0.0,
16
+ "gesture_alignment": 0.0,
17
+ "gaze_alignment": 0.0,
18
+ }
backend/generation/llm_client.py CHANGED
@@ -97,11 +97,19 @@ def chat_complete(
97
  **kwargs,
98
  )
99
  raw = (resp.choices[0].message.content if resp.choices else "") or ""
 
 
 
100
 
101
  if settings.thinking_mode in ("off", "strip"):
102
  raw = _strip_think_tags(raw)
103
 
104
- return raw.strip()
 
 
 
 
 
105
 
106
 
107
  def warmup(tier: str | None = None) -> None:
 
97
  **kwargs,
98
  )
99
  raw = (resp.choices[0].message.content if resp.choices else "") or ""
100
+ print(
101
+ f"[llm_client] tier={resolved_tier} model={model} raw_len={len(raw)} raw={raw[:200]!r}"
102
+ )
103
 
104
  if settings.thinking_mode in ("off", "strip"):
105
  raw = _strip_think_tags(raw)
106
 
107
+ stripped = raw.strip()
108
+ if not stripped:
109
+ print(
110
+ f"[llm_client] WARNING: empty response after strip. finish_reason={resp.choices[0].finish_reason if resp.choices else 'none'}"
111
+ )
112
+ return stripped
113
 
114
 
115
  def warmup(tier: str | None = None) -> None:
backend/pipeline/nodes/planner.py CHANGED
@@ -4,7 +4,7 @@ from __future__ import annotations
4
  import time
5
 
6
  from backend.config.settings import settings
7
- from backend.generation.llm_client import chat_complete
8
  from backend.guardrails.checks import check_output
9
  from backend.pipeline.state import PipelineState
10
  from backend.sensing.gesture import GESTURE_TO_TAG
@@ -94,11 +94,16 @@ def _run(state: PipelineState, tier: str) -> dict:
94
  4,
95
  )
96
 
 
 
 
 
97
  return {
98
  "augmented_prompt": prompt,
99
  "candidates": candidates,
100
  "selected_response": selected,
101
- "llm_tier_used": tier,
 
102
  "latency_log": latency_log,
103
  "guardrail_passed": guard["passed"],
104
  }
 
4
  import time
5
 
6
  from backend.config.settings import settings
7
+ from backend.generation.llm_client import active_model, chat_complete
8
  from backend.guardrails.checks import check_output
9
  from backend.pipeline.state import PipelineState
10
  from backend.sensing.gesture import GESTURE_TO_TAG
 
94
  4,
95
  )
96
 
97
+ # Mirror chat_complete's tier collapsing so the reported model matches what ran.
98
+ actual_tier = "local" if settings.active_llm_tier == "local" else tier
99
+ actual_model = active_model(actual_tier)
100
+
101
  return {
102
  "augmented_prompt": prompt,
103
  "candidates": candidates,
104
  "selected_response": selected,
105
+ "llm_tier_used": actual_tier,
106
+ "llm_model_used": actual_model,
107
  "latency_log": latency_log,
108
  "guardrail_passed": guard["passed"],
109
  }
backend/pipeline/state.py CHANGED
@@ -90,6 +90,7 @@ class PipelineState(TypedDict):
90
  candidates: list[str] # 2-3 candidate responses
91
  selected_response: str | None
92
  llm_tier_used: str # "primary" | "fallback" | "local"
 
93
 
94
  # ── L5: Feedback / tracking ───────────────────────────────────────────────
95
  latency_log: LatencyLog | None
 
90
  candidates: list[str] # 2-3 candidate responses
91
  selected_response: str | None
92
  llm_tier_used: str # "primary" | "fallback" | "local"
93
+ llm_model_used: str # actual model name (e.g. "gemma4:31b-cloud")
94
 
95
  # ── L5: Feedback / tracking ───────────────────────────────────────────────
96
  latency_log: LatencyLog | None
frontend/src/App.css CHANGED
@@ -124,14 +124,15 @@ select:focus, input[type="text"]:focus {
124
  font-weight: 500;
125
  }
126
 
127
- /* ── Latency metrics ──────────────────────────────────────────────── */
128
 
129
- .latency-metrics h3 {
130
  font-size: 12px;
131
  color: #888;
132
  text-transform: uppercase;
133
  letter-spacing: 0.5px;
134
  margin-bottom: 6px;
 
135
  }
136
 
137
  .metric-row {
@@ -139,6 +140,13 @@ select:focus, input[type="text"]:focus {
139
  justify-content: space-between;
140
  font-size: 13px;
141
  padding: 2px 0;
 
 
 
 
 
 
 
142
  }
143
 
144
  .metric-label {
@@ -150,6 +158,14 @@ select:focus, input[type="text"]:focus {
150
  font-family: monospace;
151
  }
152
 
 
 
 
 
 
 
 
 
153
  .no-metrics {
154
  color: #555;
155
  font-size: 13px;
@@ -260,3 +276,109 @@ select:focus, input[type="text"]:focus {
260
  color: #e55;
261
  font-size: 13px;
262
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  font-weight: 500;
125
  }
126
 
127
+ /* ── Shared metric primitives (latency, eval) ─────────────────────── */
128
 
129
+ .section-title {
130
  font-size: 12px;
131
  color: #888;
132
  text-transform: uppercase;
133
  letter-spacing: 0.5px;
134
  margin-bottom: 6px;
135
+ font-weight: 600;
136
  }
137
 
138
  .metric-row {
 
140
  justify-content: space-between;
141
  font-size: 13px;
142
  padding: 2px 0;
143
+ color: #aaa;
144
+ }
145
+
146
+ .metric-row.sub {
147
+ padding-left: 12px;
148
+ font-size: 11px;
149
+ color: #777;
150
  }
151
 
152
  .metric-label {
 
158
  font-family: monospace;
159
  }
160
 
161
+ .metric-value.pass {
162
+ color: #4caf50;
163
+ }
164
+
165
+ .metric-value.fail {
166
+ color: #f44336;
167
+ }
168
+
169
  .no-metrics {
170
  color: #555;
171
  font-size: 13px;
 
276
  color: #e55;
277
  font-size: 13px;
278
  }
279
+
280
+ /* ── Eval panel ──────────────────────────────────────────────────── */
281
+
282
+ .eval-panel {
283
+ margin-top: 8px;
284
+ border-top: 1px solid #3a3d47;
285
+ padding-top: 6px;
286
+ }
287
+
288
+ .eval-toggle {
289
+ background: none;
290
+ border: none;
291
+ color: #888;
292
+ font-size: 12px;
293
+ cursor: pointer;
294
+ padding: 2px 0;
295
+ display: flex;
296
+ align-items: center;
297
+ gap: 8px;
298
+ }
299
+
300
+ .eval-toggle:hover {
301
+ color: #bbb;
302
+ }
303
+
304
+ .slo-badge {
305
+ font-size: 10px;
306
+ padding: 1px 6px;
307
+ border-radius: 3px;
308
+ font-weight: 600;
309
+ }
310
+
311
+ .slo-badge.pass {
312
+ background: #1b3a1b;
313
+ color: #4caf50;
314
+ }
315
+
316
+ .slo-badge.fail {
317
+ background: #3a1b1b;
318
+ color: #f44336;
319
+ }
320
+
321
+ .eval-details {
322
+ display: flex;
323
+ flex-direction: column;
324
+ gap: 10px;
325
+ margin-top: 8px;
326
+ }
327
+
328
+ .eval-section {
329
+ display: flex;
330
+ flex-direction: column;
331
+ gap: 4px;
332
+ }
333
+
334
+ .eval-na {
335
+ font-size: 11px;
336
+ color: #666;
337
+ font-style: italic;
338
+ }
339
+
340
+ .score-bar {
341
+ height: 4px;
342
+ background: #2a2d37;
343
+ border-radius: 2px;
344
+ overflow: hidden;
345
+ }
346
+
347
+ .score-bar-fill {
348
+ height: 100%;
349
+ border-radius: 2px;
350
+ transition: width 0.3s ease;
351
+ }
352
+
353
+ /* ── Star rating ─────────────────────────────────────────────────── */
354
+
355
+ .star-rating {
356
+ display: flex;
357
+ align-items: center;
358
+ gap: 2px;
359
+ }
360
+
361
+ .star-rating .star {
362
+ background: none;
363
+ border: none;
364
+ font-size: 18px;
365
+ cursor: pointer;
366
+ color: #3a3d47;
367
+ padding: 0;
368
+ line-height: 1;
369
+ transition: color 0.15s;
370
+ }
371
+
372
+ .star-rating .star.active {
373
+ color: #ff9800;
374
+ }
375
+
376
+ .star-rating .star:hover {
377
+ color: #ffb74d;
378
+ }
379
+
380
+ .star-label {
381
+ font-size: 11px;
382
+ color: #888;
383
+ margin-left: 6px;
384
+ }
frontend/src/components/ChatPanel.tsx CHANGED
@@ -1,6 +1,7 @@
1
  import { useState, useRef, useEffect } from "react";
2
  import type { ChatMessage, SensingState, Affect, LatencyLog } from "../types";
3
  import { sendChat } from "../lib/api";
 
4
 
5
  interface Props {
6
  userId: string | null;
@@ -59,6 +60,7 @@ export function ChatPanel({
59
  content: res.response,
60
  latency: res.latency,
61
  affect: res.affect,
 
62
  },
63
  ]);
64
  onLatency(res.latency);
@@ -88,6 +90,9 @@ export function ChatPanel({
88
  {msg.role === "partner" ? "Partner" : "AAC User"}
89
  </span>
90
  <p>{msg.content}</p>
 
 
 
91
  </div>
92
  ))}
93
  {loading && (
 
1
  import { useState, useRef, useEffect } from "react";
2
  import type { ChatMessage, SensingState, Affect, LatencyLog } from "../types";
3
  import { sendChat } from "../lib/api";
4
+ import { EvalPanel } from "./EvalPanel";
5
 
6
  interface Props {
7
  userId: string | null;
 
60
  content: res.response,
61
  latency: res.latency,
62
  affect: res.affect,
63
+ evalScores: res.eval_scores,
64
  },
65
  ]);
66
  onLatency(res.latency);
 
90
  {msg.role === "partner" ? "Partner" : "AAC User"}
91
  </span>
92
  <p>{msg.content}</p>
93
+ {msg.role === "aac_user" && msg.evalScores && (
94
+ <EvalPanel evalScores={msg.evalScores} />
95
+ )}
96
  </div>
97
  ))}
98
  {loading && (
frontend/src/components/EvalPanel.tsx ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState } from "react";
2
+ import type { EvalScores } from "../types";
3
+
4
+ interface Props {
5
+ evalScores: EvalScores;
6
+ }
7
+
8
+ function ScoreBar({ value }: { value: number }) {
9
+ const pct = Math.min(value * 100, 100);
10
+ const color = pct > 70 ? "#4caf50" : pct > 40 ? "#ff9800" : "#f44336";
11
+ return (
12
+ <div className="score-bar">
13
+ <div className="score-bar-fill" style={{ width: `${pct}%`, background: color }} />
14
+ </div>
15
+ );
16
+ }
17
+
18
+ function StarRating({
19
+ value,
20
+ onChange,
21
+ }: {
22
+ value: number | null;
23
+ onChange: (v: number) => void;
24
+ }) {
25
+ const [hover, setHover] = useState(0);
26
+ return (
27
+ <div className="star-rating">
28
+ {[1, 2, 3, 4, 5].map((star) => (
29
+ <button
30
+ key={star}
31
+ className={`star ${star <= (hover || (value ?? 0)) ? "active" : ""}`}
32
+ onMouseEnter={() => setHover(star)}
33
+ onMouseLeave={() => setHover(0)}
34
+ onClick={() => onChange(star)}
35
+ >
36
+
37
+ </button>
38
+ ))}
39
+ {value !== null && <span className="star-label">{value}/5</span>}
40
+ </div>
41
+ );
42
+ }
43
+
44
+ export function EvalPanel({ evalScores }: Props) {
45
+ const [expanded, setExpanded] = useState(false);
46
+ const [likert, setLikert] = useState<number | null>(null);
47
+
48
+ return (
49
+ <div className="eval-panel">
50
+ <button
51
+ className="eval-toggle"
52
+ onClick={() => setExpanded(!expanded)}
53
+ >
54
+ {expanded ? "▾" : "▸"} Eval Metrics
55
+ {evalScores.slo_passed ? (
56
+ <span className="slo-badge pass">SLO ✓</span>
57
+ ) : (
58
+ <span className="slo-badge fail">SLO ✗</span>
59
+ )}
60
+ {likert !== null && (
61
+ <span className="slo-badge">{likert}/5 ★</span>
62
+ )}
63
+ </button>
64
+
65
+ {expanded && (
66
+ <div className="eval-details">
67
+ <div className="eval-section">
68
+ <div className="section-title">Factual Faithfulness</div>
69
+ {evalScores.no_evidence ? (
70
+ <div className="eval-na">N/A — no evidence retrieved</div>
71
+ ) : (
72
+ <>
73
+ <div className="metric-row">
74
+ <span>Groundedness</span>
75
+ <span className="metric-value">{(evalScores.groundedness * 100).toFixed(0)}%</span>
76
+ </div>
77
+ <ScoreBar value={evalScores.groundedness} />
78
+ <div className="metric-row">
79
+ <span>Hallucination Rate</span>
80
+ <span className={`metric-value ${evalScores.hallucination_rate > 0.2 ? "fail" : "pass"}`}>
81
+ {(evalScores.hallucination_rate * 100).toFixed(0)}%
82
+ </span>
83
+ </div>
84
+ </>
85
+ )}
86
+ </div>
87
+
88
+ <div className="eval-section">
89
+ <div className="section-title">Communication Efficiency</div>
90
+ <div className="metric-row">
91
+ <span>Response Time</span>
92
+ <span className={`metric-value ${evalScores.slo_passed ? "pass" : "fail"}`}>
93
+ {evalScores.t_total_s.toFixed(2)}s
94
+ {evalScores.slo_passed ? " ✓" : " ✗"}
95
+ </span>
96
+ </div>
97
+ <div className="metric-row sub">
98
+ <span>SLO Target</span>
99
+ <span className="metric-value">
100
+ &lt; {evalScores.slo_target_s.toFixed(1)}s (margin: {evalScores.slo_margin_s.toFixed(2)}s)
101
+ </span>
102
+ </div>
103
+ </div>
104
+
105
+ <div className="eval-section">
106
+ <div className="section-title">Multimodal Alignment</div>
107
+ <div className="metric-row">
108
+ <span>Overall</span>
109
+ <span className="metric-value">{(evalScores.multimodal_alignment * 100).toFixed(0)}%</span>
110
+ </div>
111
+ <ScoreBar value={evalScores.multimodal_alignment} />
112
+ <div className="metric-row sub">
113
+ <span>Affect</span>
114
+ <span className="metric-value">{(evalScores.affect_alignment * 100).toFixed(0)}%</span>
115
+ </div>
116
+ <div className="metric-row sub">
117
+ <span>Gesture</span>
118
+ <span className="metric-value">{(evalScores.gesture_alignment * 100).toFixed(0)}%</span>
119
+ </div>
120
+ <div className="metric-row sub">
121
+ <span>Gaze</span>
122
+ <span className="metric-value">{(evalScores.gaze_alignment * 100).toFixed(0)}%</span>
123
+ </div>
124
+ </div>
125
+
126
+ <div className="eval-section">
127
+ <div className="section-title">Perceived Authenticity</div>
128
+ <div className="metric-row">
129
+ <span>Rate this response</span>
130
+ </div>
131
+ <StarRating value={likert} onChange={setLikert} />
132
+ </div>
133
+ </div>
134
+ )}
135
+ </div>
136
+ );
137
+ }
frontend/src/components/LatencyMetrics.tsx CHANGED
@@ -16,8 +16,8 @@ export function LatencyMetrics({ latency }: Props) {
16
  if (!latency) return <p className="no-metrics">No turn yet</p>;
17
 
18
  return (
19
- <div className="latency-metrics">
20
- <h3>Latency</h3>
21
  {FIELDS.map(({ key, label }) => (
22
  <div key={key} className="metric-row">
23
  <span className="metric-label">{label}</span>
 
16
  if (!latency) return <p className="no-metrics">No turn yet</p>;
17
 
18
  return (
19
+ <div>
20
+ <div className="section-title">Latency</div>
21
  {FIELDS.map(({ key, label }) => (
22
  <div key={key} className="metric-row">
23
  <span className="metric-label">{label}</span>
frontend/src/types.ts CHANGED
@@ -33,6 +33,20 @@ export interface LatencyLog {
33
  t_total: number;
34
  }
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  export interface ChatResponse {
37
  user_id: string;
38
  query: string;
@@ -42,6 +56,7 @@ export interface ChatResponse {
42
  retrieval_mode: string;
43
  latency: LatencyLog;
44
  guardrail_passed: boolean;
 
45
  }
46
 
47
  export interface ChatMessage {
@@ -49,4 +64,5 @@ export interface ChatMessage {
49
  content: string;
50
  latency?: LatencyLog;
51
  affect?: string;
 
52
  }
 
33
  t_total: number;
34
  }
35
 
36
+ export interface EvalScores {
37
+ groundedness: number;
38
+ hallucination_rate: number;
39
+ no_evidence: boolean;
40
+ t_total_s: number;
41
+ slo_target_s: number;
42
+ slo_passed: boolean;
43
+ slo_margin_s: number;
44
+ multimodal_alignment: number;
45
+ affect_alignment: number;
46
+ gesture_alignment: number;
47
+ gaze_alignment: number;
48
+ }
49
+
50
  export interface ChatResponse {
51
  user_id: string;
52
  query: string;
 
56
  retrieval_mode: string;
57
  latency: LatencyLog;
58
  guardrail_passed: boolean;
59
+ eval_scores: EvalScores | null;
60
  }
61
 
62
  export interface ChatMessage {
 
64
  content: string;
65
  latency?: LatencyLog;
66
  affect?: string;
67
+ evalScores?: EvalScores | null;
68
  }
run.sh CHANGED
@@ -3,6 +3,16 @@ set -euo pipefail
3
 
4
  export PYTHONWARNINGS="ignore::UserWarning:multiprocessing.resource_tracker"
5
 
 
 
 
 
 
 
 
 
 
 
6
  PIDS=()
7
 
8
  cleanup() {
@@ -22,7 +32,7 @@ trap cleanup INT TERM
22
 
23
  # Use Node 22 if available (Vite 8 requires Node 20.19+ or 22.12+)
24
  if [ -x /opt/homebrew/opt/node@22/bin/node ]; then
25
- export PATH="/opt/homebrew/opt/node@22/bin:$PATH"
26
  fi
27
 
28
  # Start Ollama if not already running
 
3
 
4
  export PYTHONWARNINGS="ignore::UserWarning:multiprocessing.resource_tracker"
5
 
6
+ CONDA_ENV="aac-chatbot"
7
+
8
+ # Activate conda env
9
+ if ! command -v conda >/dev/null 2>&1; then
10
+ echo "ERROR: conda not found. Run setup.sh first." >&2
11
+ exit 1
12
+ fi
13
+ eval "$(conda shell.bash hook)"
14
+ conda activate "$CONDA_ENV"
15
+
16
  PIDS=()
17
 
18
  cleanup() {
 
32
 
33
  # Use Node 22 if available (Vite 8 requires Node 20.19+ or 22.12+)
34
  if [ -x /opt/homebrew/opt/node@22/bin/node ]; then
35
+ export PATH="/opt/homebrew/opt/node@22/bin:/opt/homebrew/bin:$PATH"
36
  fi
37
 
38
  # Start Ollama if not already running