aac-chatbot / frontend /src /components /EvalPanel.tsx
shwetangisingh's picture
bug fixes
0e19ba2
import { memo, useState } from "react";
import { submitRating } from "../lib/api";
import type { EvalScores } from "../types";
interface Props {
runId: string;
userId: string;
latencyTotal: number;
sloTarget?: number;
evalScores?: EvalScores | null;
}
function gradeClass(score: number): string {
if (score >= 0.75) return "good";
if (score >= 0.4) return "mid";
return "bad";
}
function fmt(score: number): string {
return (score * 100).toFixed(0) + "%";
}
function buildTip(parts: {
title: string;
question: string;
how?: string;
thisTurn?: string;
fallback?: string;
}): string {
const header = `${parts.title} — ${parts.question}`;
if (parts.fallback) return `${header}\n\n${parts.fallback}`;
const sections = [parts.how, parts.thisTurn].filter(Boolean);
return sections.length ? `${header}\n\n${sections.join("\n\n")}` : header;
}
function groundednessTip(s: EvalScores): string {
const title = "GROUNDED";
if (s.no_evidence) {
return buildTip({
title,
question: "Did the response stick to the retrieved memories?",
fallback:
"Not scored: no memories were retrieved this turn (e.g. a 'how are you feeling?' question that skips retrieval).",
});
}
const total = s.sentences_total ?? 0;
const grounded = s.sentences_grounded ?? 0;
const thr = s.nli_threshold ?? 0.5;
return buildTip({
title,
question: "Did the response stick to the retrieved memories, or hallucinate?",
how:
`How: each sentence in the response is checked against each retrieved chunk with an NLI model. ` +
`A sentence counts as grounded if at least one chunk entails it with probability ≥ ${thr.toFixed(2)}.`,
thisTurn:
`This turn: ${grounded}/${total} sentences grounded → ${fmt(s.groundedness)}. ` +
`Hallucination = ${fmt(s.hallucination_rate)} (${total - grounded} unsupported).`,
});
}
function relevanceTip(s: EvalScores): string {
return buildTip({
title: "RELEVANT",
question: "Did the response actually address the partner's question?",
how:
"How: cosine similarity between the BGE embedding of the query and the embedding of the response. " +
"Higher = more semantically on-topic.",
thisTurn: `This turn: ${(s.relevance ?? 0).toFixed(3)} → ${fmt(s.relevance ?? 0)}.`,
});
}
function affectTip(s: EvalScores): string {
const question = "Does the response tone match the detected facial expression?";
const ex = s.explain?.affect;
if (!ex) {
return buildTip({ title: "AFFECT", question });
}
return buildTip({
title: "AFFECT",
question,
how:
"How: response sentiment is computed from positive vs negative word counts, " +
"then compared to the affect target.",
thisTurn:
`This turn: detected ${ex.target}, response sentiment = ${ex.sentiment.toFixed(2)} ` +
`(${ex.pos_words} positive word${ex.pos_words === 1 ? "" : "s"}, ` +
`${ex.neg_words} negative) → ${fmt(s.affect_alignment)}.`,
});
}
function gestureTip(s: EvalScores): string {
const title = "GESTURE";
const question = "Does the response opener acknowledge the detected hand gesture?";
const ex = s.explain?.gesture;
if (!ex) {
return buildTip({ title, question, fallback: "No gesture detected this turn — defaults to 0." });
}
if (!ex.has_pattern) {
return buildTip({
title,
question,
fallback: `Detected ${ex.tag}, but this gesture has no opener pattern to test — partial credit (50%).`,
});
}
return buildTip({
title,
question,
how: `How: regex check on the first words of the response (e.g. THUMBS_UP expects 'yes/sure/absolutely…').`,
thisTurn:
`This turn: detected ${ex.tag}, opener ${ex.matched ? "matched" : "did not match"} ` +
`→ ${ex.matched ? "100%" : "0%"}.`,
});
}
function gazeTip(s: EvalScores): string {
const title = "GAZE";
const question = "Did the retrieved memories come from the topic the user was looking at?";
const ex = s.explain?.gaze;
if (!ex) {
return buildTip({ title, question, fallback: "No gaze bucket detected this turn — defaults to 0." });
}
if (ex.total_chunks === 0) {
return buildTip({
title,
question,
fallback: `User was looking at: ${ex.bucket}. No chunks retrieved this turn — defaults to 0.`,
});
}
return buildTip({
title,
question,
how: `How: fraction of retrieved chunks whose 'bucket' label matches the gaze target.`,
thisTurn:
`This turn: user looking at ${ex.bucket}, ${ex.matched_chunks}/${ex.total_chunks} ` +
`retrieved chunks matched → ${fmt(s.gaze_alignment)}.`,
});
}
function diversityTip(s: EvalScores): string {
const title = "DIVERSITY";
const question = "How different are the candidate responses the picker showed?";
const n = s.n_candidates ?? 0;
const d = s.candidate_diversity ?? 0;
if (n < 2) {
return buildTip({ title, question, fallback: `Only ${n} candidate this turn — not meaningful.` });
}
return buildTip({
title,
question,
how:
"How: average pairwise cosine distance between BGE embeddings of the candidate texts. " +
"High = varied alternatives. Low = three paraphrases of the same answer (the 'aloha' problem).",
thisTurn: `This turn: ${n} candidates, mean pairwise distance = ${d.toFixed(3)} → ${fmt(d)}.`,
});
}
function sloTip(
s: EvalScores | null | undefined,
fallbackLatency: number,
fallbackTarget: number,
fallbackPassed: boolean,
): string {
const latency = s?.t_total_s ?? fallbackLatency;
const target = s?.slo_target_s ?? fallbackTarget;
const passed = s?.slo_passed ?? fallbackPassed;
const margin = s?.slo_margin_s;
const sign = (margin ?? 0) >= 0 ? "+" : "";
const m = margin !== undefined ? `${sign}${margin.toFixed(2)}s` : "";
return buildTip({
title: "LATENCY",
question: "Did the response arrive within the SLO budget?",
thisTurn:
`Target: < ${target.toFixed(1)}s. ` +
`This turn: ${latency.toFixed(2)}s${m ? ` (${m} margin)` : ""} — ${passed ? "passed ✓" : "failed ✗"}.`,
});
}
function EvalPanelImpl({
runId,
userId,
latencyTotal,
sloTarget = 6.0,
evalScores,
}: Props) {
const [value, setValue] = useState<number | null>(null);
const [hover, setHover] = useState(0);
const [submitting, setSubmitting] = useState(false);
const sloPassed = evalScores
? evalScores.slo_passed
: latencyTotal > 0 && latencyTotal < sloTarget;
const effectiveLatency = evalScores?.t_total_s ?? latencyTotal;
const showDiversity =
evalScores && (evalScores.n_candidates ?? 0) >= 2;
const showRelevance = evalScores && evalScores.relevance !== undefined;
async function rate(stars: number) {
if (submitting || value !== null) return;
setSubmitting(true);
try {
await submitRating({
run_id: runId,
user_id: userId,
authenticity: stars,
});
setValue(stars);
} catch (e) {
console.error("rating submit failed", e);
} finally {
setSubmitting(false);
}
}
return (
<div className="eval-panel">
<div className="eval-row">
{effectiveLatency > 0 && (
<span
className="tip"
data-tip={sloTip(evalScores, effectiveLatency, sloTarget, sloPassed)}
>
<span className={`slo-badge ${sloPassed ? "pass" : "fail"}`}>
{effectiveLatency.toFixed(2)}s {sloPassed ? "✓" : "✗"}
</span>
</span>
)}
{evalScores && (
<>
<span className="tip" data-tip={groundednessTip(evalScores)}>
<span
className={`eval-pill ${
evalScores.no_evidence ? "muted" : gradeClass(evalScores.groundedness)
}`}
>
grounded {evalScores.no_evidence ? "—" : fmt(evalScores.groundedness)}
</span>
</span>
{showRelevance && (
<span className="tip" data-tip={relevanceTip(evalScores)}>
<span className={`eval-pill ${gradeClass(evalScores.relevance ?? 0)}`}>
relevant {fmt(evalScores.relevance ?? 0)}
</span>
</span>
)}
<span className="tip" data-tip={affectTip(evalScores)}>
<span className={`eval-pill ${gradeClass(evalScores.affect_alignment)}`}>
affect {fmt(evalScores.affect_alignment)}
</span>
</span>
<span className="tip" data-tip={gestureTip(evalScores)}>
<span className={`eval-pill ${gradeClass(evalScores.gesture_alignment)}`}>
gesture {fmt(evalScores.gesture_alignment)}
</span>
</span>
<span className="tip" data-tip={gazeTip(evalScores)}>
<span className={`eval-pill ${gradeClass(evalScores.gaze_alignment)}`}>
gaze {fmt(evalScores.gaze_alignment)}
</span>
</span>
{showDiversity && (
<span className="tip" data-tip={diversityTip(evalScores)}>
<span className={`eval-pill ${gradeClass(evalScores.candidate_diversity ?? 0)}`}>
diversity {fmt(evalScores.candidate_diversity ?? 0)}
</span>
</span>
)}
</>
)}
<div className="tip star-rating" data-tip="Rate how authentic this response felt as the persona (1 = off, 5 = spot on). Logged to ratings.jsonl.">
{[1, 2, 3, 4, 5].map((star) => (
<button
key={star}
className={`star ${star <= (hover || (value ?? 0)) ? "active" : ""}`}
onMouseEnter={() => setHover(star)}
onMouseLeave={() => setHover(0)}
onClick={() => rate(star)}
disabled={value !== null || submitting}
>
★
</button>
))}
{value !== null && <span className="star-label">{value}/5</span>}
</div>
</div>
</div>
);
}
export const EvalPanel = memo(EvalPanelImpl);