Spaces:

ub-aac-chatbot
/

aac-chatbot

Sleeping

File size: 10,166 Bytes

import { memo, useState } from "react";
import { submitRating } from "../lib/api";
import type { EvalScores } from "../types";

interface Props {
  runId: string;
  userId: string;
  latencyTotal: number;
  sloTarget?: number;
  evalScores?: EvalScores | null;
}

function gradeClass(score: number): string {
  if (score >= 0.75) return "good";
  if (score >= 0.4) return "mid";
  return "bad";
}

function fmt(score: number): string {
  return (score * 100).toFixed(0) + "%";
}

function buildTip(parts: {
  title: string;
  question: string;
  how?: string;
  thisTurn?: string;
  fallback?: string;
}): string {
  const header = `${parts.title} — ${parts.question}`;
  if (parts.fallback) return `${header}\n\n${parts.fallback}`;
  const sections = [parts.how, parts.thisTurn].filter(Boolean);
  return sections.length ? `${header}\n\n${sections.join("\n\n")}` : header;
}

function groundednessTip(s: EvalScores): string {
  const title = "GROUNDED";
  if (s.no_evidence) {
    return buildTip({
      title,
      question: "Did the response stick to the retrieved memories?",
      fallback:
        "Not scored: no memories were retrieved this turn (e.g. a 'how are you feeling?' question that skips retrieval).",
    });
  }
  const total = s.sentences_total ?? 0;
  const grounded = s.sentences_grounded ?? 0;
  const thr = s.nli_threshold ?? 0.5;
  return buildTip({
    title,
    question: "Did the response stick to the retrieved memories, or hallucinate?",
    how:
      `How: each sentence in the response is checked against each retrieved chunk with an NLI model. ` +
      `A sentence counts as grounded if at least one chunk entails it with probability ≥ ${thr.toFixed(2)}.`,
    thisTurn:
      `This turn: ${grounded}/${total} sentences grounded → ${fmt(s.groundedness)}. ` +
      `Hallucination = ${fmt(s.hallucination_rate)} (${total - grounded} unsupported).`,
  });
}

function relevanceTip(s: EvalScores): string {
  return buildTip({
    title: "RELEVANT",
    question: "Did the response actually address the partner's question?",
    how:
      "How: cosine similarity between the BGE embedding of the query and the embedding of the response. " +
      "Higher = more semantically on-topic.",
    thisTurn: `This turn: ${(s.relevance ?? 0).toFixed(3)} → ${fmt(s.relevance ?? 0)}.`,
  });
}

function affectTip(s: EvalScores): string {
  const question = "Does the response tone match the detected facial expression?";
  const ex = s.explain?.affect;
  if (!ex) {
    return buildTip({ title: "AFFECT", question });
  }
  return buildTip({
    title: "AFFECT",
    question,
    how:
      "How: response sentiment is computed from positive vs negative word counts, " +
      "then compared to the affect target.",
    thisTurn:
      `This turn: detected ${ex.target}, response sentiment = ${ex.sentiment.toFixed(2)} ` +
      `(${ex.pos_words} positive word${ex.pos_words === 1 ? "" : "s"}, ` +
      `${ex.neg_words} negative) → ${fmt(s.affect_alignment)}.`,
  });
}

function gestureTip(s: EvalScores): string {
  const title = "GESTURE";
  const question = "Does the response opener acknowledge the detected hand gesture?";
  const ex = s.explain?.gesture;
  if (!ex) {
    return buildTip({ title, question, fallback: "No gesture detected this turn — defaults to 0." });
  }
  if (!ex.has_pattern) {
    return buildTip({
      title,
      question,
      fallback: `Detected ${ex.tag}, but this gesture has no opener pattern to test — partial credit (50%).`,
    });
  }
  return buildTip({
    title,
    question,
    how: `How: regex check on the first words of the response (e.g. THUMBS_UP expects 'yes/sure/absolutely…').`,
    thisTurn:
      `This turn: detected ${ex.tag}, opener ${ex.matched ? "matched" : "did not match"} ` +
      `→ ${ex.matched ? "100%" : "0%"}.`,
  });
}

function gazeTip(s: EvalScores): string {
  const title = "GAZE";
  const question = "Did the retrieved memories come from the topic the user was looking at?";
  const ex = s.explain?.gaze;
  if (!ex) {
    return buildTip({ title, question, fallback: "No gaze bucket detected this turn — defaults to 0." });
  }
  if (ex.total_chunks === 0) {
    return buildTip({
      title,
      question,
      fallback: `User was looking at: ${ex.bucket}. No chunks retrieved this turn — defaults to 0.`,
    });
  }
  return buildTip({
    title,
    question,
    how: `How: fraction of retrieved chunks whose 'bucket' label matches the gaze target.`,
    thisTurn:
      `This turn: user looking at ${ex.bucket}, ${ex.matched_chunks}/${ex.total_chunks} ` +
      `retrieved chunks matched → ${fmt(s.gaze_alignment)}.`,
  });
}

function diversityTip(s: EvalScores): string {
  const title = "DIVERSITY";
  const question = "How different are the candidate responses the picker showed?";
  const n = s.n_candidates ?? 0;
  const d = s.candidate_diversity ?? 0;
  if (n < 2) {
    return buildTip({ title, question, fallback: `Only ${n} candidate this turn — not meaningful.` });
  }
  return buildTip({
    title,
    question,
    how:
      "How: average pairwise cosine distance between BGE embeddings of the candidate texts. " +
      "High = varied alternatives. Low = three paraphrases of the same answer (the 'aloha' problem).",
    thisTurn: `This turn: ${n} candidates, mean pairwise distance = ${d.toFixed(3)} → ${fmt(d)}.`,
  });
}

function sloTip(
  s: EvalScores | null | undefined,
  fallbackLatency: number,
  fallbackTarget: number,
  fallbackPassed: boolean,
): string {
  const latency = s?.t_total_s ?? fallbackLatency;
  const target = s?.slo_target_s ?? fallbackTarget;
  const passed = s?.slo_passed ?? fallbackPassed;
  const margin = s?.slo_margin_s;
  const sign = (margin ?? 0) >= 0 ? "+" : "";
  const m = margin !== undefined ? `${sign}${margin.toFixed(2)}s` : "";
  return buildTip({
    title: "LATENCY",
    question: "Did the response arrive within the SLO budget?",
    thisTurn:
      `Target: < ${target.toFixed(1)}s. ` +
      `This turn: ${latency.toFixed(2)}s${m ? ` (${m} margin)` : ""} — ${passed ? "passed ✓" : "failed ✗"}.`,
  });
}

function EvalPanelImpl({
  runId,
  userId,
  latencyTotal,
  sloTarget = 6.0,
  evalScores,
}: Props) {
  const [value, setValue] = useState<number | null>(null);
  const [hover, setHover] = useState(0);
  const [submitting, setSubmitting] = useState(false);
  const sloPassed = evalScores
    ? evalScores.slo_passed
    : latencyTotal > 0 && latencyTotal < sloTarget;
  const effectiveLatency = evalScores?.t_total_s ?? latencyTotal;
  const showDiversity =
    evalScores && (evalScores.n_candidates ?? 0) >= 2;
  const showRelevance = evalScores && evalScores.relevance !== undefined;

  async function rate(stars: number) {
    if (submitting || value !== null) return;
    setSubmitting(true);
    try {
      await submitRating({
        run_id: runId,
        user_id: userId,
        authenticity: stars,
      });
      setValue(stars);
    } catch (e) {
      console.error("rating submit failed", e);
    } finally {
      setSubmitting(false);
    }
  }

  return (
    <div className="eval-panel">
      <div className="eval-row">
        {effectiveLatency > 0 && (
          <span
            className="tip"
            data-tip={sloTip(evalScores, effectiveLatency, sloTarget, sloPassed)}
          >
            <span className={`slo-badge ${sloPassed ? "pass" : "fail"}`}>
              {effectiveLatency.toFixed(2)}s {sloPassed ? "✓" : "✗"}
            </span>
          </span>
        )}
        {evalScores && (
          <>
            <span className="tip" data-tip={groundednessTip(evalScores)}>
              <span
                className={`eval-pill ${
                  evalScores.no_evidence ? "muted" : gradeClass(evalScores.groundedness)
                }`}
              >
                grounded {evalScores.no_evidence ? "—" : fmt(evalScores.groundedness)}
              </span>
            </span>
            {showRelevance && (
              <span className="tip" data-tip={relevanceTip(evalScores)}>
                <span className={`eval-pill ${gradeClass(evalScores.relevance ?? 0)}`}>
                  relevant {fmt(evalScores.relevance ?? 0)}
                </span>
              </span>
            )}
            <span className="tip" data-tip={affectTip(evalScores)}>
              <span className={`eval-pill ${gradeClass(evalScores.affect_alignment)}`}>
                affect {fmt(evalScores.affect_alignment)}
              </span>
            </span>
            <span className="tip" data-tip={gestureTip(evalScores)}>
              <span className={`eval-pill ${gradeClass(evalScores.gesture_alignment)}`}>
                gesture {fmt(evalScores.gesture_alignment)}
              </span>
            </span>
            <span className="tip" data-tip={gazeTip(evalScores)}>
              <span className={`eval-pill ${gradeClass(evalScores.gaze_alignment)}`}>
                gaze {fmt(evalScores.gaze_alignment)}
              </span>
            </span>
            {showDiversity && (
              <span className="tip" data-tip={diversityTip(evalScores)}>
                <span className={`eval-pill ${gradeClass(evalScores.candidate_diversity ?? 0)}`}>
                  diversity {fmt(evalScores.candidate_diversity ?? 0)}
                </span>
              </span>
            )}
          </>
        )}
        <div className="tip star-rating" data-tip="Rate how authentic this response felt as the persona (1 = off, 5 = spot on). Logged to ratings.jsonl.">
          {[1, 2, 3, 4, 5].map((star) => (
            <button
              key={star}
              className={`star ${star <= (hover || (value ?? 0)) ? "active" : ""}`}
              onMouseEnter={() => setHover(star)}
              onMouseLeave={() => setHover(0)}
              onClick={() => rate(star)}
              disabled={value !== null || submitting}
            >
              ★
            </button>
          ))}
          {value !== null && <span className="star-label">{value}/5</span>}
        </div>
      </div>
    </div>
  );
}

export const EvalPanel = memo(EvalPanelImpl);