Spaces:
Sleeping
Sleeping
File size: 4,873 Bytes
9ad188a 69345ca 9ad188a 69345ca 9ad188a 69345ca 0e19ba2 69345ca 9ad188a 69345ca 9ad188a 69345ca 9ad188a 69345ca 0e19ba2 69345ca 9ad188a 69345ca 9ad188a 69345ca 0e19ba2 69345ca 0e19ba2 69345ca | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | # Evaluation metrics — compute after pipeline returns, before API response.
import torch
from backend.evals.diversity import compute_candidate_diversity
from backend.evals.efficiency import compute_efficiency
from backend.evals.faithfulness import compute_faithfulness, compute_faithfulness_batch
from backend.evals.multimodal_alignment import compute_multimodal_alignment
from backend.evals.relevance import compute_relevance
def _score_candidates_batched(
candidates: list[dict],
chunks: list[dict],
query: str,
) -> tuple[list[dict], "torch.Tensor | None"]:
"""One BGE pass + one NLI pass across all candidates. Returns per-candidate
score dicts and the candidate vector matrix (for diversity reuse), or None
when no embedding pass was needed."""
texts = [c.get("text", "") for c in candidates]
faiths = compute_faithfulness_batch(texts, chunks)
cand_vecs: torch.Tensor | None = None
if query.strip() and any(t.strip() for t in texts):
from backend.retrieval.vector_store import embed_texts
vecs = embed_texts([query] + texts)
q_vec = vecs[0]
cand_vecs = vecs[1:]
relevances = [
round(max(0.0, float(q_vec @ cand_vecs[i])), 4) for i in range(len(texts))
]
else:
relevances = [0.0] * len(texts)
scores = [{**f, "relevance": r} for f, r in zip(faiths, relevances, strict=True)]
return scores, cand_vecs
def _diversity_from_vecs(cand_vecs: "torch.Tensor") -> dict:
n = cand_vecs.shape[0]
sims = cand_vecs @ cand_vecs.T
iu = torch.triu_indices(n, n, offset=1)
return {
"candidate_diversity": round(float(1.0 - sims[iu[0], iu[1]].mean().item()), 4),
"n_candidates": n,
}
def compute_evals(
response: str,
chunks: list[dict],
latency_log: dict,
affect: str | None,
gesture_tag: str | None,
gaze_bucket: str | None,
slo_target: float = 6.0,
query: str = "",
candidates: list[dict] | None = None,
selected_idx: int | None = None,
) -> dict:
"""Run all eval scorers and return a unified EvalScores dict.
When candidates are provided, scoring is batched: one BGE encode for
query + all candidates, one NLI predict across all (sentence, chunk)
pairs, then sliced per candidate. The selected candidate's scores are
reused as the top-level fields so the existing UI pills keep working.
"""
eff = compute_efficiency(latency_log, slo_target)
align = compute_multimodal_alignment(
response, affect, gesture_tag, gaze_bucket, chunks
)
per_cand: list[dict] = []
cand_vecs = None
if candidates:
# The planner serves uniq[0] as `selected_response`, so when caller
# didn't pass selected_idx explicitly, default to 0 rather than
# text-matching (which can collide on duplicate candidate texts).
if selected_idx is None:
selected_idx = 0
scored, cand_vecs = _score_candidates_batched(candidates, chunks, query)
per_cand = [
{
"idx": i,
"strategy": c.get("strategy", "unknown"),
"selected": (selected_idx is not None and i == selected_idx),
**scored[i],
}
for i, c in enumerate(candidates)
]
if per_cand and selected_idx is not None and 0 <= selected_idx < len(per_cand):
# Strip per-candidate-only keys before reusing as top-level scores.
top = {
k: v
for k, v in per_cand[selected_idx].items()
if k not in ("idx", "strategy", "selected")
}
else:
faith = compute_faithfulness(response, chunks)
top = {**faith, "relevance": compute_relevance(response, query)["relevance"]}
out = {
**top,
"t_total_s": eff["t_total"],
"slo_target_s": eff["slo_target"],
"slo_passed": eff["slo_passed"],
"slo_margin_s": eff["margin_s"],
"multimodal_alignment": align["overall_score"],
"affect_alignment": align["affect_alignment"],
"gesture_alignment": align["gesture_alignment"],
"gaze_alignment": align["gaze_alignment"],
"explain": align.get("explain", {}),
}
if per_cand:
out["candidates_eval"] = per_cand
n = len(candidates)
if n < 2:
out["candidate_diversity"] = 0.0
out["n_candidates"] = n
elif cand_vecs is not None:
# Reuse vectors from the relevance pass.
out.update(_diversity_from_vecs(cand_vecs))
else:
# Standalone BGE encode (e.g. when query was empty so the relevance
# pass was skipped).
out.update(compute_candidate_diversity(candidates))
else:
out["candidate_diversity"] = 0.0
out["n_candidates"] = 1 if response else 0
return out
|