Spaces:
Sleeping
Sleeping
| # Evaluation metrics — compute after pipeline returns, before API response. | |
| import torch | |
| from backend.evals.diversity import compute_candidate_diversity | |
| from backend.evals.efficiency import compute_efficiency | |
| from backend.evals.faithfulness import compute_faithfulness, compute_faithfulness_batch | |
| from backend.evals.multimodal_alignment import compute_multimodal_alignment | |
| from backend.evals.relevance import compute_relevance | |
| def _score_candidates_batched( | |
| candidates: list[dict], | |
| chunks: list[dict], | |
| query: str, | |
| ) -> tuple[list[dict], "torch.Tensor | None"]: | |
| """One BGE pass + one NLI pass across all candidates. Returns per-candidate | |
| score dicts and the candidate vector matrix (for diversity reuse), or None | |
| when no embedding pass was needed.""" | |
| texts = [c.get("text", "") for c in candidates] | |
| faiths = compute_faithfulness_batch(texts, chunks) | |
| cand_vecs: torch.Tensor | None = None | |
| if query.strip() and any(t.strip() for t in texts): | |
| from backend.retrieval.vector_store import embed_texts | |
| vecs = embed_texts([query] + texts) | |
| q_vec = vecs[0] | |
| cand_vecs = vecs[1:] | |
| relevances = [ | |
| round(max(0.0, float(q_vec @ cand_vecs[i])), 4) for i in range(len(texts)) | |
| ] | |
| else: | |
| relevances = [0.0] * len(texts) | |
| scores = [{**f, "relevance": r} for f, r in zip(faiths, relevances, strict=True)] | |
| return scores, cand_vecs | |
| def _diversity_from_vecs(cand_vecs: "torch.Tensor") -> dict: | |
| n = cand_vecs.shape[0] | |
| sims = cand_vecs @ cand_vecs.T | |
| iu = torch.triu_indices(n, n, offset=1) | |
| return { | |
| "candidate_diversity": round(float(1.0 - sims[iu[0], iu[1]].mean().item()), 4), | |
| "n_candidates": n, | |
| } | |
| def compute_evals( | |
| response: str, | |
| chunks: list[dict], | |
| latency_log: dict, | |
| affect: str | None, | |
| gesture_tag: str | None, | |
| gaze_bucket: str | None, | |
| slo_target: float = 6.0, | |
| query: str = "", | |
| candidates: list[dict] | None = None, | |
| selected_idx: int | None = None, | |
| ) -> dict: | |
| """Run all eval scorers and return a unified EvalScores dict. | |
| When candidates are provided, scoring is batched: one BGE encode for | |
| query + all candidates, one NLI predict across all (sentence, chunk) | |
| pairs, then sliced per candidate. The selected candidate's scores are | |
| reused as the top-level fields so the existing UI pills keep working. | |
| """ | |
| eff = compute_efficiency(latency_log, slo_target) | |
| align = compute_multimodal_alignment( | |
| response, affect, gesture_tag, gaze_bucket, chunks | |
| ) | |
| per_cand: list[dict] = [] | |
| cand_vecs = None | |
| if candidates: | |
| # The planner serves uniq[0] as `selected_response`, so when caller | |
| # didn't pass selected_idx explicitly, default to 0 rather than | |
| # text-matching (which can collide on duplicate candidate texts). | |
| if selected_idx is None: | |
| selected_idx = 0 | |
| scored, cand_vecs = _score_candidates_batched(candidates, chunks, query) | |
| per_cand = [ | |
| { | |
| "idx": i, | |
| "strategy": c.get("strategy", "unknown"), | |
| "selected": (selected_idx is not None and i == selected_idx), | |
| **scored[i], | |
| } | |
| for i, c in enumerate(candidates) | |
| ] | |
| if per_cand and selected_idx is not None and 0 <= selected_idx < len(per_cand): | |
| # Strip per-candidate-only keys before reusing as top-level scores. | |
| top = { | |
| k: v | |
| for k, v in per_cand[selected_idx].items() | |
| if k not in ("idx", "strategy", "selected") | |
| } | |
| else: | |
| faith = compute_faithfulness(response, chunks) | |
| top = {**faith, "relevance": compute_relevance(response, query)["relevance"]} | |
| out = { | |
| **top, | |
| "t_total_s": eff["t_total"], | |
| "slo_target_s": eff["slo_target"], | |
| "slo_passed": eff["slo_passed"], | |
| "slo_margin_s": eff["margin_s"], | |
| "multimodal_alignment": align["overall_score"], | |
| "affect_alignment": align["affect_alignment"], | |
| "gesture_alignment": align["gesture_alignment"], | |
| "gaze_alignment": align["gaze_alignment"], | |
| "explain": align.get("explain", {}), | |
| } | |
| if per_cand: | |
| out["candidates_eval"] = per_cand | |
| n = len(candidates) | |
| if n < 2: | |
| out["candidate_diversity"] = 0.0 | |
| out["n_candidates"] = n | |
| elif cand_vecs is not None: | |
| # Reuse vectors from the relevance pass. | |
| out.update(_diversity_from_vecs(cand_vecs)) | |
| else: | |
| # Standalone BGE encode (e.g. when query was empty so the relevance | |
| # pass was skipped). | |
| out.update(compute_candidate_diversity(candidates)) | |
| else: | |
| out["candidate_diversity"] = 0.0 | |
| out["n_candidates"] = 1 if response else 0 | |
| return out | |