File size: 4,873 Bytes
9ad188a
69345ca
 
 
9ad188a
69345ca
9ad188a
69345ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e19ba2
69345ca
 
 
 
 
 
 
 
 
 
 
9ad188a
 
 
 
 
 
 
 
 
 
69345ca
 
 
9ad188a
69345ca
 
 
 
 
 
 
9ad188a
 
 
 
 
69345ca
 
 
0e19ba2
 
 
 
 
69345ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ad188a
 
 
 
 
 
 
 
69345ca
9ad188a
69345ca
 
 
0e19ba2
 
 
 
 
 
69345ca
 
0e19ba2
 
69345ca
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# Evaluation metrics — compute after pipeline returns, before API response.
import torch

from backend.evals.diversity import compute_candidate_diversity
from backend.evals.efficiency import compute_efficiency
from backend.evals.faithfulness import compute_faithfulness, compute_faithfulness_batch
from backend.evals.multimodal_alignment import compute_multimodal_alignment
from backend.evals.relevance import compute_relevance


def _score_candidates_batched(
    candidates: list[dict],
    chunks: list[dict],
    query: str,
) -> tuple[list[dict], "torch.Tensor | None"]:
    """One BGE pass + one NLI pass across all candidates. Returns per-candidate
    score dicts and the candidate vector matrix (for diversity reuse), or None
    when no embedding pass was needed."""
    texts = [c.get("text", "") for c in candidates]
    faiths = compute_faithfulness_batch(texts, chunks)

    cand_vecs: torch.Tensor | None = None
    if query.strip() and any(t.strip() for t in texts):
        from backend.retrieval.vector_store import embed_texts

        vecs = embed_texts([query] + texts)
        q_vec = vecs[0]
        cand_vecs = vecs[1:]
        relevances = [
            round(max(0.0, float(q_vec @ cand_vecs[i])), 4) for i in range(len(texts))
        ]
    else:
        relevances = [0.0] * len(texts)

    scores = [{**f, "relevance": r} for f, r in zip(faiths, relevances, strict=True)]
    return scores, cand_vecs


def _diversity_from_vecs(cand_vecs: "torch.Tensor") -> dict:
    n = cand_vecs.shape[0]
    sims = cand_vecs @ cand_vecs.T
    iu = torch.triu_indices(n, n, offset=1)
    return {
        "candidate_diversity": round(float(1.0 - sims[iu[0], iu[1]].mean().item()), 4),
        "n_candidates": n,
    }


def compute_evals(
    response: str,
    chunks: list[dict],
    latency_log: dict,
    affect: str | None,
    gesture_tag: str | None,
    gaze_bucket: str | None,
    slo_target: float = 6.0,
    query: str = "",
    candidates: list[dict] | None = None,
    selected_idx: int | None = None,
) -> dict:
    """Run all eval scorers and return a unified EvalScores dict.

    When candidates are provided, scoring is batched: one BGE encode for
    query + all candidates, one NLI predict across all (sentence, chunk)
    pairs, then sliced per candidate. The selected candidate's scores are
    reused as the top-level fields so the existing UI pills keep working.
    """
    eff = compute_efficiency(latency_log, slo_target)
    align = compute_multimodal_alignment(
        response, affect, gesture_tag, gaze_bucket, chunks
    )

    per_cand: list[dict] = []
    cand_vecs = None
    if candidates:
        # The planner serves uniq[0] as `selected_response`, so when caller
        # didn't pass selected_idx explicitly, default to 0 rather than
        # text-matching (which can collide on duplicate candidate texts).
        if selected_idx is None:
            selected_idx = 0
        scored, cand_vecs = _score_candidates_batched(candidates, chunks, query)
        per_cand = [
            {
                "idx": i,
                "strategy": c.get("strategy", "unknown"),
                "selected": (selected_idx is not None and i == selected_idx),
                **scored[i],
            }
            for i, c in enumerate(candidates)
        ]

    if per_cand and selected_idx is not None and 0 <= selected_idx < len(per_cand):
        # Strip per-candidate-only keys before reusing as top-level scores.
        top = {
            k: v
            for k, v in per_cand[selected_idx].items()
            if k not in ("idx", "strategy", "selected")
        }
    else:
        faith = compute_faithfulness(response, chunks)
        top = {**faith, "relevance": compute_relevance(response, query)["relevance"]}

    out = {
        **top,
        "t_total_s": eff["t_total"],
        "slo_target_s": eff["slo_target"],
        "slo_passed": eff["slo_passed"],
        "slo_margin_s": eff["margin_s"],
        "multimodal_alignment": align["overall_score"],
        "affect_alignment": align["affect_alignment"],
        "gesture_alignment": align["gesture_alignment"],
        "gaze_alignment": align["gaze_alignment"],
        "explain": align.get("explain", {}),
    }

    if per_cand:
        out["candidates_eval"] = per_cand
        n = len(candidates)
        if n < 2:
            out["candidate_diversity"] = 0.0
            out["n_candidates"] = n
        elif cand_vecs is not None:
            # Reuse vectors from the relevance pass.
            out.update(_diversity_from_vecs(cand_vecs))
        else:
            # Standalone BGE encode (e.g. when query was empty so the relevance
            # pass was skipped).
            out.update(compute_candidate_diversity(candidates))
    else:
        out["candidate_diversity"] = 0.0
        out["n_candidates"] = 1 if response else 0

    return out