Spaces:
Sleeping
Sleeping
Commit ·
0e19ba2
1
Parent(s): 69345ca
bug fixes
Browse files- backend/evals/__init__.py +14 -11
- backend/evals/aggregate.py +5 -2
- backend/evals/diversity.py +3 -1
- backend/evals/relevance.py +2 -5
- frontend/src/components/EvalPanel.tsx +15 -17
backend/evals/__init__.py
CHANGED
|
@@ -32,14 +32,12 @@ def _score_candidates_batched(
|
|
| 32 |
else:
|
| 33 |
relevances = [0.0] * len(texts)
|
| 34 |
|
| 35 |
-
scores = [{**
|
| 36 |
return scores, cand_vecs
|
| 37 |
|
| 38 |
|
| 39 |
def _diversity_from_vecs(cand_vecs: "torch.Tensor") -> dict:
|
| 40 |
n = cand_vecs.shape[0]
|
| 41 |
-
if n < 2:
|
| 42 |
-
return {"candidate_diversity": 0.0, "n_candidates": n}
|
| 43 |
sims = cand_vecs @ cand_vecs.T
|
| 44 |
iu = torch.triu_indices(n, n, offset=1)
|
| 45 |
return {
|
|
@@ -75,11 +73,11 @@ def compute_evals(
|
|
| 75 |
per_cand: list[dict] = []
|
| 76 |
cand_vecs = None
|
| 77 |
if candidates:
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
scored, cand_vecs = _score_candidates_batched(candidates, chunks, query)
|
| 84 |
per_cand = [
|
| 85 |
{
|
|
@@ -117,11 +115,16 @@ def compute_evals(
|
|
| 117 |
|
| 118 |
if per_cand:
|
| 119 |
out["candidates_eval"] = per_cand
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
| 123 |
out.update(_diversity_from_vecs(cand_vecs))
|
| 124 |
else:
|
|
|
|
|
|
|
| 125 |
out.update(compute_candidate_diversity(candidates))
|
| 126 |
else:
|
| 127 |
out["candidate_diversity"] = 0.0
|
|
|
|
| 32 |
else:
|
| 33 |
relevances = [0.0] * len(texts)
|
| 34 |
|
| 35 |
+
scores = [{**f, "relevance": r} for f, r in zip(faiths, relevances, strict=True)]
|
| 36 |
return scores, cand_vecs
|
| 37 |
|
| 38 |
|
| 39 |
def _diversity_from_vecs(cand_vecs: "torch.Tensor") -> dict:
|
| 40 |
n = cand_vecs.shape[0]
|
|
|
|
|
|
|
| 41 |
sims = cand_vecs @ cand_vecs.T
|
| 42 |
iu = torch.triu_indices(n, n, offset=1)
|
| 43 |
return {
|
|
|
|
| 73 |
per_cand: list[dict] = []
|
| 74 |
cand_vecs = None
|
| 75 |
if candidates:
|
| 76 |
+
# The planner serves uniq[0] as `selected_response`, so when caller
|
| 77 |
+
# didn't pass selected_idx explicitly, default to 0 rather than
|
| 78 |
+
# text-matching (which can collide on duplicate candidate texts).
|
| 79 |
+
if selected_idx is None:
|
| 80 |
+
selected_idx = 0
|
| 81 |
scored, cand_vecs = _score_candidates_batched(candidates, chunks, query)
|
| 82 |
per_cand = [
|
| 83 |
{
|
|
|
|
| 115 |
|
| 116 |
if per_cand:
|
| 117 |
out["candidates_eval"] = per_cand
|
| 118 |
+
n = len(candidates)
|
| 119 |
+
if n < 2:
|
| 120 |
+
out["candidate_diversity"] = 0.0
|
| 121 |
+
out["n_candidates"] = n
|
| 122 |
+
elif cand_vecs is not None:
|
| 123 |
+
# Reuse vectors from the relevance pass.
|
| 124 |
out.update(_diversity_from_vecs(cand_vecs))
|
| 125 |
else:
|
| 126 |
+
# Standalone BGE encode (e.g. when query was empty so the relevance
|
| 127 |
+
# pass was skipped).
|
| 128 |
out.update(compute_candidate_diversity(candidates))
|
| 129 |
else:
|
| 130 |
out["candidate_diversity"] = 0.0
|
backend/evals/aggregate.py
CHANGED
|
@@ -11,6 +11,9 @@ from pathlib import Path
|
|
| 11 |
|
| 12 |
from backend.config.settings import settings
|
| 13 |
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
def _load(path: Path) -> list[dict]:
|
| 16 |
if not path.exists():
|
|
@@ -256,11 +259,11 @@ def report_picker(turns: list[dict], picks: list[dict], evals: list[dict]) -> No
|
|
| 256 |
]
|
| 257 |
if div_scored:
|
| 258 |
diversities = [float(e["candidate_diversity"]) for e in div_scored]
|
| 259 |
-
low = sum(1 for d in diversities if d <
|
| 260 |
print(
|
| 261 |
f"\nCandidate diversity (n={len(div_scored)} turns): "
|
| 262 |
f"mean={statistics.mean(diversities):.2f} "
|
| 263 |
-
f"low (<
|
| 264 |
)
|
| 265 |
|
| 266 |
|
|
|
|
| 11 |
|
| 12 |
from backend.config.settings import settings
|
| 13 |
|
| 14 |
+
# Mean pairwise cosine distance below this means the picker showed near-paraphrases.
|
| 15 |
+
_DIVERSITY_FLOOR = 0.10
|
| 16 |
+
|
| 17 |
|
| 18 |
def _load(path: Path) -> list[dict]:
|
| 19 |
if not path.exists():
|
|
|
|
| 259 |
]
|
| 260 |
if div_scored:
|
| 261 |
diversities = [float(e["candidate_diversity"]) for e in div_scored]
|
| 262 |
+
low = sum(1 for d in diversities if d < _DIVERSITY_FLOOR)
|
| 263 |
print(
|
| 264 |
f"\nCandidate diversity (n={len(div_scored)} turns): "
|
| 265 |
f"mean={statistics.mean(diversities):.2f} "
|
| 266 |
+
f"low (<{_DIVERSITY_FLOOR:.2f}): {low}/{len(div_scored)} ({low / len(div_scored):.0%})"
|
| 267 |
)
|
| 268 |
|
| 269 |
|
backend/evals/diversity.py
CHANGED
|
@@ -4,7 +4,9 @@ import torch
|
|
| 4 |
def compute_candidate_diversity(candidates: list[dict]) -> dict:
|
| 5 |
"""Mean pairwise cosine *distance* among candidate texts.
|
| 6 |
|
| 7 |
-
1.0 = maximally different, 0.0 = identical paraphrases.
|
|
|
|
|
|
|
| 8 |
"""
|
| 9 |
texts = [c.get("text", "").strip() for c in candidates]
|
| 10 |
texts = [t for t in texts if t]
|
|
|
|
| 4 |
def compute_candidate_diversity(candidates: list[dict]) -> dict:
|
| 5 |
"""Mean pairwise cosine *distance* among candidate texts.
|
| 6 |
|
| 7 |
+
1.0 = maximally different, 0.0 = identical paraphrases. Empty candidate
|
| 8 |
+
texts are filtered out before encoding, so `n_candidates` in the result
|
| 9 |
+
is the count of *non-empty* texts (may be < len(candidates)).
|
| 10 |
"""
|
| 11 |
texts = [c.get("text", "").strip() for c in candidates]
|
| 12 |
texts = [t for t in texts if t]
|
backend/evals/relevance.py
CHANGED
|
@@ -5,12 +5,9 @@ def compute_relevance(response: str, query: str) -> dict:
|
|
| 5 |
its answer), so we use the same embedding space the retriever uses.
|
| 6 |
"""
|
| 7 |
if not response.strip() or not query.strip():
|
| 8 |
-
return {"relevance": 0.0
|
| 9 |
|
| 10 |
from backend.retrieval.vector_store import embed_texts
|
| 11 |
|
| 12 |
vecs = embed_texts([query, response])
|
| 13 |
-
return {
|
| 14 |
-
"relevance": round(max(0.0, float(vecs[0] @ vecs[1])), 4),
|
| 15 |
-
"no_query": False,
|
| 16 |
-
}
|
|
|
|
| 5 |
its answer), so we use the same embedding space the retriever uses.
|
| 6 |
"""
|
| 7 |
if not response.strip() or not query.strip():
|
| 8 |
+
return {"relevance": 0.0}
|
| 9 |
|
| 10 |
from backend.retrieval.vector_store import embed_texts
|
| 11 |
|
| 12 |
vecs = embed_texts([query, response])
|
| 13 |
+
return {"relevance": round(max(0.0, float(vecs[0] @ vecs[1])), 4)}
|
|
|
|
|
|
|
|
|
frontend/src/components/EvalPanel.tsx
CHANGED
|
@@ -265,23 +265,21 @@ function EvalPanelImpl({
|
|
| 265 |
)}
|
| 266 |
</>
|
| 267 |
)}
|
| 268 |
-
<
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
</div>
|
| 284 |
-
</span>
|
| 285 |
</div>
|
| 286 |
</div>
|
| 287 |
);
|
|
|
|
| 265 |
)}
|
| 266 |
</>
|
| 267 |
)}
|
| 268 |
+
<div className="tip star-rating" data-tip="Rate how authentic this response felt as the persona (1 = off, 5 = spot on). Logged to ratings.jsonl.">
|
| 269 |
+
{[1, 2, 3, 4, 5].map((star) => (
|
| 270 |
+
<button
|
| 271 |
+
key={star}
|
| 272 |
+
className={`star ${star <= (hover || (value ?? 0)) ? "active" : ""}`}
|
| 273 |
+
onMouseEnter={() => setHover(star)}
|
| 274 |
+
onMouseLeave={() => setHover(0)}
|
| 275 |
+
onClick={() => rate(star)}
|
| 276 |
+
disabled={value !== null || submitting}
|
| 277 |
+
>
|
| 278 |
+
★
|
| 279 |
+
</button>
|
| 280 |
+
))}
|
| 281 |
+
{value !== null && <span className="star-label">{value}/5</span>}
|
| 282 |
+
</div>
|
|
|
|
|
|
|
| 283 |
</div>
|
| 284 |
</div>
|
| 285 |
);
|