shwetangisingh commited on
Commit
0e19ba2
·
1 Parent(s): 69345ca

bug fixes

Browse files
backend/evals/__init__.py CHANGED
@@ -32,14 +32,12 @@ def _score_candidates_batched(
32
  else:
33
  relevances = [0.0] * len(texts)
34
 
35
- scores = [{**faiths[i], "relevance": relevances[i]} for i in range(len(candidates))]
36
  return scores, cand_vecs
37
 
38
 
39
  def _diversity_from_vecs(cand_vecs: "torch.Tensor") -> dict:
40
  n = cand_vecs.shape[0]
41
- if n < 2:
42
- return {"candidate_diversity": 0.0, "n_candidates": n}
43
  sims = cand_vecs @ cand_vecs.T
44
  iu = torch.triu_indices(n, n, offset=1)
45
  return {
@@ -75,11 +73,11 @@ def compute_evals(
75
  per_cand: list[dict] = []
76
  cand_vecs = None
77
  if candidates:
78
- if selected_idx is None and response:
79
- for i, c in enumerate(candidates):
80
- if c.get("text", "").strip() == response.strip():
81
- selected_idx = i
82
- break
83
  scored, cand_vecs = _score_candidates_batched(candidates, chunks, query)
84
  per_cand = [
85
  {
@@ -117,11 +115,16 @@ def compute_evals(
117
 
118
  if per_cand:
119
  out["candidates_eval"] = per_cand
120
- # Reuse cand_vecs from the relevance pass when available; falls back to
121
- # standalone BGE encode (e.g. when query was empty).
122
- if cand_vecs is not None:
 
 
 
123
  out.update(_diversity_from_vecs(cand_vecs))
124
  else:
 
 
125
  out.update(compute_candidate_diversity(candidates))
126
  else:
127
  out["candidate_diversity"] = 0.0
 
32
  else:
33
  relevances = [0.0] * len(texts)
34
 
35
+ scores = [{**f, "relevance": r} for f, r in zip(faiths, relevances, strict=True)]
36
  return scores, cand_vecs
37
 
38
 
39
  def _diversity_from_vecs(cand_vecs: "torch.Tensor") -> dict:
40
  n = cand_vecs.shape[0]
 
 
41
  sims = cand_vecs @ cand_vecs.T
42
  iu = torch.triu_indices(n, n, offset=1)
43
  return {
 
73
  per_cand: list[dict] = []
74
  cand_vecs = None
75
  if candidates:
76
+ # The planner serves uniq[0] as `selected_response`, so when caller
77
+ # didn't pass selected_idx explicitly, default to 0 rather than
78
+ # text-matching (which can collide on duplicate candidate texts).
79
+ if selected_idx is None:
80
+ selected_idx = 0
81
  scored, cand_vecs = _score_candidates_batched(candidates, chunks, query)
82
  per_cand = [
83
  {
 
115
 
116
  if per_cand:
117
  out["candidates_eval"] = per_cand
118
+ n = len(candidates)
119
+ if n < 2:
120
+ out["candidate_diversity"] = 0.0
121
+ out["n_candidates"] = n
122
+ elif cand_vecs is not None:
123
+ # Reuse vectors from the relevance pass.
124
  out.update(_diversity_from_vecs(cand_vecs))
125
  else:
126
+ # Standalone BGE encode (e.g. when query was empty so the relevance
127
+ # pass was skipped).
128
  out.update(compute_candidate_diversity(candidates))
129
  else:
130
  out["candidate_diversity"] = 0.0
backend/evals/aggregate.py CHANGED
@@ -11,6 +11,9 @@ from pathlib import Path
11
 
12
  from backend.config.settings import settings
13
 
 
 
 
14
 
15
  def _load(path: Path) -> list[dict]:
16
  if not path.exists():
@@ -256,11 +259,11 @@ def report_picker(turns: list[dict], picks: list[dict], evals: list[dict]) -> No
256
  ]
257
  if div_scored:
258
  diversities = [float(e["candidate_diversity"]) for e in div_scored]
259
- low = sum(1 for d in diversities if d < 0.1)
260
  print(
261
  f"\nCandidate diversity (n={len(div_scored)} turns): "
262
  f"mean={statistics.mean(diversities):.2f} "
263
- f"low (<0.10): {low}/{len(div_scored)} ({low / len(div_scored):.0%})"
264
  )
265
 
266
 
 
11
 
12
  from backend.config.settings import settings
13
 
14
+ # Mean pairwise cosine distance below this means the picker showed near-paraphrases.
15
+ _DIVERSITY_FLOOR = 0.10
16
+
17
 
18
  def _load(path: Path) -> list[dict]:
19
  if not path.exists():
 
259
  ]
260
  if div_scored:
261
  diversities = [float(e["candidate_diversity"]) for e in div_scored]
262
+ low = sum(1 for d in diversities if d < _DIVERSITY_FLOOR)
263
  print(
264
  f"\nCandidate diversity (n={len(div_scored)} turns): "
265
  f"mean={statistics.mean(diversities):.2f} "
266
+ f"low (<{_DIVERSITY_FLOOR:.2f}): {low}/{len(div_scored)} ({low / len(div_scored):.0%})"
267
  )
268
 
269
 
backend/evals/diversity.py CHANGED
@@ -4,7 +4,9 @@ import torch
4
  def compute_candidate_diversity(candidates: list[dict]) -> dict:
5
  """Mean pairwise cosine *distance* among candidate texts.
6
 
7
- 1.0 = maximally different, 0.0 = identical paraphrases.
 
 
8
  """
9
  texts = [c.get("text", "").strip() for c in candidates]
10
  texts = [t for t in texts if t]
 
4
  def compute_candidate_diversity(candidates: list[dict]) -> dict:
5
  """Mean pairwise cosine *distance* among candidate texts.
6
 
7
+ 1.0 = maximally different, 0.0 = identical paraphrases. Empty candidate
8
+ texts are filtered out before encoding, so `n_candidates` in the result
9
+ is the count of *non-empty* texts (may be < len(candidates)).
10
  """
11
  texts = [c.get("text", "").strip() for c in candidates]
12
  texts = [t for t in texts if t]
backend/evals/relevance.py CHANGED
@@ -5,12 +5,9 @@ def compute_relevance(response: str, query: str) -> dict:
5
  its answer), so we use the same embedding space the retriever uses.
6
  """
7
  if not response.strip() or not query.strip():
8
- return {"relevance": 0.0, "no_query": not query.strip()}
9
 
10
  from backend.retrieval.vector_store import embed_texts
11
 
12
  vecs = embed_texts([query, response])
13
- return {
14
- "relevance": round(max(0.0, float(vecs[0] @ vecs[1])), 4),
15
- "no_query": False,
16
- }
 
5
  its answer), so we use the same embedding space the retriever uses.
6
  """
7
  if not response.strip() or not query.strip():
8
+ return {"relevance": 0.0}
9
 
10
  from backend.retrieval.vector_store import embed_texts
11
 
12
  vecs = embed_texts([query, response])
13
+ return {"relevance": round(max(0.0, float(vecs[0] @ vecs[1])), 4)}
 
 
 
frontend/src/components/EvalPanel.tsx CHANGED
@@ -265,23 +265,21 @@ function EvalPanelImpl({
265
  )}
266
  </>
267
  )}
268
- <span className="tip" data-tip="Rate how authentic this response felt as the persona (1 = off, 5 = spot on). Logged to ratings.jsonl.">
269
- <div className="star-rating">
270
- {[1, 2, 3, 4, 5].map((star) => (
271
- <button
272
- key={star}
273
- className={`star ${star <= (hover || (value ?? 0)) ? "active" : ""}`}
274
- onMouseEnter={() => setHover(star)}
275
- onMouseLeave={() => setHover(0)}
276
- onClick={() => rate(star)}
277
- disabled={value !== null || submitting}
278
- >
279
-
280
- </button>
281
- ))}
282
- {value !== null && <span className="star-label">{value}/5</span>}
283
- </div>
284
- </span>
285
  </div>
286
  </div>
287
  );
 
265
  )}
266
  </>
267
  )}
268
+ <div className="tip star-rating" data-tip="Rate how authentic this response felt as the persona (1 = off, 5 = spot on). Logged to ratings.jsonl.">
269
+ {[1, 2, 3, 4, 5].map((star) => (
270
+ <button
271
+ key={star}
272
+ className={`star ${star <= (hover || (value ?? 0)) ? "active" : ""}`}
273
+ onMouseEnter={() => setHover(star)}
274
+ onMouseLeave={() => setHover(0)}
275
+ onClick={() => rate(star)}
276
+ disabled={value !== null || submitting}
277
+ >
278
+
279
+ </button>
280
+ ))}
281
+ {value !== null && <span className="star-label">{value}/5</span>}
282
+ </div>
 
 
283
  </div>
284
  </div>
285
  );