Rohan03 commited on
Commit
bc30484
·
verified ·
1 Parent(s): 4a0cbd0

Sprint 10A: shadow_eval.py — compare candidate vs baseline safely

Browse files
purpose_agent/optimization/shadow_eval.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ shadow_eval.py — Shadow evaluation: compare candidate changes vs production baseline.
3
+
4
+ Before any optimization is promoted (new prompt pack, new skill, new adapter),
5
+ it must pass shadow evaluation under RunMode.EVAL_TEST (no writes).
6
+
7
+ Candidate promoted only if: candidate_score >= baseline_score * threshold
8
+ Rollback if: candidate degrades below baseline.
9
+ """
10
+ from __future__ import annotations
11
+ from dataclasses import dataclass, field
12
+ from typing import Any
13
+
14
+
15
+ @dataclass
16
+ class ShadowResult:
17
+ """Result of comparing candidate vs baseline."""
18
+ baseline_score: float
19
+ candidate_score: float
20
+ threshold: float = 0.95 # Candidate must be >= 95% of baseline
21
+ passed: bool = False
22
+ detail: str = ""
23
+
24
+ def __post_init__(self):
25
+ self.passed = self.candidate_score >= self.baseline_score * self.threshold
26
+ delta = self.candidate_score - self.baseline_score
27
+ if self.passed:
28
+ self.detail = f"PASS: candidate={self.candidate_score:.3f} >= baseline={self.baseline_score:.3f} (Δ={delta:+.3f})"
29
+ else:
30
+ self.detail = f"FAIL: candidate={self.candidate_score:.3f} < baseline={self.baseline_score:.3f}×{self.threshold} (Δ={delta:+.3f})"
31
+
32
+
33
+ class ShadowEvaluator:
34
+ """
35
+ Compares a candidate configuration against the current baseline.
36
+
37
+ Usage:
38
+ evaluator = ShadowEvaluator(threshold=0.95)
39
+
40
+ # Run baseline
41
+ baseline_scores = [run_eval(baseline_config, case) for case in eval_cases]
42
+
43
+ # Run candidate
44
+ candidate_scores = [run_eval(candidate_config, case) for case in eval_cases]
45
+
46
+ # Compare
47
+ result = evaluator.compare(baseline_scores, candidate_scores)
48
+ if result.passed:
49
+ promote(candidate_config)
50
+ else:
51
+ rollback()
52
+ """
53
+
54
+ def __init__(self, threshold: float = 0.95):
55
+ self.threshold = threshold
56
+ self._history: list[ShadowResult] = []
57
+
58
+ def compare(self, baseline_scores: list[float], candidate_scores: list[float]) -> ShadowResult:
59
+ """Compare aggregate scores. Returns ShadowResult with pass/fail."""
60
+ if not baseline_scores or not candidate_scores:
61
+ return ShadowResult(baseline_score=0, candidate_score=0, threshold=self.threshold)
62
+
63
+ baseline_avg = sum(baseline_scores) / len(baseline_scores)
64
+ candidate_avg = sum(candidate_scores) / len(candidate_scores)
65
+
66
+ result = ShadowResult(
67
+ baseline_score=baseline_avg,
68
+ candidate_score=candidate_avg,
69
+ threshold=self.threshold,
70
+ )
71
+ self._history.append(result)
72
+ return result
73
+
74
+ def should_promote(self, result: ShadowResult) -> bool:
75
+ """Explicit promotion decision."""
76
+ return result.passed
77
+
78
+ def should_rollback(self, result: ShadowResult) -> bool:
79
+ """Explicit rollback decision (candidate is significantly worse)."""
80
+ return result.candidate_score < result.baseline_score * 0.8 # 20% degradation
81
+
82
+ @property
83
+ def history(self) -> list[ShadowResult]:
84
+ return self._history