Rohan03
/

purpose-agent

+"""
+shadow_eval.py — Shadow evaluation: compare candidate changes vs production baseline.
+Before any optimization is promoted (new prompt pack, new skill, new adapter),
+it must pass shadow evaluation under RunMode.EVAL_TEST (no writes).
+Candidate promoted only if: candidate_score >= baseline_score * threshold
+Rollback if: candidate degrades below baseline.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any
+@dataclass
+class ShadowResult:
+    """Result of comparing candidate vs baseline."""
+    baseline_score: float
+    candidate_score: float
+    threshold: float = 0.95  # Candidate must be >= 95% of baseline
+    passed: bool = False
+    detail: str = ""
+    def __post_init__(self):
+        self.passed = self.candidate_score >= self.baseline_score * self.threshold
+        delta = self.candidate_score - self.baseline_score
+        if self.passed:
+            self.detail = f"PASS: candidate={self.candidate_score:.3f} >= baseline={self.baseline_score:.3f} (Δ={delta:+.3f})"
+        else:
+            self.detail = f"FAIL: candidate={self.candidate_score:.3f} < baseline={self.baseline_score:.3f}×{self.threshold} (Δ={delta:+.3f})"
+class ShadowEvaluator:
+    """
+    Compares a candidate configuration against the current baseline.
+    Usage:
+        evaluator = ShadowEvaluator(threshold=0.95)
+        # Run baseline
+        baseline_scores = [run_eval(baseline_config, case) for case in eval_cases]
+        # Run candidate
+        candidate_scores = [run_eval(candidate_config, case) for case in eval_cases]
+        # Compare
+        result = evaluator.compare(baseline_scores, candidate_scores)
+        if result.passed:
+            promote(candidate_config)
+        else:
+            rollback()
+    """
+    def __init__(self, threshold: float = 0.95):
+        self.threshold = threshold
+        self._history: list[ShadowResult] = []
+    def compare(self, baseline_scores: list[float], candidate_scores: list[float]) -> ShadowResult:
+        """Compare aggregate scores. Returns ShadowResult with pass/fail."""
+        if not baseline_scores or not candidate_scores:
+            return ShadowResult(baseline_score=0, candidate_score=0, threshold=self.threshold)
+        baseline_avg = sum(baseline_scores) / len(baseline_scores)
+        candidate_avg = sum(candidate_scores) / len(candidate_scores)
+        result = ShadowResult(
+            baseline_score=baseline_avg,
+            candidate_score=candidate_avg,
+            threshold=self.threshold,
+        )
+        self._history.append(result)
+        return result
+    def should_promote(self, result: ShadowResult) -> bool:
+        """Explicit promotion decision."""
+        return result.passed
+    def should_rollback(self, result: ShadowResult) -> bool:
+        """Explicit rollback decision (candidate is significantly worse)."""
+        return result.candidate_score < result.baseline_score * 0.8  # 20% degradation
+    @property
+    def history(self) -> list[ShadowResult]:
+        return self._history