| """ |
| shadow_eval.py — Shadow evaluation: compare candidate changes vs production baseline. |
| |
| Before any optimization is promoted (new prompt pack, new skill, new adapter), |
| it must pass shadow evaluation under RunMode.EVAL_TEST (no writes). |
| |
| Candidate promoted only if: candidate_score >= baseline_score * threshold |
| Rollback if: candidate degrades below baseline. |
| """ |
| from __future__ import annotations |
| from dataclasses import dataclass, field |
| from typing import Any |
|
|
|
|
| @dataclass |
| class ShadowResult: |
| """Result of comparing candidate vs baseline.""" |
| baseline_score: float |
| candidate_score: float |
| threshold: float = 0.95 |
| passed: bool = False |
| detail: str = "" |
|
|
| def __post_init__(self): |
| self.passed = self.candidate_score >= self.baseline_score * self.threshold |
| delta = self.candidate_score - self.baseline_score |
| if self.passed: |
| self.detail = f"PASS: candidate={self.candidate_score:.3f} >= baseline={self.baseline_score:.3f} (Δ={delta:+.3f})" |
| else: |
| self.detail = f"FAIL: candidate={self.candidate_score:.3f} < baseline={self.baseline_score:.3f}×{self.threshold} (Δ={delta:+.3f})" |
|
|
|
|
| class ShadowEvaluator: |
| """ |
| Compares a candidate configuration against the current baseline. |
| |
| Usage: |
| evaluator = ShadowEvaluator(threshold=0.95) |
| |
| # Run baseline |
| baseline_scores = [run_eval(baseline_config, case) for case in eval_cases] |
| |
| # Run candidate |
| candidate_scores = [run_eval(candidate_config, case) for case in eval_cases] |
| |
| # Compare |
| result = evaluator.compare(baseline_scores, candidate_scores) |
| if result.passed: |
| promote(candidate_config) |
| else: |
| rollback() |
| """ |
|
|
| def __init__(self, threshold: float = 0.95): |
| self.threshold = threshold |
| self._history: list[ShadowResult] = [] |
|
|
| def compare(self, baseline_scores: list[float], candidate_scores: list[float]) -> ShadowResult: |
| """Compare aggregate scores. Returns ShadowResult with pass/fail.""" |
| if not baseline_scores or not candidate_scores: |
| return ShadowResult(baseline_score=0, candidate_score=0, threshold=self.threshold) |
|
|
| baseline_avg = sum(baseline_scores) / len(baseline_scores) |
| candidate_avg = sum(candidate_scores) / len(candidate_scores) |
|
|
| result = ShadowResult( |
| baseline_score=baseline_avg, |
| candidate_score=candidate_avg, |
| threshold=self.threshold, |
| ) |
| self._history.append(result) |
| return result |
|
|
| def should_promote(self, result: ShadowResult) -> bool: |
| """Explicit promotion decision.""" |
| return result.passed |
|
|
| def should_rollback(self, result: ShadowResult) -> bool: |
| """Explicit rollback decision (candidate is significantly worse).""" |
| return result.candidate_score < result.baseline_score * 0.8 |
|
|
| @property |
| def history(self) -> list[ShadowResult]: |
| return self._history |
|
|