""" shadow_eval.py — Shadow evaluation: compare candidate changes vs production baseline. Before any optimization is promoted (new prompt pack, new skill, new adapter), it must pass shadow evaluation under RunMode.EVAL_TEST (no writes). Candidate promoted only if: candidate_score >= baseline_score * threshold Rollback if: candidate degrades below baseline. """ from __future__ import annotations from dataclasses import dataclass, field from typing import Any @dataclass class ShadowResult: """Result of comparing candidate vs baseline.""" baseline_score: float candidate_score: float threshold: float = 0.95 # Candidate must be >= 95% of baseline passed: bool = False detail: str = "" def __post_init__(self): self.passed = self.candidate_score >= self.baseline_score * self.threshold delta = self.candidate_score - self.baseline_score if self.passed: self.detail = f"PASS: candidate={self.candidate_score:.3f} >= baseline={self.baseline_score:.3f} (Δ={delta:+.3f})" else: self.detail = f"FAIL: candidate={self.candidate_score:.3f} < baseline={self.baseline_score:.3f}×{self.threshold} (Δ={delta:+.3f})" class ShadowEvaluator: """ Compares a candidate configuration against the current baseline. Usage: evaluator = ShadowEvaluator(threshold=0.95) # Run baseline baseline_scores = [run_eval(baseline_config, case) for case in eval_cases] # Run candidate candidate_scores = [run_eval(candidate_config, case) for case in eval_cases] # Compare result = evaluator.compare(baseline_scores, candidate_scores) if result.passed: promote(candidate_config) else: rollback() """ def __init__(self, threshold: float = 0.95): self.threshold = threshold self._history: list[ShadowResult] = [] def compare(self, baseline_scores: list[float], candidate_scores: list[float]) -> ShadowResult: """Compare aggregate scores. Returns ShadowResult with pass/fail.""" if not baseline_scores or not candidate_scores: return ShadowResult(baseline_score=0, candidate_score=0, threshold=self.threshold) baseline_avg = sum(baseline_scores) / len(baseline_scores) candidate_avg = sum(candidate_scores) / len(candidate_scores) result = ShadowResult( baseline_score=baseline_avg, candidate_score=candidate_avg, threshold=self.threshold, ) self._history.append(result) return result def should_promote(self, result: ShadowResult) -> bool: """Explicit promotion decision.""" return result.passed def should_rollback(self, result: ShadowResult) -> bool: """Explicit rollback decision (candidate is significantly worse).""" return result.candidate_score < result.baseline_score * 0.8 # 20% degradation @property def history(self) -> list[ShadowResult]: return self._history