Rohan03's picture
Sprint 10A: shadow_eval.py — compare candidate vs baseline safely
bc30484 verified
"""
shadow_eval.py — Shadow evaluation: compare candidate changes vs production baseline.
Before any optimization is promoted (new prompt pack, new skill, new adapter),
it must pass shadow evaluation under RunMode.EVAL_TEST (no writes).
Candidate promoted only if: candidate_score >= baseline_score * threshold
Rollback if: candidate degrades below baseline.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
@dataclass
class ShadowResult:
"""Result of comparing candidate vs baseline."""
baseline_score: float
candidate_score: float
threshold: float = 0.95 # Candidate must be >= 95% of baseline
passed: bool = False
detail: str = ""
def __post_init__(self):
self.passed = self.candidate_score >= self.baseline_score * self.threshold
delta = self.candidate_score - self.baseline_score
if self.passed:
self.detail = f"PASS: candidate={self.candidate_score:.3f} >= baseline={self.baseline_score:.3f} (Δ={delta:+.3f})"
else:
self.detail = f"FAIL: candidate={self.candidate_score:.3f} < baseline={self.baseline_score:.3f}×{self.threshold} (Δ={delta:+.3f})"
class ShadowEvaluator:
"""
Compares a candidate configuration against the current baseline.
Usage:
evaluator = ShadowEvaluator(threshold=0.95)
# Run baseline
baseline_scores = [run_eval(baseline_config, case) for case in eval_cases]
# Run candidate
candidate_scores = [run_eval(candidate_config, case) for case in eval_cases]
# Compare
result = evaluator.compare(baseline_scores, candidate_scores)
if result.passed:
promote(candidate_config)
else:
rollback()
"""
def __init__(self, threshold: float = 0.95):
self.threshold = threshold
self._history: list[ShadowResult] = []
def compare(self, baseline_scores: list[float], candidate_scores: list[float]) -> ShadowResult:
"""Compare aggregate scores. Returns ShadowResult with pass/fail."""
if not baseline_scores or not candidate_scores:
return ShadowResult(baseline_score=0, candidate_score=0, threshold=self.threshold)
baseline_avg = sum(baseline_scores) / len(baseline_scores)
candidate_avg = sum(candidate_scores) / len(candidate_scores)
result = ShadowResult(
baseline_score=baseline_avg,
candidate_score=candidate_avg,
threshold=self.threshold,
)
self._history.append(result)
return result
def should_promote(self, result: ShadowResult) -> bool:
"""Explicit promotion decision."""
return result.passed
def should_rollback(self, result: ShadowResult) -> bool:
"""Explicit rollback decision (candidate is significantly worse)."""
return result.candidate_score < result.baseline_score * 0.8 # 20% degradation
@property
def history(self) -> list[ShadowResult]:
return self._history