Sprint 10A: shadow_eval.py — compare candidate vs baseline safely

bc30484 verified 11 days ago

3.08 kB

	"""
	shadow_eval.py — Shadow evaluation: compare candidate changes vs production baseline.

	Before any optimization is promoted (new prompt pack, new skill, new adapter),
	it must pass shadow evaluation under RunMode.EVAL_TEST (no writes).

	Candidate promoted only if: candidate_score >= baseline_score * threshold
	Rollback if: candidate degrades below baseline.
	"""
	from __future__ import annotations
	from dataclasses import dataclass, field
	from typing import Any


	@dataclass
	class ShadowResult:
	"""Result of comparing candidate vs baseline."""
	baseline_score: float
	candidate_score: float
	threshold: float = 0.95 # Candidate must be >= 95% of baseline
	passed: bool = False
	detail: str = ""

	def __post_init__(self):
	self.passed = self.candidate_score >= self.baseline_score * self.threshold
	delta = self.candidate_score - self.baseline_score
	if self.passed:
	self.detail = f"PASS: candidate={self.candidate_score:.3f} >= baseline={self.baseline_score:.3f} (Δ={delta:+.3f})"
	else:
	self.detail = f"FAIL: candidate={self.candidate_score:.3f} < baseline={self.baseline_score:.3f}×{self.threshold} (Δ={delta:+.3f})"


	class ShadowEvaluator:
	"""
	Compares a candidate configuration against the current baseline.

	Usage:
	evaluator = ShadowEvaluator(threshold=0.95)

	# Run baseline
	baseline_scores = [run_eval(baseline_config, case) for case in eval_cases]

	# Run candidate
	candidate_scores = [run_eval(candidate_config, case) for case in eval_cases]

	# Compare
	result = evaluator.compare(baseline_scores, candidate_scores)
	if result.passed:
	promote(candidate_config)
	else:
	rollback()
	"""

	def __init__(self, threshold: float = 0.95):
	self.threshold = threshold
	self._history: list[ShadowResult] = []

	def compare(self, baseline_scores: list[float], candidate_scores: list[float]) -> ShadowResult:
	"""Compare aggregate scores. Returns ShadowResult with pass/fail."""
	if not baseline_scores or not candidate_scores:
	return ShadowResult(baseline_score=0, candidate_score=0, threshold=self.threshold)

	baseline_avg = sum(baseline_scores) / len(baseline_scores)
	candidate_avg = sum(candidate_scores) / len(candidate_scores)

	result = ShadowResult(
	baseline_score=baseline_avg,
	candidate_score=candidate_avg,
	threshold=self.threshold,
	)
	self._history.append(result)
	return result

	def should_promote(self, result: ShadowResult) -> bool:
	"""Explicit promotion decision."""
	return result.passed

	def should_rollback(self, result: ShadowResult) -> bool:
	"""Explicit rollback decision (candidate is significantly worse)."""
	return result.candidate_score < result.baseline_score * 0.8 # 20% degradation

	@property
	def history(self) -> list[ShadowResult]:
	return self._history