File size: 3,081 Bytes
bc30484
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""
shadow_eval.py — Shadow evaluation: compare candidate changes vs production baseline.

Before any optimization is promoted (new prompt pack, new skill, new adapter),
it must pass shadow evaluation under RunMode.EVAL_TEST (no writes).

Candidate promoted only if: candidate_score >= baseline_score * threshold
Rollback if: candidate degrades below baseline.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any


@dataclass
class ShadowResult:
    """Result of comparing candidate vs baseline."""
    baseline_score: float
    candidate_score: float
    threshold: float = 0.95  # Candidate must be >= 95% of baseline
    passed: bool = False
    detail: str = ""

    def __post_init__(self):
        self.passed = self.candidate_score >= self.baseline_score * self.threshold
        delta = self.candidate_score - self.baseline_score
        if self.passed:
            self.detail = f"PASS: candidate={self.candidate_score:.3f} >= baseline={self.baseline_score:.3f} (Δ={delta:+.3f})"
        else:
            self.detail = f"FAIL: candidate={self.candidate_score:.3f} < baseline={self.baseline_score:.3f}×{self.threshold} (Δ={delta:+.3f})"


class ShadowEvaluator:
    """
    Compares a candidate configuration against the current baseline.
    
    Usage:
        evaluator = ShadowEvaluator(threshold=0.95)
        
        # Run baseline
        baseline_scores = [run_eval(baseline_config, case) for case in eval_cases]
        
        # Run candidate
        candidate_scores = [run_eval(candidate_config, case) for case in eval_cases]
        
        # Compare
        result = evaluator.compare(baseline_scores, candidate_scores)
        if result.passed:
            promote(candidate_config)
        else:
            rollback()
    """

    def __init__(self, threshold: float = 0.95):
        self.threshold = threshold
        self._history: list[ShadowResult] = []

    def compare(self, baseline_scores: list[float], candidate_scores: list[float]) -> ShadowResult:
        """Compare aggregate scores. Returns ShadowResult with pass/fail."""
        if not baseline_scores or not candidate_scores:
            return ShadowResult(baseline_score=0, candidate_score=0, threshold=self.threshold)

        baseline_avg = sum(baseline_scores) / len(baseline_scores)
        candidate_avg = sum(candidate_scores) / len(candidate_scores)

        result = ShadowResult(
            baseline_score=baseline_avg,
            candidate_score=candidate_avg,
            threshold=self.threshold,
        )
        self._history.append(result)
        return result

    def should_promote(self, result: ShadowResult) -> bool:
        """Explicit promotion decision."""
        return result.passed

    def should_rollback(self, result: ShadowResult) -> bool:
        """Explicit rollback decision (candidate is significantly worse)."""
        return result.candidate_score < result.baseline_score * 0.8  # 20% degradation

    @property
    def history(self) -> list[ShadowResult]:
        return self._history