File size: 7,901 Bytes
ccbf192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
"""
meta_rewarding.py — Self-improving critic via meta-judge loop.

From Meta-Rewarding LLMs (arxiv:2407.19594):
  The Purpose Function judges agent actions. But who judges the judge?
  A meta-judge evaluates the Purpose Function's own judgments, creating
  preference pairs (good judgment vs bad judgment) that improve the critic.

Adaptation for Purpose Agent (no weight updates):
  Instead of DPO fine-tuning, we store high-quality judgment examples
  as critic_calibration memories. The Purpose Function's prompt gets
  augmented with these calibration examples, improving scoring quality
  over time through in-context learning.

  Meta-judge loop:
    1. Purpose Function scores a transition → produces (score, reasoning, evidence)
    2. Meta-judge evaluates the judgment: was the reasoning sound? was evidence cited?
    3. Good judgments → stored as critic_calibration memories (positive examples)
    4. Bad judgments → stored as failure_pattern memories (negative examples)
    5. Next time the Purpose Function runs, calibration memories are in its prompt

  Result: the critic gets better at scoring without any weight updates.
"""
from __future__ import annotations

import json
import logging
from typing import Any

from purpose_agent.llm_backend import LLMBackend, ChatMessage
from purpose_agent.types import PurposeScore
from purpose_agent.memory import MemoryCard, MemoryKind, MemoryStatus
from purpose_agent.v2_types import MemoryScope
from purpose_agent.memory_ci import MemoryCI

logger = logging.getLogger(__name__)

META_JUDGE_PROMPT = """\
You are a META-JUDGE. You evaluate the QUALITY of another AI's evaluation.

You will see:
- A state transition (before → action → after)
- The Purpose Function's judgment (Φ scores, reasoning, evidence)

Rate the judgment quality on these criteria:
1. EVIDENCE GROUNDING: Did the judgment cite specific, verifiable state changes? (0-10)
2. REASONING COHERENCE: Is the chain of reasoning logically sound? (0-10)
3. CALIBRATION: Are the Φ scores proportional to actual progress? (0-10)
4. ANTI-SYCOPHANCY: Did the judgment avoid inflating scores to be encouraging? (0-10)
5. CONSISTENCY: Would an identical state get the same score? (0-10)

Respond with JSON:
{
  "evidence_grounding": <0-10>,
  "reasoning_coherence": <0-10>,
  "calibration": <0-10>,
  "anti_sycophancy": <0-10>,
  "consistency": <0-10>,
  "overall": <0-10>,
  "feedback": "<specific feedback on what was good or bad about this judgment>"
}
"""

META_JUDGE_SCHEMA = {
    "type": "object",
    "properties": {
        "evidence_grounding": {"type": "number"},
        "reasoning_coherence": {"type": "number"},
        "calibration": {"type": "number"},
        "anti_sycophancy": {"type": "number"},
        "consistency": {"type": "number"},
        "overall": {"type": "number"},
        "feedback": {"type": "string"},
    },
    "required": ["overall", "feedback"],
}


class MetaRewardingLoop:
    """
    Evaluates and improves the Purpose Function through meta-judgment.

    Usage:
        meta = MetaRewardingLoop(meta_llm=strong_model, memory_ci=ci)

        # After each Purpose Function evaluation:
        meta.evaluate_judgment(
            state_before_desc="Position (0,0)",
            action_desc="move_east",
            state_after_desc="Position (1,0)",
            purpose="Reach (4,4)",
            judgment=purpose_score,
        )

        # Good judgments become calibration examples in memory.
        # Bad judgments become failure patterns.
        # Purpose Function improves via in-context learning.
    """

    def __init__(
        self,
        meta_llm: LLMBackend,
        memory_ci: MemoryCI,
        quality_threshold: float = 7.0,
    ):
        self.meta_llm = meta_llm
        self.memory_ci = memory_ci
        self.quality_threshold = quality_threshold
        self._eval_log: list[dict] = []

    def evaluate_judgment(
        self,
        state_before_desc: str,
        action_desc: str,
        state_after_desc: str,
        purpose: str,
        judgment: PurposeScore,
        trace_id: str = "",
    ) -> dict[str, Any]:
        """
        Have the meta-judge evaluate a Purpose Function judgment.

        If the judgment is high quality → create a positive calibration memory.
        If low quality → create a negative calibration memory.
        """
        context = (
            f"Purpose: {purpose}\n"
            f"State before: {state_before_desc}\n"
            f"Action: {action_desc}\n"
            f"State after: {state_after_desc}\n\n"
            f"Purpose Function's judgment:\n"
            f"  Φ_before={judgment.phi_before:.1f}, Φ_after={judgment.phi_after:.1f}, Δ={judgment.delta:+.2f}\n"
            f"  Confidence: {judgment.confidence:.2f}\n"
            f"  Reasoning: {judgment.reasoning}\n"
            f"  Evidence: {judgment.evidence}"
        )

        messages = [
            ChatMessage(role="system", content=META_JUDGE_PROMPT),
            ChatMessage(role="user", content=context),
        ]

        try:
            result = self.meta_llm.generate_structured(messages, schema=META_JUDGE_SCHEMA)
        except Exception as e:
            logger.warning(f"Meta-judge failed: {e}")
            return {"error": str(e)}

        overall = float(result.get("overall", 5.0))
        feedback = str(result.get("feedback", ""))

        log_entry = {
            "trace_id": trace_id,
            "overall_quality": overall,
            "feedback": feedback,
            "components": {k: result.get(k, 0) for k in META_JUDGE_SCHEMA["properties"] if k not in ("overall", "feedback")},
        }
        self._eval_log.append(log_entry)

        # Create calibration memory
        if overall >= self.quality_threshold:
            card = MemoryCard(
                kind=MemoryKind.CRITIC_CALIBRATION,
                status=MemoryStatus.CANDIDATE,
                content=(
                    f"GOOD judgment example (quality={overall:.0f}/10): "
                    f"For Δ={judgment.delta:+.2f}, evidence was: '{judgment.evidence[:200]}'. "
                    f"Meta-judge feedback: {feedback[:200]}"
                ),
                pattern=f"When scoring transitions with delta ~{judgment.delta:+.1f}",
                strategy=f"Follow this example: {judgment.reasoning[:200]}",
                trust_score=min(overall / 10.0, 1.0),
                source_trace_id=trace_id,
                created_by="meta_judge",
            )
            self.memory_ci.submit(card)
            logger.info(f"MetaRewarding: Good judgment (quality={overall:.0f}) → calibration memory")
        elif overall < 4.0:
            card = MemoryCard(
                kind=MemoryKind.FAILURE_PATTERN,
                status=MemoryStatus.CANDIDATE,
                content=(
                    f"BAD judgment example (quality={overall:.0f}/10): "
                    f"Avoid this pattern: {feedback[:300]}"
                ),
                pattern="When scoring state transitions",
                strategy=f"Do NOT: {feedback[:200]}",
                trust_score=0.8,
                source_trace_id=trace_id,
                created_by="meta_judge",
                scope=MemoryScope(agent_roles=["critic"]),
            )
            self.memory_ci.submit(card)
            logger.info(f"MetaRewarding: Bad judgment (quality={overall:.0f}) → failure pattern memory")

        return log_entry

    @property
    def eval_log(self) -> list[dict]:
        return self._eval_log

    def summary(self) -> dict[str, Any]:
        if not self._eval_log:
            return {"evaluations": 0}
        scores = [e["overall_quality"] for e in self._eval_log]
        return {
            "evaluations": len(scores),
            "avg_quality": round(sum(scores) / len(scores), 2),
            "min_quality": min(scores),
            "max_quality": max(scores),
        }