| """ |
| meta_rewarding.py — Self-improving critic via meta-judge loop. |
| |
| From Meta-Rewarding LLMs (arxiv:2407.19594): |
| The Purpose Function judges agent actions. But who judges the judge? |
| A meta-judge evaluates the Purpose Function's own judgments, creating |
| preference pairs (good judgment vs bad judgment) that improve the critic. |
| |
| Adaptation for Purpose Agent (no weight updates): |
| Instead of DPO fine-tuning, we store high-quality judgment examples |
| as critic_calibration memories. The Purpose Function's prompt gets |
| augmented with these calibration examples, improving scoring quality |
| over time through in-context learning. |
| |
| Meta-judge loop: |
| 1. Purpose Function scores a transition → produces (score, reasoning, evidence) |
| 2. Meta-judge evaluates the judgment: was the reasoning sound? was evidence cited? |
| 3. Good judgments → stored as critic_calibration memories (positive examples) |
| 4. Bad judgments → stored as failure_pattern memories (negative examples) |
| 5. Next time the Purpose Function runs, calibration memories are in its prompt |
| |
| Result: the critic gets better at scoring without any weight updates. |
| """ |
| from __future__ import annotations |
|
|
| import json |
| import logging |
| from typing import Any |
|
|
| from purpose_agent.llm_backend import LLMBackend, ChatMessage |
| from purpose_agent.types import PurposeScore |
| from purpose_agent.memory import MemoryCard, MemoryKind, MemoryStatus |
| from purpose_agent.v2_types import MemoryScope |
| from purpose_agent.memory_ci import MemoryCI |
|
|
| logger = logging.getLogger(__name__) |
|
|
| META_JUDGE_PROMPT = """\ |
| You are a META-JUDGE. You evaluate the QUALITY of another AI's evaluation. |
| |
| You will see: |
| - A state transition (before → action → after) |
| - The Purpose Function's judgment (Φ scores, reasoning, evidence) |
| |
| Rate the judgment quality on these criteria: |
| 1. EVIDENCE GROUNDING: Did the judgment cite specific, verifiable state changes? (0-10) |
| 2. REASONING COHERENCE: Is the chain of reasoning logically sound? (0-10) |
| 3. CALIBRATION: Are the Φ scores proportional to actual progress? (0-10) |
| 4. ANTI-SYCOPHANCY: Did the judgment avoid inflating scores to be encouraging? (0-10) |
| 5. CONSISTENCY: Would an identical state get the same score? (0-10) |
| |
| Respond with JSON: |
| { |
| "evidence_grounding": <0-10>, |
| "reasoning_coherence": <0-10>, |
| "calibration": <0-10>, |
| "anti_sycophancy": <0-10>, |
| "consistency": <0-10>, |
| "overall": <0-10>, |
| "feedback": "<specific feedback on what was good or bad about this judgment>" |
| } |
| """ |
|
|
| META_JUDGE_SCHEMA = { |
| "type": "object", |
| "properties": { |
| "evidence_grounding": {"type": "number"}, |
| "reasoning_coherence": {"type": "number"}, |
| "calibration": {"type": "number"}, |
| "anti_sycophancy": {"type": "number"}, |
| "consistency": {"type": "number"}, |
| "overall": {"type": "number"}, |
| "feedback": {"type": "string"}, |
| }, |
| "required": ["overall", "feedback"], |
| } |
|
|
|
|
| class MetaRewardingLoop: |
| """ |
| Evaluates and improves the Purpose Function through meta-judgment. |
| |
| Usage: |
| meta = MetaRewardingLoop(meta_llm=strong_model, memory_ci=ci) |
| |
| # After each Purpose Function evaluation: |
| meta.evaluate_judgment( |
| state_before_desc="Position (0,0)", |
| action_desc="move_east", |
| state_after_desc="Position (1,0)", |
| purpose="Reach (4,4)", |
| judgment=purpose_score, |
| ) |
| |
| # Good judgments become calibration examples in memory. |
| # Bad judgments become failure patterns. |
| # Purpose Function improves via in-context learning. |
| """ |
|
|
| def __init__( |
| self, |
| meta_llm: LLMBackend, |
| memory_ci: MemoryCI, |
| quality_threshold: float = 7.0, |
| ): |
| self.meta_llm = meta_llm |
| self.memory_ci = memory_ci |
| self.quality_threshold = quality_threshold |
| self._eval_log: list[dict] = [] |
|
|
| def evaluate_judgment( |
| self, |
| state_before_desc: str, |
| action_desc: str, |
| state_after_desc: str, |
| purpose: str, |
| judgment: PurposeScore, |
| trace_id: str = "", |
| ) -> dict[str, Any]: |
| """ |
| Have the meta-judge evaluate a Purpose Function judgment. |
| |
| If the judgment is high quality → create a positive calibration memory. |
| If low quality → create a negative calibration memory. |
| """ |
| context = ( |
| f"Purpose: {purpose}\n" |
| f"State before: {state_before_desc}\n" |
| f"Action: {action_desc}\n" |
| f"State after: {state_after_desc}\n\n" |
| f"Purpose Function's judgment:\n" |
| f" Φ_before={judgment.phi_before:.1f}, Φ_after={judgment.phi_after:.1f}, Δ={judgment.delta:+.2f}\n" |
| f" Confidence: {judgment.confidence:.2f}\n" |
| f" Reasoning: {judgment.reasoning}\n" |
| f" Evidence: {judgment.evidence}" |
| ) |
|
|
| messages = [ |
| ChatMessage(role="system", content=META_JUDGE_PROMPT), |
| ChatMessage(role="user", content=context), |
| ] |
|
|
| try: |
| result = self.meta_llm.generate_structured(messages, schema=META_JUDGE_SCHEMA) |
| except Exception as e: |
| logger.warning(f"Meta-judge failed: {e}") |
| return {"error": str(e)} |
|
|
| overall = float(result.get("overall", 5.0)) |
| feedback = str(result.get("feedback", "")) |
|
|
| log_entry = { |
| "trace_id": trace_id, |
| "overall_quality": overall, |
| "feedback": feedback, |
| "components": {k: result.get(k, 0) for k in META_JUDGE_SCHEMA["properties"] if k not in ("overall", "feedback")}, |
| } |
| self._eval_log.append(log_entry) |
|
|
| |
| if overall >= self.quality_threshold: |
| card = MemoryCard( |
| kind=MemoryKind.CRITIC_CALIBRATION, |
| status=MemoryStatus.CANDIDATE, |
| content=( |
| f"GOOD judgment example (quality={overall:.0f}/10): " |
| f"For Δ={judgment.delta:+.2f}, evidence was: '{judgment.evidence[:200]}'. " |
| f"Meta-judge feedback: {feedback[:200]}" |
| ), |
| pattern=f"When scoring transitions with delta ~{judgment.delta:+.1f}", |
| strategy=f"Follow this example: {judgment.reasoning[:200]}", |
| trust_score=min(overall / 10.0, 1.0), |
| source_trace_id=trace_id, |
| created_by="meta_judge", |
| ) |
| self.memory_ci.submit(card) |
| logger.info(f"MetaRewarding: Good judgment (quality={overall:.0f}) → calibration memory") |
| elif overall < 4.0: |
| card = MemoryCard( |
| kind=MemoryKind.FAILURE_PATTERN, |
| status=MemoryStatus.CANDIDATE, |
| content=( |
| f"BAD judgment example (quality={overall:.0f}/10): " |
| f"Avoid this pattern: {feedback[:300]}" |
| ), |
| pattern="When scoring state transitions", |
| strategy=f"Do NOT: {feedback[:200]}", |
| trust_score=0.8, |
| source_trace_id=trace_id, |
| created_by="meta_judge", |
| scope=MemoryScope(agent_roles=["critic"]), |
| ) |
| self.memory_ci.submit(card) |
| logger.info(f"MetaRewarding: Bad judgment (quality={overall:.0f}) → failure pattern memory") |
|
|
| return log_entry |
|
|
| @property |
| def eval_log(self) -> list[dict]: |
| return self._eval_log |
|
|
| def summary(self) -> dict[str, Any]: |
| if not self._eval_log: |
| return {"evaluations": 0} |
| scores = [e["overall_quality"] for e in self._eval_log] |
| return { |
| "evaluations": len(scores), |
| "avg_quality": round(sum(scores) / len(scores), 2), |
| "min_quality": min(scores), |
| "max_quality": max(scores), |
| } |
|
|