""" meta_rewarding.py — Self-improving critic via meta-judge loop. From Meta-Rewarding LLMs (arxiv:2407.19594): The Purpose Function judges agent actions. But who judges the judge? A meta-judge evaluates the Purpose Function's own judgments, creating preference pairs (good judgment vs bad judgment) that improve the critic. Adaptation for Purpose Agent (no weight updates): Instead of DPO fine-tuning, we store high-quality judgment examples as critic_calibration memories. The Purpose Function's prompt gets augmented with these calibration examples, improving scoring quality over time through in-context learning. Meta-judge loop: 1. Purpose Function scores a transition → produces (score, reasoning, evidence) 2. Meta-judge evaluates the judgment: was the reasoning sound? was evidence cited? 3. Good judgments → stored as critic_calibration memories (positive examples) 4. Bad judgments → stored as failure_pattern memories (negative examples) 5. Next time the Purpose Function runs, calibration memories are in its prompt Result: the critic gets better at scoring without any weight updates. """ from __future__ import annotations import json import logging from typing import Any from purpose_agent.llm_backend import LLMBackend, ChatMessage from purpose_agent.types import PurposeScore from purpose_agent.memory import MemoryCard, MemoryKind, MemoryStatus from purpose_agent.v2_types import MemoryScope from purpose_agent.memory_ci import MemoryCI logger = logging.getLogger(__name__) META_JUDGE_PROMPT = """\ You are a META-JUDGE. You evaluate the QUALITY of another AI's evaluation. You will see: - A state transition (before → action → after) - The Purpose Function's judgment (Φ scores, reasoning, evidence) Rate the judgment quality on these criteria: 1. EVIDENCE GROUNDING: Did the judgment cite specific, verifiable state changes? (0-10) 2. REASONING COHERENCE: Is the chain of reasoning logically sound? (0-10) 3. CALIBRATION: Are the Φ scores proportional to actual progress? (0-10) 4. ANTI-SYCOPHANCY: Did the judgment avoid inflating scores to be encouraging? (0-10) 5. CONSISTENCY: Would an identical state get the same score? (0-10) Respond with JSON: { "evidence_grounding": <0-10>, "reasoning_coherence": <0-10>, "calibration": <0-10>, "anti_sycophancy": <0-10>, "consistency": <0-10>, "overall": <0-10>, "feedback": "" } """ META_JUDGE_SCHEMA = { "type": "object", "properties": { "evidence_grounding": {"type": "number"}, "reasoning_coherence": {"type": "number"}, "calibration": {"type": "number"}, "anti_sycophancy": {"type": "number"}, "consistency": {"type": "number"}, "overall": {"type": "number"}, "feedback": {"type": "string"}, }, "required": ["overall", "feedback"], } class MetaRewardingLoop: """ Evaluates and improves the Purpose Function through meta-judgment. Usage: meta = MetaRewardingLoop(meta_llm=strong_model, memory_ci=ci) # After each Purpose Function evaluation: meta.evaluate_judgment( state_before_desc="Position (0,0)", action_desc="move_east", state_after_desc="Position (1,0)", purpose="Reach (4,4)", judgment=purpose_score, ) # Good judgments become calibration examples in memory. # Bad judgments become failure patterns. # Purpose Function improves via in-context learning. """ def __init__( self, meta_llm: LLMBackend, memory_ci: MemoryCI, quality_threshold: float = 7.0, ): self.meta_llm = meta_llm self.memory_ci = memory_ci self.quality_threshold = quality_threshold self._eval_log: list[dict] = [] def evaluate_judgment( self, state_before_desc: str, action_desc: str, state_after_desc: str, purpose: str, judgment: PurposeScore, trace_id: str = "", ) -> dict[str, Any]: """ Have the meta-judge evaluate a Purpose Function judgment. If the judgment is high quality → create a positive calibration memory. If low quality → create a negative calibration memory. """ context = ( f"Purpose: {purpose}\n" f"State before: {state_before_desc}\n" f"Action: {action_desc}\n" f"State after: {state_after_desc}\n\n" f"Purpose Function's judgment:\n" f" Φ_before={judgment.phi_before:.1f}, Φ_after={judgment.phi_after:.1f}, Δ={judgment.delta:+.2f}\n" f" Confidence: {judgment.confidence:.2f}\n" f" Reasoning: {judgment.reasoning}\n" f" Evidence: {judgment.evidence}" ) messages = [ ChatMessage(role="system", content=META_JUDGE_PROMPT), ChatMessage(role="user", content=context), ] try: result = self.meta_llm.generate_structured(messages, schema=META_JUDGE_SCHEMA) except Exception as e: logger.warning(f"Meta-judge failed: {e}") return {"error": str(e)} overall = float(result.get("overall", 5.0)) feedback = str(result.get("feedback", "")) log_entry = { "trace_id": trace_id, "overall_quality": overall, "feedback": feedback, "components": {k: result.get(k, 0) for k in META_JUDGE_SCHEMA["properties"] if k not in ("overall", "feedback")}, } self._eval_log.append(log_entry) # Create calibration memory if overall >= self.quality_threshold: card = MemoryCard( kind=MemoryKind.CRITIC_CALIBRATION, status=MemoryStatus.CANDIDATE, content=( f"GOOD judgment example (quality={overall:.0f}/10): " f"For Δ={judgment.delta:+.2f}, evidence was: '{judgment.evidence[:200]}'. " f"Meta-judge feedback: {feedback[:200]}" ), pattern=f"When scoring transitions with delta ~{judgment.delta:+.1f}", strategy=f"Follow this example: {judgment.reasoning[:200]}", trust_score=min(overall / 10.0, 1.0), source_trace_id=trace_id, created_by="meta_judge", ) self.memory_ci.submit(card) logger.info(f"MetaRewarding: Good judgment (quality={overall:.0f}) → calibration memory") elif overall < 4.0: card = MemoryCard( kind=MemoryKind.FAILURE_PATTERN, status=MemoryStatus.CANDIDATE, content=( f"BAD judgment example (quality={overall:.0f}/10): " f"Avoid this pattern: {feedback[:300]}" ), pattern="When scoring state transitions", strategy=f"Do NOT: {feedback[:200]}", trust_score=0.8, source_trace_id=trace_id, created_by="meta_judge", scope=MemoryScope(agent_roles=["critic"]), ) self.memory_ci.submit(card) logger.info(f"MetaRewarding: Bad judgment (quality={overall:.0f}) → failure pattern memory") return log_entry @property def eval_log(self) -> list[dict]: return self._eval_log def summary(self) -> dict[str, Any]: if not self._eval_log: return {"evaluations": 0} scores = [e["overall_quality"] for e in self._eval_log] return { "evaluations": len(scores), "avg_quality": round(sum(scores) / len(scores), 2), "min_quality": min(scores), "max_quality": max(scores), }