File size: 17,087 Bytes

fedfb2e

"""
Purpose Function — The Critic / State Evaluator.

This is the core innovation: a strictly separated LLM call that evaluates
state improvement Φ(s). It rewards the agent ONLY if Φ(s_new) > Φ(s_current).

Design principles (from literature):
  1. Score AFTER environment feedback, never from expected state alone (LATS)
  2. Require specific observable state changes as evidence (SPC anti-hacking)
  3. Use separate LLM call / separate system prompt from the Actor (MUSE)
  4. Normalize scores to prevent inflation over trajectory (novel addition)
  5. V(s) = λ·LM_score + (1-λ)·consistency_score (LATS formulation)

The Purpose Function is intentionally "non-hackable" by design:
  - It sees the ACTUAL new state, not the Actor's prediction
  - It must cite specific evidence for every score
  - Scores are bounded and normalized
  - The system prompt explicitly guards against sycophancy and vague reasoning
"""

from __future__ import annotations

import json
import logging
from typing import Any

from purpose_agent.types import Action, PurposeScore, State
from purpose_agent.llm_backend import ChatMessage, LLMBackend

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Purpose Function System Prompt — The "Non-Hackable Judge"
# ---------------------------------------------------------------------------

PURPOSE_FUNCTION_SYSTEM_PROMPT = """\
You are a STATE EVALUATOR — a strict, impartial judge of progress toward a goal.
You are NOT the agent. You do NOT help the agent. You ONLY measure progress.

## Your Role
Given a state transition (state_before → action → state_after) and an ultimate purpose,
you compute two scores:
  - Φ(state_before): How far the OLD state was from the purpose (0.0 = no progress, 10.0 = goal achieved)
  - Φ(state_after):  How far the NEW state is from the purpose (same scale)

The delta Φ(state_after) - Φ(state_before) is the ONLY signal the agent receives.

## STRICT RULES — Violation of any rule invalidates your evaluation

1. **EVIDENCE REQUIRED**: Every score MUST cite a specific, observable change in the
   state data. "The state improved" is NOT evidence. "Field 'score' changed from 3 to 7"
   IS evidence. If you cannot cite a specific change, the delta MUST be 0.0.

2. **NO CREDIT FOR INTENTIONS**: The agent's "thought" and "expected_delta" are
   provided for context only. You score based on ACTUAL state changes, never on
   what the agent intended or claimed would happen.

3. **NO SYCOPHANCY**: You are not the agent's friend. Do not inflate scores to be
   encouraging. A lateral move (no improvement) gets delta = 0.0. A regression gets
   negative delta. Be precise.

4. **MONOTONIC SCALE**: Φ = 0.0 means the state has zero progress toward the purpose.
   Φ = 10.0 means the purpose is fully achieved. Intermediate values are proportional.
   Justify WHY you chose each specific value.

5. **ANTI-GAMING**: If the action appears to manipulate the state in a way that
   superficially looks like progress but doesn't genuinely advance the purpose
   (e.g., changing a label without doing the work), score it as delta = 0.0 or negative
   and flag it in your evidence field.

6. **CONSISTENCY**: If a state identical to one you scored before appears again,
   it MUST receive the same Φ score. Progress is objective, not relative to your mood.

7. **CONFIDENCE**: Rate your confidence 0.0–1.0. High confidence (>0.8) requires
   clear, unambiguous evidence. If the state change is ambiguous, lower your confidence.

## Scoring Guide
- Φ = 0.0: No meaningful progress toward the purpose
- Φ = 1.0–3.0: Initial setup/preparation steps completed
- Φ = 4.0–6.0: Substantive progress, key sub-goals partially achieved
- Φ = 7.0–8.0: Most of the purpose is achieved, final steps remaining
- Φ = 9.0: Purpose essentially achieved with minor polish needed
- Φ = 10.0: Purpose fully and completely achieved
"""


PURPOSE_FUNCTION_EVAL_PROMPT = """\
## Ultimate Purpose
{purpose}

## State BEFORE Action
{state_before}

## Action Taken
Name: {action_name}
Parameters: {action_params}
Agent's Thought: {action_thought}
Agent's Prediction: {expected_delta}

## State AFTER Action (this is the ACTUAL result — score based on THIS)
{state_after}

Evaluate this state transition. Remember:
- Score Φ(state_before) and Φ(state_after) on the 0.0–10.0 scale
- Cite SPECIFIC evidence from the state data
- Do NOT give credit for intentions — only actual changes
"""


# ---------------------------------------------------------------------------
# Purpose Function Schema (for structured output)
# ---------------------------------------------------------------------------

PURPOSE_SCORE_SCHEMA: dict[str, Any] = {
    "type": "object",
    "properties": {
        "phi_before": {
            "type": "number",
            "minimum": 0.0,
            "maximum": 10.0,
            "description": "Φ(state_before) — distance-to-purpose of the state before the action",
        },
        "phi_after": {
            "type": "number",
            "minimum": 0.0,
            "maximum": 10.0,
            "description": "Φ(state_after) — distance-to-purpose of the state after the action",
        },
        "reasoning": {
            "type": "string",
            "description": "Step-by-step justification for both scores (max 200 words)",
        },
        "evidence": {
            "type": "string",
            "description": "Specific observable state changes that justify the delta (REQUIRED)",
        },
        "confidence": {
            "type": "number",
            "minimum": 0.0,
            "maximum": 1.0,
            "description": "Confidence in this evaluation (0.0 = pure guess, 1.0 = certain)",
        },
    },
    "required": ["phi_before", "phi_after", "reasoning", "evidence", "confidence"],
}


# ---------------------------------------------------------------------------
# Purpose Function Class
# ---------------------------------------------------------------------------

class PurposeFunction:
    """
    The Critic — evaluates state transitions via Φ(s) scoring.
    
    Uses a SEPARATE LLM call from the Actor to prevent self-confirmation bias
    (per MUSE's Reflect Agent design, arxiv:2510.08002).
    
    Can optionally use a different model than the Actor (recommended for
    production — use a stronger model as the critic).
    
    Args:
        llm: LLM backend (can be same or different from Actor's)
        score_cache_size: Max entries in the Φ score cache (for consistency)
        require_evidence: If True, reject scores with empty evidence
        min_confidence: Minimum confidence threshold — below this, score is discarded
    """

    def __init__(
        self,
        llm: LLMBackend,
        score_cache_size: int = 1000,
        require_evidence: bool = True,
        min_confidence: float = 0.3,
    ):
        self.llm = llm
        self.require_evidence = require_evidence
        self.min_confidence = min_confidence
        # Cache: state_hash → Φ score (for consistency rule #6)
        self._phi_cache: dict[str, float] = {}
        self._cache_size = score_cache_size
        # Running stats for normalization
        self._score_history: list[float] = []

    # ------------------------------------------------------------------
    # Core Evaluation
    # ------------------------------------------------------------------

    def evaluate(
        self,
        state_before: State,
        action: Action,
        state_after: State,
        purpose: str,
    ) -> PurposeScore:
        """
        Evaluate a state transition: did the action move closer to the purpose?
        
        Returns a PurposeScore with phi_before, phi_after, delta, reasoning,
        evidence, and confidence.
        
        Anti-hacking measures:
        1. Scores based on ACTUAL state_after (not actor's expected_delta)
        2. Evidence is required — vague scores are rejected
        3. Cached Φ values enforce consistency
        4. Confidence threshold filters uncertain evaluations
        """
        # Check cache for consistency (Rule #6)
        cached_before = self._get_cached_phi(state_before)
        cached_after = self._get_cached_phi(state_after)

        # Build evaluation prompt
        messages = [
            ChatMessage(role="system", content=PURPOSE_FUNCTION_SYSTEM_PROMPT),
            ChatMessage(role="user", content=PURPOSE_FUNCTION_EVAL_PROMPT.format(
                purpose=purpose,
                state_before=state_before.describe(),
                state_after=state_after.describe(),
                action_name=action.name,
                action_params=json.dumps(action.params, default=str),
                action_thought=action.thought,
                expected_delta=action.expected_delta,
            )),
        ]

        # Get structured evaluation from LLM
        try:
            raw_score = self.llm.generate_structured(
                messages, schema=PURPOSE_SCORE_SCHEMA, temperature=0.2
            )
        except Exception as e:
            logger.error(f"Purpose Function structured output failed: {e}")
            # Fall back to text-based evaluation
            raw_score = self._fallback_evaluate(messages)

        # Extract and validate scores
        phi_before = float(raw_score.get("phi_before", 0.0))
        phi_after = float(raw_score.get("phi_after", 0.0))
        reasoning = str(raw_score.get("reasoning", ""))
        evidence = str(raw_score.get("evidence", ""))
        confidence = float(raw_score.get("confidence", 0.5))

        # Clamp to valid range
        phi_before = max(0.0, min(10.0, phi_before))
        phi_after = max(0.0, min(10.0, phi_after))
        confidence = max(0.0, min(1.0, confidence))

        # Apply anti-hacking rules
        phi_before, phi_after, confidence = self._apply_safeguards(
            phi_before, phi_after, evidence, confidence,
            cached_before, cached_after,
        )

        delta = phi_after - phi_before

        # Update caches
        self._cache_phi(state_before, phi_before)
        self._cache_phi(state_after, phi_after)
        self._score_history.append(phi_after)

        score = PurposeScore(
            phi_before=phi_before,
            phi_after=phi_after,
            delta=delta,
            reasoning=reasoning,
            evidence=evidence,
            confidence=confidence,
        )

        logger.info(
            f"Purpose Function: Φ({phi_before:.1f}) → Φ({phi_after:.1f}), "
            f"Δ={delta:+.2f}, conf={confidence:.2f}, improved={score.improved}"
        )
        return score

    # ------------------------------------------------------------------
    # Anti-Hacking Safeguards
    # ------------------------------------------------------------------

    def _apply_safeguards(
        self,
        phi_before: float,
        phi_after: float,
        evidence: str,
        confidence: float,
        cached_before: float | None,
        cached_after: float | None,
    ) -> tuple[float, float, float]:
        """
        Apply anti-reward-hacking safeguards.
        
        1. Evidence requirement: no evidence → delta forced to 0
        2. Cache consistency: if we've scored this state before, use cached value
        3. Confidence threshold: low confidence → reduce delta magnitude
        4. Anomaly detection: suspiciously large jumps get confidence penalty
        """
        # Rule 1: Require evidence
        if self.require_evidence and len(evidence.strip()) < 10:
            logger.warning("Purpose Function: Insufficient evidence, forcing delta=0")
            phi_after = phi_before  # No credit without evidence
            confidence = max(confidence, 0.1)

        # Rule 2: Cache consistency (allow small drift for scoring noise)
        if cached_before is not None:
            drift = abs(phi_before - cached_before)
            if drift > 1.0:
                logger.warning(
                    f"Purpose Function: Inconsistent Φ_before "
                    f"(new={phi_before:.1f}, cached={cached_before:.1f}), "
                    f"using cached value"
                )
                phi_before = cached_before

        if cached_after is not None:
            drift = abs(phi_after - cached_after)
            if drift > 1.0:
                logger.warning(
                    f"Purpose Function: Inconsistent Φ_after "
                    f"(new={phi_after:.1f}, cached={cached_after:.1f}), "
                    f"using cached value"
                )
                phi_after = cached_after

        # Rule 3: Confidence threshold
        if confidence < self.min_confidence:
            logger.warning(
                f"Purpose Function: Low confidence ({confidence:.2f}), "
                f"reducing delta magnitude by 50%"
            )
            midpoint = (phi_before + phi_after) / 2
            phi_after = midpoint + (phi_after - midpoint) * 0.5

        # Rule 4: Anomaly detection — flag suspiciously large single-step jumps
        delta = phi_after - phi_before
        if abs(delta) > 3.0:
            logger.warning(
                f"Purpose Function: Unusually large delta ({delta:+.1f}), "
                f"applying confidence penalty"
            )
            confidence = min(confidence, 0.5)

        return phi_before, phi_after, confidence

    # ------------------------------------------------------------------
    # Caching
    # ------------------------------------------------------------------

    def _state_hash(self, state: State) -> str:
        """Hash a state for cache lookup (based on data content)."""
        return json.dumps(state.data, sort_keys=True, default=str)

    def _get_cached_phi(self, state: State) -> float | None:
        return self._phi_cache.get(self._state_hash(state))

    def _cache_phi(self, state: State, phi: float) -> None:
        key = self._state_hash(state)
        if len(self._phi_cache) >= self._cache_size:
            # Evict oldest (FIFO — good enough for our use case)
            oldest_key = next(iter(self._phi_cache))
            del self._phi_cache[oldest_key]
        self._phi_cache[key] = phi

    # ------------------------------------------------------------------
    # Normalization (prevent score inflation over long trajectories)
    # ------------------------------------------------------------------

    def get_normalized_phi(self, raw_phi: float) -> float:
        """
        Normalize a Φ score relative to the trajectory's score distribution.
        
        Prevents the common failure mode where LLM scores drift upward over
        a trajectory regardless of actual progress.
        """
        if len(self._score_history) < 3:
            return raw_phi

        mean = sum(self._score_history) / len(self._score_history)
        variance = sum((x - mean) ** 2 for x in self._score_history) / len(self._score_history)
        std = max(variance ** 0.5, 0.1)  # Avoid division by zero

        # Z-score normalization mapped back to 0-10
        z = (raw_phi - mean) / std
        normalized = 5.0 + z * 2.0  # Center at 5, spread by 2
        return max(0.0, min(10.0, normalized))

    def reset_trajectory_stats(self) -> None:
        """Reset per-trajectory normalization stats. Call at trajectory start."""
        self._score_history = []

    # ------------------------------------------------------------------
    # Fallback
    # ------------------------------------------------------------------

    def _fallback_evaluate(self, messages: list[ChatMessage]) -> dict[str, Any]:
        """Text-based fallback when structured output is unavailable."""
        raw = self.llm.generate(messages, temperature=0.2)

        import re

        phi_before = 0.0
        phi_after = 0.0

        # Try to extract scores from text
        before_match = re.search(r'[Φφ]\s*\(?state_?before\)?\s*[=:]\s*([\d.]+)', raw, re.IGNORECASE)
        after_match = re.search(r'[Φφ]\s*\(?state_?after\)?\s*[=:]\s*([\d.]+)', raw, re.IGNORECASE)

        if before_match:
            phi_before = float(before_match.group(1))
        if after_match:
            phi_after = float(after_match.group(1))

        # Also try "Score: X/10" patterns
        if not before_match:
            score_matches = re.findall(r'(\d+\.?\d*)\s*/?\s*10', raw)
            if len(score_matches) >= 2:
                phi_before = float(score_matches[0])
                phi_after = float(score_matches[1])
            elif len(score_matches) == 1:
                phi_after = float(score_matches[0])

        confidence_match = re.search(r'confidence\s*[=:]\s*([\d.]+)', raw, re.IGNORECASE)
        confidence = float(confidence_match.group(1)) if confidence_match else 0.4

        return {
            "phi_before": phi_before,
            "phi_after": phi_after,
            "reasoning": raw[:500],
            "evidence": raw[500:800] if len(raw) > 500 else "",
            "confidence": confidence,
        }