File size: 5,567 Bytes

d1da4ac

"""
state_delta.py — Markovian State-Delta Evaluation.

PROBLEM: Passing full conversation history to the Critic costs O(N²) tokens
as the trajectory grows. This makes long tasks unaffordable.

SOLUTION: Recognize this IS a Markov Decision Process. The Critic only needs:
  - The delta between state(t) and state(t+1)
  - The action taken
  - The purpose (constant)

This reduces the Critic's input to ~300 tokens regardless of trajectory length.

Mathematical basis:
  In an MDP, the value of state s' depends only on s' itself, not on the path
  taken to reach it. Therefore Φ(s') can be computed from s' alone.
  
  We compute: diff = state_diff(s_t, s_{t+1})
  Critic sees: (purpose, action, diff) → Φ score
  Token cost: O(1) per step, not O(N)
"""
from __future__ import annotations

import difflib
import json
from dataclasses import dataclass
from typing import Any

from purpose_agent.types import State


@dataclass
class StateDelta:
    """
    The computed difference between two states.
    
    This is ALL the Critic needs to see — not the full history.
    Keeps token cost constant (~100-300 tokens) regardless of trajectory length.
    """
    added_keys: dict[str, Any]      # New keys in s_{t+1} that weren't in s_t
    removed_keys: list[str]          # Keys in s_t that are gone in s_{t+1}
    changed_keys: dict[str, tuple[Any, Any]]  # key → (old_value, new_value)
    summary_diff: str                # Human-readable diff (what the Critic sees)
    token_estimate: int = 0          # Estimated tokens for this delta

    @property
    def is_empty(self) -> bool:
        """No state change occurred."""
        return not self.added_keys and not self.removed_keys and not self.changed_keys

    @property
    def change_magnitude(self) -> int:
        """How much changed — useful for detecting no-ops."""
        return len(self.added_keys) + len(self.removed_keys) + len(self.changed_keys)


def compute_state_delta(s_before: State, s_after: State) -> StateDelta:
    """
    Compute the minimal diff between two states.
    
    This is the core O(1) optimization: instead of passing both full states
    to the Critic (which grows with trajectory), we pass only what changed.
    
    Token cost: ~100-300 tokens regardless of state size or trajectory length.
    """
    d_before = s_before.data
    d_after = s_after.data

    added = {}
    removed = []
    changed = {}

    # Find additions and changes
    for key, val in d_after.items():
        if key.startswith("_"):  # Skip internal keys
            continue
        if key not in d_before:
            added[key] = val
        elif d_before[key] != val:
            changed[key] = (d_before[key], val)

    # Find removals
    for key in d_before:
        if key.startswith("_"):
            continue
        if key not in d_after:
            removed.append(key)

    # Build human-readable summary (what the Critic actually sees)
    lines = []
    if added:
        for k, v in added.items():
            lines.append(f"+ {k} = {_truncate(v)}")
    if removed:
        for k in removed:
            lines.append(f"- {k} (removed)")
    if changed:
        for k, (old, new) in changed.items():
            lines.append(f"~ {k}: {_truncate(old)} → {_truncate(new)}")

    # If summaries available, use text diff
    if s_before.summary and s_after.summary and s_before.summary != s_after.summary:
        text_diff = _text_diff(s_before.summary, s_after.summary)
        if text_diff and not lines:
            lines.append(text_diff)

    summary = "\n".join(lines) if lines else "(no observable change)"
    token_est = len(summary) // 4  # ~4 chars per token

    return StateDelta(
        added_keys=added,
        removed_keys=removed,
        changed_keys=changed,
        summary_diff=summary,
        token_estimate=token_est,
    )


def format_critic_input(
    purpose: str,
    action_name: str,
    action_thought: str,
    delta: StateDelta,
    max_tokens: int = 300,
) -> str:
    """
    Format the minimal input for the Markovian Critic.
    
    Total budget: ~300 tokens. This is ALL the Critic gets.
    No history. No full state. Just: purpose + action + what changed.
    
    This is the fundamental insight: in an MDP, Φ(s') depends only on s',
    not on the path taken to reach it.
    """
    parts = [
        f"PURPOSE: {purpose[:100]}",
        f"ACTION: {action_name}",
        f"THOUGHT: {action_thought[:80]}",
        f"STATE CHANGE:\n{delta.summary_diff[:400]}",
    ]

    text = "\n".join(parts)

    # Hard cap
    char_budget = max_tokens * 4
    if len(text) > char_budget:
        text = text[:char_budget] + "\n...(truncated)"

    return text


def _truncate(val: Any, max_len: int = 80) -> str:
    """Truncate a value for display."""
    s = str(val)
    return s[:max_len] + "..." if len(s) > max_len else s


def _text_diff(before: str, after: str) -> str:
    """Compute a concise text diff between two summaries."""
    if before == after:
        return ""

    # Use unified diff for short texts
    before_lines = before.split("\n")
    after_lines = after.split("\n")

    diff_lines = []
    for line in difflib.unified_diff(before_lines, after_lines, lineterm="", n=0):
        if line.startswith("---") or line.startswith("+++") or line.startswith("@@"):
            continue
        if line.startswith("+"):
            diff_lines.append(f"+ {line[1:]}")
        elif line.startswith("-"):
            diff_lines.append(f"- {line[1:]}")

    return "\n".join(diff_lines[:5])  # Cap at 5 diff lines