File size: 8,130 Bytes

"""
Core data types for the Purpose Agent framework.

All modules exchange these types — this keeps the architecture modular
and lets you swap out any component without touching the others.

NOTE: sre_patches.py monkey-patches Trajectory properties at import time
with even more robust None guards. The versions here are the baseline;
SRE patches are the authoritative runtime versions.
"""

from __future__ import annotations

import time
import uuid
from dataclasses import dataclass, field
from enum import Enum
from typing import Any


# ---------------------------------------------------------------------------
# State & Action
# ---------------------------------------------------------------------------

@dataclass
class State:
    """
    A snapshot of the environment at a point in time.
    
    `data` is intentionally a free-form dict — the framework doesn't prescribe
    what "state" looks like. Your environment defines it.
    
    `summary` is an optional human-readable string (used in LLM prompts).
    If not set, the Purpose Function will serialize `data` to JSON.
    """
    data: dict[str, Any]
    summary: str | None = None
    timestamp: float = field(default_factory=time.time)
    id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])

    def describe(self) -> str:
        """Return a prompt-friendly description of this state."""
        if self.summary:
            return self.summary
        import json
        return json.dumps(self.data, indent=2, default=str)


@dataclass
class Action:
    """
    An action the Actor decided to take.
    
    Attributes:
        name: Action identifier (e.g. "search", "move_north", "edit_file")
        params: Parameters for the action
        thought: The Actor's chain-of-thought reasoning for this action
        expected_delta: What the Actor *expects* to change in the state
    """
    name: str
    params: dict[str, Any] = field(default_factory=dict)
    thought: str = ""
    expected_delta: str = ""


# ---------------------------------------------------------------------------
# Trajectory
# ---------------------------------------------------------------------------

@dataclass
class PurposeScore:
    """
    Output of the Purpose Function for a single state transition.
    
    Inspired by LATS (arxiv:2310.04406) V(s) formulation:
      score = λ·LM_score + (1-λ)·consistency_score
    
    Anti-reward-hacking: `evidence` must cite specific observable state changes
    (per SPC arxiv:2504.19162). The orchestrator can reject scores with vague evidence.
    """
    phi_before: float       # Φ(s_current)  — distance-to-purpose of old state
    phi_after: float        # Φ(s_new)      — distance-to-purpose of new state
    delta: float            # Φ(s_new) - Φ(s_current) — the improvement signal
    reasoning: str          # Step-by-step justification from the LLM
    evidence: str           # Specific observable changes cited as evidence
    confidence: float       # 0.0–1.0 — how confident the critic is

    @property
    def improved(self) -> bool:
        """Did this transition move closer to the purpose?"""
        return self.delta > 0.0


@dataclass
class TrajectoryStep:
    """A single step in a trajectory: state → action → new_state → score."""
    state_before: State
    action: Action
    state_after: State
    score: PurposeScore | None = None
    step_index: int = 0
    wall_time_s: float = 0.0


@dataclass
class Trajectory:
    """
    A complete sequence of steps for one task attempt.
    
    The Experience Replay module stores these and the Optimizer extracts
    heuristics from high-reward trajectories.
    
    NOTE: sre_patches.py replaces these properties with more thorough None guards.
    The patched versions check both `s.score is not None` AND `s.score.delta is not None`.
    """
    task_description: str
    purpose: str
    steps: list[TrajectoryStep] = field(default_factory=list)
    id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
    created_at: float = field(default_factory=time.time)

    @property
    def cumulative_reward(self) -> float:
        """Sum of all positive deltas in the trajectory (None-safe)."""
        total = 0.0
        for s in self.steps:
            if s.score is not None and s.score.delta is not None and s.score.delta > 0:
                total += s.score.delta
        return total

    @property
    def total_delta(self) -> float:
        """Net state improvement across the entire trajectory (None-safe)."""
        total = 0.0
        for s in self.steps:
            if s.score is not None and s.score.delta is not None:
                total += s.score.delta
        return total

    @property
    def success_rate(self) -> float:
        """Fraction of steps that improved state (None-safe)."""
        scored = [s for s in self.steps if s.score is not None and s.score.delta is not None]
        if not scored:
            return 0.0
        return sum(1 for s in scored if s.score.improved) / len(scored)

    @property
    def final_phi(self) -> float | None:
        """Final Φ value (state-distance-to-goal) at end of trajectory (None-safe)."""
        scored = [s for s in self.steps if s.score is not None]
        if not scored:
            return None
        return scored[-1].score.phi_after


# ---------------------------------------------------------------------------
# Memory & Heuristics
# ---------------------------------------------------------------------------

class MemoryTier(Enum):
    """
    3-tier memory hierarchy from MUSE (arxiv:2510.08002):
    
    STRATEGIC: High-level <Dilemma, Strategy> pairs loaded into system prompt.
               Updated after each completed task via global distillation.
    
    PROCEDURAL: SOPs (Standard Operating Procedures) indexed by task pattern.
                Loaded lazily — only the index is in context, full SOPs fetched on demand.
    
    TOOL: Per-action "muscle memory" — dynamic instructions returned with
          each observation. Updated when new action patterns prove effective.
    """
    STRATEGIC = "strategic"
    PROCEDURAL = "procedural"
    TOOL = "tool"


@dataclass
class Heuristic:
    """
    A learned heuristic extracted from a high-reward trajectory.
    
    Inspired by CER (arxiv:2506.06698) distillation format:
    - `pattern`: The abstract situation (with {variable} placeholders)
    - `strategy`: What to do in that situation
    - `steps`: Concrete action sequence (procedural SOP)
    """
    pattern: str            # When to apply (e.g. "When facing {obstacle_type} near {location}")
    strategy: str           # What to do (e.g. "Decompose into sub-tasks, handle {x} first")
    steps: list[str]        # Concrete action sequence
    tier: MemoryTier        # Which memory tier this belongs to
    source_trajectory_id: str = ""
    q_value: float = 0.5    # Learned quality estimate (REMEMBERER-style Q-value)
    times_used: int = 0
    times_succeeded: int = 0
    id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])

    @property
    def empirical_success_rate(self) -> float:
        if self.times_used == 0:
            return 0.5  # Prior
        return self.times_succeeded / self.times_used

    def update_q_value(self, reward: float, alpha: float = 0.1) -> None:
        """
        Monte Carlo Q-value update (from REMEMBERER arxiv:2306.07929):
        Q_new = Q_old + α * (reward - Q_old)
        """
        self.q_value = self.q_value + alpha * (reward - self.q_value)


@dataclass
class MemoryRecord:
    """
    A single record in the experience replay buffer.
    
    Combines the trajectory with its distilled heuristics and metadata
    for retrieval ranking (semantic similarity + Q-value, per MemRL arxiv:2601.03192).
    """
    trajectory: Trajectory
    heuristics: list[Heuristic] = field(default_factory=list)
    task_embedding: list[float] | None = None  # For semantic retrieval
    retrieval_q_value: float = 0.5  # Learned utility for retrieval ranking
    id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])