Rohan03
/

purpose-agent

+"""
+Core data types for the Purpose Agent framework.
+All modules exchange these types — this keeps the architecture modular
+and lets you swap out any component without touching the others.
+"""
+from __future__ import annotations
+import time
+import uuid
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any
+# ---------------------------------------------------------------------------
+# State & Action
+# ---------------------------------------------------------------------------
+@dataclass
+class State:
+    """
+    A snapshot of the environment at a point in time.
+    `data` is intentionally a free-form dict — the framework doesn't prescribe
+    what "state" looks like. Your environment defines it.
+    `summary` is an optional human-readable string (used in LLM prompts).
+    If not set, the Purpose Function will serialize `data` to JSON.
+    """
+    data: dict[str, Any]
+    summary: str | None = None
+    timestamp: float = field(default_factory=time.time)
+    id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
+    def describe(self) -> str:
+        """Return a prompt-friendly description of this state."""
+        if self.summary:
+            return self.summary
+        import json
+        return json.dumps(self.data, indent=2, default=str)
+@dataclass
+class Action:
+    """
+    An action the Actor decided to take.
+    Attributes:
+        name: Action identifier (e.g. "search", "move_north", "edit_file")
+        params: Parameters for the action
+        thought: The Actor's chain-of-thought reasoning for this action
+        expected_delta: What the Actor *expects* to change in the state
+    """
+    name: str
+    params: dict[str, Any] = field(default_factory=dict)
+    thought: str = ""
+    expected_delta: str = ""
+# ---------------------------------------------------------------------------
+# Trajectory
+# ---------------------------------------------------------------------------
+@dataclass
+class PurposeScore:
+    """
+    Output of the Purpose Function for a single state transition.
+    Inspired by LATS (arxiv:2310.04406) V(s) formulation:
+      score = λ·LM_score + (1-λ)·consistency_score
+    Anti-reward-hacking: `evidence` must cite specific observable state changes
+    (per SPC arxiv:2504.19162). The orchestrator can reject scores with vague evidence.
+    """
+    phi_before: float       # Φ(s_current)  — distance-to-purpose of old state
+    phi_after: float        # Φ(s_new)      — distance-to-purpose of new state
+    delta: float            # Φ(s_new) - Φ(s_current) — the improvement signal
+    reasoning: str          # Step-by-step justification from the LLM
+    evidence: str           # Specific observable changes cited as evidence
+    confidence: float       # 0.0–1.0 — how confident the critic is
+    @property
+    def improved(self) -> bool:
+        """Did this transition move closer to the purpose?"""
+        return self.delta > 0.0
+@dataclass
+class TrajectoryStep:
+    """A single step in a trajectory: state → action → new_state → score."""
+    state_before: State
+    action: Action
+    state_after: State
+    score: PurposeScore | None = None
+    step_index: int = 0
+    wall_time_s: float = 0.0
+@dataclass
+class Trajectory:
+    """
+    A complete sequence of steps for one task attempt.
+    The Experience Replay module stores these and the Optimizer extracts
+    heuristics from high-reward trajectories.
+    """
+    task_description: str
+    purpose: str
+    steps: list[TrajectoryStep] = field(default_factory=list)
+    id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
+    created_at: float = field(default_factory=time.time)
+    @property
+    def cumulative_reward(self) -> float:
+        """Sum of all positive deltas in the trajectory."""
+        return sum(
+            s.score.delta for s in self.steps
+            if s.score is not None and s.score.delta > 0
+        )
+    @property
+    def total_delta(self) -> float:
+        """Net state improvement across the entire trajectory."""
+        return sum(
+            s.score.delta for s in self.steps if s.score is not None
+        )
+    @property
+    def success_rate(self) -> float:
+        """Fraction of steps that improved state."""
+        scored = [s for s in self.steps if s.score is not None]
+        if not scored:
+            return 0.0
+        return sum(1 for s in scored if s.score.improved) / len(scored)
+    @property
+    def final_phi(self) -> float | None:
+        """Final Φ value (state-distance-to-goal) at end of trajectory."""
+        scored = [s for s in self.steps if s.score is not None]
+        if not scored:
+            return None
+        return scored[-1].score.phi_after
+# ---------------------------------------------------------------------------
+# Memory & Heuristics
+# ---------------------------------------------------------------------------
+class MemoryTier(Enum):
+    """
+    3-tier memory hierarchy from MUSE (arxiv:2510.08002):
+    STRATEGIC: High-level <Dilemma, Strategy> pairs loaded into system prompt.
+               Updated after each completed task via global distillation.
+    PROCEDURAL: SOPs (Standard Operating Procedures) indexed by task pattern.
+                Loaded lazily — only the index is in context, full SOPs fetched on demand.
+    TOOL: Per-action "muscle memory" — dynamic instructions returned with
+          each observation. Updated when new action patterns prove effective.
+    """
+    STRATEGIC = "strategic"
+    PROCEDURAL = "procedural"
+    TOOL = "tool"
+@dataclass
+class Heuristic:
+    """
+    A learned heuristic extracted from a high-reward trajectory.
+    Inspired by CER (arxiv:2506.06698) distillation format:
+    - `pattern`: The abstract situation (with {variable} placeholders)
+    - `strategy`: What to do in that situation
+    - `steps`: Concrete action sequence (procedural SOP)
+    """
+    pattern: str            # When to apply (e.g. "When facing {obstacle_type} near {location}")
+    strategy: str           # What to do (e.g. "Decompose into sub-tasks, handle {x} first")
+    steps: list[str]        # Concrete action sequence
+    tier: MemoryTier        # Which memory tier this belongs to
+    source_trajectory_id: str = ""
+    q_value: float = 0.5    # Learned quality estimate (REMEMBERER-style Q-value)
+    times_used: int = 0
+    times_succeeded: int = 0
+    id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
+    @property
+    def empirical_success_rate(self) -> float:
+        if self.times_used == 0:
+            return 0.5  # Prior
+        return self.times_succeeded / self.times_used
+    def update_q_value(self, reward: float, alpha: float = 0.1) -> None:
+        """
+        Monte Carlo Q-value update (from REMEMBERER arxiv:2306.07929):
+        Q_new = Q_old + α * (reward - Q_old)
+        """
+        self.q_value = self.q_value + alpha * (reward - self.q_value)
+@dataclass
+class MemoryRecord:
+    """
+    A single record in the experience replay buffer.
+    Combines the trajectory with its distilled heuristics and metadata
+    for retrieval ranking (semantic similarity + Q-value, per MemRL arxiv:2601.03192).
+    """
+    trajectory: Trajectory
+    heuristics: list[Heuristic] = field(default_factory=list)
+    task_embedding: list[float] | None = None  # For semantic retrieval
+    retrieval_q_value: float = 0.5  # Learned utility for retrieval ranking
+    id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])