""" Core data types for the Purpose Agent framework. All modules exchange these types — this keeps the architecture modular and lets you swap out any component without touching the others. NOTE: sre_patches.py monkey-patches Trajectory properties at import time with even more robust None guards. The versions here are the baseline; SRE patches are the authoritative runtime versions. """ from __future__ import annotations import time import uuid from dataclasses import dataclass, field from enum import Enum from typing import Any # --------------------------------------------------------------------------- # State & Action # --------------------------------------------------------------------------- @dataclass class State: """ A snapshot of the environment at a point in time. `data` is intentionally a free-form dict — the framework doesn't prescribe what "state" looks like. Your environment defines it. `summary` is an optional human-readable string (used in LLM prompts). If not set, the Purpose Function will serialize `data` to JSON. """ data: dict[str, Any] summary: str | None = None timestamp: float = field(default_factory=time.time) id: str = field(default_factory=lambda: uuid.uuid4().hex[:12]) def describe(self) -> str: """Return a prompt-friendly description of this state.""" if self.summary: return self.summary import json return json.dumps(self.data, indent=2, default=str) @dataclass class Action: """ An action the Actor decided to take. Attributes: name: Action identifier (e.g. "search", "move_north", "edit_file") params: Parameters for the action thought: The Actor's chain-of-thought reasoning for this action expected_delta: What the Actor *expects* to change in the state """ name: str params: dict[str, Any] = field(default_factory=dict) thought: str = "" expected_delta: str = "" # --------------------------------------------------------------------------- # Trajectory # --------------------------------------------------------------------------- @dataclass class PurposeScore: """ Output of the Purpose Function for a single state transition. Inspired by LATS (arxiv:2310.04406) V(s) formulation: score = λ·LM_score + (1-λ)·consistency_score Anti-reward-hacking: `evidence` must cite specific observable state changes (per SPC arxiv:2504.19162). The orchestrator can reject scores with vague evidence. """ phi_before: float # Φ(s_current) — distance-to-purpose of old state phi_after: float # Φ(s_new) — distance-to-purpose of new state delta: float # Φ(s_new) - Φ(s_current) — the improvement signal reasoning: str # Step-by-step justification from the LLM evidence: str # Specific observable changes cited as evidence confidence: float # 0.0–1.0 — how confident the critic is @property def improved(self) -> bool: """Did this transition move closer to the purpose?""" return self.delta > 0.0 @dataclass class TrajectoryStep: """A single step in a trajectory: state → action → new_state → score.""" state_before: State action: Action state_after: State score: PurposeScore | None = None step_index: int = 0 wall_time_s: float = 0.0 @dataclass class Trajectory: """ A complete sequence of steps for one task attempt. The Experience Replay module stores these and the Optimizer extracts heuristics from high-reward trajectories. NOTE: sre_patches.py replaces these properties with more thorough None guards. The patched versions check both `s.score is not None` AND `s.score.delta is not None`. """ task_description: str purpose: str steps: list[TrajectoryStep] = field(default_factory=list) id: str = field(default_factory=lambda: uuid.uuid4().hex[:12]) created_at: float = field(default_factory=time.time) @property def cumulative_reward(self) -> float: """Sum of all positive deltas in the trajectory (None-safe).""" total = 0.0 for s in self.steps: if s.score is not None and s.score.delta is not None and s.score.delta > 0: total += s.score.delta return total @property def total_delta(self) -> float: """Net state improvement across the entire trajectory (None-safe).""" total = 0.0 for s in self.steps: if s.score is not None and s.score.delta is not None: total += s.score.delta return total @property def success_rate(self) -> float: """Fraction of steps that improved state (None-safe).""" scored = [s for s in self.steps if s.score is not None and s.score.delta is not None] if not scored: return 0.0 return sum(1 for s in scored if s.score.improved) / len(scored) @property def final_phi(self) -> float | None: """Final Φ value (state-distance-to-goal) at end of trajectory (None-safe).""" scored = [s for s in self.steps if s.score is not None] if not scored: return None return scored[-1].score.phi_after # --------------------------------------------------------------------------- # Memory & Heuristics # --------------------------------------------------------------------------- class MemoryTier(Enum): """ 3-tier memory hierarchy from MUSE (arxiv:2510.08002): STRATEGIC: High-level pairs loaded into system prompt. Updated after each completed task via global distillation. PROCEDURAL: SOPs (Standard Operating Procedures) indexed by task pattern. Loaded lazily — only the index is in context, full SOPs fetched on demand. TOOL: Per-action "muscle memory" — dynamic instructions returned with each observation. Updated when new action patterns prove effective. """ STRATEGIC = "strategic" PROCEDURAL = "procedural" TOOL = "tool" @dataclass class Heuristic: """ A learned heuristic extracted from a high-reward trajectory. Inspired by CER (arxiv:2506.06698) distillation format: - `pattern`: The abstract situation (with {variable} placeholders) - `strategy`: What to do in that situation - `steps`: Concrete action sequence (procedural SOP) """ pattern: str # When to apply (e.g. "When facing {obstacle_type} near {location}") strategy: str # What to do (e.g. "Decompose into sub-tasks, handle {x} first") steps: list[str] # Concrete action sequence tier: MemoryTier # Which memory tier this belongs to source_trajectory_id: str = "" q_value: float = 0.5 # Learned quality estimate (REMEMBERER-style Q-value) times_used: int = 0 times_succeeded: int = 0 id: str = field(default_factory=lambda: uuid.uuid4().hex[:12]) @property def empirical_success_rate(self) -> float: if self.times_used == 0: return 0.5 # Prior return self.times_succeeded / self.times_used def update_q_value(self, reward: float, alpha: float = 0.1) -> None: """ Monte Carlo Q-value update (from REMEMBERER arxiv:2306.07929): Q_new = Q_old + α * (reward - Q_old) """ self.q_value = self.q_value + alpha * (reward - self.q_value) @dataclass class MemoryRecord: """ A single record in the experience replay buffer. Combines the trajectory with its distilled heuristics and metadata for retrieval ranking (semantic similarity + Q-value, per MemRL arxiv:2601.03192). """ trajectory: Trajectory heuristics: list[Heuristic] = field(default_factory=list) task_embedding: list[float] | None = None # For semantic retrieval retrieval_q_value: float = 0.5 # Learned utility for retrieval ranking id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])