| """ |
| Core data types for the Purpose Agent framework. |
| |
| All modules exchange these types — this keeps the architecture modular |
| and lets you swap out any component without touching the others. |
| |
| NOTE: sre_patches.py monkey-patches Trajectory properties at import time |
| with even more robust None guards. The versions here are the baseline; |
| SRE patches are the authoritative runtime versions. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import time |
| import uuid |
| from dataclasses import dataclass, field |
| from enum import Enum |
| from typing import Any |
|
|
|
|
| |
| |
| |
|
|
| @dataclass |
| class State: |
| """ |
| A snapshot of the environment at a point in time. |
| |
| `data` is intentionally a free-form dict — the framework doesn't prescribe |
| what "state" looks like. Your environment defines it. |
| |
| `summary` is an optional human-readable string (used in LLM prompts). |
| If not set, the Purpose Function will serialize `data` to JSON. |
| """ |
| data: dict[str, Any] |
| summary: str | None = None |
| timestamp: float = field(default_factory=time.time) |
| id: str = field(default_factory=lambda: uuid.uuid4().hex[:12]) |
|
|
| def describe(self) -> str: |
| """Return a prompt-friendly description of this state.""" |
| if self.summary: |
| return self.summary |
| import json |
| return json.dumps(self.data, indent=2, default=str) |
|
|
|
|
| @dataclass |
| class Action: |
| """ |
| An action the Actor decided to take. |
| |
| Attributes: |
| name: Action identifier (e.g. "search", "move_north", "edit_file") |
| params: Parameters for the action |
| thought: The Actor's chain-of-thought reasoning for this action |
| expected_delta: What the Actor *expects* to change in the state |
| """ |
| name: str |
| params: dict[str, Any] = field(default_factory=dict) |
| thought: str = "" |
| expected_delta: str = "" |
|
|
|
|
| |
| |
| |
|
|
| @dataclass |
| class PurposeScore: |
| """ |
| Output of the Purpose Function for a single state transition. |
| |
| Inspired by LATS (arxiv:2310.04406) V(s) formulation: |
| score = λ·LM_score + (1-λ)·consistency_score |
| |
| Anti-reward-hacking: `evidence` must cite specific observable state changes |
| (per SPC arxiv:2504.19162). The orchestrator can reject scores with vague evidence. |
| """ |
| phi_before: float |
| phi_after: float |
| delta: float |
| reasoning: str |
| evidence: str |
| confidence: float |
|
|
| @property |
| def improved(self) -> bool: |
| """Did this transition move closer to the purpose?""" |
| return self.delta > 0.0 |
|
|
|
|
| @dataclass |
| class TrajectoryStep: |
| """A single step in a trajectory: state → action → new_state → score.""" |
| state_before: State |
| action: Action |
| state_after: State |
| score: PurposeScore | None = None |
| step_index: int = 0 |
| wall_time_s: float = 0.0 |
|
|
|
|
| @dataclass |
| class Trajectory: |
| """ |
| A complete sequence of steps for one task attempt. |
| |
| The Experience Replay module stores these and the Optimizer extracts |
| heuristics from high-reward trajectories. |
| |
| NOTE: sre_patches.py replaces these properties with more thorough None guards. |
| The patched versions check both `s.score is not None` AND `s.score.delta is not None`. |
| """ |
| task_description: str |
| purpose: str |
| steps: list[TrajectoryStep] = field(default_factory=list) |
| id: str = field(default_factory=lambda: uuid.uuid4().hex[:12]) |
| created_at: float = field(default_factory=time.time) |
|
|
| @property |
| def cumulative_reward(self) -> float: |
| """Sum of all positive deltas in the trajectory (None-safe).""" |
| total = 0.0 |
| for s in self.steps: |
| if s.score is not None and s.score.delta is not None and s.score.delta > 0: |
| total += s.score.delta |
| return total |
|
|
| @property |
| def total_delta(self) -> float: |
| """Net state improvement across the entire trajectory (None-safe).""" |
| total = 0.0 |
| for s in self.steps: |
| if s.score is not None and s.score.delta is not None: |
| total += s.score.delta |
| return total |
|
|
| @property |
| def success_rate(self) -> float: |
| """Fraction of steps that improved state (None-safe).""" |
| scored = [s for s in self.steps if s.score is not None and s.score.delta is not None] |
| if not scored: |
| return 0.0 |
| return sum(1 for s in scored if s.score.improved) / len(scored) |
|
|
| @property |
| def final_phi(self) -> float | None: |
| """Final Φ value (state-distance-to-goal) at end of trajectory (None-safe).""" |
| scored = [s for s in self.steps if s.score is not None] |
| if not scored: |
| return None |
| return scored[-1].score.phi_after |
|
|
|
|
| |
| |
| |
|
|
| class MemoryTier(Enum): |
| """ |
| 3-tier memory hierarchy from MUSE (arxiv:2510.08002): |
| |
| STRATEGIC: High-level <Dilemma, Strategy> pairs loaded into system prompt. |
| Updated after each completed task via global distillation. |
| |
| PROCEDURAL: SOPs (Standard Operating Procedures) indexed by task pattern. |
| Loaded lazily — only the index is in context, full SOPs fetched on demand. |
| |
| TOOL: Per-action "muscle memory" — dynamic instructions returned with |
| each observation. Updated when new action patterns prove effective. |
| """ |
| STRATEGIC = "strategic" |
| PROCEDURAL = "procedural" |
| TOOL = "tool" |
|
|
|
|
| @dataclass |
| class Heuristic: |
| """ |
| A learned heuristic extracted from a high-reward trajectory. |
| |
| Inspired by CER (arxiv:2506.06698) distillation format: |
| - `pattern`: The abstract situation (with {variable} placeholders) |
| - `strategy`: What to do in that situation |
| - `steps`: Concrete action sequence (procedural SOP) |
| """ |
| pattern: str |
| strategy: str |
| steps: list[str] |
| tier: MemoryTier |
| source_trajectory_id: str = "" |
| q_value: float = 0.5 |
| times_used: int = 0 |
| times_succeeded: int = 0 |
| id: str = field(default_factory=lambda: uuid.uuid4().hex[:12]) |
|
|
| @property |
| def empirical_success_rate(self) -> float: |
| if self.times_used == 0: |
| return 0.5 |
| return self.times_succeeded / self.times_used |
|
|
| def update_q_value(self, reward: float, alpha: float = 0.1) -> None: |
| """ |
| Monte Carlo Q-value update (from REMEMBERER arxiv:2306.07929): |
| Q_new = Q_old + α * (reward - Q_old) |
| """ |
| self.q_value = self.q_value + alpha * (reward - self.q_value) |
|
|
|
|
| @dataclass |
| class MemoryRecord: |
| """ |
| A single record in the experience replay buffer. |
| |
| Combines the trajectory with its distilled heuristics and metadata |
| for retrieval ranking (semantic similarity + Q-value, per MemRL arxiv:2601.03192). |
| """ |
| trajectory: Trajectory |
| heuristics: list[Heuristic] = field(default_factory=list) |
| task_embedding: list[float] | None = None |
| retrieval_q_value: float = 0.5 |
| id: str = field(default_factory=lambda: uuid.uuid4().hex[:12]) |
|
|