File size: 8,130 Bytes
b199fa3 2259ebe b199fa3 2259ebe b199fa3 2259ebe b199fa3 2259ebe b199fa3 2259ebe b199fa3 2259ebe b199fa3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 | """
Core data types for the Purpose Agent framework.
All modules exchange these types — this keeps the architecture modular
and lets you swap out any component without touching the others.
NOTE: sre_patches.py monkey-patches Trajectory properties at import time
with even more robust None guards. The versions here are the baseline;
SRE patches are the authoritative runtime versions.
"""
from __future__ import annotations
import time
import uuid
from dataclasses import dataclass, field
from enum import Enum
from typing import Any
# ---------------------------------------------------------------------------
# State & Action
# ---------------------------------------------------------------------------
@dataclass
class State:
"""
A snapshot of the environment at a point in time.
`data` is intentionally a free-form dict — the framework doesn't prescribe
what "state" looks like. Your environment defines it.
`summary` is an optional human-readable string (used in LLM prompts).
If not set, the Purpose Function will serialize `data` to JSON.
"""
data: dict[str, Any]
summary: str | None = None
timestamp: float = field(default_factory=time.time)
id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
def describe(self) -> str:
"""Return a prompt-friendly description of this state."""
if self.summary:
return self.summary
import json
return json.dumps(self.data, indent=2, default=str)
@dataclass
class Action:
"""
An action the Actor decided to take.
Attributes:
name: Action identifier (e.g. "search", "move_north", "edit_file")
params: Parameters for the action
thought: The Actor's chain-of-thought reasoning for this action
expected_delta: What the Actor *expects* to change in the state
"""
name: str
params: dict[str, Any] = field(default_factory=dict)
thought: str = ""
expected_delta: str = ""
# ---------------------------------------------------------------------------
# Trajectory
# ---------------------------------------------------------------------------
@dataclass
class PurposeScore:
"""
Output of the Purpose Function for a single state transition.
Inspired by LATS (arxiv:2310.04406) V(s) formulation:
score = λ·LM_score + (1-λ)·consistency_score
Anti-reward-hacking: `evidence` must cite specific observable state changes
(per SPC arxiv:2504.19162). The orchestrator can reject scores with vague evidence.
"""
phi_before: float # Φ(s_current) — distance-to-purpose of old state
phi_after: float # Φ(s_new) — distance-to-purpose of new state
delta: float # Φ(s_new) - Φ(s_current) — the improvement signal
reasoning: str # Step-by-step justification from the LLM
evidence: str # Specific observable changes cited as evidence
confidence: float # 0.0–1.0 — how confident the critic is
@property
def improved(self) -> bool:
"""Did this transition move closer to the purpose?"""
return self.delta > 0.0
@dataclass
class TrajectoryStep:
"""A single step in a trajectory: state → action → new_state → score."""
state_before: State
action: Action
state_after: State
score: PurposeScore | None = None
step_index: int = 0
wall_time_s: float = 0.0
@dataclass
class Trajectory:
"""
A complete sequence of steps for one task attempt.
The Experience Replay module stores these and the Optimizer extracts
heuristics from high-reward trajectories.
NOTE: sre_patches.py replaces these properties with more thorough None guards.
The patched versions check both `s.score is not None` AND `s.score.delta is not None`.
"""
task_description: str
purpose: str
steps: list[TrajectoryStep] = field(default_factory=list)
id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
created_at: float = field(default_factory=time.time)
@property
def cumulative_reward(self) -> float:
"""Sum of all positive deltas in the trajectory (None-safe)."""
total = 0.0
for s in self.steps:
if s.score is not None and s.score.delta is not None and s.score.delta > 0:
total += s.score.delta
return total
@property
def total_delta(self) -> float:
"""Net state improvement across the entire trajectory (None-safe)."""
total = 0.0
for s in self.steps:
if s.score is not None and s.score.delta is not None:
total += s.score.delta
return total
@property
def success_rate(self) -> float:
"""Fraction of steps that improved state (None-safe)."""
scored = [s for s in self.steps if s.score is not None and s.score.delta is not None]
if not scored:
return 0.0
return sum(1 for s in scored if s.score.improved) / len(scored)
@property
def final_phi(self) -> float | None:
"""Final Φ value (state-distance-to-goal) at end of trajectory (None-safe)."""
scored = [s for s in self.steps if s.score is not None]
if not scored:
return None
return scored[-1].score.phi_after
# ---------------------------------------------------------------------------
# Memory & Heuristics
# ---------------------------------------------------------------------------
class MemoryTier(Enum):
"""
3-tier memory hierarchy from MUSE (arxiv:2510.08002):
STRATEGIC: High-level <Dilemma, Strategy> pairs loaded into system prompt.
Updated after each completed task via global distillation.
PROCEDURAL: SOPs (Standard Operating Procedures) indexed by task pattern.
Loaded lazily — only the index is in context, full SOPs fetched on demand.
TOOL: Per-action "muscle memory" — dynamic instructions returned with
each observation. Updated when new action patterns prove effective.
"""
STRATEGIC = "strategic"
PROCEDURAL = "procedural"
TOOL = "tool"
@dataclass
class Heuristic:
"""
A learned heuristic extracted from a high-reward trajectory.
Inspired by CER (arxiv:2506.06698) distillation format:
- `pattern`: The abstract situation (with {variable} placeholders)
- `strategy`: What to do in that situation
- `steps`: Concrete action sequence (procedural SOP)
"""
pattern: str # When to apply (e.g. "When facing {obstacle_type} near {location}")
strategy: str # What to do (e.g. "Decompose into sub-tasks, handle {x} first")
steps: list[str] # Concrete action sequence
tier: MemoryTier # Which memory tier this belongs to
source_trajectory_id: str = ""
q_value: float = 0.5 # Learned quality estimate (REMEMBERER-style Q-value)
times_used: int = 0
times_succeeded: int = 0
id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
@property
def empirical_success_rate(self) -> float:
if self.times_used == 0:
return 0.5 # Prior
return self.times_succeeded / self.times_used
def update_q_value(self, reward: float, alpha: float = 0.1) -> None:
"""
Monte Carlo Q-value update (from REMEMBERER arxiv:2306.07929):
Q_new = Q_old + α * (reward - Q_old)
"""
self.q_value = self.q_value + alpha * (reward - self.q_value)
@dataclass
class MemoryRecord:
"""
A single record in the experience replay buffer.
Combines the trajectory with its distilled heuristics and metadata
for retrieval ranking (semantic similarity + Q-value, per MemRL arxiv:2601.03192).
"""
trajectory: Trajectory
heuristics: list[Heuristic] = field(default_factory=list)
task_embedding: list[float] | None = None # For semantic retrieval
retrieval_q_value: float = 0.5 # Learned utility for retrieval ranking
id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
|