Rohan03's picture
Fix Issue 1: Make Trajectory None guards thorough (matching SRE patch quality)
2259ebe verified
raw
history blame
8.13 kB
"""
Core data types for the Purpose Agent framework.
All modules exchange these types — this keeps the architecture modular
and lets you swap out any component without touching the others.
NOTE: sre_patches.py monkey-patches Trajectory properties at import time
with even more robust None guards. The versions here are the baseline;
SRE patches are the authoritative runtime versions.
"""
from __future__ import annotations
import time
import uuid
from dataclasses import dataclass, field
from enum import Enum
from typing import Any
# ---------------------------------------------------------------------------
# State & Action
# ---------------------------------------------------------------------------
@dataclass
class State:
"""
A snapshot of the environment at a point in time.
`data` is intentionally a free-form dict — the framework doesn't prescribe
what "state" looks like. Your environment defines it.
`summary` is an optional human-readable string (used in LLM prompts).
If not set, the Purpose Function will serialize `data` to JSON.
"""
data: dict[str, Any]
summary: str | None = None
timestamp: float = field(default_factory=time.time)
id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
def describe(self) -> str:
"""Return a prompt-friendly description of this state."""
if self.summary:
return self.summary
import json
return json.dumps(self.data, indent=2, default=str)
@dataclass
class Action:
"""
An action the Actor decided to take.
Attributes:
name: Action identifier (e.g. "search", "move_north", "edit_file")
params: Parameters for the action
thought: The Actor's chain-of-thought reasoning for this action
expected_delta: What the Actor *expects* to change in the state
"""
name: str
params: dict[str, Any] = field(default_factory=dict)
thought: str = ""
expected_delta: str = ""
# ---------------------------------------------------------------------------
# Trajectory
# ---------------------------------------------------------------------------
@dataclass
class PurposeScore:
"""
Output of the Purpose Function for a single state transition.
Inspired by LATS (arxiv:2310.04406) V(s) formulation:
score = λ·LM_score + (1-λ)·consistency_score
Anti-reward-hacking: `evidence` must cite specific observable state changes
(per SPC arxiv:2504.19162). The orchestrator can reject scores with vague evidence.
"""
phi_before: float # Φ(s_current) — distance-to-purpose of old state
phi_after: float # Φ(s_new) — distance-to-purpose of new state
delta: float # Φ(s_new) - Φ(s_current) — the improvement signal
reasoning: str # Step-by-step justification from the LLM
evidence: str # Specific observable changes cited as evidence
confidence: float # 0.0–1.0 — how confident the critic is
@property
def improved(self) -> bool:
"""Did this transition move closer to the purpose?"""
return self.delta > 0.0
@dataclass
class TrajectoryStep:
"""A single step in a trajectory: state → action → new_state → score."""
state_before: State
action: Action
state_after: State
score: PurposeScore | None = None
step_index: int = 0
wall_time_s: float = 0.0
@dataclass
class Trajectory:
"""
A complete sequence of steps for one task attempt.
The Experience Replay module stores these and the Optimizer extracts
heuristics from high-reward trajectories.
NOTE: sre_patches.py replaces these properties with more thorough None guards.
The patched versions check both `s.score is not None` AND `s.score.delta is not None`.
"""
task_description: str
purpose: str
steps: list[TrajectoryStep] = field(default_factory=list)
id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
created_at: float = field(default_factory=time.time)
@property
def cumulative_reward(self) -> float:
"""Sum of all positive deltas in the trajectory (None-safe)."""
total = 0.0
for s in self.steps:
if s.score is not None and s.score.delta is not None and s.score.delta > 0:
total += s.score.delta
return total
@property
def total_delta(self) -> float:
"""Net state improvement across the entire trajectory (None-safe)."""
total = 0.0
for s in self.steps:
if s.score is not None and s.score.delta is not None:
total += s.score.delta
return total
@property
def success_rate(self) -> float:
"""Fraction of steps that improved state (None-safe)."""
scored = [s for s in self.steps if s.score is not None and s.score.delta is not None]
if not scored:
return 0.0
return sum(1 for s in scored if s.score.improved) / len(scored)
@property
def final_phi(self) -> float | None:
"""Final Φ value (state-distance-to-goal) at end of trajectory (None-safe)."""
scored = [s for s in self.steps if s.score is not None]
if not scored:
return None
return scored[-1].score.phi_after
# ---------------------------------------------------------------------------
# Memory & Heuristics
# ---------------------------------------------------------------------------
class MemoryTier(Enum):
"""
3-tier memory hierarchy from MUSE (arxiv:2510.08002):
STRATEGIC: High-level <Dilemma, Strategy> pairs loaded into system prompt.
Updated after each completed task via global distillation.
PROCEDURAL: SOPs (Standard Operating Procedures) indexed by task pattern.
Loaded lazily — only the index is in context, full SOPs fetched on demand.
TOOL: Per-action "muscle memory" — dynamic instructions returned with
each observation. Updated when new action patterns prove effective.
"""
STRATEGIC = "strategic"
PROCEDURAL = "procedural"
TOOL = "tool"
@dataclass
class Heuristic:
"""
A learned heuristic extracted from a high-reward trajectory.
Inspired by CER (arxiv:2506.06698) distillation format:
- `pattern`: The abstract situation (with {variable} placeholders)
- `strategy`: What to do in that situation
- `steps`: Concrete action sequence (procedural SOP)
"""
pattern: str # When to apply (e.g. "When facing {obstacle_type} near {location}")
strategy: str # What to do (e.g. "Decompose into sub-tasks, handle {x} first")
steps: list[str] # Concrete action sequence
tier: MemoryTier # Which memory tier this belongs to
source_trajectory_id: str = ""
q_value: float = 0.5 # Learned quality estimate (REMEMBERER-style Q-value)
times_used: int = 0
times_succeeded: int = 0
id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
@property
def empirical_success_rate(self) -> float:
if self.times_used == 0:
return 0.5 # Prior
return self.times_succeeded / self.times_used
def update_q_value(self, reward: float, alpha: float = 0.1) -> None:
"""
Monte Carlo Q-value update (from REMEMBERER arxiv:2306.07929):
Q_new = Q_old + α * (reward - Q_old)
"""
self.q_value = self.q_value + alpha * (reward - self.q_value)
@dataclass
class MemoryRecord:
"""
A single record in the experience replay buffer.
Combines the trajectory with its distilled heuristics and metadata
for retrieval ranking (semantic similarity + Q-value, per MemRL arxiv:2601.03192).
"""
trajectory: Trajectory
heuristics: list[Heuristic] = field(default_factory=list)
task_embedding: list[float] | None = None # For semantic retrieval
retrieval_q_value: float = 0.5 # Learned utility for retrieval ranking
id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])