Fix Issue 1: Make Trajectory None guards thorough (matching SRE patch quality)

2259ebe verified 11 days ago

8.13 kB

	"""
	Core data types for the Purpose Agent framework.

	All modules exchange these types — this keeps the architecture modular
	and lets you swap out any component without touching the others.

	NOTE: sre_patches.py monkey-patches Trajectory properties at import time
	with even more robust None guards. The versions here are the baseline;
	SRE patches are the authoritative runtime versions.
	"""

	from __future__ import annotations

	import time
	import uuid
	from dataclasses import dataclass, field
	from enum import Enum
	from typing import Any


	# ---------------------------------------------------------------------------
	# State & Action
	# ---------------------------------------------------------------------------

	@dataclass
	class State:
	"""
	A snapshot of the environment at a point in time.

	`data` is intentionally a free-form dict — the framework doesn't prescribe
	what "state" looks like. Your environment defines it.

	`summary` is an optional human-readable string (used in LLM prompts).
	If not set, the Purpose Function will serialize `data` to JSON.
	"""
	data: dict[str, Any]
	summary: str \| None = None
	timestamp: float = field(default_factory=time.time)
	id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])

	def describe(self) -> str:
	"""Return a prompt-friendly description of this state."""
	if self.summary:
	return self.summary
	import json
	return json.dumps(self.data, indent=2, default=str)


	@dataclass
	class Action:
	"""
	An action the Actor decided to take.

	Attributes:
	name: Action identifier (e.g. "search", "move_north", "edit_file")
	params: Parameters for the action
	thought: The Actor's chain-of-thought reasoning for this action
	expected_delta: What the Actor expects to change in the state
	"""
	name: str
	params: dict[str, Any] = field(default_factory=dict)
	thought: str = ""
	expected_delta: str = ""


	# ---------------------------------------------------------------------------
	# Trajectory
	# ---------------------------------------------------------------------------

	@dataclass
	class PurposeScore:
	"""
	Output of the Purpose Function for a single state transition.

	Inspired by LATS (arxiv:2310.04406) V(s) formulation:
	score = λ·LM_score + (1-λ)·consistency_score

	Anti-reward-hacking: `evidence` must cite specific observable state changes
	(per SPC arxiv:2504.19162). The orchestrator can reject scores with vague evidence.
	"""
	phi_before: float # Φ(s_current) — distance-to-purpose of old state
	phi_after: float # Φ(s_new) — distance-to-purpose of new state
	delta: float # Φ(s_new) - Φ(s_current) — the improvement signal
	reasoning: str # Step-by-step justification from the LLM
	evidence: str # Specific observable changes cited as evidence
	confidence: float # 0.0–1.0 — how confident the critic is

	@property
	def improved(self) -> bool:
	"""Did this transition move closer to the purpose?"""
	return self.delta > 0.0


	@dataclass
	class TrajectoryStep:
	"""A single step in a trajectory: state → action → new_state → score."""
	state_before: State
	action: Action
	state_after: State
	score: PurposeScore \| None = None
	step_index: int = 0
	wall_time_s: float = 0.0


	@dataclass
	class Trajectory:
	"""
	A complete sequence of steps for one task attempt.

	The Experience Replay module stores these and the Optimizer extracts
	heuristics from high-reward trajectories.

	NOTE: sre_patches.py replaces these properties with more thorough None guards.
	The patched versions check both `s.score is not None` AND `s.score.delta is not None`.
	"""
	task_description: str
	purpose: str
	steps: list[TrajectoryStep] = field(default_factory=list)
	id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
	created_at: float = field(default_factory=time.time)

	@property
	def cumulative_reward(self) -> float:
	"""Sum of all positive deltas in the trajectory (None-safe)."""
	total = 0.0
	for s in self.steps:
	if s.score is not None and s.score.delta is not None and s.score.delta > 0:
	total += s.score.delta
	return total

	@property
	def total_delta(self) -> float:
	"""Net state improvement across the entire trajectory (None-safe)."""
	total = 0.0
	for s in self.steps:
	if s.score is not None and s.score.delta is not None:
	total += s.score.delta
	return total

	@property
	def success_rate(self) -> float:
	"""Fraction of steps that improved state (None-safe)."""
	scored = [s for s in self.steps if s.score is not None and s.score.delta is not None]
	if not scored:
	return 0.0
	return sum(1 for s in scored if s.score.improved) / len(scored)

	@property
	def final_phi(self) -> float \| None:
	"""Final Φ value (state-distance-to-goal) at end of trajectory (None-safe)."""
	scored = [s for s in self.steps if s.score is not None]
	if not scored:
	return None
	return scored[-1].score.phi_after


	# ---------------------------------------------------------------------------
	# Memory & Heuristics
	# ---------------------------------------------------------------------------

	class MemoryTier(Enum):
	"""
	3-tier memory hierarchy from MUSE (arxiv:2510.08002):

	STRATEGIC: High-level <Dilemma, Strategy> pairs loaded into system prompt.
	Updated after each completed task via global distillation.

	PROCEDURAL: SOPs (Standard Operating Procedures) indexed by task pattern.
	Loaded lazily — only the index is in context, full SOPs fetched on demand.

	TOOL: Per-action "muscle memory" — dynamic instructions returned with
	each observation. Updated when new action patterns prove effective.
	"""
	STRATEGIC = "strategic"
	PROCEDURAL = "procedural"
	TOOL = "tool"


	@dataclass
	class Heuristic:
	"""
	A learned heuristic extracted from a high-reward trajectory.

	Inspired by CER (arxiv:2506.06698) distillation format:
	- `pattern`: The abstract situation (with {variable} placeholders)
	- `strategy`: What to do in that situation
	- `steps`: Concrete action sequence (procedural SOP)
	"""
	pattern: str # When to apply (e.g. "When facing {obstacle_type} near {location}")
	strategy: str # What to do (e.g. "Decompose into sub-tasks, handle {x} first")
	steps: list[str] # Concrete action sequence
	tier: MemoryTier # Which memory tier this belongs to
	source_trajectory_id: str = ""
	q_value: float = 0.5 # Learned quality estimate (REMEMBERER-style Q-value)
	times_used: int = 0
	times_succeeded: int = 0
	id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])

	@property
	def empirical_success_rate(self) -> float:
	if self.times_used == 0:
	return 0.5 # Prior
	return self.times_succeeded / self.times_used

	def update_q_value(self, reward: float, alpha: float = 0.1) -> None:
	"""
	Monte Carlo Q-value update (from REMEMBERER arxiv:2306.07929):
	Q_new = Q_old + α * (reward - Q_old)
	"""
	self.q_value = self.q_value + alpha * (reward - self.q_value)


	@dataclass
	class MemoryRecord:
	"""
	A single record in the experience replay buffer.

	Combines the trajectory with its distilled heuristics and metadata
	for retrieval ranking (semantic similarity + Q-value, per MemRL arxiv:2601.03192).
	"""
	trajectory: Trajectory
	heuristics: list[Heuristic] = field(default_factory=list)
	task_embedding: list[float] \| None = None # For semantic retrieval
	retrieval_q_value: float = 0.5 # Learned utility for retrieval ranking
	id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])