File size: 8,130 Bytes
b199fa3
 
 
 
 
2259ebe
 
 
 
b199fa3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2259ebe
 
 
b199fa3
 
 
 
 
 
 
 
 
2259ebe
 
 
 
 
 
b199fa3
 
 
2259ebe
 
 
 
 
 
b199fa3
 
 
2259ebe
 
b199fa3
 
 
 
 
 
2259ebe
b199fa3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
"""
Core data types for the Purpose Agent framework.

All modules exchange these types — this keeps the architecture modular
and lets you swap out any component without touching the others.

NOTE: sre_patches.py monkey-patches Trajectory properties at import time
with even more robust None guards. The versions here are the baseline;
SRE patches are the authoritative runtime versions.
"""

from __future__ import annotations

import time
import uuid
from dataclasses import dataclass, field
from enum import Enum
from typing import Any


# ---------------------------------------------------------------------------
# State & Action
# ---------------------------------------------------------------------------

@dataclass
class State:
    """
    A snapshot of the environment at a point in time.
    
    `data` is intentionally a free-form dict — the framework doesn't prescribe
    what "state" looks like. Your environment defines it.
    
    `summary` is an optional human-readable string (used in LLM prompts).
    If not set, the Purpose Function will serialize `data` to JSON.
    """
    data: dict[str, Any]
    summary: str | None = None
    timestamp: float = field(default_factory=time.time)
    id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])

    def describe(self) -> str:
        """Return a prompt-friendly description of this state."""
        if self.summary:
            return self.summary
        import json
        return json.dumps(self.data, indent=2, default=str)


@dataclass
class Action:
    """
    An action the Actor decided to take.
    
    Attributes:
        name: Action identifier (e.g. "search", "move_north", "edit_file")
        params: Parameters for the action
        thought: The Actor's chain-of-thought reasoning for this action
        expected_delta: What the Actor *expects* to change in the state
    """
    name: str
    params: dict[str, Any] = field(default_factory=dict)
    thought: str = ""
    expected_delta: str = ""


# ---------------------------------------------------------------------------
# Trajectory
# ---------------------------------------------------------------------------

@dataclass
class PurposeScore:
    """
    Output of the Purpose Function for a single state transition.
    
    Inspired by LATS (arxiv:2310.04406) V(s) formulation:
      score = λ·LM_score + (1-λ)·consistency_score
    
    Anti-reward-hacking: `evidence` must cite specific observable state changes
    (per SPC arxiv:2504.19162). The orchestrator can reject scores with vague evidence.
    """
    phi_before: float       # Φ(s_current)  — distance-to-purpose of old state
    phi_after: float        # Φ(s_new)      — distance-to-purpose of new state
    delta: float            # Φ(s_new) - Φ(s_current) — the improvement signal
    reasoning: str          # Step-by-step justification from the LLM
    evidence: str           # Specific observable changes cited as evidence
    confidence: float       # 0.0–1.0 — how confident the critic is

    @property
    def improved(self) -> bool:
        """Did this transition move closer to the purpose?"""
        return self.delta > 0.0


@dataclass
class TrajectoryStep:
    """A single step in a trajectory: state → action → new_state → score."""
    state_before: State
    action: Action
    state_after: State
    score: PurposeScore | None = None
    step_index: int = 0
    wall_time_s: float = 0.0


@dataclass
class Trajectory:
    """
    A complete sequence of steps for one task attempt.
    
    The Experience Replay module stores these and the Optimizer extracts
    heuristics from high-reward trajectories.
    
    NOTE: sre_patches.py replaces these properties with more thorough None guards.
    The patched versions check both `s.score is not None` AND `s.score.delta is not None`.
    """
    task_description: str
    purpose: str
    steps: list[TrajectoryStep] = field(default_factory=list)
    id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
    created_at: float = field(default_factory=time.time)

    @property
    def cumulative_reward(self) -> float:
        """Sum of all positive deltas in the trajectory (None-safe)."""
        total = 0.0
        for s in self.steps:
            if s.score is not None and s.score.delta is not None and s.score.delta > 0:
                total += s.score.delta
        return total

    @property
    def total_delta(self) -> float:
        """Net state improvement across the entire trajectory (None-safe)."""
        total = 0.0
        for s in self.steps:
            if s.score is not None and s.score.delta is not None:
                total += s.score.delta
        return total

    @property
    def success_rate(self) -> float:
        """Fraction of steps that improved state (None-safe)."""
        scored = [s for s in self.steps if s.score is not None and s.score.delta is not None]
        if not scored:
            return 0.0
        return sum(1 for s in scored if s.score.improved) / len(scored)

    @property
    def final_phi(self) -> float | None:
        """Final Φ value (state-distance-to-goal) at end of trajectory (None-safe)."""
        scored = [s for s in self.steps if s.score is not None]
        if not scored:
            return None
        return scored[-1].score.phi_after


# ---------------------------------------------------------------------------
# Memory & Heuristics
# ---------------------------------------------------------------------------

class MemoryTier(Enum):
    """
    3-tier memory hierarchy from MUSE (arxiv:2510.08002):
    
    STRATEGIC: High-level <Dilemma, Strategy> pairs loaded into system prompt.
               Updated after each completed task via global distillation.
    
    PROCEDURAL: SOPs (Standard Operating Procedures) indexed by task pattern.
                Loaded lazily — only the index is in context, full SOPs fetched on demand.
    
    TOOL: Per-action "muscle memory" — dynamic instructions returned with
          each observation. Updated when new action patterns prove effective.
    """
    STRATEGIC = "strategic"
    PROCEDURAL = "procedural"
    TOOL = "tool"


@dataclass
class Heuristic:
    """
    A learned heuristic extracted from a high-reward trajectory.
    
    Inspired by CER (arxiv:2506.06698) distillation format:
    - `pattern`: The abstract situation (with {variable} placeholders)
    - `strategy`: What to do in that situation
    - `steps`: Concrete action sequence (procedural SOP)
    """
    pattern: str            # When to apply (e.g. "When facing {obstacle_type} near {location}")
    strategy: str           # What to do (e.g. "Decompose into sub-tasks, handle {x} first")
    steps: list[str]        # Concrete action sequence
    tier: MemoryTier        # Which memory tier this belongs to
    source_trajectory_id: str = ""
    q_value: float = 0.5    # Learned quality estimate (REMEMBERER-style Q-value)
    times_used: int = 0
    times_succeeded: int = 0
    id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])

    @property
    def empirical_success_rate(self) -> float:
        if self.times_used == 0:
            return 0.5  # Prior
        return self.times_succeeded / self.times_used

    def update_q_value(self, reward: float, alpha: float = 0.1) -> None:
        """
        Monte Carlo Q-value update (from REMEMBERER arxiv:2306.07929):
        Q_new = Q_old + α * (reward - Q_old)
        """
        self.q_value = self.q_value + alpha * (reward - self.q_value)


@dataclass
class MemoryRecord:
    """
    A single record in the experience replay buffer.
    
    Combines the trajectory with its distilled heuristics and metadata
    for retrieval ranking (semantic similarity + Q-value, per MemRL arxiv:2601.03192).
    """
    trajectory: Trajectory
    heuristics: list[Heuristic] = field(default_factory=list)
    task_embedding: list[float] | None = None  # For semantic retrieval
    retrieval_q_value: float = 0.5  # Learned utility for retrieval ranking
    id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])