Rohan03 commited on
Commit
3ca0d80
Β·
verified Β·
1 Parent(s): b6f70a1

Add purpose_agent/orchestrator.py

Browse files
Files changed (1) hide show
  1. purpose_agent/orchestrator.py +554 -0
purpose_agent/orchestrator.py ADDED
@@ -0,0 +1,554 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Orchestrator β€” The main loop tying Actor, Purpose Function, Experience Replay,
3
+ and Heuristic Optimizer together.
4
+
5
+ Implements the self-improvement loop:
6
+
7
+ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
8
+ β”‚ ORCHESTRATOR LOOP β”‚
9
+ β”‚ β”‚
10
+ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” action β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” s_new β”‚
11
+ β”‚ β”‚ ACTOR β”‚ ────────► β”‚ ENVIRONMENT β”‚ ──────────┐ β”‚
12
+ β”‚ β”‚(+memory) β”‚ β”‚ (your code) β”‚ β”‚ β”‚
13
+ β”‚ β””β”€β”€β”€β”€β–²β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚
14
+ β”‚ β”‚ β–Ό β”‚
15
+ β”‚ β”‚ heuristics β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” (s, a, s') β”‚
16
+ β”‚ │◄───────────────│ OPTIMIZER │◄─────────┐ β”‚
17
+ β”‚ β”‚ β”‚ (distillation) β”‚ β”‚ β”‚
18
+ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚
19
+ β”‚ β”‚ β”‚ β”‚
20
+ β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” Ξ¦(s)β†’Ξ¦(s') β”‚
21
+ β”‚ β”‚ β”‚ PURPOSE FN │─────────── β”‚
22
+ β”‚ β”‚ β”‚ (state critic) β”‚ β”‚ β”‚
23
+ β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚
24
+ β”‚ β”‚ β”‚ β”‚
25
+ β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚
26
+ β”‚ └────────────────│ EXPERIENCE β”‚β—„β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚
27
+ β”‚ β”‚ REPLAY BUFFER β”‚ β”‚
28
+ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚
29
+ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
30
+
31
+ Usage:
32
+ from purpose_agent import Orchestrator, MockLLMBackend
33
+
34
+ # 1. Define your environment
35
+ class MyEnv(Environment):
36
+ def execute(self, action, current_state):
37
+ # ... do something ...
38
+ return new_state
39
+
40
+ # 2. Create orchestrator
41
+ orch = Orchestrator(
42
+ llm=MockLLMBackend(), # or HFInferenceBackend(), OpenAICompatibleBackend()
43
+ environment=MyEnv(),
44
+ available_actions={"search": "Search for items", "move": "Move to location"},
45
+ )
46
+
47
+ # 3. Run a task
48
+ result = orch.run_task(
49
+ purpose="Find the hidden treasure in the maze",
50
+ initial_state=State(data={"position": [0, 0], "inventory": []}),
51
+ max_steps=20,
52
+ )
53
+
54
+ # 4. The agent self-improves β€” run more tasks and it gets better
55
+ result2 = orch.run_task(purpose="Find the second treasure", ...)
56
+ """
57
+
58
+ from __future__ import annotations
59
+
60
+ import json
61
+ import logging
62
+ import time
63
+ from abc import ABC, abstractmethod
64
+ from typing import Any, Callable
65
+
66
+ from purpose_agent.types import (
67
+ Action,
68
+ Heuristic,
69
+ MemoryTier,
70
+ PurposeScore,
71
+ State,
72
+ Trajectory,
73
+ TrajectoryStep,
74
+ )
75
+ from purpose_agent.actor import Actor
76
+ from purpose_agent.purpose_function import PurposeFunction
77
+ from purpose_agent.experience_replay import ExperienceReplay
78
+ from purpose_agent.optimizer import HeuristicOptimizer
79
+ from purpose_agent.llm_backend import LLMBackend
80
+
81
+ logger = logging.getLogger(__name__)
82
+
83
+
84
+ # ---------------------------------------------------------------------------
85
+ # Environment Interface
86
+ # ---------------------------------------------------------------------------
87
+
88
+ class Environment(ABC):
89
+ """
90
+ Abstract environment that the Agent acts in.
91
+
92
+ Implement this for your specific use case:
93
+ - Web navigation: wrap a browser automation tool
94
+ - Code generation: wrap a code executor
95
+ - Game: wrap a game API
96
+ - Simulated: mock environment for testing
97
+
98
+ The Orchestrator calls execute() with the agent's action and current state,
99
+ and expects a new state back.
100
+ """
101
+
102
+ @abstractmethod
103
+ def execute(self, action: Action, current_state: State) -> State:
104
+ """
105
+ Execute an action in the environment and return the resulting state.
106
+
107
+ Args:
108
+ action: The action to execute
109
+ current_state: The state before the action
110
+
111
+ Returns:
112
+ The new state after the action
113
+ """
114
+ ...
115
+
116
+ def reset(self) -> State:
117
+ """
118
+ Reset the environment and return the initial state.
119
+ Override if your environment needs resetting between tasks.
120
+ """
121
+ return State(data={})
122
+
123
+ def is_terminal(self, state: State) -> bool:
124
+ """
125
+ Check if the state is terminal (task complete or impossible to continue).
126
+ Override for environments with natural termination conditions.
127
+ """
128
+ return False
129
+
130
+
131
+ class SimpleEnvironment(Environment):
132
+ """
133
+ A simple environment backed by a user-provided execute function.
134
+
135
+ Usage:
136
+ env = SimpleEnvironment(
137
+ execute_fn=lambda action, state: new_state,
138
+ initial_state=State(data={"x": 0}),
139
+ )
140
+ """
141
+
142
+ def __init__(
143
+ self,
144
+ execute_fn: Callable[[Action, State], State],
145
+ initial_state: State | None = None,
146
+ terminal_fn: Callable[[State], bool] | None = None,
147
+ ):
148
+ self._execute_fn = execute_fn
149
+ self._initial_state = initial_state or State(data={})
150
+ self._terminal_fn = terminal_fn
151
+
152
+ def execute(self, action: Action, current_state: State) -> State:
153
+ return self._execute_fn(action, current_state)
154
+
155
+ def reset(self) -> State:
156
+ return self._initial_state
157
+
158
+ def is_terminal(self, state: State) -> bool:
159
+ if self._terminal_fn:
160
+ return self._terminal_fn(state)
161
+ return False
162
+
163
+
164
+ # ---------------------------------------------------------------------------
165
+ # Task Result
166
+ # ---------------------------------------------------------------------------
167
+
168
+ class TaskResult:
169
+ """Result of running a task through the Orchestrator."""
170
+
171
+ def __init__(self, trajectory: Trajectory, final_state: State):
172
+ self.trajectory = trajectory
173
+ self.final_state = final_state
174
+
175
+ @property
176
+ def success(self) -> bool:
177
+ """Was the task successful? (final Ξ¦ > 7.0)"""
178
+ phi = self.trajectory.final_phi
179
+ return phi is not None and phi > 7.0
180
+
181
+ @property
182
+ def total_steps(self) -> int:
183
+ return len(self.trajectory.steps)
184
+
185
+ @property
186
+ def cumulative_reward(self) -> float:
187
+ return self.trajectory.cumulative_reward
188
+
189
+ @property
190
+ def final_phi(self) -> float | None:
191
+ return self.trajectory.final_phi
192
+
193
+ def summary(self) -> str:
194
+ lines = [
195
+ f"Task: {self.trajectory.task_description}",
196
+ f"Purpose: {self.trajectory.purpose}",
197
+ f"Steps: {self.total_steps}",
198
+ f"Success Rate: {self.trajectory.success_rate:.1%}",
199
+ f"Cumulative Reward: {self.cumulative_reward:.2f}",
200
+ f"Net Delta: {self.trajectory.total_delta:.2f}",
201
+ f"Final Ξ¦: {self.final_phi:.2f}" if self.final_phi is not None else "Final Ξ¦: N/A",
202
+ f"Task Success: {'βœ“' if self.success else 'βœ—'}",
203
+ ]
204
+ return "\n".join(lines)
205
+
206
+
207
+ # ---------------------------------------------------------------------------
208
+ # Orchestrator
209
+ # ---------------------------------------------------------------------------
210
+
211
+ class Orchestrator:
212
+ """
213
+ Main orchestration loop for the self-improving agent.
214
+
215
+ Ties together all modules:
216
+ - Actor: Decides actions based on state + memory
217
+ - Purpose Function: Scores state transitions (Ξ¦ improvement)
218
+ - Experience Replay: Stores trajectories for future retrieval
219
+ - Heuristic Optimizer: Extracts winning strategies from good trajectories
220
+
221
+ Self-improvement happens via the memory feedback loop:
222
+ 1. Actor uses heuristics from memory to decide actions
223
+ 2. Purpose Function scores each transition
224
+ 3. Experience Replay stores the full trajectory
225
+ 4. Optimizer distills high-reward trajectories into new heuristics
226
+ 5. Actor's memory is updated with new heuristics β†’ better next time
227
+
228
+ Args:
229
+ llm: Default LLM backend (used for all modules unless overridden)
230
+ critic_llm: Optional separate LLM for the Purpose Function
231
+ optimizer_llm: Optional separate LLM for the Optimizer
232
+ environment: The environment the agent acts in
233
+ available_actions: Dict of {action_name: description}
234
+ experience_buffer_size: Max trajectories in experience replay
235
+ persistence_dir: Directory for persistent storage (experience replay, heuristics)
236
+ on_step: Optional callback called after each step (for monitoring)
237
+ """
238
+
239
+ def __init__(
240
+ self,
241
+ llm: LLMBackend,
242
+ environment: Environment,
243
+ available_actions: dict[str, str] | None = None,
244
+ critic_llm: LLMBackend | None = None,
245
+ optimizer_llm: LLMBackend | None = None,
246
+ experience_buffer_size: int = 500,
247
+ persistence_dir: str | None = None,
248
+ on_step: Callable[[TrajectoryStep], None] | None = None,
249
+ optimize_every_n_tasks: int = 1,
250
+ ):
251
+ self.environment = environment
252
+ self.on_step = on_step
253
+ self.optimize_every_n_tasks = optimize_every_n_tasks
254
+ self._tasks_since_optimize = 0
255
+
256
+ # Persistence
257
+ replay_path = None
258
+ if persistence_dir:
259
+ import os
260
+ os.makedirs(persistence_dir, exist_ok=True)
261
+ replay_path = f"{persistence_dir}/experience_replay.json"
262
+
263
+ # Initialize modules
264
+ self.actor = Actor(
265
+ llm=llm,
266
+ available_actions=available_actions,
267
+ )
268
+ self.purpose_fn = PurposeFunction(
269
+ llm=critic_llm or llm,
270
+ )
271
+ self.experience_replay = ExperienceReplay(
272
+ capacity=experience_buffer_size,
273
+ persistence_path=replay_path,
274
+ )
275
+ self.optimizer = HeuristicOptimizer(
276
+ llm=optimizer_llm or llm,
277
+ )
278
+
279
+ # Load existing heuristics into Actor memory
280
+ self._sync_memory()
281
+
282
+ # ------------------------------------------------------------------
283
+ # Main Task Loop
284
+ # ------------------------------------------------------------------
285
+
286
+ def run_task(
287
+ self,
288
+ purpose: str,
289
+ initial_state: State | None = None,
290
+ max_steps: int = 20,
291
+ early_stop_phi: float = 9.0,
292
+ task_description: str | None = None,
293
+ ) -> TaskResult:
294
+ """
295
+ Run a complete task through the agent loop.
296
+
297
+ The loop for each step:
298
+ 1. Actor decides an action (with thought + prediction)
299
+ 2. Environment executes the action β†’ new state
300
+ 3. Purpose Function evaluates: Ξ¦(s_new) vs Ξ¦(s_old)
301
+ 4. Trajectory step is recorded
302
+ 5. Check termination conditions
303
+
304
+ After the task:
305
+ - Trajectory is added to Experience Replay
306
+ - If enough tasks have run, Optimizer extracts new heuristics
307
+ - Actor's memory is updated
308
+
309
+ Args:
310
+ purpose: The goal description
311
+ initial_state: Starting state (or environment.reset() if None)
312
+ max_steps: Maximum steps before forced termination
313
+ early_stop_phi: Stop if Ξ¦ exceeds this value (goal ~achieved)
314
+ task_description: Optional description (defaults to purpose)
315
+ """
316
+ task_desc = task_description or purpose
317
+ current_state = initial_state or self.environment.reset()
318
+
319
+ # Reset Purpose Function per-trajectory stats
320
+ self.purpose_fn.reset_trajectory_stats()
321
+
322
+ # Retrieve relevant past experiences for context
323
+ relevant_experiences = self.experience_replay.retrieve(task_desc, top_k=3)
324
+ self._inject_experience_context(relevant_experiences)
325
+
326
+ # Create trajectory
327
+ trajectory = Trajectory(
328
+ task_description=task_desc,
329
+ purpose=purpose,
330
+ )
331
+
332
+ # History for Actor context
333
+ history: list[dict[str, Any]] = []
334
+
335
+ logger.info(f"═══ Starting task: {task_desc} (max {max_steps} steps) ═══")
336
+
337
+ for step_idx in range(max_steps):
338
+ step_start = time.time()
339
+
340
+ # Step 1: Actor decides
341
+ action = self.actor.decide(
342
+ purpose=purpose,
343
+ current_state=current_state,
344
+ history=history,
345
+ )
346
+
347
+ logger.info(
348
+ f"Step {step_idx + 1}: Action={action.name}, "
349
+ f"Thought={action.thought[:100]}..."
350
+ )
351
+
352
+ # Check for DONE action
353
+ if action.name.upper() == "DONE":
354
+ logger.info("Agent signaled DONE β€” ending task")
355
+ # Still score the final state to record final Ξ¦
356
+ final_score = self.purpose_fn.evaluate(
357
+ state_before=current_state,
358
+ action=action,
359
+ state_after=current_state,
360
+ purpose=purpose,
361
+ )
362
+ trajectory.steps.append(TrajectoryStep(
363
+ state_before=current_state,
364
+ action=action,
365
+ state_after=current_state,
366
+ score=final_score,
367
+ step_index=step_idx + 1,
368
+ wall_time_s=time.time() - step_start,
369
+ ))
370
+ break
371
+
372
+ # Step 2: Environment executes
373
+ try:
374
+ new_state = self.environment.execute(action, current_state)
375
+ except Exception as e:
376
+ logger.error(f"Environment execution failed: {e}")
377
+ new_state = State(
378
+ data={**current_state.data, "_error": str(e)},
379
+ summary=f"Error: {e}",
380
+ )
381
+
382
+ # Step 3: Purpose Function evaluates
383
+ score = self.purpose_fn.evaluate(
384
+ state_before=current_state,
385
+ action=action,
386
+ state_after=new_state,
387
+ purpose=purpose,
388
+ )
389
+
390
+ # Step 4: Record step
391
+ step = TrajectoryStep(
392
+ state_before=current_state,
393
+ action=action,
394
+ state_after=new_state,
395
+ score=score,
396
+ step_index=step_idx + 1,
397
+ wall_time_s=time.time() - step_start,
398
+ )
399
+ trajectory.steps.append(step)
400
+
401
+ # Update history for Actor context
402
+ history.append({
403
+ "action": f"{action.name}({json.dumps(action.params, default=str)})",
404
+ "result": new_state.describe()[:200],
405
+ "score": f"Ξ”={score.delta:+.2f}" if score else "N/A",
406
+ })
407
+
408
+ # Callback
409
+ if self.on_step:
410
+ self.on_step(step)
411
+
412
+ logger.info(
413
+ f" β†’ Ξ¦: {score.phi_before:.1f} β†’ {score.phi_after:.1f} "
414
+ f"(Ξ”={score.delta:+.2f}, conf={score.confidence:.2f})"
415
+ )
416
+
417
+ # Step 5: Check termination
418
+ if score.phi_after >= early_stop_phi:
419
+ logger.info(f"Early stop: Ξ¦={score.phi_after:.1f} β‰₯ {early_stop_phi}")
420
+ break
421
+
422
+ if self.environment.is_terminal(new_state):
423
+ logger.info("Environment signaled terminal state")
424
+ break
425
+
426
+ current_state = new_state
427
+
428
+ # Post-task processing
429
+ result = TaskResult(trajectory=trajectory, final_state=current_state)
430
+ self._post_task(trajectory, relevant_experiences)
431
+
432
+ logger.info(f"═══ Task complete ═══\n{result.summary()}")
433
+ return result
434
+
435
+ # ------------------------------------------------------------------
436
+ # Post-Task: Experience Storage + Optimization
437
+ # ------------------------------------------------------------------
438
+
439
+ def _post_task(
440
+ self,
441
+ trajectory: Trajectory,
442
+ used_experiences: list[Any],
443
+ ) -> None:
444
+ """Post-task processing: store trajectory, maybe optimize, sync memory."""
445
+
446
+ # Store in experience replay
447
+ record = self.experience_replay.add(trajectory)
448
+
449
+ # Update Q-values for retrieved experiences that were used
450
+ task_success = trajectory.success_rate > 0.5
451
+ for exp in used_experiences:
452
+ self.experience_replay.update_q_value(
453
+ exp.id, reward=1.0 if task_success else 0.0
454
+ )
455
+
456
+ # Update heuristic usage stats
457
+ for h in self.actor.strategic_memory + self.actor.procedural_memory:
458
+ self.optimizer.update_heuristic_usage(h.id, was_successful=task_success)
459
+
460
+ # Periodic optimization
461
+ self._tasks_since_optimize += 1
462
+ if self._tasks_since_optimize >= self.optimize_every_n_tasks:
463
+ self._run_optimization()
464
+ self._tasks_since_optimize = 0
465
+
466
+ def _run_optimization(self) -> None:
467
+ """Run the heuristic optimization cycle."""
468
+ logger.info("Running optimization cycle...")
469
+
470
+ # Get best trajectories
471
+ top_trajectories = self.experience_replay.get_top_trajectories(
472
+ n=5, min_success_rate=0.3
473
+ )
474
+
475
+ if not top_trajectories:
476
+ logger.info("No qualifying trajectories for optimization")
477
+ return
478
+
479
+ # Run optimizer
480
+ self.optimizer.optimize(top_trajectories)
481
+
482
+ # Sync updated heuristics to Actor memory
483
+ self._sync_memory()
484
+
485
+ def _sync_memory(self) -> None:
486
+ """Push current heuristic library to Actor's memory tiers."""
487
+ self.actor.update_strategic_memory(
488
+ self.optimizer.get_heuristics_by_tier(MemoryTier.STRATEGIC)
489
+ )
490
+ self.actor.update_procedural_memory(
491
+ self.optimizer.get_heuristics_by_tier(MemoryTier.PROCEDURAL)
492
+ )
493
+
494
+ # Tool memory from heuristics
495
+ tool_heuristics = self.optimizer.get_heuristics_by_tier(MemoryTier.TOOL)
496
+ tool_tips = {h.pattern: h.strategy for h in tool_heuristics}
497
+ if tool_tips:
498
+ self.actor.update_tool_memory(tool_tips)
499
+
500
+ def _inject_experience_context(self, experiences: list[Any]) -> None:
501
+ """
502
+ Inject retrieved experience context into Actor's procedural memory.
503
+
504
+ This is the CER (arxiv:2506.06698) retrieval injection pattern:
505
+ relevant past trajectories β†’ distilled into SOPs β†’ added to Actor context.
506
+ """
507
+ injected = []
508
+ for exp in experiences:
509
+ for h in exp.heuristics:
510
+ if h.tier == MemoryTier.PROCEDURAL:
511
+ injected.append(h)
512
+
513
+ if injected:
514
+ current = self.actor.procedural_memory or []
515
+ self.actor.procedural_memory = current + injected
516
+ logger.debug(f"Injected {len(injected)} experience-based SOPs")
517
+
518
+ # ------------------------------------------------------------------
519
+ # Inspection / Monitoring
520
+ # ------------------------------------------------------------------
521
+
522
+ @property
523
+ def stats(self) -> dict[str, Any]:
524
+ """Get current framework statistics."""
525
+ return {
526
+ "experience_replay": self.experience_replay.stats,
527
+ "heuristic_library_size": len(self.optimizer.heuristic_library),
528
+ "heuristics_by_tier": {
529
+ tier.value: len(self.optimizer.get_heuristics_by_tier(tier))
530
+ for tier in MemoryTier
531
+ },
532
+ "tasks_since_optimize": self._tasks_since_optimize,
533
+ }
534
+
535
+ def get_heuristic_report(self) -> str:
536
+ """Human-readable report of all learned heuristics."""
537
+ lines = ["═══ Learned Heuristics Report ═══\n"]
538
+
539
+ for tier in MemoryTier:
540
+ heuristics = self.optimizer.get_heuristics_by_tier(tier)
541
+ lines.append(f"\n{'─' * 40}")
542
+ lines.append(f" {tier.value.upper()} ({len(heuristics)} heuristics)")
543
+ lines.append(f"{'─' * 40}")
544
+
545
+ for h in heuristics:
546
+ lines.append(f"\n [{h.id}] Q={h.q_value:.3f} (used {h.times_used}x, "
547
+ f"{h.times_succeeded} successes)")
548
+ lines.append(f" Pattern: {h.pattern}")
549
+ lines.append(f" Strategy: {h.strategy}")
550
+ if h.steps:
551
+ for i, step in enumerate(h.steps, 1):
552
+ lines.append(f" {i}. {step}")
553
+
554
+ return "\n".join(lines)