| |
| """ |
| End-to-end demo: Self-improving agent solving a simulated maze-search task. |
| |
| This demo shows: |
| 1. The full Actor β Purpose Function β Experience Replay β Optimizer loop |
| 2. How the agent improves across multiple task attempts |
| 3. The 3-tier memory system in action |
| 4. Anti-reward-hacking safeguards |
| 5. Q-value experience retrieval |
| |
| No real LLM calls β uses MockLLMBackend with deterministic behavior |
| so you can see the architecture working end-to-end. |
| """ |
|
|
| import json |
| import logging |
| import sys |
| from copy import deepcopy |
|
|
| |
| sys.path.insert(0, "/app") |
|
|
| from purpose_agent import ( |
| Action, |
| Heuristic, |
| MockLLMBackend, |
| State, |
| PurposeScore, |
| MemoryRecord, |
| ) |
| from purpose_agent.types import MemoryTier |
| from purpose_agent.orchestrator import ( |
| Environment, |
| Orchestrator, |
| SimpleEnvironment, |
| TaskResult, |
| ) |
|
|
| |
| |
| |
| logging.basicConfig( |
| level=logging.INFO, |
| format="%(asctime)s | %(name)s | %(levelname)s | %(message)s", |
| datefmt="%H:%M:%S", |
| ) |
| logger = logging.getLogger("demo") |
|
|
|
|
| |
| |
| |
|
|
| class TreasureMaze(Environment): |
| """ |
| A simple grid-based maze where the agent must find a treasure. |
| |
| Grid is 5x5. Agent starts at (0,0). Treasure is at (4,4). |
| Actions: move_north, move_south, move_east, move_west, search, pick_up |
| |
| The agent gets closer to the goal by moving toward (4,4) and then |
| picking up the treasure when at the right location. |
| """ |
|
|
| TREASURE_POS = (4, 4) |
| GRID_SIZE = 5 |
|
|
| def execute(self, action: Action, current_state: State) -> State: |
| data = deepcopy(current_state.data) |
| pos = data.get("position", [0, 0]) |
| inventory = data.get("inventory", []) |
| moves = data.get("moves", 0) |
|
|
| x, y = pos |
|
|
| if action.name == "move_north" and y < self.GRID_SIZE - 1: |
| y += 1 |
| elif action.name == "move_south" and y > 0: |
| y -= 1 |
| elif action.name == "move_east" and x < self.GRID_SIZE - 1: |
| x += 1 |
| elif action.name == "move_west" and x > 0: |
| x -= 1 |
| elif action.name == "search": |
| if (x, y) == self.TREASURE_POS and "treasure_found" not in data: |
| data["treasure_found"] = True |
| elif action.name == "pick_up": |
| if data.get("treasure_found") and "treasure" not in inventory: |
| inventory.append("treasure") |
| data["task_complete"] = True |
|
|
| data["position"] = [x, y] |
| data["inventory"] = inventory |
| data["moves"] = moves + 1 |
|
|
| |
| dist = abs(x - self.TREASURE_POS[0]) + abs(y - self.TREASURE_POS[1]) |
|
|
| summary = ( |
| f"Position: ({x}, {y}), Distance to treasure: {dist}, " |
| f"Inventory: {inventory}, Treasure found: {data.get('treasure_found', False)}, " |
| f"Moves: {data['moves']}" |
| ) |
|
|
| return State(data=data, summary=summary) |
|
|
| def reset(self) -> State: |
| data = { |
| "position": [0, 0], |
| "inventory": [], |
| "moves": 0, |
| } |
| return State( |
| data=data, |
| summary="Position: (0, 0), Distance to treasure: 8, Inventory: [], Moves: 0", |
| ) |
|
|
| def is_terminal(self, state: State) -> bool: |
| return state.data.get("task_complete", False) |
|
|
|
|
| |
| |
| |
|
|
| def create_mock_llm() -> MockLLMBackend: |
| """ |
| Create a mock LLM that simulates reasonable agent behavior. |
| |
| The mock has three modes: |
| 1. Actor mode: Follows a simple heuristic (move toward treasure) |
| 2. Critic mode: Scores based on distance delta |
| 3. Optimizer mode: Returns canned heuristics |
| """ |
| mock = MockLLMBackend() |
|
|
| |
| state = {"step": 0, "task_num": 0} |
|
|
| |
| OPTIMAL_PATH = [ |
| "move_east", "move_east", "move_east", "move_east", |
| "move_north", "move_north", "move_north", "move_north", |
| "search", "pick_up", |
| ] |
|
|
| |
| NAIVE_PATH = [ |
| "move_north", "move_east", "move_north", "move_east", |
| "move_north", "move_east", "move_north", "move_east", |
| "search", "pick_up", |
| ] |
|
|
| def actor_handler(messages): |
| """Simulate actor deciding actions.""" |
| step = state["step"] |
| task = state["task_num"] |
|
|
| |
| path = NAIVE_PATH if task == 0 else OPTIMAL_PATH |
|
|
| if step < len(path): |
| action_name = path[step] |
| else: |
| action_name = "DONE" |
|
|
| state["step"] += 1 |
|
|
| return json.dumps({ |
| "thought": f"Step {step + 1}: I should {action_name} to get closer to the treasure.", |
| "action": {"name": action_name, "params": {}}, |
| "expected_delta": f"Position will change after {action_name}", |
| }) |
|
|
| def critic_handler(messages): |
| """Simulate the Purpose Function scoring transitions.""" |
| full_text = " ".join(m.content for m in messages) |
|
|
| |
| import re |
| distances = re.findall(r'Distance to treasure: (\d+)', full_text) |
|
|
| if len(distances) >= 2: |
| dist_before = int(distances[0]) |
| dist_after = int(distances[1]) |
| elif len(distances) == 1: |
| dist_before = int(distances[0]) |
| dist_after = dist_before |
| else: |
| dist_before = 8 |
| dist_after = 8 |
|
|
| |
| max_dist = 8 |
| phi_before = 10.0 * (1 - dist_before / max_dist) |
| phi_after = 10.0 * (1 - dist_after / max_dist) |
|
|
| |
| if "treasure_found: True" in full_text.lower() or "treasure found: true" in full_text.lower(): |
| phi_after = max(phi_after, 8.5) |
| if "'treasure'" in full_text or '"treasure"' in full_text: |
| if "inventory" in full_text.lower(): |
| phi_after = max(phi_after, 10.0) |
| if "task_complete" in full_text: |
| phi_after = 10.0 |
|
|
| return json.dumps({ |
| "phi_before": round(phi_before, 1), |
| "phi_after": round(phi_after, 1), |
| "reasoning": ( |
| f"Distance changed from {dist_before} to {dist_after}. " |
| f"{'Moved closer to treasure.' if dist_after < dist_before else 'No net progress.'}" |
| ), |
| "evidence": f"Position distance: {dist_before} β {dist_after}", |
| "confidence": 0.9, |
| }) |
|
|
| def optimizer_handler(messages): |
| """Simulate the optimizer extracting heuristics.""" |
| return json.dumps({ |
| "heuristics": [ |
| { |
| "tier": "strategic", |
| "pattern": "When navigating a grid toward a {target}", |
| "strategy": "Move along one axis first (e.g., all east), then the other (all north). This is more efficient than zigzagging diagonally.", |
| }, |
| { |
| "tier": "procedural", |
| "pattern": "To reach position ({target_x}, {target_y}) from ({start_x}, {start_y})", |
| "strategy": "Follow the axis-first approach", |
| "steps": [ |
| "Move east/west until x matches target_x", |
| "Move north/south until y matches target_y", |
| "Search at the target location", |
| "Pick up any found items", |
| ], |
| }, |
| { |
| "tier": "tool", |
| "pattern": "When using action search", |
| "strategy": "Only use 'search' when at the exact target coordinates. Searching elsewhere wastes a move.", |
| }, |
| ] |
| }) |
|
|
| |
| mock.register_handler("STATE EVALUATOR", critic_handler) |
| mock.register_handler("HEURISTIC EXTRACTOR", optimizer_handler) |
| mock.register_handler("HEURISTIC DEDUPLICATOR", optimizer_handler) |
| mock.register_handler("goal-directed agent", actor_handler) |
|
|
| |
| mock.set_structured_default({ |
| "phi_before": 5.0, |
| "phi_after": 6.0, |
| "reasoning": "Default structured output", |
| "evidence": "State data changed", |
| "confidence": 0.7, |
| }) |
|
|
| return mock, state |
|
|
|
|
| |
| |
| |
|
|
| def run_demo(): |
| print("=" * 70) |
| print(" PURPOSE AGENT β Self-Improving Framework Demo") |
| print(" Simulated: Treasure Hunt in a 5Γ5 Grid") |
| print("=" * 70) |
| print() |
|
|
| |
| mock_llm, llm_state = create_mock_llm() |
| env = TreasureMaze() |
|
|
| |
| orch = Orchestrator( |
| llm=mock_llm, |
| environment=env, |
| available_actions={ |
| "move_north": "Move one cell north (y+1)", |
| "move_south": "Move one cell south (y-1)", |
| "move_east": "Move one cell east (x+1)", |
| "move_west": "Move one cell west (x-1)", |
| "search": "Search current cell for items", |
| "pick_up": "Pick up a found item", |
| "DONE": "Signal task completion", |
| }, |
| optimize_every_n_tasks=1, |
| persistence_dir="/app/demo_data", |
| ) |
|
|
| |
| print("\n" + "β" * 70) |
| print(" TASK 1: First attempt (naive β no learned heuristics)") |
| print("β" * 70) |
|
|
| llm_state["step"] = 0 |
| llm_state["task_num"] = 0 |
|
|
| result1 = orch.run_task( |
| purpose="Find and collect the treasure hidden at position (4,4) in the maze", |
| initial_state=env.reset(), |
| max_steps=15, |
| ) |
|
|
| print(f"\nπ Task 1 Result:\n{result1.summary()}") |
|
|
| |
| print("\n" + "β" * 70) |
| print(" LEARNED HEURISTICS (after Task 1)") |
| print("β" * 70) |
| print(orch.get_heuristic_report()) |
|
|
| |
| print("\n" + "β" * 70) |
| print(" TASK 2: Second attempt (with learned heuristics)") |
| print("β" * 70) |
|
|
| llm_state["step"] = 0 |
| llm_state["task_num"] = 1 |
|
|
| result2 = orch.run_task( |
| purpose="Find and collect the treasure hidden at position (4,4) in the maze", |
| initial_state=env.reset(), |
| max_steps=15, |
| ) |
|
|
| print(f"\nπ Task 2 Result:\n{result2.summary()}") |
|
|
| |
| print("\n" + "=" * 70) |
| print(" PERFORMANCE COMPARISON") |
| print("=" * 70) |
| print(f"\n {'Metric':<30} {'Task 1':>10} {'Task 2':>10} {'Ξ':>10}") |
| print(f" {'β' * 60}") |
| print(f" {'Steps taken':<30} {result1.total_steps:>10} {result2.total_steps:>10} " |
| f"{result2.total_steps - result1.total_steps:>+10}") |
| print(f" {'Cumulative reward':<30} {result1.cumulative_reward:>10.2f} {result2.cumulative_reward:>10.2f} " |
| f"{result2.cumulative_reward - result1.cumulative_reward:>+10.2f}") |
| print(f" {'Success rate':<30} {result1.trajectory.success_rate:>10.1%} {result2.trajectory.success_rate:>10.1%} " |
| f"{result2.trajectory.success_rate - result1.trajectory.success_rate:>+10.1%}") |
| phi1 = result1.final_phi or 0 |
| phi2 = result2.final_phi or 0 |
| print(f" {'Final Ξ¦':<30} {phi1:>10.1f} {phi2:>10.1f} {phi2 - phi1:>+10.1f}") |
| print(f" {'Task success':<30} {'β' if result1.success else 'β':>10} {'β' if result2.success else 'β':>10}") |
|
|
| |
| print(f"\n Framework Stats: {json.dumps(orch.stats, indent=4)}") |
|
|
| |
| print(f"\n Experience Replay: {json.dumps(orch.experience_replay.stats, indent=4)}") |
|
|
| print("\n" + "=" * 70) |
| print(" Demo complete! The agent improved from Task 1 β Task 2") |
| print(" by learning heuristics from its first experience.") |
| print("=" * 70) |
|
|
| return result1, result2 |
|
|
|
|
| |
| |
| |
|
|
| def run_tests(): |
| """Quick unit tests for each module.""" |
| print("\n" + "=" * 70) |
| print(" UNIT TESTS") |
| print("=" * 70) |
|
|
| tests_passed = 0 |
| tests_total = 0 |
|
|
| def check(name, condition): |
| nonlocal tests_passed, tests_total |
| tests_total += 1 |
| if condition: |
| tests_passed += 1 |
| print(f" β {name}") |
| else: |
| print(f" β {name}") |
|
|
| |
| s = State(data={"x": 1, "y": 2}, summary="Test state") |
| check("State.describe() returns summary", "Test state" in s.describe()) |
| check("State.id is unique", len(s.id) == 12) |
|
|
| |
| a = Action(name="move", params={"dir": "north"}, thought="go north", expected_delta="y+1") |
| check("Action fields", a.name == "move" and a.thought == "go north") |
|
|
| |
| ps = PurposeScore(phi_before=3.0, phi_after=5.0, delta=2.0, |
| reasoning="improved", evidence="x changed", confidence=0.9) |
| check("PurposeScore.improved", ps.improved) |
| check("PurposeScore.delta", ps.delta == 2.0) |
|
|
| |
| h = Heuristic(pattern="test", strategy="test", steps=[], tier=MemoryTier.STRATEGIC, q_value=0.5) |
| h.update_q_value(1.0, alpha=0.1) |
| check("Heuristic Q-value update (reward=1.0)", 0.54 < h.q_value < 0.66) |
| h.update_q_value(0.0, alpha=0.1) |
| check("Heuristic Q-value update (reward=0.0)", 0.45 < h.q_value < 0.60) |
|
|
| |
| from purpose_agent.experience_replay import ExperienceReplay |
| from purpose_agent.types import Trajectory, TrajectoryStep |
|
|
| er = ExperienceReplay(capacity=10) |
|
|
| traj = Trajectory(task_description="test task", purpose="test purpose") |
| traj.steps.append(TrajectoryStep( |
| state_before=State(data={"x": 0}), |
| action=Action(name="move"), |
| state_after=State(data={"x": 1}), |
| score=PurposeScore(phi_before=1.0, phi_after=3.0, delta=2.0, |
| reasoning="good", evidence="x: 0β1", confidence=0.8), |
| )) |
| record = er.add(traj) |
| check("ExperienceReplay.add", er.size == 1) |
| check("ExperienceReplay.retrieve", len(er.retrieve("test task")) == 1) |
|
|
| |
| old_q = record.retrieval_q_value |
| er.update_q_value(record.id, reward=1.0) |
| check("ExperienceReplay Q-value update", record.retrieval_q_value > old_q) |
|
|
| |
| from purpose_agent.llm_backend import ChatMessage |
| mock = MockLLMBackend() |
| mock.register_handler("hello", "world") |
| result = mock.generate([ChatMessage(role="user", content="hello")]) |
| check("MockLLM keyword handler", result == "world") |
|
|
| result = mock.generate([ChatMessage(role="user", content="unknown")]) |
| check("MockLLM default response", "MockLLM" in result) |
|
|
| |
| from purpose_agent.purpose_function import PurposeFunction |
| mock2 = MockLLMBackend() |
| mock2.set_structured_default({ |
| "phi_before": 3.0, |
| "phi_after": 5.0, |
| "reasoning": "The state improved because of the action", |
| "evidence": "Position changed from (0,0) to (1,0), reducing distance by 1", |
| "confidence": 0.85, |
| }) |
| pf = PurposeFunction(llm=mock2) |
| score = pf.evaluate( |
| state_before=State(data={"pos": [0, 0]}), |
| action=Action(name="move_east"), |
| state_after=State(data={"pos": [1, 0]}), |
| purpose="Reach position (4,4)", |
| ) |
| check("PurposeFunction returns PurposeScore", score.delta == 2.0) |
| check("PurposeFunction evidence check", len(score.evidence) > 0) |
|
|
| |
| maze = TreasureMaze() |
| s0 = maze.reset() |
| check("Environment.reset", s0.data["position"] == [0, 0]) |
| s1 = maze.execute(Action(name="move_east"), s0) |
| check("Environment.execute move_east", s1.data["position"] == [1, 0]) |
| check("Environment not terminal at start", not maze.is_terminal(s1)) |
|
|
| print(f"\n Results: {tests_passed}/{tests_total} tests passed") |
| return tests_passed == tests_total |
|
|
|
|
| |
| |
| |
|
|
| if __name__ == "__main__": |
| |
| all_passed = run_tests() |
|
|
| if not all_passed: |
| print("\nβ Some tests failed β check output above") |
| sys.exit(1) |
|
|
| |
| run_demo() |
|
|