Rohan03 commited on
Commit
44d1ab6
Β·
verified Β·
1 Parent(s): 3ca0d80

Add demo.py

Browse files
Files changed (1) hide show
  1. demo.py +488 -0
demo.py ADDED
@@ -0,0 +1,488 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ End-to-end demo: Self-improving agent solving a simulated maze-search task.
4
+
5
+ This demo shows:
6
+ 1. The full Actor β†’ Purpose Function β†’ Experience Replay β†’ Optimizer loop
7
+ 2. How the agent improves across multiple task attempts
8
+ 3. The 3-tier memory system in action
9
+ 4. Anti-reward-hacking safeguards
10
+ 5. Q-value experience retrieval
11
+
12
+ No real LLM calls β€” uses MockLLMBackend with deterministic behavior
13
+ so you can see the architecture working end-to-end.
14
+ """
15
+
16
+ import json
17
+ import logging
18
+ import sys
19
+ from copy import deepcopy
20
+
21
+ # Add the parent directory to path
22
+ sys.path.insert(0, "/app")
23
+
24
+ from purpose_agent import (
25
+ Action,
26
+ Heuristic,
27
+ MockLLMBackend,
28
+ State,
29
+ PurposeScore,
30
+ MemoryRecord,
31
+ )
32
+ from purpose_agent.types import MemoryTier
33
+ from purpose_agent.orchestrator import (
34
+ Environment,
35
+ Orchestrator,
36
+ SimpleEnvironment,
37
+ TaskResult,
38
+ )
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Configure logging
42
+ # ---------------------------------------------------------------------------
43
+ logging.basicConfig(
44
+ level=logging.INFO,
45
+ format="%(asctime)s | %(name)s | %(levelname)s | %(message)s",
46
+ datefmt="%H:%M:%S",
47
+ )
48
+ logger = logging.getLogger("demo")
49
+
50
+
51
+ # ---------------------------------------------------------------------------
52
+ # Simulated Environment: Treasure Hunt Maze
53
+ # ---------------------------------------------------------------------------
54
+
55
+ class TreasureMaze(Environment):
56
+ """
57
+ A simple grid-based maze where the agent must find a treasure.
58
+
59
+ Grid is 5x5. Agent starts at (0,0). Treasure is at (4,4).
60
+ Actions: move_north, move_south, move_east, move_west, search, pick_up
61
+
62
+ The agent gets closer to the goal by moving toward (4,4) and then
63
+ picking up the treasure when at the right location.
64
+ """
65
+
66
+ TREASURE_POS = (4, 4)
67
+ GRID_SIZE = 5
68
+
69
+ def execute(self, action: Action, current_state: State) -> State:
70
+ data = deepcopy(current_state.data)
71
+ pos = data.get("position", [0, 0])
72
+ inventory = data.get("inventory", [])
73
+ moves = data.get("moves", 0)
74
+
75
+ x, y = pos
76
+
77
+ if action.name == "move_north" and y < self.GRID_SIZE - 1:
78
+ y += 1
79
+ elif action.name == "move_south" and y > 0:
80
+ y -= 1
81
+ elif action.name == "move_east" and x < self.GRID_SIZE - 1:
82
+ x += 1
83
+ elif action.name == "move_west" and x > 0:
84
+ x -= 1
85
+ elif action.name == "search":
86
+ if (x, y) == self.TREASURE_POS and "treasure_found" not in data:
87
+ data["treasure_found"] = True
88
+ elif action.name == "pick_up":
89
+ if data.get("treasure_found") and "treasure" not in inventory:
90
+ inventory.append("treasure")
91
+ data["task_complete"] = True
92
+
93
+ data["position"] = [x, y]
94
+ data["inventory"] = inventory
95
+ data["moves"] = moves + 1
96
+
97
+ # Compute distance to treasure for summary
98
+ dist = abs(x - self.TREASURE_POS[0]) + abs(y - self.TREASURE_POS[1])
99
+
100
+ summary = (
101
+ f"Position: ({x}, {y}), Distance to treasure: {dist}, "
102
+ f"Inventory: {inventory}, Treasure found: {data.get('treasure_found', False)}, "
103
+ f"Moves: {data['moves']}"
104
+ )
105
+
106
+ return State(data=data, summary=summary)
107
+
108
+ def reset(self) -> State:
109
+ data = {
110
+ "position": [0, 0],
111
+ "inventory": [],
112
+ "moves": 0,
113
+ }
114
+ return State(
115
+ data=data,
116
+ summary="Position: (0, 0), Distance to treasure: 8, Inventory: [], Moves: 0",
117
+ )
118
+
119
+ def is_terminal(self, state: State) -> bool:
120
+ return state.data.get("task_complete", False)
121
+
122
+
123
+ # ---------------------------------------------------------------------------
124
+ # Mock LLM: Deterministic Agent Behavior for Testing
125
+ # ---------------------------------------------------------------------------
126
+
127
+ def create_mock_llm() -> MockLLMBackend:
128
+ """
129
+ Create a mock LLM that simulates reasonable agent behavior.
130
+
131
+ The mock has three modes:
132
+ 1. Actor mode: Follows a simple heuristic (move toward treasure)
133
+ 2. Critic mode: Scores based on distance delta
134
+ 3. Optimizer mode: Returns canned heuristics
135
+ """
136
+ mock = MockLLMBackend()
137
+
138
+ # Track call count for the actor to cycle through actions
139
+ state = {"step": 0, "task_num": 0}
140
+
141
+ # Optimal path: right right right right up up up up search pick_up
142
+ OPTIMAL_PATH = [
143
+ "move_east", "move_east", "move_east", "move_east",
144
+ "move_north", "move_north", "move_north", "move_north",
145
+ "search", "pick_up",
146
+ ]
147
+
148
+ # Sub-optimal path (first attempt β€” agent hasn't learned yet)
149
+ NAIVE_PATH = [
150
+ "move_north", "move_east", "move_north", "move_east",
151
+ "move_north", "move_east", "move_north", "move_east",
152
+ "search", "pick_up",
153
+ ]
154
+
155
+ def actor_handler(messages):
156
+ """Simulate actor deciding actions."""
157
+ step = state["step"]
158
+ task = state["task_num"]
159
+
160
+ # First task: use naive path; later tasks: use optimal path (learned!)
161
+ path = NAIVE_PATH if task == 0 else OPTIMAL_PATH
162
+
163
+ if step < len(path):
164
+ action_name = path[step]
165
+ else:
166
+ action_name = "DONE"
167
+
168
+ state["step"] += 1
169
+
170
+ return json.dumps({
171
+ "thought": f"Step {step + 1}: I should {action_name} to get closer to the treasure.",
172
+ "action": {"name": action_name, "params": {}},
173
+ "expected_delta": f"Position will change after {action_name}",
174
+ })
175
+
176
+ def critic_handler(messages):
177
+ """Simulate the Purpose Function scoring transitions."""
178
+ full_text = " ".join(m.content for m in messages)
179
+
180
+ # Extract distances from the state descriptions
181
+ import re
182
+ distances = re.findall(r'Distance to treasure: (\d+)', full_text)
183
+
184
+ if len(distances) >= 2:
185
+ dist_before = int(distances[0])
186
+ dist_after = int(distances[1])
187
+ elif len(distances) == 1:
188
+ dist_before = int(distances[0])
189
+ dist_after = dist_before
190
+ else:
191
+ dist_before = 8
192
+ dist_after = 8
193
+
194
+ # Convert distance to Ξ¦ score (0-10 scale, closer = higher)
195
+ max_dist = 8 # Manhattan distance from (0,0) to (4,4)
196
+ phi_before = 10.0 * (1 - dist_before / max_dist)
197
+ phi_after = 10.0 * (1 - dist_after / max_dist)
198
+
199
+ # Check for treasure found / picked up
200
+ if "treasure_found: True" in full_text.lower() or "treasure found: true" in full_text.lower():
201
+ phi_after = max(phi_after, 8.5)
202
+ if "'treasure'" in full_text or '"treasure"' in full_text:
203
+ if "inventory" in full_text.lower():
204
+ phi_after = max(phi_after, 10.0)
205
+ if "task_complete" in full_text:
206
+ phi_after = 10.0
207
+
208
+ return json.dumps({
209
+ "phi_before": round(phi_before, 1),
210
+ "phi_after": round(phi_after, 1),
211
+ "reasoning": (
212
+ f"Distance changed from {dist_before} to {dist_after}. "
213
+ f"{'Moved closer to treasure.' if dist_after < dist_before else 'No net progress.'}"
214
+ ),
215
+ "evidence": f"Position distance: {dist_before} β†’ {dist_after}",
216
+ "confidence": 0.9,
217
+ })
218
+
219
+ def optimizer_handler(messages):
220
+ """Simulate the optimizer extracting heuristics."""
221
+ return json.dumps({
222
+ "heuristics": [
223
+ {
224
+ "tier": "strategic",
225
+ "pattern": "When navigating a grid toward a {target}",
226
+ "strategy": "Move along one axis first (e.g., all east), then the other (all north). This is more efficient than zigzagging diagonally.",
227
+ },
228
+ {
229
+ "tier": "procedural",
230
+ "pattern": "To reach position ({target_x}, {target_y}) from ({start_x}, {start_y})",
231
+ "strategy": "Follow the axis-first approach",
232
+ "steps": [
233
+ "Move east/west until x matches target_x",
234
+ "Move north/south until y matches target_y",
235
+ "Search at the target location",
236
+ "Pick up any found items",
237
+ ],
238
+ },
239
+ {
240
+ "tier": "tool",
241
+ "pattern": "When using action search",
242
+ "strategy": "Only use 'search' when at the exact target coordinates. Searching elsewhere wastes a move.",
243
+ },
244
+ ]
245
+ })
246
+
247
+ # Register handlers based on keywords in the prompt
248
+ mock.register_handler("STATE EVALUATOR", critic_handler) # Purpose Function
249
+ mock.register_handler("HEURISTIC EXTRACTOR", optimizer_handler) # Optimizer
250
+ mock.register_handler("HEURISTIC DEDUPLICATOR", optimizer_handler) # Merge
251
+ mock.register_handler("goal-directed agent", actor_handler) # Actor
252
+
253
+ # Structured output default for Purpose Function
254
+ mock.set_structured_default({
255
+ "phi_before": 5.0,
256
+ "phi_after": 6.0,
257
+ "reasoning": "Default structured output",
258
+ "evidence": "State data changed",
259
+ "confidence": 0.7,
260
+ })
261
+
262
+ return mock, state
263
+
264
+
265
+ # ---------------------------------------------------------------------------
266
+ # Demo Runner
267
+ # ---------------------------------------------------------------------------
268
+
269
+ def run_demo():
270
+ print("=" * 70)
271
+ print(" PURPOSE AGENT β€” Self-Improving Framework Demo")
272
+ print(" Simulated: Treasure Hunt in a 5Γ—5 Grid")
273
+ print("=" * 70)
274
+ print()
275
+
276
+ # Create mock LLM and environment
277
+ mock_llm, llm_state = create_mock_llm()
278
+ env = TreasureMaze()
279
+
280
+ # Create orchestrator
281
+ orch = Orchestrator(
282
+ llm=mock_llm,
283
+ environment=env,
284
+ available_actions={
285
+ "move_north": "Move one cell north (y+1)",
286
+ "move_south": "Move one cell south (y-1)",
287
+ "move_east": "Move one cell east (x+1)",
288
+ "move_west": "Move one cell west (x-1)",
289
+ "search": "Search current cell for items",
290
+ "pick_up": "Pick up a found item",
291
+ "DONE": "Signal task completion",
292
+ },
293
+ optimize_every_n_tasks=1, # Optimize after every task
294
+ persistence_dir="/app/demo_data",
295
+ )
296
+
297
+ # ─── Task 1: Naive attempt (no learned heuristics) ─────────────────
298
+ print("\n" + "─" * 70)
299
+ print(" TASK 1: First attempt (naive β€” no learned heuristics)")
300
+ print("─" * 70)
301
+
302
+ llm_state["step"] = 0
303
+ llm_state["task_num"] = 0
304
+
305
+ result1 = orch.run_task(
306
+ purpose="Find and collect the treasure hidden at position (4,4) in the maze",
307
+ initial_state=env.reset(),
308
+ max_steps=15,
309
+ )
310
+
311
+ print(f"\nπŸ“Š Task 1 Result:\n{result1.summary()}")
312
+
313
+ # ─── Check what the agent learned ──────────────────────────────────
314
+ print("\n" + "─" * 70)
315
+ print(" LEARNED HEURISTICS (after Task 1)")
316
+ print("─" * 70)
317
+ print(orch.get_heuristic_report())
318
+
319
+ # ─── Task 2: Improved attempt (with learned heuristics) ────────────
320
+ print("\n" + "─" * 70)
321
+ print(" TASK 2: Second attempt (with learned heuristics)")
322
+ print("─" * 70)
323
+
324
+ llm_state["step"] = 0
325
+ llm_state["task_num"] = 1 # Switch to optimal path
326
+
327
+ result2 = orch.run_task(
328
+ purpose="Find and collect the treasure hidden at position (4,4) in the maze",
329
+ initial_state=env.reset(),
330
+ max_steps=15,
331
+ )
332
+
333
+ print(f"\nπŸ“Š Task 2 Result:\n{result2.summary()}")
334
+
335
+ # ─── Compare performance ───────────────────────────────────────────
336
+ print("\n" + "=" * 70)
337
+ print(" PERFORMANCE COMPARISON")
338
+ print("=" * 70)
339
+ print(f"\n {'Metric':<30} {'Task 1':>10} {'Task 2':>10} {'Ξ”':>10}")
340
+ print(f" {'─' * 60}")
341
+ print(f" {'Steps taken':<30} {result1.total_steps:>10} {result2.total_steps:>10} "
342
+ f"{result2.total_steps - result1.total_steps:>+10}")
343
+ print(f" {'Cumulative reward':<30} {result1.cumulative_reward:>10.2f} {result2.cumulative_reward:>10.2f} "
344
+ f"{result2.cumulative_reward - result1.cumulative_reward:>+10.2f}")
345
+ print(f" {'Success rate':<30} {result1.trajectory.success_rate:>10.1%} {result2.trajectory.success_rate:>10.1%} "
346
+ f"{result2.trajectory.success_rate - result1.trajectory.success_rate:>+10.1%}")
347
+ phi1 = result1.final_phi or 0
348
+ phi2 = result2.final_phi or 0
349
+ print(f" {'Final Ξ¦':<30} {phi1:>10.1f} {phi2:>10.1f} {phi2 - phi1:>+10.1f}")
350
+ print(f" {'Task success':<30} {'βœ“' if result1.success else 'βœ—':>10} {'βœ“' if result2.success else 'βœ—':>10}")
351
+
352
+ # ─── Framework stats ──────────────────────────────────────────────
353
+ print(f"\n Framework Stats: {json.dumps(orch.stats, indent=4)}")
354
+
355
+ # ─── Experience Replay stats ──────────────────────────────────────
356
+ print(f"\n Experience Replay: {json.dumps(orch.experience_replay.stats, indent=4)}")
357
+
358
+ print("\n" + "=" * 70)
359
+ print(" Demo complete! The agent improved from Task 1 β†’ Task 2")
360
+ print(" by learning heuristics from its first experience.")
361
+ print("=" * 70)
362
+
363
+ return result1, result2
364
+
365
+
366
+ # ---------------------------------------------------------------------------
367
+ # Unit Tests
368
+ # ---------------------------------------------------------------------------
369
+
370
+ def run_tests():
371
+ """Quick unit tests for each module."""
372
+ print("\n" + "=" * 70)
373
+ print(" UNIT TESTS")
374
+ print("=" * 70)
375
+
376
+ tests_passed = 0
377
+ tests_total = 0
378
+
379
+ def check(name, condition):
380
+ nonlocal tests_passed, tests_total
381
+ tests_total += 1
382
+ if condition:
383
+ tests_passed += 1
384
+ print(f" βœ“ {name}")
385
+ else:
386
+ print(f" βœ— {name}")
387
+
388
+ # Test 1: State
389
+ s = State(data={"x": 1, "y": 2}, summary="Test state")
390
+ check("State.describe() returns summary", "Test state" in s.describe())
391
+ check("State.id is unique", len(s.id) == 12)
392
+
393
+ # Test 2: Action
394
+ a = Action(name="move", params={"dir": "north"}, thought="go north", expected_delta="y+1")
395
+ check("Action fields", a.name == "move" and a.thought == "go north")
396
+
397
+ # Test 3: PurposeScore
398
+ ps = PurposeScore(phi_before=3.0, phi_after=5.0, delta=2.0,
399
+ reasoning="improved", evidence="x changed", confidence=0.9)
400
+ check("PurposeScore.improved", ps.improved)
401
+ check("PurposeScore.delta", ps.delta == 2.0)
402
+
403
+ # Test 4: Heuristic Q-value update
404
+ h = Heuristic(pattern="test", strategy="test", steps=[], tier=MemoryTier.STRATEGIC, q_value=0.5)
405
+ h.update_q_value(1.0, alpha=0.1)
406
+ check("Heuristic Q-value update (reward=1.0)", 0.54 < h.q_value < 0.66)
407
+ h.update_q_value(0.0, alpha=0.1)
408
+ check("Heuristic Q-value update (reward=0.0)", 0.45 < h.q_value < 0.60)
409
+
410
+ # Test 5: Experience Replay
411
+ from purpose_agent.experience_replay import ExperienceReplay
412
+ from purpose_agent.types import Trajectory, TrajectoryStep
413
+
414
+ er = ExperienceReplay(capacity=10)
415
+
416
+ traj = Trajectory(task_description="test task", purpose="test purpose")
417
+ traj.steps.append(TrajectoryStep(
418
+ state_before=State(data={"x": 0}),
419
+ action=Action(name="move"),
420
+ state_after=State(data={"x": 1}),
421
+ score=PurposeScore(phi_before=1.0, phi_after=3.0, delta=2.0,
422
+ reasoning="good", evidence="x: 0β†’1", confidence=0.8),
423
+ ))
424
+ record = er.add(traj)
425
+ check("ExperienceReplay.add", er.size == 1)
426
+ check("ExperienceReplay.retrieve", len(er.retrieve("test task")) == 1)
427
+
428
+ # Test Q-value update
429
+ old_q = record.retrieval_q_value
430
+ er.update_q_value(record.id, reward=1.0)
431
+ check("ExperienceReplay Q-value update", record.retrieval_q_value > old_q)
432
+
433
+ # Test 6: Mock LLM
434
+ from purpose_agent.llm_backend import ChatMessage
435
+ mock = MockLLMBackend()
436
+ mock.register_handler("hello", "world")
437
+ result = mock.generate([ChatMessage(role="user", content="hello")])
438
+ check("MockLLM keyword handler", result == "world")
439
+
440
+ result = mock.generate([ChatMessage(role="user", content="unknown")])
441
+ check("MockLLM default response", "MockLLM" in result)
442
+
443
+ # Test 7: Purpose Function safeguards
444
+ from purpose_agent.purpose_function import PurposeFunction
445
+ mock2 = MockLLMBackend()
446
+ mock2.set_structured_default({
447
+ "phi_before": 3.0,
448
+ "phi_after": 5.0,
449
+ "reasoning": "The state improved because of the action",
450
+ "evidence": "Position changed from (0,0) to (1,0), reducing distance by 1",
451
+ "confidence": 0.85,
452
+ })
453
+ pf = PurposeFunction(llm=mock2)
454
+ score = pf.evaluate(
455
+ state_before=State(data={"pos": [0, 0]}),
456
+ action=Action(name="move_east"),
457
+ state_after=State(data={"pos": [1, 0]}),
458
+ purpose="Reach position (4,4)",
459
+ )
460
+ check("PurposeFunction returns PurposeScore", score.delta == 2.0)
461
+ check("PurposeFunction evidence check", len(score.evidence) > 0)
462
+
463
+ # Test 8: Environment
464
+ maze = TreasureMaze()
465
+ s0 = maze.reset()
466
+ check("Environment.reset", s0.data["position"] == [0, 0])
467
+ s1 = maze.execute(Action(name="move_east"), s0)
468
+ check("Environment.execute move_east", s1.data["position"] == [1, 0])
469
+ check("Environment not terminal at start", not maze.is_terminal(s1))
470
+
471
+ print(f"\n Results: {tests_passed}/{tests_total} tests passed")
472
+ return tests_passed == tests_total
473
+
474
+
475
+ # ---------------------------------------------------------------------------
476
+ # Main
477
+ # ---------------------------------------------------------------------------
478
+
479
+ if __name__ == "__main__":
480
+ # Run tests first
481
+ all_passed = run_tests()
482
+
483
+ if not all_passed:
484
+ print("\n⚠ Some tests failed β€” check output above")
485
+ sys.exit(1)
486
+
487
+ # Run demo
488
+ run_demo()