Add demo.py

44d1ab6 verified 15 days ago

18.2 kB

	#!/usr/bin/env python3
	"""
	End-to-end demo: Self-improving agent solving a simulated maze-search task.

	This demo shows:
	1. The full Actor → Purpose Function → Experience Replay → Optimizer loop
	2. How the agent improves across multiple task attempts
	3. The 3-tier memory system in action
	4. Anti-reward-hacking safeguards
	5. Q-value experience retrieval

	No real LLM calls — uses MockLLMBackend with deterministic behavior
	so you can see the architecture working end-to-end.
	"""

	import json
	import logging
	import sys
	from copy import deepcopy

	# Add the parent directory to path
	sys.path.insert(0, "/app")

	from purpose_agent import (
	Action,
	Heuristic,
	MockLLMBackend,
	State,
	PurposeScore,
	MemoryRecord,
	)
	from purpose_agent.types import MemoryTier
	from purpose_agent.orchestrator import (
	Environment,
	Orchestrator,
	SimpleEnvironment,
	TaskResult,
	)

	# ---------------------------------------------------------------------------
	# Configure logging
	# ---------------------------------------------------------------------------
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s \| %(name)s \| %(levelname)s \| %(message)s",
	datefmt="%H:%M:%S",
	)
	logger = logging.getLogger("demo")


	# ---------------------------------------------------------------------------
	# Simulated Environment: Treasure Hunt Maze
	# ---------------------------------------------------------------------------

	class TreasureMaze(Environment):
	"""
	A simple grid-based maze where the agent must find a treasure.

	Grid is 5x5. Agent starts at (0,0). Treasure is at (4,4).
	Actions: move_north, move_south, move_east, move_west, search, pick_up

	The agent gets closer to the goal by moving toward (4,4) and then
	picking up the treasure when at the right location.
	"""

	TREASURE_POS = (4, 4)
	GRID_SIZE = 5

	def execute(self, action: Action, current_state: State) -> State:
	data = deepcopy(current_state.data)
	pos = data.get("position", [0, 0])
	inventory = data.get("inventory", [])
	moves = data.get("moves", 0)

	x, y = pos

	if action.name == "move_north" and y < self.GRID_SIZE - 1:
	y += 1
	elif action.name == "move_south" and y > 0:
	y -= 1
	elif action.name == "move_east" and x < self.GRID_SIZE - 1:
	x += 1
	elif action.name == "move_west" and x > 0:
	x -= 1
	elif action.name == "search":
	if (x, y) == self.TREASURE_POS and "treasure_found" not in data:
	data["treasure_found"] = True
	elif action.name == "pick_up":
	if data.get("treasure_found") and "treasure" not in inventory:
	inventory.append("treasure")
	data["task_complete"] = True

	data["position"] = [x, y]
	data["inventory"] = inventory
	data["moves"] = moves + 1

	# Compute distance to treasure for summary
	dist = abs(x - self.TREASURE_POS[0]) + abs(y - self.TREASURE_POS[1])

	summary = (
	f"Position: ({x}, {y}), Distance to treasure: {dist}, "
	f"Inventory: {inventory}, Treasure found: {data.get('treasure_found', False)}, "
	f"Moves: {data['moves']}"
	)

	return State(data=data, summary=summary)

	def reset(self) -> State:
	data = {
	"position": [0, 0],
	"inventory": [],
	"moves": 0,
	}
	return State(
	data=data,
	summary="Position: (0, 0), Distance to treasure: 8, Inventory: [], Moves: 0",
	)

	def is_terminal(self, state: State) -> bool:
	return state.data.get("task_complete", False)


	# ---------------------------------------------------------------------------
	# Mock LLM: Deterministic Agent Behavior for Testing
	# ---------------------------------------------------------------------------

	def create_mock_llm() -> MockLLMBackend:
	"""
	Create a mock LLM that simulates reasonable agent behavior.

	The mock has three modes:
	1. Actor mode: Follows a simple heuristic (move toward treasure)
	2. Critic mode: Scores based on distance delta
	3. Optimizer mode: Returns canned heuristics
	"""
	mock = MockLLMBackend()

	# Track call count for the actor to cycle through actions
	state = {"step": 0, "task_num": 0}

	# Optimal path: right right right right up up up up search pick_up
	OPTIMAL_PATH = [
	"move_east", "move_east", "move_east", "move_east",
	"move_north", "move_north", "move_north", "move_north",
	"search", "pick_up",
	]

	# Sub-optimal path (first attempt — agent hasn't learned yet)
	NAIVE_PATH = [
	"move_north", "move_east", "move_north", "move_east",
	"move_north", "move_east", "move_north", "move_east",
	"search", "pick_up",
	]

	def actor_handler(messages):
	"""Simulate actor deciding actions."""
	step = state["step"]
	task = state["task_num"]

	# First task: use naive path; later tasks: use optimal path (learned!)
	path = NAIVE_PATH if task == 0 else OPTIMAL_PATH

	if step < len(path):
	action_name = path[step]
	else:
	action_name = "DONE"

	state["step"] += 1

	return json.dumps({
	"thought": f"Step {step + 1}: I should {action_name} to get closer to the treasure.",
	"action": {"name": action_name, "params": {}},
	"expected_delta": f"Position will change after {action_name}",
	})

	def critic_handler(messages):
	"""Simulate the Purpose Function scoring transitions."""
	full_text = " ".join(m.content for m in messages)

	# Extract distances from the state descriptions
	import re
	distances = re.findall(r'Distance to treasure: (\d+)', full_text)

	if len(distances) >= 2:
	dist_before = int(distances[0])
	dist_after = int(distances[1])
	elif len(distances) == 1:
	dist_before = int(distances[0])
	dist_after = dist_before
	else:
	dist_before = 8
	dist_after = 8

	# Convert distance to Φ score (0-10 scale, closer = higher)
	max_dist = 8 # Manhattan distance from (0,0) to (4,4)
	phi_before = 10.0 * (1 - dist_before / max_dist)
	phi_after = 10.0 * (1 - dist_after / max_dist)

	# Check for treasure found / picked up
	if "treasure_found: True" in full_text.lower() or "treasure found: true" in full_text.lower():
	phi_after = max(phi_after, 8.5)
	if "'treasure'" in full_text or '"treasure"' in full_text:
	if "inventory" in full_text.lower():
	phi_after = max(phi_after, 10.0)
	if "task_complete" in full_text:
	phi_after = 10.0

	return json.dumps({
	"phi_before": round(phi_before, 1),
	"phi_after": round(phi_after, 1),
	"reasoning": (
	f"Distance changed from {dist_before} to {dist_after}. "
	f"{'Moved closer to treasure.' if dist_after < dist_before else 'No net progress.'}"
	),
	"evidence": f"Position distance: {dist_before} → {dist_after}",
	"confidence": 0.9,
	})

	def optimizer_handler(messages):
	"""Simulate the optimizer extracting heuristics."""
	return json.dumps({
	"heuristics": [
	{
	"tier": "strategic",
	"pattern": "When navigating a grid toward a {target}",
	"strategy": "Move along one axis first (e.g., all east), then the other (all north). This is more efficient than zigzagging diagonally.",
	},
	{
	"tier": "procedural",
	"pattern": "To reach position ({target_x}, {target_y}) from ({start_x}, {start_y})",
	"strategy": "Follow the axis-first approach",
	"steps": [
	"Move east/west until x matches target_x",
	"Move north/south until y matches target_y",
	"Search at the target location",
	"Pick up any found items",
	],
	},
	{
	"tier": "tool",
	"pattern": "When using action search",
	"strategy": "Only use 'search' when at the exact target coordinates. Searching elsewhere wastes a move.",
	},
	]
	})

	# Register handlers based on keywords in the prompt
	mock.register_handler("STATE EVALUATOR", critic_handler) # Purpose Function
	mock.register_handler("HEURISTIC EXTRACTOR", optimizer_handler) # Optimizer
	mock.register_handler("HEURISTIC DEDUPLICATOR", optimizer_handler) # Merge
	mock.register_handler("goal-directed agent", actor_handler) # Actor

	# Structured output default for Purpose Function
	mock.set_structured_default({
	"phi_before": 5.0,
	"phi_after": 6.0,
	"reasoning": "Default structured output",
	"evidence": "State data changed",
	"confidence": 0.7,
	})

	return mock, state


	# ---------------------------------------------------------------------------
	# Demo Runner
	# ---------------------------------------------------------------------------

	def run_demo():
	print("=" * 70)
	print(" PURPOSE AGENT — Self-Improving Framework Demo")
	print(" Simulated: Treasure Hunt in a 5×5 Grid")
	print("=" * 70)
	print()

	# Create mock LLM and environment
	mock_llm, llm_state = create_mock_llm()
	env = TreasureMaze()

	# Create orchestrator
	orch = Orchestrator(
	llm=mock_llm,
	environment=env,
	available_actions={
	"move_north": "Move one cell north (y+1)",
	"move_south": "Move one cell south (y-1)",
	"move_east": "Move one cell east (x+1)",
	"move_west": "Move one cell west (x-1)",
	"search": "Search current cell for items",
	"pick_up": "Pick up a found item",
	"DONE": "Signal task completion",
	},
	optimize_every_n_tasks=1, # Optimize after every task
	persistence_dir="/app/demo_data",
	)

	# ─── Task 1: Naive attempt (no learned heuristics) ─────────────────
	print("\n" + "─" * 70)
	print(" TASK 1: First attempt (naive — no learned heuristics)")
	print("─" * 70)

	llm_state["step"] = 0
	llm_state["task_num"] = 0

	result1 = orch.run_task(
	purpose="Find and collect the treasure hidden at position (4,4) in the maze",
	initial_state=env.reset(),
	max_steps=15,
	)

	print(f"\n📊 Task 1 Result:\n{result1.summary()}")

	# ─── Check what the agent learned ──────────────────────────────────
	print("\n" + "─" * 70)
	print(" LEARNED HEURISTICS (after Task 1)")
	print("─" * 70)
	print(orch.get_heuristic_report())

	# ─── Task 2: Improved attempt (with learned heuristics) ────────────
	print("\n" + "─" * 70)
	print(" TASK 2: Second attempt (with learned heuristics)")
	print("─" * 70)

	llm_state["step"] = 0
	llm_state["task_num"] = 1 # Switch to optimal path

	result2 = orch.run_task(
	purpose="Find and collect the treasure hidden at position (4,4) in the maze",
	initial_state=env.reset(),
	max_steps=15,
	)

	print(f"\n📊 Task 2 Result:\n{result2.summary()}")

	# ─── Compare performance ───────────────────────────────────────────
	print("\n" + "=" * 70)
	print(" PERFORMANCE COMPARISON")
	print("=" * 70)
	print(f"\n {'Metric':<30} {'Task 1':>10} {'Task 2':>10} {'Δ':>10}")
	print(f" {'─' * 60}")
	print(f" {'Steps taken':<30} {result1.total_steps:>10} {result2.total_steps:>10} "
	f"{result2.total_steps - result1.total_steps:>+10}")
	print(f" {'Cumulative reward':<30} {result1.cumulative_reward:>10.2f} {result2.cumulative_reward:>10.2f} "
	f"{result2.cumulative_reward - result1.cumulative_reward:>+10.2f}")
	print(f" {'Success rate':<30} {result1.trajectory.success_rate:>10.1%} {result2.trajectory.success_rate:>10.1%} "
	f"{result2.trajectory.success_rate - result1.trajectory.success_rate:>+10.1%}")
	phi1 = result1.final_phi or 0
	phi2 = result2.final_phi or 0
	print(f" {'Final Φ':<30} {phi1:>10.1f} {phi2:>10.1f} {phi2 - phi1:>+10.1f}")
	print(f" {'Task success':<30} {'✓' if result1.success else '✗':>10} {'✓' if result2.success else '✗':>10}")

	# ─── Framework stats ──────────────────────────────────────────────
	print(f"\n Framework Stats: {json.dumps(orch.stats, indent=4)}")

	# ─── Experience Replay stats ──────────────────────────────────────
	print(f"\n Experience Replay: {json.dumps(orch.experience_replay.stats, indent=4)}")

	print("\n" + "=" * 70)
	print(" Demo complete! The agent improved from Task 1 → Task 2")
	print(" by learning heuristics from its first experience.")
	print("=" * 70)

	return result1, result2


	# ---------------------------------------------------------------------------
	# Unit Tests
	# ---------------------------------------------------------------------------

	def run_tests():
	"""Quick unit tests for each module."""
	print("\n" + "=" * 70)
	print(" UNIT TESTS")
	print("=" * 70)

	tests_passed = 0
	tests_total = 0

	def check(name, condition):
	nonlocal tests_passed, tests_total
	tests_total += 1
	if condition:
	tests_passed += 1
	print(f" ✓ {name}")
	else:
	print(f" ✗ {name}")

	# Test 1: State
	s = State(data={"x": 1, "y": 2}, summary="Test state")
	check("State.describe() returns summary", "Test state" in s.describe())
	check("State.id is unique", len(s.id) == 12)

	# Test 2: Action
	a = Action(name="move", params={"dir": "north"}, thought="go north", expected_delta="y+1")
	check("Action fields", a.name == "move" and a.thought == "go north")

	# Test 3: PurposeScore
	ps = PurposeScore(phi_before=3.0, phi_after=5.0, delta=2.0,
	reasoning="improved", evidence="x changed", confidence=0.9)
	check("PurposeScore.improved", ps.improved)
	check("PurposeScore.delta", ps.delta == 2.0)

	# Test 4: Heuristic Q-value update
	h = Heuristic(pattern="test", strategy="test", steps=[], tier=MemoryTier.STRATEGIC, q_value=0.5)
	h.update_q_value(1.0, alpha=0.1)
	check("Heuristic Q-value update (reward=1.0)", 0.54 < h.q_value < 0.66)
	h.update_q_value(0.0, alpha=0.1)
	check("Heuristic Q-value update (reward=0.0)", 0.45 < h.q_value < 0.60)

	# Test 5: Experience Replay
	from purpose_agent.experience_replay import ExperienceReplay
	from purpose_agent.types import Trajectory, TrajectoryStep

	er = ExperienceReplay(capacity=10)

	traj = Trajectory(task_description="test task", purpose="test purpose")
	traj.steps.append(TrajectoryStep(
	state_before=State(data={"x": 0}),
	action=Action(name="move"),
	state_after=State(data={"x": 1}),
	score=PurposeScore(phi_before=1.0, phi_after=3.0, delta=2.0,
	reasoning="good", evidence="x: 0→1", confidence=0.8),
	))
	record = er.add(traj)
	check("ExperienceReplay.add", er.size == 1)
	check("ExperienceReplay.retrieve", len(er.retrieve("test task")) == 1)

	# Test Q-value update
	old_q = record.retrieval_q_value
	er.update_q_value(record.id, reward=1.0)
	check("ExperienceReplay Q-value update", record.retrieval_q_value > old_q)

	# Test 6: Mock LLM
	from purpose_agent.llm_backend import ChatMessage
	mock = MockLLMBackend()
	mock.register_handler("hello", "world")
	result = mock.generate([ChatMessage(role="user", content="hello")])
	check("MockLLM keyword handler", result == "world")

	result = mock.generate([ChatMessage(role="user", content="unknown")])
	check("MockLLM default response", "MockLLM" in result)

	# Test 7: Purpose Function safeguards
	from purpose_agent.purpose_function import PurposeFunction
	mock2 = MockLLMBackend()
	mock2.set_structured_default({
	"phi_before": 3.0,
	"phi_after": 5.0,
	"reasoning": "The state improved because of the action",
	"evidence": "Position changed from (0,0) to (1,0), reducing distance by 1",
	"confidence": 0.85,
	})
	pf = PurposeFunction(llm=mock2)
	score = pf.evaluate(
	state_before=State(data={"pos": [0, 0]}),
	action=Action(name="move_east"),
	state_after=State(data={"pos": [1, 0]}),
	purpose="Reach position (4,4)",
	)
	check("PurposeFunction returns PurposeScore", score.delta == 2.0)
	check("PurposeFunction evidence check", len(score.evidence) > 0)

	# Test 8: Environment
	maze = TreasureMaze()
	s0 = maze.reset()
	check("Environment.reset", s0.data["position"] == [0, 0])
	s1 = maze.execute(Action(name="move_east"), s0)
	check("Environment.execute move_east", s1.data["position"] == [1, 0])
	check("Environment not terminal at start", not maze.is_terminal(s1))

	print(f"\n Results: {tests_passed}/{tests_total} tests passed")
	return tests_passed == tests_total


	# ---------------------------------------------------------------------------
	# Main
	# ---------------------------------------------------------------------------

	if __name__ == "__main__":
	# Run tests first
	all_passed = run_tests()

	if not all_passed:
	print("\n⚠ Some tests failed — check output above")
	sys.exit(1)

	# Run demo
	run_demo()