File size: 18,204 Bytes

44d1ab6

#!/usr/bin/env python3
"""
End-to-end demo: Self-improving agent solving a simulated maze-search task.

This demo shows:
1. The full Actor → Purpose Function → Experience Replay → Optimizer loop
2. How the agent improves across multiple task attempts
3. The 3-tier memory system in action
4. Anti-reward-hacking safeguards
5. Q-value experience retrieval

No real LLM calls — uses MockLLMBackend with deterministic behavior
so you can see the architecture working end-to-end.
"""

import json
import logging
import sys
from copy import deepcopy

# Add the parent directory to path
sys.path.insert(0, "/app")

from purpose_agent import (
    Action,
    Heuristic,
    MockLLMBackend,
    State,
    PurposeScore,
    MemoryRecord,
)
from purpose_agent.types import MemoryTier
from purpose_agent.orchestrator import (
    Environment,
    Orchestrator,
    SimpleEnvironment,
    TaskResult,
)

# ---------------------------------------------------------------------------
# Configure logging
# ---------------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(name)s | %(levelname)s | %(message)s",
    datefmt="%H:%M:%S",
)
logger = logging.getLogger("demo")


# ---------------------------------------------------------------------------
# Simulated Environment: Treasure Hunt Maze
# ---------------------------------------------------------------------------

class TreasureMaze(Environment):
    """
    A simple grid-based maze where the agent must find a treasure.
    
    Grid is 5x5. Agent starts at (0,0). Treasure is at (4,4).
    Actions: move_north, move_south, move_east, move_west, search, pick_up
    
    The agent gets closer to the goal by moving toward (4,4) and then
    picking up the treasure when at the right location.
    """

    TREASURE_POS = (4, 4)
    GRID_SIZE = 5

    def execute(self, action: Action, current_state: State) -> State:
        data = deepcopy(current_state.data)
        pos = data.get("position", [0, 0])
        inventory = data.get("inventory", [])
        moves = data.get("moves", 0)

        x, y = pos

        if action.name == "move_north" and y < self.GRID_SIZE - 1:
            y += 1
        elif action.name == "move_south" and y > 0:
            y -= 1
        elif action.name == "move_east" and x < self.GRID_SIZE - 1:
            x += 1
        elif action.name == "move_west" and x > 0:
            x -= 1
        elif action.name == "search":
            if (x, y) == self.TREASURE_POS and "treasure_found" not in data:
                data["treasure_found"] = True
        elif action.name == "pick_up":
            if data.get("treasure_found") and "treasure" not in inventory:
                inventory.append("treasure")
                data["task_complete"] = True

        data["position"] = [x, y]
        data["inventory"] = inventory
        data["moves"] = moves + 1

        # Compute distance to treasure for summary
        dist = abs(x - self.TREASURE_POS[0]) + abs(y - self.TREASURE_POS[1])

        summary = (
            f"Position: ({x}, {y}), Distance to treasure: {dist}, "
            f"Inventory: {inventory}, Treasure found: {data.get('treasure_found', False)}, "
            f"Moves: {data['moves']}"
        )

        return State(data=data, summary=summary)

    def reset(self) -> State:
        data = {
            "position": [0, 0],
            "inventory": [],
            "moves": 0,
        }
        return State(
            data=data,
            summary="Position: (0, 0), Distance to treasure: 8, Inventory: [], Moves: 0",
        )

    def is_terminal(self, state: State) -> bool:
        return state.data.get("task_complete", False)


# ---------------------------------------------------------------------------
# Mock LLM: Deterministic Agent Behavior for Testing
# ---------------------------------------------------------------------------

def create_mock_llm() -> MockLLMBackend:
    """
    Create a mock LLM that simulates reasonable agent behavior.
    
    The mock has three modes:
    1. Actor mode: Follows a simple heuristic (move toward treasure)
    2. Critic mode: Scores based on distance delta
    3. Optimizer mode: Returns canned heuristics
    """
    mock = MockLLMBackend()

    # Track call count for the actor to cycle through actions
    state = {"step": 0, "task_num": 0}

    # Optimal path: right right right right up up up up search pick_up
    OPTIMAL_PATH = [
        "move_east", "move_east", "move_east", "move_east",
        "move_north", "move_north", "move_north", "move_north",
        "search", "pick_up",
    ]

    # Sub-optimal path (first attempt — agent hasn't learned yet)
    NAIVE_PATH = [
        "move_north", "move_east", "move_north", "move_east",
        "move_north", "move_east", "move_north", "move_east",
        "search", "pick_up",
    ]

    def actor_handler(messages):
        """Simulate actor deciding actions."""
        step = state["step"]
        task = state["task_num"]

        # First task: use naive path; later tasks: use optimal path (learned!)
        path = NAIVE_PATH if task == 0 else OPTIMAL_PATH

        if step < len(path):
            action_name = path[step]
        else:
            action_name = "DONE"

        state["step"] += 1

        return json.dumps({
            "thought": f"Step {step + 1}: I should {action_name} to get closer to the treasure.",
            "action": {"name": action_name, "params": {}},
            "expected_delta": f"Position will change after {action_name}",
        })

    def critic_handler(messages):
        """Simulate the Purpose Function scoring transitions."""
        full_text = " ".join(m.content for m in messages)

        # Extract distances from the state descriptions
        import re
        distances = re.findall(r'Distance to treasure: (\d+)', full_text)

        if len(distances) >= 2:
            dist_before = int(distances[0])
            dist_after = int(distances[1])
        elif len(distances) == 1:
            dist_before = int(distances[0])
            dist_after = dist_before
        else:
            dist_before = 8
            dist_after = 8

        # Convert distance to Φ score (0-10 scale, closer = higher)
        max_dist = 8  # Manhattan distance from (0,0) to (4,4)
        phi_before = 10.0 * (1 - dist_before / max_dist)
        phi_after = 10.0 * (1 - dist_after / max_dist)

        # Check for treasure found / picked up
        if "treasure_found: True" in full_text.lower() or "treasure found: true" in full_text.lower():
            phi_after = max(phi_after, 8.5)
        if "'treasure'" in full_text or '"treasure"' in full_text:
            if "inventory" in full_text.lower():
                phi_after = max(phi_after, 10.0)
        if "task_complete" in full_text:
            phi_after = 10.0

        return json.dumps({
            "phi_before": round(phi_before, 1),
            "phi_after": round(phi_after, 1),
            "reasoning": (
                f"Distance changed from {dist_before} to {dist_after}. "
                f"{'Moved closer to treasure.' if dist_after < dist_before else 'No net progress.'}"
            ),
            "evidence": f"Position distance: {dist_before} → {dist_after}",
            "confidence": 0.9,
        })

    def optimizer_handler(messages):
        """Simulate the optimizer extracting heuristics."""
        return json.dumps({
            "heuristics": [
                {
                    "tier": "strategic",
                    "pattern": "When navigating a grid toward a {target}",
                    "strategy": "Move along one axis first (e.g., all east), then the other (all north). This is more efficient than zigzagging diagonally.",
                },
                {
                    "tier": "procedural",
                    "pattern": "To reach position ({target_x}, {target_y}) from ({start_x}, {start_y})",
                    "strategy": "Follow the axis-first approach",
                    "steps": [
                        "Move east/west until x matches target_x",
                        "Move north/south until y matches target_y",
                        "Search at the target location",
                        "Pick up any found items",
                    ],
                },
                {
                    "tier": "tool",
                    "pattern": "When using action search",
                    "strategy": "Only use 'search' when at the exact target coordinates. Searching elsewhere wastes a move.",
                },
            ]
        })

    # Register handlers based on keywords in the prompt
    mock.register_handler("STATE EVALUATOR", critic_handler)  # Purpose Function
    mock.register_handler("HEURISTIC EXTRACTOR", optimizer_handler)  # Optimizer
    mock.register_handler("HEURISTIC DEDUPLICATOR", optimizer_handler)  # Merge
    mock.register_handler("goal-directed agent", actor_handler)  # Actor

    # Structured output default for Purpose Function
    mock.set_structured_default({
        "phi_before": 5.0,
        "phi_after": 6.0,
        "reasoning": "Default structured output",
        "evidence": "State data changed",
        "confidence": 0.7,
    })

    return mock, state


# ---------------------------------------------------------------------------
# Demo Runner
# ---------------------------------------------------------------------------

def run_demo():
    print("=" * 70)
    print("  PURPOSE AGENT — Self-Improving Framework Demo")
    print("  Simulated: Treasure Hunt in a 5×5 Grid")
    print("=" * 70)
    print()

    # Create mock LLM and environment
    mock_llm, llm_state = create_mock_llm()
    env = TreasureMaze()

    # Create orchestrator
    orch = Orchestrator(
        llm=mock_llm,
        environment=env,
        available_actions={
            "move_north": "Move one cell north (y+1)",
            "move_south": "Move one cell south (y-1)",
            "move_east": "Move one cell east (x+1)",
            "move_west": "Move one cell west (x-1)",
            "search": "Search current cell for items",
            "pick_up": "Pick up a found item",
            "DONE": "Signal task completion",
        },
        optimize_every_n_tasks=1,  # Optimize after every task
        persistence_dir="/app/demo_data",
    )

    # ─── Task 1: Naive attempt (no learned heuristics) ─────────────────
    print("\n" + "─" * 70)
    print("  TASK 1: First attempt (naive — no learned heuristics)")
    print("─" * 70)

    llm_state["step"] = 0
    llm_state["task_num"] = 0

    result1 = orch.run_task(
        purpose="Find and collect the treasure hidden at position (4,4) in the maze",
        initial_state=env.reset(),
        max_steps=15,
    )

    print(f"\n📊 Task 1 Result:\n{result1.summary()}")

    # ─── Check what the agent learned ──────────────────────────────────
    print("\n" + "─" * 70)
    print("  LEARNED HEURISTICS (after Task 1)")
    print("─" * 70)
    print(orch.get_heuristic_report())

    # ─── Task 2: Improved attempt (with learned heuristics) ────────────
    print("\n" + "─" * 70)
    print("  TASK 2: Second attempt (with learned heuristics)")
    print("─" * 70)

    llm_state["step"] = 0
    llm_state["task_num"] = 1  # Switch to optimal path

    result2 = orch.run_task(
        purpose="Find and collect the treasure hidden at position (4,4) in the maze",
        initial_state=env.reset(),
        max_steps=15,
    )

    print(f"\n📊 Task 2 Result:\n{result2.summary()}")

    # ─── Compare performance ───────────────────────────────────────────
    print("\n" + "=" * 70)
    print("  PERFORMANCE COMPARISON")
    print("=" * 70)
    print(f"\n  {'Metric':<30} {'Task 1':>10} {'Task 2':>10} {'Δ':>10}")
    print(f"  {'─' * 60}")
    print(f"  {'Steps taken':<30} {result1.total_steps:>10} {result2.total_steps:>10} "
          f"{result2.total_steps - result1.total_steps:>+10}")
    print(f"  {'Cumulative reward':<30} {result1.cumulative_reward:>10.2f} {result2.cumulative_reward:>10.2f} "
          f"{result2.cumulative_reward - result1.cumulative_reward:>+10.2f}")
    print(f"  {'Success rate':<30} {result1.trajectory.success_rate:>10.1%} {result2.trajectory.success_rate:>10.1%} "
          f"{result2.trajectory.success_rate - result1.trajectory.success_rate:>+10.1%}")
    phi1 = result1.final_phi or 0
    phi2 = result2.final_phi or 0
    print(f"  {'Final Φ':<30} {phi1:>10.1f} {phi2:>10.1f} {phi2 - phi1:>+10.1f}")
    print(f"  {'Task success':<30} {'✓' if result1.success else '✗':>10} {'✓' if result2.success else '✗':>10}")

    # ─── Framework stats ──────────────────────────────────────────────
    print(f"\n  Framework Stats: {json.dumps(orch.stats, indent=4)}")

    # ─── Experience Replay stats ──────────────────────────────────────
    print(f"\n  Experience Replay: {json.dumps(orch.experience_replay.stats, indent=4)}")

    print("\n" + "=" * 70)
    print("  Demo complete! The agent improved from Task 1 → Task 2")
    print("  by learning heuristics from its first experience.")
    print("=" * 70)

    return result1, result2


# ---------------------------------------------------------------------------
# Unit Tests
# ---------------------------------------------------------------------------

def run_tests():
    """Quick unit tests for each module."""
    print("\n" + "=" * 70)
    print("  UNIT TESTS")
    print("=" * 70)

    tests_passed = 0
    tests_total = 0

    def check(name, condition):
        nonlocal tests_passed, tests_total
        tests_total += 1
        if condition:
            tests_passed += 1
            print(f"  ✓ {name}")
        else:
            print(f"  ✗ {name}")

    # Test 1: State
    s = State(data={"x": 1, "y": 2}, summary="Test state")
    check("State.describe() returns summary", "Test state" in s.describe())
    check("State.id is unique", len(s.id) == 12)

    # Test 2: Action
    a = Action(name="move", params={"dir": "north"}, thought="go north", expected_delta="y+1")
    check("Action fields", a.name == "move" and a.thought == "go north")

    # Test 3: PurposeScore
    ps = PurposeScore(phi_before=3.0, phi_after=5.0, delta=2.0,
                       reasoning="improved", evidence="x changed", confidence=0.9)
    check("PurposeScore.improved", ps.improved)
    check("PurposeScore.delta", ps.delta == 2.0)

    # Test 4: Heuristic Q-value update
    h = Heuristic(pattern="test", strategy="test", steps=[], tier=MemoryTier.STRATEGIC, q_value=0.5)
    h.update_q_value(1.0, alpha=0.1)
    check("Heuristic Q-value update (reward=1.0)", 0.54 < h.q_value < 0.66)
    h.update_q_value(0.0, alpha=0.1)
    check("Heuristic Q-value update (reward=0.0)", 0.45 < h.q_value < 0.60)

    # Test 5: Experience Replay
    from purpose_agent.experience_replay import ExperienceReplay
    from purpose_agent.types import Trajectory, TrajectoryStep

    er = ExperienceReplay(capacity=10)

    traj = Trajectory(task_description="test task", purpose="test purpose")
    traj.steps.append(TrajectoryStep(
        state_before=State(data={"x": 0}),
        action=Action(name="move"),
        state_after=State(data={"x": 1}),
        score=PurposeScore(phi_before=1.0, phi_after=3.0, delta=2.0,
                           reasoning="good", evidence="x: 0→1", confidence=0.8),
    ))
    record = er.add(traj)
    check("ExperienceReplay.add", er.size == 1)
    check("ExperienceReplay.retrieve", len(er.retrieve("test task")) == 1)

    # Test Q-value update
    old_q = record.retrieval_q_value
    er.update_q_value(record.id, reward=1.0)
    check("ExperienceReplay Q-value update", record.retrieval_q_value > old_q)

    # Test 6: Mock LLM
    from purpose_agent.llm_backend import ChatMessage
    mock = MockLLMBackend()
    mock.register_handler("hello", "world")
    result = mock.generate([ChatMessage(role="user", content="hello")])
    check("MockLLM keyword handler", result == "world")

    result = mock.generate([ChatMessage(role="user", content="unknown")])
    check("MockLLM default response", "MockLLM" in result)

    # Test 7: Purpose Function safeguards
    from purpose_agent.purpose_function import PurposeFunction
    mock2 = MockLLMBackend()
    mock2.set_structured_default({
        "phi_before": 3.0,
        "phi_after": 5.0,
        "reasoning": "The state improved because of the action",
        "evidence": "Position changed from (0,0) to (1,0), reducing distance by 1",
        "confidence": 0.85,
    })
    pf = PurposeFunction(llm=mock2)
    score = pf.evaluate(
        state_before=State(data={"pos": [0, 0]}),
        action=Action(name="move_east"),
        state_after=State(data={"pos": [1, 0]}),
        purpose="Reach position (4,4)",
    )
    check("PurposeFunction returns PurposeScore", score.delta == 2.0)
    check("PurposeFunction evidence check", len(score.evidence) > 0)

    # Test 8: Environment
    maze = TreasureMaze()
    s0 = maze.reset()
    check("Environment.reset", s0.data["position"] == [0, 0])
    s1 = maze.execute(Action(name="move_east"), s0)
    check("Environment.execute move_east", s1.data["position"] == [1, 0])
    check("Environment not terminal at start", not maze.is_terminal(s1))

    print(f"\n  Results: {tests_passed}/{tests_total} tests passed")
    return tests_passed == tests_total


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

if __name__ == "__main__":
    # Run tests first
    all_passed = run_tests()

    if not all_passed:
        print("\n⚠ Some tests failed — check output above")
        sys.exit(1)

    # Run demo
    run_demo()