Rohan03
/

purpose-agent

+"""
+Orchestrator — The main loop tying Actor, Purpose Function, Experience Replay,
+and Heuristic Optimizer together.
+Implements the self-improvement loop:
+    ┌─────────────────────────────────────────────────────────────────┐
+    │                     ORCHESTRATOR LOOP                          │
+    │                                                                 │
+    │  ┌──────────┐   action   ┌─────────────┐   s_new              │
+    │  │  ACTOR   │ ────────►  │ ENVIRONMENT │ ──────────┐          │
+    │  │(+memory) │            │ (your code) │           │          │
+    │  └────▲─────┘            └─────────────┘           │          │
+    │       │                                             ▼          │
+    │       │  heuristics    ┌────────────────┐   (s, a, s')        │
+    │       │◄───────────────│   OPTIMIZER    │◄─────────┐          │
+    │       │                │ (distillation) │          │          │
+    │       │                └────────────────┘          │          │
+    │       │                                             │          │
+    │       │                ┌────────────────┐   Φ(s)→Φ(s')       │
+    │       │                │   PURPOSE FN   │──────────┤          │
+    │       │                │ (state critic) │          │          │
+    │       │                └────────────────┘          │          │
+    │       │                                             │          │
+    │       │                ┌────────────────┐          │          │
+    │       └────────────────│ EXPERIENCE     │◄─────────┘          │
+    │                        │ REPLAY BUFFER  │                      │
+    │                        └────────────────┘                      │
+    └─────────────────────────────────────────────────────────────────┘
+Usage:
+    from purpose_agent import Orchestrator, MockLLMBackend
+    # 1. Define your environment
+    class MyEnv(Environment):
+        def execute(self, action, current_state):
+            # ... do something ...
+            return new_state
+    # 2. Create orchestrator
+    orch = Orchestrator(
+        llm=MockLLMBackend(),  # or HFInferenceBackend(), OpenAICompatibleBackend()
+        environment=MyEnv(),
+        available_actions={"search": "Search for items", "move": "Move to location"},
+    )
+    # 3. Run a task
+    result = orch.run_task(
+        purpose="Find the hidden treasure in the maze",
+        initial_state=State(data={"position": [0, 0], "inventory": []}),
+        max_steps=20,
+    )
+    # 4. The agent self-improves — run more tasks and it gets better
+    result2 = orch.run_task(purpose="Find the second treasure", ...)
+"""
+from __future__ import annotations
+import json
+import logging
+import time
+from abc import ABC, abstractmethod
+from typing import Any, Callable
+from purpose_agent.types import (
+    Action,
+    Heuristic,
+    MemoryTier,
+    PurposeScore,
+    State,
+    Trajectory,
+    TrajectoryStep,
+)
+from purpose_agent.actor import Actor
+from purpose_agent.purpose_function import PurposeFunction
+from purpose_agent.experience_replay import ExperienceReplay
+from purpose_agent.optimizer import HeuristicOptimizer
+from purpose_agent.llm_backend import LLMBackend
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Environment Interface
+# ---------------------------------------------------------------------------
+class Environment(ABC):
+    """
+    Abstract environment that the Agent acts in.
+    Implement this for your specific use case:
+    - Web navigation: wrap a browser automation tool
+    - Code generation: wrap a code executor
+    - Game: wrap a game API
+    - Simulated: mock environment for testing
+    The Orchestrator calls execute() with the agent's action and current state,
+    and expects a new state back.
+    """
+    @abstractmethod
+    def execute(self, action: Action, current_state: State) -> State:
+        """
+        Execute an action in the environment and return the resulting state.
+        Args:
+            action: The action to execute
+            current_state: The state before the action
+        Returns:
+            The new state after the action
+        """
+        ...
+    def reset(self) -> State:
+        """
+        Reset the environment and return the initial state.
+        Override if your environment needs resetting between tasks.
+        """
+        return State(data={})
+    def is_terminal(self, state: State) -> bool:
+        """
+        Check if the state is terminal (task complete or impossible to continue).
+        Override for environments with natural termination conditions.
+        """
+        return False
+class SimpleEnvironment(Environment):
+    """
+    A simple environment backed by a user-provided execute function.
+    Usage:
+        env = SimpleEnvironment(
+            execute_fn=lambda action, state: new_state,
+            initial_state=State(data={"x": 0}),
+        )
+    """
+    def __init__(
+        self,
+        execute_fn: Callable[[Action, State], State],
+        initial_state: State | None = None,
+        terminal_fn: Callable[[State], bool] | None = None,
+    ):
+        self._execute_fn = execute_fn
+        self._initial_state = initial_state or State(data={})
+        self._terminal_fn = terminal_fn
+    def execute(self, action: Action, current_state: State) -> State:
+        return self._execute_fn(action, current_state)
+    def reset(self) -> State:
+        return self._initial_state
+    def is_terminal(self, state: State) -> bool:
+        if self._terminal_fn:
+            return self._terminal_fn(state)
+        return False
+# ---------------------------------------------------------------------------
+# Task Result
+# ---------------------------------------------------------------------------
+class TaskResult:
+    """Result of running a task through the Orchestrator."""
+    def __init__(self, trajectory: Trajectory, final_state: State):
+        self.trajectory = trajectory
+        self.final_state = final_state
+    @property
+    def success(self) -> bool:
+        """Was the task successful? (final Φ > 7.0)"""
+        phi = self.trajectory.final_phi
+        return phi is not None and phi > 7.0
+    @property
+    def total_steps(self) -> int:
+        return len(self.trajectory.steps)
+    @property
+    def cumulative_reward(self) -> float:
+        return self.trajectory.cumulative_reward
+    @property
+    def final_phi(self) -> float | None:
+        return self.trajectory.final_phi
+    def summary(self) -> str:
+        lines = [
+            f"Task: {self.trajectory.task_description}",
+            f"Purpose: {self.trajectory.purpose}",
+            f"Steps: {self.total_steps}",
+            f"Success Rate: {self.trajectory.success_rate:.1%}",
+            f"Cumulative Reward: {self.cumulative_reward:.2f}",
+            f"Net Delta: {self.trajectory.total_delta:.2f}",
+            f"Final Φ: {self.final_phi:.2f}" if self.final_phi is not None else "Final Φ: N/A",
+            f"Task Success: {'✓' if self.success else '✗'}",
+        ]
+        return "\n".join(lines)
+# ---------------------------------------------------------------------------
+# Orchestrator
+# ---------------------------------------------------------------------------
+class Orchestrator:
+    """
+    Main orchestration loop for the self-improving agent.
+    Ties together all modules:
+    - Actor: Decides actions based on state + memory
+    - Purpose Function: Scores state transitions (Φ improvement)
+    - Experience Replay: Stores trajectories for future retrieval
+    - Heuristic Optimizer: Extracts winning strategies from good trajectories
+    Self-improvement happens via the memory feedback loop:
+    1. Actor uses heuristics from memory to decide actions
+    2. Purpose Function scores each transition
+    3. Experience Replay stores the full trajectory
+    4. Optimizer distills high-reward trajectories into new heuristics
+    5. Actor's memory is updated with new heuristics → better next time
+    Args:
+        llm: Default LLM backend (used for all modules unless overridden)
+        critic_llm: Optional separate LLM for the Purpose Function
+        optimizer_llm: Optional separate LLM for the Optimizer
+        environment: The environment the agent acts in
+        available_actions: Dict of {action_name: description}
+        experience_buffer_size: Max trajectories in experience replay
+        persistence_dir: Directory for persistent storage (experience replay, heuristics)
+        on_step: Optional callback called after each step (for monitoring)
+    """
+    def __init__(
+        self,
+        llm: LLMBackend,
+        environment: Environment,
+        available_actions: dict[str, str] | None = None,
+        critic_llm: LLMBackend | None = None,
+        optimizer_llm: LLMBackend | None = None,
+        experience_buffer_size: int = 500,
+        persistence_dir: str | None = None,
+        on_step: Callable[[TrajectoryStep], None] | None = None,
+        optimize_every_n_tasks: int = 1,
+    ):
+        self.environment = environment
+        self.on_step = on_step
+        self.optimize_every_n_tasks = optimize_every_n_tasks
+        self._tasks_since_optimize = 0
+        # Persistence
+        replay_path = None
+        if persistence_dir:
+            import os
+            os.makedirs(persistence_dir, exist_ok=True)
+            replay_path = f"{persistence_dir}/experience_replay.json"
+        # Initialize modules
+        self.actor = Actor(
+            llm=llm,
+            available_actions=available_actions,
+        )
+        self.purpose_fn = PurposeFunction(
+            llm=critic_llm or llm,
+        )
+        self.experience_replay = ExperienceReplay(
+            capacity=experience_buffer_size,
+            persistence_path=replay_path,
+        )
+        self.optimizer = HeuristicOptimizer(
+            llm=optimizer_llm or llm,
+        )
+        # Load existing heuristics into Actor memory
+        self._sync_memory()
+    # ------------------------------------------------------------------
+    # Main Task Loop
+    # ------------------------------------------------------------------
+    def run_task(
+        self,
+        purpose: str,
+        initial_state: State | None = None,
+        max_steps: int = 20,
+        early_stop_phi: float = 9.0,
+        task_description: str | None = None,
+    ) -> TaskResult:
+        """
+        Run a complete task through the agent loop.
+        The loop for each step:
+        1. Actor decides an action (with thought + prediction)
+        2. Environment executes the action → new state
+        3. Purpose Function evaluates: Φ(s_new) vs Φ(s_old)
+        4. Trajectory step is recorded
+        5. Check termination conditions
+        After the task:
+        - Trajectory is added to Experience Replay
+        - If enough tasks have run, Optimizer extracts new heuristics
+        - Actor's memory is updated
+        Args:
+            purpose: The goal description
+            initial_state: Starting state (or environment.reset() if None)
+            max_steps: Maximum steps before forced termination
+            early_stop_phi: Stop if Φ exceeds this value (goal ~achieved)
+            task_description: Optional description (defaults to purpose)
+        """
+        task_desc = task_description or purpose
+        current_state = initial_state or self.environment.reset()
+        # Reset Purpose Function per-trajectory stats
+        self.purpose_fn.reset_trajectory_stats()
+        # Retrieve relevant past experiences for context
+        relevant_experiences = self.experience_replay.retrieve(task_desc, top_k=3)
+        self._inject_experience_context(relevant_experiences)
+        # Create trajectory
+        trajectory = Trajectory(
+            task_description=task_desc,
+            purpose=purpose,
+        )
+        # History for Actor context
+        history: list[dict[str, Any]] = []
+        logger.info(f"═══ Starting task: {task_desc} (max {max_steps} steps) ═══")
+        for step_idx in range(max_steps):
+            step_start = time.time()
+            # Step 1: Actor decides
+            action = self.actor.decide(
+                purpose=purpose,
+                current_state=current_state,
+                history=history,
+            )
+            logger.info(
+                f"Step {step_idx + 1}: Action={action.name}, "
+                f"Thought={action.thought[:100]}..."
+            )
+            # Check for DONE action
+            if action.name.upper() == "DONE":
+                logger.info("Agent signaled DONE — ending task")
+                # Still score the final state to record final Φ
+                final_score = self.purpose_fn.evaluate(
+                    state_before=current_state,
+                    action=action,
+                    state_after=current_state,
+                    purpose=purpose,
+                )
+                trajectory.steps.append(TrajectoryStep(
+                    state_before=current_state,
+                    action=action,
+                    state_after=current_state,
+                    score=final_score,
+                    step_index=step_idx + 1,
+                    wall_time_s=time.time() - step_start,
+                ))
+                break
+            # Step 2: Environment executes
+            try:
+                new_state = self.environment.execute(action, current_state)
+            except Exception as e:
+                logger.error(f"Environment execution failed: {e}")
+                new_state = State(
+                    data={**current_state.data, "_error": str(e)},
+                    summary=f"Error: {e}",
+                )
+            # Step 3: Purpose Function evaluates
+            score = self.purpose_fn.evaluate(
+                state_before=current_state,
+                action=action,
+                state_after=new_state,
+                purpose=purpose,
+            )
+            # Step 4: Record step
+            step = TrajectoryStep(
+                state_before=current_state,
+                action=action,
+                state_after=new_state,
+                score=score,
+                step_index=step_idx + 1,
+                wall_time_s=time.time() - step_start,
+            )
+            trajectory.steps.append(step)
+            # Update history for Actor context
+            history.append({
+                "action": f"{action.name}({json.dumps(action.params, default=str)})",
+                "result": new_state.describe()[:200],
+                "score": f"Δ={score.delta:+.2f}" if score else "N/A",
+            })
+            # Callback
+            if self.on_step:
+                self.on_step(step)
+            logger.info(
+                f"  → Φ: {score.phi_before:.1f} → {score.phi_after:.1f} "
+                f"(Δ={score.delta:+.2f}, conf={score.confidence:.2f})"
+            )
+            # Step 5: Check termination
+            if score.phi_after >= early_stop_phi:
+                logger.info(f"Early stop: Φ={score.phi_after:.1f} ≥ {early_stop_phi}")
+                break
+            if self.environment.is_terminal(new_state):
+                logger.info("Environment signaled terminal state")
+                break
+            current_state = new_state
+        # Post-task processing
+        result = TaskResult(trajectory=trajectory, final_state=current_state)
+        self._post_task(trajectory, relevant_experiences)
+        logger.info(f"═══ Task complete ═══\n{result.summary()}")
+        return result
+    # ------------------------------------------------------------------
+    # Post-Task: Experience Storage + Optimization
+    # ------------------------------------------------------------------
+    def _post_task(
+        self,
+        trajectory: Trajectory,
+        used_experiences: list[Any],
+    ) -> None:
+        """Post-task processing: store trajectory, maybe optimize, sync memory."""
+        # Store in experience replay
+        record = self.experience_replay.add(trajectory)
+        # Update Q-values for retrieved experiences that were used
+        task_success = trajectory.success_rate > 0.5
+        for exp in used_experiences:
+            self.experience_replay.update_q_value(
+                exp.id, reward=1.0 if task_success else 0.0
+            )
+        # Update heuristic usage stats
+        for h in self.actor.strategic_memory + self.actor.procedural_memory:
+            self.optimizer.update_heuristic_usage(h.id, was_successful=task_success)
+        # Periodic optimization
+        self._tasks_since_optimize += 1
+        if self._tasks_since_optimize >= self.optimize_every_n_tasks:
+            self._run_optimization()
+            self._tasks_since_optimize = 0
+    def _run_optimization(self) -> None:
+        """Run the heuristic optimization cycle."""
+        logger.info("Running optimization cycle...")
+        # Get best trajectories
+        top_trajectories = self.experience_replay.get_top_trajectories(
+            n=5, min_success_rate=0.3
+        )
+        if not top_trajectories:
+            logger.info("No qualifying trajectories for optimization")
+            return
+        # Run optimizer
+        self.optimizer.optimize(top_trajectories)
+        # Sync updated heuristics to Actor memory
+        self._sync_memory()
+    def _sync_memory(self) -> None:
+        """Push current heuristic library to Actor's memory tiers."""
+        self.actor.update_strategic_memory(
+            self.optimizer.get_heuristics_by_tier(MemoryTier.STRATEGIC)
+        )
+        self.actor.update_procedural_memory(
+            self.optimizer.get_heuristics_by_tier(MemoryTier.PROCEDURAL)
+        )
+        # Tool memory from heuristics
+        tool_heuristics = self.optimizer.get_heuristics_by_tier(MemoryTier.TOOL)
+        tool_tips = {h.pattern: h.strategy for h in tool_heuristics}
+        if tool_tips:
+            self.actor.update_tool_memory(tool_tips)
+    def _inject_experience_context(self, experiences: list[Any]) -> None:
+        """
+        Inject retrieved experience context into Actor's procedural memory.
+        This is the CER (arxiv:2506.06698) retrieval injection pattern:
+        relevant past trajectories → distilled into SOPs → added to Actor context.
+        """
+        injected = []
+        for exp in experiences:
+            for h in exp.heuristics:
+                if h.tier == MemoryTier.PROCEDURAL:
+                    injected.append(h)
+        if injected:
+            current = self.actor.procedural_memory or []
+            self.actor.procedural_memory = current + injected
+            logger.debug(f"Injected {len(injected)} experience-based SOPs")
+    # ------------------------------------------------------------------
+    # Inspection / Monitoring
+    # ------------------------------------------------------------------
+    @property
+    def stats(self) -> dict[str, Any]:
+        """Get current framework statistics."""
+        return {
+            "experience_replay": self.experience_replay.stats,
+            "heuristic_library_size": len(self.optimizer.heuristic_library),
+            "heuristics_by_tier": {
+                tier.value: len(self.optimizer.get_heuristics_by_tier(tier))
+                for tier in MemoryTier
+            },
+            "tasks_since_optimize": self._tasks_since_optimize,
+        }
+    def get_heuristic_report(self) -> str:
+        """Human-readable report of all learned heuristics."""
+        lines = ["═══ Learned Heuristics Report ═══\n"]
+        for tier in MemoryTier:
+            heuristics = self.optimizer.get_heuristics_by_tier(tier)
+            lines.append(f"\n{'─' * 40}")
+            lines.append(f"  {tier.value.upper()} ({len(heuristics)} heuristics)")
+            lines.append(f"{'─' * 40}")
+            for h in heuristics:
+                lines.append(f"\n  [{h.id}] Q={h.q_value:.3f} (used {h.times_used}x, "
+                             f"{h.times_succeeded} successes)")
+                lines.append(f"  Pattern:  {h.pattern}")
+                lines.append(f"  Strategy: {h.strategy}")
+                if h.steps:
+                    for i, step in enumerate(h.steps, 1):
+                        lines.append(f"    {i}. {step}")
+        return "\n".join(lines)