File size: 15,861 Bytes
3ca0d80
74d6368
 
 
 
 
 
 
 
3ca0d80
 
 
 
 
 
 
 
 
 
 
74d6368
3ca0d80
 
 
 
 
 
 
 
 
 
 
 
74d6368
 
 
3ca0d80
 
 
74d6368
3ca0d80
 
 
74d6368
 
 
3ca0d80
 
 
 
 
 
 
 
 
 
 
 
 
74d6368
3ca0d80
 
74d6368
3ca0d80
 
74d6368
3ca0d80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74d6368
3ca0d80
74d6368
 
 
 
 
 
 
 
3ca0d80
 
 
 
 
 
 
 
 
 
 
 
 
74d6368
 
3ca0d80
 
 
 
74d6368
3ca0d80
 
 
 
 
 
 
 
 
 
74d6368
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ca0d80
09cce1a
3ca0d80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74d6368
3ca0d80
 
74d6368
3ca0d80
 
 
 
74d6368
 
3ca0d80
 
74d6368
 
3ca0d80
74d6368
 
3ca0d80
 
 
 
 
 
 
74d6368
 
 
 
3ca0d80
 
74d6368
 
3ca0d80
 
 
 
 
 
 
 
 
 
 
 
74d6368
3ca0d80
74d6368
9557780
3ca0d80
 
 
 
 
 
 
 
09cce1a
3ca0d80
 
 
74d6368
 
 
09cce1a
74d6368
 
 
 
09cce1a
74d6368
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ca0d80
74d6368
3ca0d80
74d6368
 
 
3ca0d80
 
74d6368
3ca0d80
 
 
 
 
 
 
 
 
74d6368
 
3ca0d80
 
74d6368
09cce1a
3ca0d80
09cce1a
74d6368
 
3ca0d80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74d6368
3ca0d80
74d6368
3ca0d80
 
 
 
 
 
 
 
 
 
74d6368
3ca0d80
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
"""
Orchestrator β€” Main loop with first-principles upgrades.

v3 additions (backward compatible):
  - State-delta Markovian critic (O(1) token cost) β€” auto-enabled
  - Falsification critic mode for coding tasks β€” opt-in via critic_mode="falsification"
  - PEP 578 sandbox auto-install for PythonExecTool β€” opt-in via sandbox=True
  
All existing behavior preserved. New modes are additive.
"""

from __future__ import annotations

import json
import logging
import time
from abc import ABC, abstractmethod
from typing import Any, Callable

from purpose_agent.types import (
    Action, Heuristic, MemoryTier, PurposeScore, State, Trajectory, TrajectoryStep,
)
from purpose_agent.actor import Actor
from purpose_agent.purpose_function import PurposeFunction
from purpose_agent.experience_replay import ExperienceReplay
from purpose_agent.optimizer import HeuristicOptimizer
from purpose_agent.llm_backend import LLMBackend

logger = logging.getLogger(__name__)


class Environment(ABC):
    @abstractmethod
    def execute(self, action: Action, current_state: State) -> State: ...
    def reset(self) -> State: return State(data={})
    def is_terminal(self, state: State) -> bool: return False


class SimpleEnvironment(Environment):
    def __init__(self, execute_fn, initial_state=None, terminal_fn=None):
        self._execute_fn = execute_fn
        self._initial_state = initial_state or State(data={})
        self._terminal_fn = terminal_fn
    def execute(self, action, current_state): return self._execute_fn(action, current_state)
    def reset(self): return self._initial_state
    def is_terminal(self, state): return self._terminal_fn(state) if self._terminal_fn else False


class TaskResult:
    def __init__(self, trajectory: Trajectory, final_state: State):
        self.trajectory = trajectory
        self.final_state = final_state

    @property
    def success(self) -> bool:
        phi = self.trajectory.final_phi
        return phi is not None and phi > 7.0

    @property
    def total_steps(self) -> int: return len(self.trajectory.steps)

    @property
    def cumulative_reward(self) -> float: return self.trajectory.cumulative_reward

    @property
    def final_phi(self) -> float | None: return self.trajectory.final_phi

    def summary(self) -> str:
        lines = [
            f"Task: {self.trajectory.task_description}",
            f"Purpose: {self.trajectory.purpose}",
            f"Steps: {self.total_steps}",
            f"Success Rate: {self.trajectory.success_rate:.1%}",
            f"Cumulative Reward: {self.cumulative_reward:.2f}",
            f"Net Delta: {self.trajectory.total_delta:.2f}",
            f"Final Ξ¦: {self.final_phi:.2f}" if self.final_phi is not None else "Final Ξ¦: N/A",
            f"Task Success: {'βœ“' if self.success else 'βœ—'}",
        ]
        return "\n".join(lines)


class Orchestrator:
    """
    Main orchestration loop with first-principles upgrades.
    
    New in v3:
      critic_mode: "standard" (default) | "delta" | "falsification"
        - "standard": full state to critic (original behavior)
        - "delta": O(1) Markovian state-delta (recommended for long tasks)
        - "falsification": Popperian scoring for coding tasks (zero hallucination)
      
      sandbox: bool = False
        - If True, installs PEP 578 audit hooks before execution
    """

    def __init__(
        self,
        llm: LLMBackend,
        environment: Environment,
        available_actions: dict[str, str] | None = None,
        critic_llm: LLMBackend | None = None,
        optimizer_llm: LLMBackend | None = None,
        experience_buffer_size: int = 500,
        persistence_dir: str | None = None,
        on_step: Callable[[TrajectoryStep], None] | None = None,
        optimize_every_n_tasks: int = 1,
        critic_mode: str = "delta",  # NEW: "standard" | "delta" | "falsification"
        sandbox: bool = False,       # NEW: PEP 578 kernel sandbox
    ):
        self.environment = environment
        self.on_step = on_step
        self.optimize_every_n_tasks = optimize_every_n_tasks
        self.critic_mode = critic_mode
        self._tasks_since_optimize = 0

        # Persistence
        replay_path = None
        if persistence_dir:
            import os
            os.makedirs(persistence_dir, exist_ok=True)
            replay_path = f"{persistence_dir}/experience_replay.json"

        # Initialize modules
        self.actor = Actor(llm=llm, available_actions=available_actions)
        self.purpose_fn = PurposeFunction(llm=critic_llm or llm)
        self.experience_replay = ExperienceReplay(capacity=experience_buffer_size, persistence_path=replay_path)
        self.optimizer = HeuristicOptimizer(llm=optimizer_llm or llm)

        # Falsification critic (lazy init)
        self._falsification_critic = None
        if critic_mode == "falsification":
            from purpose_agent.falsification_critic import FalsificationCritic
            self._falsification_critic = FalsificationCritic(llm=critic_llm or llm)

        # PEP 578 sandbox
        if sandbox:
            from purpose_agent.sandbox_hooks import install_sandbox, SandboxPolicy
            install_sandbox(SandboxPolicy(
                allowed_paths=[persistence_dir or "/tmp", "/tmp"],
                block_network=True,
                block_subprocess=False,  # PythonExecTool needs subprocess
            ))

        self.sync_memory()

    def run_task(
        self,
        purpose: str,
        initial_state: State | None = None,
        max_steps: int = 20,
        early_stop_phi: float = 9.0,
        task_description: str | None = None,
    ) -> TaskResult:
        task_desc = task_description or purpose
        current_state = initial_state or self.environment.reset()
        self.purpose_fn.reset_trajectory_stats()

        relevant_experiences = self.experience_replay.retrieve(task_desc, top_k=3)
        self._inject_experience_context(relevant_experiences)

        trajectory = Trajectory(task_description=task_desc, purpose=purpose)
        history: list[dict[str, Any]] = []

        logger.info(f"═══ Starting task: {task_desc} (max {max_steps} steps, critic={self.critic_mode}) ═══")

        for step_idx in range(max_steps):
            step_start = time.time()

            action = self.actor.decide(purpose=purpose, current_state=current_state, history=history)
            logger.info(f"Step {step_idx + 1}: Action={action.name}, Thought={action.thought[:100]}...")

            if action.name.upper() == "DONE":
                logger.info("Agent signaled DONE")
                final_score = self._evaluate(current_state, action, current_state, purpose)
                trajectory.steps.append(TrajectoryStep(
                    state_before=current_state, action=action, state_after=current_state,
                    score=final_score, step_index=step_idx + 1, wall_time_s=time.time() - step_start,
                ))
                break

            try:
                new_state = self.environment.execute(action, current_state)
            except Exception as e:
                logger.error(f"Environment execution failed: {e}")
                new_state = State(data={**current_state.data, "_error": str(e)}, summary=f"Error: {e}")

            # ── FIRST-PRINCIPLES: Evaluate using selected critic mode ──
            score = self._evaluate(current_state, action, new_state, purpose)

            step = TrajectoryStep(
                state_before=current_state, action=action, state_after=new_state,
                score=score, step_index=step_idx + 1, wall_time_s=time.time() - step_start,
            )
            trajectory.steps.append(step)

            history.append({
                "action": f"{action.name}({json.dumps(action.params, default=str)})",
                "result": new_state.describe()[:200],
                "score": f"Ξ”={score.delta:+.2f}" if score else "N/A",
            })

            if self.on_step:
                self.on_step(step)

            logger.info(f"  β†’ Ξ¦: {score.phi_before:.1f} β†’ {score.phi_after:.1f} (Ξ”={score.delta:+.2f}, conf={score.confidence:.2f})")

            current_state = new_state

            if score.phi_after >= early_stop_phi:
                logger.info(f"Early stop: Ξ¦={score.phi_after:.1f} β‰₯ {early_stop_phi}")
                break
            if self.environment.is_terminal(new_state):
                logger.info("Environment signaled terminal state")
                break

        result = TaskResult(trajectory=trajectory, final_state=current_state)
        self.post_task(trajectory, relevant_experiences)
        logger.info(f"═══ Task complete ═══\n{result.summary()}")
        return result

    def _evaluate(self, state_before: State, action: Action, state_after: State, purpose: str) -> PurposeScore:
        """
        Evaluate a state transition using the configured critic mode.
        
        Modes:
          "standard"     β€” original full-state Purpose Function
          "delta"        β€” O(1) Markovian state-delta (default, saves tokens)
          "falsification" β€” Popperian: generate assertions, execute, score = math
        """
        if self.critic_mode == "falsification":
            return self._evaluate_falsification(action, state_after)
        elif self.critic_mode == "delta":
            return self._evaluate_delta(state_before, action, state_after, purpose)
        else:
            # Standard: full state evaluation (original behavior)
            return self.purpose_fn.evaluate(state_before, action, state_after, purpose)

    def _evaluate_delta(self, state_before: State, action: Action, state_after: State, purpose: str) -> PurposeScore:
        """O(1) Markovian evaluation β€” passes only the delta to the critic."""
        from purpose_agent.state_delta import compute_state_delta, format_critic_input
        from purpose_agent.llm_backend import ChatMessage
        from purpose_agent.robust_parser import parse_critic_response
        from purpose_agent.purpose_function import PURPOSE_FUNCTION_SYSTEM_PROMPT

        delta = compute_state_delta(state_before, state_after)

        if delta.is_empty:
            return PurposeScore(phi_before=0, phi_after=0, delta=0, reasoning="No state change", evidence="(empty delta)", confidence=0.5)

        # Format minimal critic input (~300 tokens)
        critic_input = format_critic_input(purpose, action.name, action.thought, delta)

        # Call critic with just the delta (not full states)
        prompt = f"{critic_input}\n\nScore phi_before and phi_after (0-10). Respond in TOML:\nphi_before = 0.0\nphi_after = 0.0\nreasoning = \"...\"\nevidence = \"...\"\nconfidence = 0.5"

        try:
            raw = self.purpose_fn.llm.generate(
                [ChatMessage(role="system", content=PURPOSE_FUNCTION_SYSTEM_PROMPT[:500]),
                 ChatMessage(role="user", content=prompt)],
                temperature=0.2, max_tokens=500,
            )
            parsed = parse_critic_response(raw)
        except Exception:
            parsed = {"phi_before": 0, "phi_after": 0, "reasoning": "eval failed", "evidence": "", "confidence": 0.3}

        phi_b = max(0, min(10, float(parsed.get("phi_before", 0))))
        phi_a = max(0, min(10, float(parsed.get("phi_after", 0))))
        return PurposeScore(
            phi_before=phi_b, phi_after=phi_a, delta=phi_a - phi_b,
            reasoning=str(parsed.get("reasoning", "")),
            evidence=str(parsed.get("evidence", delta.summary_diff[:200])),
            confidence=max(0, min(1, float(parsed.get("confidence", 0.5)))),
        )

    def _evaluate_falsification(self, action: Action, state_after: State) -> PurposeScore:
        """Popperian evaluation: generate adversarial assertions, execute, score = math."""
        code = action.params.get("code", "")
        if not code:
            from purpose_agent.robust_parser import extract_code
            code = extract_code(action.thought or "") or extract_code(action.expected_delta or "")

        if not code or "def " not in code:
            return PurposeScore(phi_before=0, phi_after=0, delta=0, reasoning="No code to falsify", evidence="", confidence=0.5)

        result = self._falsification_critic.evaluate(code)
        return PurposeScore(
            phi_before=0,
            phi_after=result.score,
            delta=result.score,
            reasoning=f"Falsification: {result.assertions_passed}/{result.assertions_total} assertions survived",
            evidence="; ".join(result.failed_details[:3]) if result.failed_details else "All assertions passed",
            confidence=0.95,  # High confidence β€” score is computed, not hallucinated
        )

    # ── Post-task + optimization (unchanged) ──

    def post_task(self, trajectory: Trajectory, used_experiences: list[Any] | None = None) -> None:
        used_experiences = used_experiences or []
        self.experience_replay.add(trajectory)
        task_success = trajectory.success_rate > 0.5
        for exp in used_experiences:
            self.experience_replay.update_q_value(exp.id, reward=1.0 if task_success else 0.0)
        for h in self.actor.strategic_memory + self.actor.procedural_memory:
            self.optimizer.update_heuristic_usage(h.id, was_successful=task_success)
        self._tasks_since_optimize += 1
        if self._tasks_since_optimize >= self.optimize_every_n_tasks:
            self._run_optimization()
            self._tasks_since_optimize = 0

    def _run_optimization(self) -> None:
        logger.info("Running optimization cycle...")
        top = self.experience_replay.get_top_trajectories(n=5, min_success_rate=0.3)
        if not top:
            logger.info("No qualifying trajectories for optimization")
            return
        self.optimizer.optimize(top)
        self.sync_memory()

    def sync_memory(self) -> None:
        self.actor.update_strategic_memory(self.optimizer.get_heuristics_by_tier(MemoryTier.STRATEGIC))
        self.actor.update_procedural_memory(self.optimizer.get_heuristics_by_tier(MemoryTier.PROCEDURAL))
        tool_heuristics = self.optimizer.get_heuristics_by_tier(MemoryTier.TOOL)
        tool_tips = {h.pattern: h.strategy for h in tool_heuristics}
        if tool_tips:
            self.actor.update_tool_memory(tool_tips)

    def _inject_experience_context(self, experiences: list[Any]) -> None:
        injected = []
        for exp in experiences:
            for h in exp.heuristics:
                if h.tier == MemoryTier.PROCEDURAL:
                    injected.append(h)
        if injected:
            current = self.actor.procedural_memory or []
            self.actor.procedural_memory = current + injected

    @property
    def stats(self) -> dict[str, Any]:
        return {
            "experience_replay": self.experience_replay.stats,
            "heuristic_library_size": len(self.optimizer.heuristic_library),
            "heuristics_by_tier": {t.value: len(self.optimizer.get_heuristics_by_tier(t)) for t in MemoryTier},
            "tasks_since_optimize": self._tasks_since_optimize,
            "critic_mode": self.critic_mode,
        }

    def get_heuristic_report(self) -> str:
        lines = ["═══ Learned Heuristics Report ═══\n"]
        for tier in MemoryTier:
            heuristics = self.optimizer.get_heuristics_by_tier(tier)
            lines.append(f"\n{'─' * 40}")
            lines.append(f"  {tier.value.upper()} ({len(heuristics)} heuristics)")
            lines.append(f"{'─' * 40}")
            for h in heuristics:
                lines.append(f"\n  [{h.id}] Q={h.q_value:.3f} (used {h.times_used}x)")
                lines.append(f"  Pattern:  {h.pattern}")
                lines.append(f"  Strategy: {h.strategy}")
        return "\n".join(lines)