Spaces:

Chirag0123
/

codebase-nav-env

Sleeping

App Files Files Community

Chirag0123 commited on 18 days ago

Commit

dfbd16e

1 Parent(s): f7185e1

v3.0 — Intelligence layer: failure classification, strategy detection, advanced metrics, self-improvement, multi-agent comparison, 3D visualizer

Browse files

Files changed (8) hide show

app.py +518 -200
server/advanced_metrics.py +245 -0
server/app.py +247 -49
server/failure_classifier.py +294 -0
server/multi_agent.py +371 -0
server/self_improvement.py +292 -0
server/strategy_detector.py +243 -0
static/viz3d.html +867 -0

app.py CHANGED Viewed

@@ -1,80 +1,92 @@
 #!/usr/bin/env python3
 """
-app.py — Gradio UI + FastAPI endpoints for the OpenEnv environment.
-This is the HF Space entry point.
 """
 import os
 import json
 import gradio as gr
 from server.environment import CodebaseNavEnvironment
 from server.models import RepoAction
-# ── Global environment instance ──────────────────────────────────────────────
 env = CodebaseNavEnvironment()
-# ── Gradio callback functions ────────────────────────────────────────────────
 def reset_environment(task: str):
-    """Reset environment and return initial state."""
     try:
         result = env.reset(task=task)
         obs = result.observation
         tree = "\n".join(f"  📄 {f}" for f in obs.repo_tree)
         failing = ", ".join(obs.failing_tests) if obs.failing_tests else "None listed"
-        info_data = result.info
-        status_text = (
-            f"✅ Episode started\n"
-            f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
-            f"Task: {task}\n"
-            f"Variant: {info_data.get('variant_id', 'unknown')}\n"
-            f"Steps remaining: {obs.steps_remaining}\n"
-            f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
-            f"📁 Repository Files:\n{tree}\n\n"
             f"🔴 Failing Tests: {failing}\n\n"
-            f"📋 Task: {obs.task_description}"
         )
-        return status_text, "", "0", "0.000"
     except Exception as e:
         return f"❌ Error: {e}", "", "0", "0.000"
 def take_step(action_type: str, path: str, query: str, content: str):
-    """Execute one agent step."""
     if env.done:
-        return "❌ Episode is done. Reset first.", "", "", ""
     try:
         action = RepoAction(
             action_type=action_type,
-            path=path if path.strip() else None,
-            query=query if query.strip() else None,
-            content=content if content.strip() else None,
         )
         result = env.step(action)
         obs = result.observation
-        action_result = obs.last_action_result or "No output"
-        error = obs.last_action_error or ""
-        if error:
-            error = f"⚠️ {error}"
         status = (
             f"Step {result.info['steps_taken']} | "
             f"Reward: {result.reward:+.3f} | "
-            f"Steps left: {obs.steps_remaining}"
         )
         if result.done:
-            status += f"\n\n🏁 EPISODE DONE — Final Score: {result.info['final_score']:.3f}"
-        flags = result.info.get("security_flags", [])
-        if flags:
-            status += f"\n🔒 Security: {flags}"
         return (
             status,
-            action_result[:3000],
             str(result.info["steps_taken"]),
             f"{result.info.get('cumulative_reward', 0):.3f}",
         )
@@ -82,261 +94,567 @@ def take_step(action_type: str, path: str, query: str, content: str):
         return f"❌ Error: {e}", "", "", ""
 def get_evaluation():
-    """Get multi-dimensional evaluation report."""
     try:
         ev = env.get_evaluation()
         if "error" in ev:
             return "No evaluation available. Run an episode first."
         lines = [
             f"🎯 Composite Score: {ev['composite_score']:.3f}",
-            "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
         ]
         for name, dim in ev.get("dimensions", {}).items():
             bar = "█" * int(dim["score"] * 20) + "░" * (20 - int(dim["score"] * 20))
             lines.append(f"  {name:15s} [{bar}] {dim['score']:.3f}")
-            for e in dim.get("evidence", []):
                 lines.append(f"    → {e}")
         if ev.get("strengths"):
-            lines.append("\n💪 Strengths:")
-            for s in ev["strengths"]:
-                lines.append(f"  ✅ {s}")
         if ev.get("failure_analysis"):
-            lines.append("\n⚠️ Failures:")
-            for f in ev["failure_analysis"]:
-                lines.append(f"  ❌ {f}")
         if ev.get("recommendations"):
-            lines.append("\n💡 Recommendations:")
-            for r in ev["recommendations"]:
-                lines.append(f"  → {r}")
         return "\n".join(lines)
     except Exception as e:
         return f"Error: {e}"
 def get_metrics():
-    """Get comprehensive metrics."""
     try:
-        m = env.get_metrics()
-        return json.dumps(m, indent=2, default=str)
     except Exception as e:
         return f"Error: {e}"
 def get_trajectory():
-    """Get full trajectory."""
     try:
         t = env.get_trajectory()
         if not t:
-            return "No trajectory available."
         lines = [
-            f"Episode: {t.get('episode_id', 'N/A')}",
-            f"Task: {t.get('task', 'N/A')} | Variant: {t.get('variant_id', 'N/A')}",
-            f"Duration: {t.get('duration_seconds', 'N/A')}s | Score: {t.get('final_score', 0):.3f}",
-            "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
         ]
         for step in t.get("steps", []):
-            emoji = "📖" if step["action_type"] == "read_file" else \
-                    "✏️" if step["action_type"] == "write_file" else \
-                    "🧪" if step["action_type"] == "run_tests" else \
-                    "🔍" if step["action_type"] == "search_code" else "🏁"
-            path = step.get("action_path") or step.get("action_query") or ""
-            err = f" ❌ {step['error']}" if step.get("error") else ""
             lines.append(
-                f"  {emoji} Step {step['step_number']:2d}: "
-                f"{step['action_type']:12s} {path:30s} "
-                f"reward={step['reward']:+.3f} "
-                f"({step['duration_ms']:.0f}ms){err}"
             )
         return "\n".join(lines)
     except Exception as e:
         return f"Error: {e}"
-def run_builtin_agent(task: str):
-    """Run the built-in deterministic agent for a quick demo."""
     try:
-        # Reset
-        result = env.reset(task=task)
-        obs = result.observation
-        log_lines = [f"🚀 Starting {task} (variant: {result.info.get('variant_id')})"]
-        log_lines.append(f"   Files: {obs.repo_tree}")
-        log_lines.append(f"   Failing: {obs.failing_tests}")
-        # Strategy: read test file → read source → fix → run tests ��� submit
-        test_files = [f for f in obs.repo_tree if f.startswith("tests/")]
-        src_files = [f for f in obs.repo_tree if f.startswith("src/") and f.endswith(".py")]
-        spec_files = [f for f in obs.repo_tree if f.endswith(".md")]
-        steps_done = 0
-        max_demo_steps = 15
-        # Step 1: read spec or test
-        if task == "task3" and spec_files:
-            target = spec_files[0]
-        elif test_files:
-            target = test_files[0]
-        else:
-            target = obs.repo_tree[0]
-        step_result = env.step(RepoAction(action_type="read_file", path=target))
-        steps_done += 1
-        log_lines.append(f"   Step {steps_done}: read_file {target} → reward={step_result.reward:+.3f}")
-        # Step 2+: read all source files
-        for sf in src_files:
-            if env.done or steps_done >= max_demo_steps - 2:
-                break
-            step_result = env.step(RepoAction(action_type="read_file", path=sf))
-            steps_done += 1
-            log_lines.append(f"   Step {steps_done}: read_file {sf} → reward={step_result.reward:+.3f}")
-        # Step N-1: run tests
-        if not env.done and steps_done < max_demo_steps - 1:
-            step_result = env.step(RepoAction(action_type="run_tests"))
-            steps_done += 1
-            log_lines.append(f"   Step {steps_done}: run_tests → reward={step_result.reward:+.3f}")
-        # Step N: submit
-        if not env.done:
-            step_result = env.step(RepoAction(action_type="submit"))
-            steps_done += 1
-            log_lines.append(f"   Step {steps_done}: submit → reward={step_result.reward:+.3f}")
-        log_lines.append(f"\n🏁 Final Score: {env.final_score:.3f}")
-        log_lines.append(f"   Total Steps: {steps_done}")
-        log_lines.append(f"   Cumulative Reward: {env.cumulative_reward:.3f}")
-        return "\n".join(log_lines)
     except Exception as e:
         return f"❌ Error: {e}"
-# ── Build the Gradio UI ─────────────────────────────────────────────────────
-with gr.Blocks(
-    title="Codebase Navigation & Repair — OpenEnv",
-) as demo:
     gr.Markdown(
-        "# 🔍 Codebase Navigation & Repair — OpenEnv\n"
-        "**RL environment for testing AI coding agents.** "
-        "Agents navigate repos, find bugs, and fix them — graded by actual pytest execution."
     )
     with gr.Tabs():
-        # ── Tab 1: Interactive Environment ────────────────────────────────
         with gr.TabItem("🎮 Interactive"):
             with gr.Row():
                 with gr.Column(scale=1):
                     task_select = gr.Dropdown(
-                        choices=["task1", "task2", "task3"],
-                        value="task1",
                         label="Task",
-                        info="task1=single-file bugs, task2=cross-module, task3=feature impl"
                     )
                     reset_btn = gr.Button("🔄 Reset Environment", variant="primary")
-                    gr.Markdown("### Take an Action")
-                    action_type = gr.Dropdown(
-                        choices=["read_file", "write_file", "run_tests", "search_code", "submit"],
-                        value="read_file",
-                        label="Action Type",
                     )
-                    action_path = gr.Textbox(label="Path (for read/write/run_tests)", placeholder="src/auth.py")
-                    action_query = gr.Textbox(label="Query (for search_code)", placeholder="validate_token")
-                    action_content = gr.Textbox(label="Content (for write_file)", lines=5, placeholder="# new file content...")
                     step_btn = gr.Button("▶️ Execute Step", variant="secondary")
                 with gr.Column(scale=2):
-                    status_box = gr.Textbox(label="Status", lines=15, interactive=False)
-                    result_box = gr.Textbox(label="Last Action Result", lines=10, interactive=False)
                     with gr.Row():
-                        steps_box = gr.Textbox(label="Steps Taken", value="0", interactive=False)
                         reward_box = gr.Textbox(label="Cumulative Reward", value="0.000", interactive=False)
-            reset_btn.click(
-                reset_environment, inputs=[task_select],
-                outputs=[status_box, result_box, steps_box, reward_box],
             )
-            step_btn.click(
-                take_step,
-                inputs=[action_type, action_path, action_query, action_content],
-                outputs=[status_box, result_box, steps_box, reward_box],
             )
-        # ── Tab 2: Run Agent ─────────────────────────────────────────────
-        with gr.TabItem("🤖 Run Agent"):
             gr.Markdown(
-                "### Built-in Demonstration Agent\n"
-                "Runs a deterministic read-all-then-submit agent. "
-                "For LLM-based agent, use `run_agent.py` or `inference.py`."
             )
-            agent_task = gr.Dropdown(
-                choices=["task1", "task2", "task3"], value="task1", label="Task"
             )
-            run_btn = gr.Button("🚀 Run Agent", variant="primary")
-            agent_output = gr.Textbox(label="Agent Log", lines=20, interactive=False)
-            run_btn.click(run_builtin_agent, inputs=[agent_task], outputs=[agent_output])
-        # ── Tab 3: Evaluation Dashboard ──────────────────────────────────
-        with gr.TabItem("📊 Evaluation"):
-            with gr.Row():
-                eval_btn = gr.Button("🎯 Get Evaluation", variant="primary")
-                metrics_btn = gr.Button("📈 Get Metrics", variant="secondary")
-                traj_btn = gr.Button("🗺️ Get Trajectory", variant="secondary")
-            eval_output = gr.Textbox(label="Evaluation Report", lines=25, interactive=False)
-            eval_btn.click(get_evaluation, outputs=[eval_output])
-            metrics_btn.click(get_metrics, outputs=[eval_output])
-            traj_btn.click(get_trajectory, outputs=[eval_output])
-        # ── Tab 4: API Docs ──────────────────────────────────────────────
         with gr.TabItem("📖 API"):
             gr.Markdown("""
-### REST API Endpoints
-The FastAPI endpoints are mounted alongside this UI at `/api/`.
 | Endpoint | Method | Description |
 |----------|--------|-------------|
-| `/api/reset?task=task1` | POST | Start new episode |
-| `/api/step` | POST | Take action (JSON body) |
-| `/api/state` | GET | Get current state |
-| `/api/health` | GET | Health check |
-| `/api/trajectory` | GET | Full action log |
-| `/api/evaluate` | GET | Multi-dimensional scores |
-| `/api/metrics` | GET | Comprehensive stats |
-| `/api/fault-config` | POST | Enable fault injection |
-### Example: Reset + Read + Submit
-```bash
-BASE="https://YOUR-SPACE.hf.space/api"
-# Reset
-curl -X POST "$BASE/reset?task=task1"
-# Read a file
-curl -X POST "$BASE/step" -H "Content-Type: application/json" \\
-  -d '{"action_type":"read_file","path":"src/auth.py"}'
-# Submit
-curl -X POST "$BASE/step" -H "Content-Type: application/json" \\
-  -d '{"action_type":"submit"}'
-# Get evaluation
-curl "$BASE/evaluate"
 ```
 """)
-# ── Mount FastAPI under /api ─────────────────────────────────────────────────
 from server.app import app as fastapi_app
 gr_app = gr.mount_gradio_app(fastapi_app, demo, path="/")
 if __name__ == "__main__":

 #!/usr/bin/env python3
 """
+app.py — Gradio UI v3.0 — Full Platform Entry Point
+Tabs:
+  🎮 Interactive          — manual step-by-step control
+  🤖 Run Agent            — built-in deterministic agent demo
+  📊 Evaluation           — 6-dimension evaluation report
+  🧠 Intelligence         — failure classification, strategy, advanced metrics
+  🔁 Self-Improve         — improvement plan after failure
+  ⚖️ Compare Agents       — side-by-side multi-agent comparison
+  🌐 3D Visualizer        — Three.js trajectory visualization
+  📖 API                  — REST API reference
 """
 import os
 import json
 import gradio as gr
 from server.environment import CodebaseNavEnvironment
 from server.models import RepoAction
+from server.failure_classifier import FailureClassifier
+from server.strategy_detector import StrategyDetector
+from server.advanced_metrics import AdvancedMetricsEngine
+from server.self_improvement import SelfImprovementEngine
+from server.multi_agent import MultiAgentComparison
+# ── Global instances ──────────────────────────────────────────────────────────
 env = CodebaseNavEnvironment()
+failure_clf = FailureClassifier()
+strategy_det = StrategyDetector()
+adv_metrics_engine = AdvancedMetricsEngine()
+improvement_engine = SelfImprovementEngine()
+multi_agent_engine = MultiAgentComparison()
+# ── Tab 1: Interactive ────────────────────────────────────────────────────────
 def reset_environment(task: str):
     try:
         result = env.reset(task=task)
         obs = result.observation
         tree = "\n".join(f"  📄 {f}" for f in obs.repo_tree)
         failing = ", ".join(obs.failing_tests) if obs.failing_tests else "None listed"
+        fi = result.info.get("fault_injection", {})
+        faults = ""
+        if fi.get("faults_injected"):
+            faults = f"\n\n⚠️ Fault Injection ({fi.get('difficulty_multiplier', 1.0):.1f}x):\n"
+            faults += "\n".join(f"  • {f}" for f in fi["faults_injected"][:5])
+        status = (
+            f"✅ Episode Started — {task} (variant: {result.info.get('variant_id', '?')})\n"
+            f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
+            f"Steps: {obs.steps_remaining} remaining\n\n"
+            f"📁 Files:\n{tree}\n\n"
             f"🔴 Failing Tests: {failing}\n\n"
+            f"📋 Task: {obs.task_description}{faults}"
         )
+        return status, "", "0", "0.000"
     except Exception as e:
         return f"❌ Error: {e}", "", "0", "0.000"
 def take_step(action_type: str, path: str, query: str, content: str):
     if env.done:
+        return "❌ Episode done. Reset first.", "", "", ""
     try:
         action = RepoAction(
             action_type=action_type,
+            path=path.strip() or None,
+            query=query.strip() or None,
+            content=content.strip() or None,
         )
         result = env.step(action)
         obs = result.observation
+        result_text = obs.last_action_result or "No output"
+        error = f"\n⚠️ {obs.last_action_error}" if obs.last_action_error else ""
+        flags = result.info.get("security_flags", [])
+        sec = f"\n🔒 Security: {flags}" if flags else ""
         status = (
             f"Step {result.info['steps_taken']} | "
             f"Reward: {result.reward:+.3f} | "
+            f"Steps left: {obs.steps_remaining}{error}{sec}"
         )
         if result.done:
+            status += f"\n\n🏁 DONE — Score: {result.info['final_score']:.3f}"
         return (
             status,
+            result_text[:3000],
             str(result.info["steps_taken"]),
             f"{result.info.get('cumulative_reward', 0):.3f}",
         )
         return f"❌ Error: {e}", "", "", ""
+# ── Tab 2: Run Agent ──────────────────────────────────────────────────────────
+def run_builtin_agent(task: str):
+    try:
+        result = env.reset(task=task)
+        obs = result.observation
+        log = [
+            f"🚀 {task} (variant: {result.info.get('variant_id')})",
+            f"   Files: {obs.repo_tree}",
+            f"   Failing: {obs.failing_tests}",
+        ]
+        tree = obs.repo_tree
+        test_files = sorted([f for f in tree if f.startswith("tests/")])
+        src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")])
+        spec_files = sorted([f for f in tree if f.endswith(".md")])
+        steps = 0
+        if task == "task3" and spec_files:
+            for sf in spec_files:
+                if env.done: break
+                r = env.step(RepoAction(action_type="read_file", path=sf))
+                steps += 1
+                log.append(f"   Step {steps}: read_file {sf} → {r.reward:+.3f}")
+        for tf in test_files:
+            if env.done: break
+            r = env.step(RepoAction(action_type="read_file", path=tf))
+            steps += 1
+            log.append(f"   Step {steps}: read_file {tf} → {r.reward:+.3f}")
+        for sf in src_files:
+            if env.done or steps >= 12: break
+            r = env.step(RepoAction(action_type="read_file", path=sf))
+            steps += 1
+            log.append(f"   Step {steps}: read_file {sf} → {r.reward:+.3f}")
+        if not env.done and test_files:
+            r = env.step(RepoAction(action_type="run_tests", path=test_files[0]))
+            steps += 1
+            log.append(f"   Step {steps}: run_tests → {r.reward:+.3f}")
+        if not env.done:
+            r = env.step(RepoAction(action_type="submit"))
+            steps += 1
+            log.append(f"   Step {steps}: submit → {r.reward:+.3f}")
+        log += [
+            f"\n🏁 Score: {env.final_score:.3f}",
+            f"   Steps: {steps}",
+            f"   Reward: {env.cumulative_reward:.3f}",
+        ]
+        return "\n".join(log)
+    except Exception as e:
+        return f"❌ Error: {e}"
+# ── Tab 3: Evaluation ─────────────────────────────────────────────────────────
 def get_evaluation():
     try:
         ev = env.get_evaluation()
         if "error" in ev:
             return "No evaluation available. Run an episode first."
         lines = [
             f"🎯 Composite Score: {ev['composite_score']:.3f}",
+            "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
         ]
         for name, dim in ev.get("dimensions", {}).items():
             bar = "█" * int(dim["score"] * 20) + "░" * (20 - int(dim["score"] * 20))
             lines.append(f"  {name:15s} [{bar}] {dim['score']:.3f}")
+            for e in dim.get("evidence", [])[:2]:
                 lines.append(f"    → {e}")
         if ev.get("strengths"):
+            lines += ["\n💪 Strengths:"] + [f"  ✅ {s}" for s in ev["strengths"]]
         if ev.get("failure_analysis"):
+            lines += ["\n⚠️ Failures:"] + [f"  ❌ {f}" for f in ev["failure_analysis"]]
         if ev.get("recommendations"):
+            lines += ["\n💡 Recommendations:"] + [f"  → {r}" for r in ev["recommendations"]]
         return "\n".join(lines)
     except Exception as e:
         return f"Error: {e}"
 def get_metrics():
     try:
+        return json.dumps(env.get_metrics(), indent=2, default=str)
     except Exception as e:
         return f"Error: {e}"
 def get_trajectory():
     try:
         t = env.get_trajectory()
         if not t:
+            return "No trajectory. Run an episode first."
         lines = [
+            f"Episode: {t.get('episode_id')}",
+            f"Task: {t.get('task')} | Variant: {t.get('variant_id')}",
+            f"Score: {t.get('final_score', 0):.3f} | Duration: {t.get('duration_seconds', '?')}s",
+            "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
         ]
+        emojis = {"read_file": "📖", "write_file": "✏️", "run_tests": "🧪",
+                  "search_code": "🔍", "submit": "🏁"}
         for step in t.get("steps", []):
+            em = emojis.get(step["action_type"], "•")
+            p = step.get("action_path") or step.get("action_query") or ""
+            err = " ❌" if step.get("error") else ""
             lines.append(
+                f"  {em} {step['step_number']:2d}: {step['action_type']:12s} {p:30s} "
+                f"reward={step['reward']:+.3f} ({step['duration_ms']:.0f}ms){err}"
             )
         return "\n".join(lines)
     except Exception as e:
         return f"Error: {e}"
+# ── Tab 4: Intelligence ───────────────────────────────────────────────────────
+def get_failure_classification():
     try:
+        traj = env.get_trajectory()
+        if not traj:
+            return "No trajectory. Run an episode first."
+        meta = env.variant.meta if env.variant else {}
+        report = failure_clf.classify(
+            episode_id=traj.get("episode_id", ""),
+            task=env.current_task or "unknown",
+            trajectory_steps=traj.get("steps", []),
+            variant_meta=meta,
+            files_read=list(env.files_read),
+            files_written=list(env.files_written),
+            final_score=env.final_score,
+            security_violations=env.security_violations,
+        )
+        d = report.to_dict()
+        lines = [
+            f"{'✅ SUCCESS' if d['success'] else '❌ FAILURE'}",
+            f"Primary Failure Type: {d['primary_failure']}",
+            f"Failures Detected: {d['failure_count']}",
+            "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
+        ]
+        for f in d.get("failures", []):
+            lines += [
+                f"\n[{f['severity'].upper()}] {f['type']} @ Step {f['step']}",
+                f"  Evidence: {f['evidence']}",
+                f"  Root Cause: {f['root_cause']}",
+                f"  Fix: {f['remediation']}",
+            ]
+        if d.get("failure_summary"):
+            lines += ["\n📋 Summary:", f"  {d['failure_summary']}"]
+        if d.get("retry_hint"):
+            lines += ["\n🔁 Retry Hint:", f"  {d['retry_hint']}"]
+        return "\n".join(lines)
+    except Exception as e:
+        return f"Error: {e}"
+def get_strategy_detection():
+    try:
+        traj = env.get_trajectory()
+        if not traj:
+            return "No trajectory. Run an episode first."
+        meta = env.variant.meta if env.variant else {}
+        report = strategy_det.detect(
+            trajectory_steps=traj.get("steps", []),
+            task=env.current_task or "unknown",
+            variant_meta=meta,
+            files_read=list(env.files_read),
+            final_score=env.final_score,
+        )
+        d = report.to_dict()
+        score_bar = "█" * int(d["score"] * 20) + "░" * (20 - int(d["score"] * 20))
+        lines = [
+            f"🧭 Strategy: {d['strategy']}",
+            f"   Score:  [{score_bar}] {d['score']:.3f}",
+            f"   Confidence: {d['confidence']:.0%}",
+            f"\n📖 {d['strategy_description']}",
+            f"\n📊 Exploration Ratio: {d['exploration_ratio']:.2f} "
+            f"({'explore-heavy' if d['exploration_ratio'] > 0.6 else 'exploit-heavy' if d['exploration_ratio'] < 0.4 else 'balanced'})",
+            f"   Strategy Pivots: {d['pivot_count']}",
+        ]
+        if d.get("sub_patterns"):
+            lines += ["\n🔖 Sub-patterns:"] + [f"  • {p}" for p in d["sub_patterns"]]
+        if d.get("evidence"):
+            lines += ["\n🔍 Evidence:"] + [f"  → {e}" for e in d["evidence"]]
+        return "\n".join(lines)
+    except Exception as e:
+        return f"Error: {e}"
+def get_advanced_metrics():
+    try:
+        traj = env.get_trajectory()
+        if not traj:
+            return "No trajectory. Run an episode first."
+        meta = env.variant.meta if env.variant else {}
+        report = adv_metrics_engine.compute(
+            trajectory_steps=traj.get("steps", []),
+            variant_meta=meta,
+            final_score=env.final_score,
+            files_read=list(env.files_read),
+            files_written=list(env.files_written),
+        )
+        d = report.to_dict()
+        def bar(v):
+            return "█" * int(v * 20) + "░" * (20 - int(v * 20))
+        lines = [
+            "⚡ ADVANCED METRICS",
+            "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
+            f"  Reasoning Efficiency  [{bar(d['reasoning_efficiency'])}] {d['reasoning_efficiency']:.3f}",
+            f"  Reliability Index     [{bar(d['reliability_index'])}] {d['reliability_index']:.3f}",
+            f"  Exploration Ratio     [{bar(d['exploration_ratio'])}] {d['exploration_ratio']:.3f}",
+            f"  Decision Entropy      [{bar(d['decision_entropy'])}] {d['decision_entropy']:.3f}",
+            f"  Wasteful Ratio        [{bar(d['wasteful_ratio'])}] {d['wasteful_ratio']:.3f}",
+            f"  Pivot Rate            {d['pivot_rate']:.2f} per 10 steps",
+            f"  Consistency           [{bar(d['consistency_score'])}] {d['consistency_score']:.3f} ({d['runs_analyzed']} runs)",
+            "\n📊 Action Distribution:",
+        ]
+        for action, count in d.get("action_distribution", {}).items():
+            lines.append(f"  {action:15s}: {count}")
+        if d.get("useful_actions"):
+            lines += ["\n✅ Useful Actions:"] + [f"  • {a}" for a in d["useful_actions"]]
+        if d.get("wasteful_actions"):
+            lines += ["\n⚠️ Wasteful Actions:"] + [f"  • {a}" for a in d["wasteful_actions"]]
+        lines += ["\n🔒 Reliability Breakdown:"]
+        for k, v in d.get("reliability_breakdown", {}).items():
+            lines.append(f"  {k:15s}: {v:.3f}")
+        return "\n".join(lines)
+    except Exception as e:
+        return f"Error: {e}"
+# ── Tab 5: Self-Improve ───────────────────────────────────────────────────────
+def get_improvement_plan():
+    try:
+        traj = env.get_trajectory()
+        if not traj:
+            return "No trajectory. Run an episode first."
+        meta = env.variant.meta if env.variant else {}
+        steps = traj.get("steps", [])
+        fail_report = failure_clf.classify(
+            episode_id=traj.get("episode_id", ""),
+            task=env.current_task or "unknown",
+            trajectory_steps=steps,
+            variant_meta=meta,
+            files_read=list(env.files_read),
+            files_written=list(env.files_written),
+            final_score=env.final_score,
+            security_violations=env.security_violations,
+        )
+        plan = improvement_engine.generate_improvement_plan(
+            episode_id=traj.get("episode_id", ""),
+            task=env.current_task or "unknown",
+            failure_type=fail_report.primary_failure,
+            failure_evidence=[f.evidence for f in fail_report.failures],
+            original_score=env.final_score,
+            trajectory_steps=steps,
+            files_read=list(env.files_read),
+            files_written=list(env.files_written),
+        )
+        d = plan.to_dict()
+        lines = [
+            f"🔁 SELF-IMPROVEMENT PLAN",
+            f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
+            f"Original Score: {d['original_score']:.3f}",
+            f"Failure Type: {d['failure_type']}",
+            f"\n❌ What Went Wrong:\n  {d['what_went_wrong']}",
+            f"\n🎯 Improved Strategy:\n  {d['improved_strategy']}",
+            f"\n📋 Step-by-Step Plan:",
+        ]
+        for step in d.get("step_by_step_plan", []):
+            lines.append(f"  {step}")
+        if d.get("specific_errors"):
+            lines += ["\n🔎 Specific Errors:"] + [f"  • {e}" for e in d["specific_errors"][:5]]
+        lines += [
+            "\n💉 System Prompt Injection (for next LLM run):",
+            "─────────────────────────────────────",
+            d.get("system_prompt_addon", "No injection needed."),
+        ]
+        return "\n".join(lines)
+    except Exception as e:
+        return f"Error: {e}"
+# ── Tab 6: Compare Agents ─────────────────────────────────────────────────────
+def run_comparison(task: str, selected_agents: list):
+    try:
+        agents = selected_agents if selected_agents else None
+        report = multi_agent_engine.compare(env, task=task, agents=agents)
+        d = report.to_dict()
+        lines = [
+            f"⚖️ MULTI-AGENT COMPARISON — {task} (variant: {d.get('variant_id')})",
+            f"🏆 Winner: {d.get('winner')} (score: {d.get('winner_score', 0):.3f})",
+            "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
+            f"{'Rank':<6} {'Agent':<16} {'Score':<8} {'Steps':<8} {'Strategy':<22} {'Failure':<22} {'Reliability':<12}",
+            "─" * 100,
+        ]
+        for row in d.get("summary_table", []):
+            lines.append(
+                f"#{row['rank']:<5} {row['agent']:<16} {row['score']:<8.3f} "
+                f"{row['steps']:<8} {row['strategy']:<22} {row['failure']:<22} {row['reliability']:<12.3f}"
+            )
+        lines.append("━" * 100)
+        if d.get("insights"):
+            lines += ["\n💡 Insights:"] + [f"  → {i}" for i in d["insights"]]
+        lines.append("\n📊 Per-Agent Action Sequences:")
+        for run in d.get("detailed_runs", []):
+            seq = " → ".join(run.get("action_sequence", []))
+            lines.append(f"  {run['agent_name']:16s}: {seq}")
+        return "\n".join(lines)
     except Exception as e:
         return f"❌ Error: {e}"
+# ── Tab 7: 3D Visualizer ──────────────────────────────────────────────────────
+def get_viz_html():
+    """Generate the 3D visualizer HTML with current trajectory data injected."""
+    # Load the static HTML template
+    static_path = os.path.join(os.path.dirname(__file__), "static", "viz3d.html")
+    if not os.path.exists(static_path):
+        return "<p style='color:red'>viz3d.html not found in static/</p>"
+    with open(static_path, "r") as f:
+        html = f.read()
+    # Get viz data from current environment
+    traj = env.get_trajectory()
+    if traj:
+        meta = env.variant.meta if env.variant else {}
+        bug_files = set(meta.get("bug_files", []))
+        files = []
+        if env.variant:
+            for fname in env.variant.get_tree():
+                ftype = "test" if fname.startswith("tests/") else \
+                        "spec" if fname.endswith(".md") else "src"
+                files.append({
+                    "name": fname,
+                    "type": ftype,
+                    "is_bug_file": fname in bug_files,
+                    "visited": fname in env.files_read,
+                    "modified": fname in env.files_written,
+                })
+        test_files = [f["name"] for f in files if f["type"] == "test"]
+        src_files = [f["name"] for f in files if f["type"] == "src"]
+        deps = []
+        for tf in test_files:
+            for sf in src_files:
+                deps.append({"from": tf, "to": sf})
+        steps_data = []
+        for step in traj.get("steps", []):
+            steps_data.append({
+                "step": step.get("step_number", 0),
+                "action": step.get("action_type", ""),
+                "path": step.get("action_path"),
+                "reward": step.get("reward", 0.0),
+                "error": step.get("error"),
+                "pass_rate": step.get("test_pass_rate"),
+            })
+        strategy_report = strategy_det.detect(
+            traj.get("steps", []),
+            env.current_task or "unknown",
+            meta,
+            list(env.files_read),
+            env.final_score,
+        ) if traj.get("steps") else None
+        viz_data = {
+            "task": env.current_task or "unknown",
+            "variant_id": traj.get("variant_id", "unknown"),
+            "final_score": env.final_score,
+            "strategy": strategy_report.strategy if strategy_report else "UNKNOWN",
+            "failure_type": "—",
+            "files": files,
+            "dependencies": deps,
+            "steps": steps_data,
+        }
+        data_json = json.dumps(viz_data)
+    else:
+        data_json = ""
+    # Inject data into HTML
+    html = html.replace(
+        '<div id="viz-data" style="display:none"></div>',
+        f'<div id="viz-data" style="display:none">{data_json}</div>'
+    )
+    return html
+# ── Build Gradio UI ───────────────────────────────────────────────────────────
+with gr.Blocks(title="Codebase Navigation & Repair — OpenEnv v3") as demo:
     gr.Markdown(
+        "# 🔍 Codebase Navigation & Repair — OpenEnv v3\n"
+        "**The most advanced debugging + evaluation platform for AI coding agents.** "
+        "Navigate codebases · Fix bugs · Evaluate process · Visualize in 3D."
     )
     with gr.Tabs():
+        # ── Tab 1: Interactive ────────────────────────────────────────────────
         with gr.TabItem("🎮 Interactive"):
             with gr.Row():
                 with gr.Column(scale=1):
                     task_select = gr.Dropdown(
+                        ["task1", "task2", "task3"], value="task1",
                         label="Task",
+                        info="task1=bugs, task2=cross-module, task3=feature impl"
                     )
                     reset_btn = gr.Button("🔄 Reset Environment", variant="primary")
+                    gr.Markdown("### Action")
+                    act_type = gr.Dropdown(
+                        ["read_file", "write_file", "run_tests", "search_code", "submit"],
+                        value="read_file", label="Action Type",
                     )
+                    act_path = gr.Textbox(label="Path", placeholder="src/auth.py")
+                    act_query = gr.Textbox(label="Query (search_code)", placeholder="validate_token")
+                    act_content = gr.Textbox(label="Content (write_file)", lines=4)
                     step_btn = gr.Button("▶️ Execute Step", variant="secondary")
                 with gr.Column(scale=2):
+                    status_box = gr.Textbox(label="Status", lines=14, interactive=False)
+                    result_box = gr.Textbox(label="Last Result", lines=8, interactive=False)
                     with gr.Row():
+                        steps_box = gr.Textbox(label="Steps", value="0", interactive=False)
                         reward_box = gr.Textbox(label="Cumulative Reward", value="0.000", interactive=False)
+            reset_btn.click(reset_environment, [task_select], [status_box, result_box, steps_box, reward_box])
+            step_btn.click(take_step, [act_type, act_path, act_query, act_content], [status_box, result_box, steps_box, reward_box])
+        # ── Tab 2: Run Agent ──────────────────────────────────────────────────
+        with gr.TabItem("🤖 Run Agent"):
+            gr.Markdown("### Built-in Demonstration Agent\nRuns deterministic read→submit strategy.")
+            agent_task = gr.Dropdown(["task1", "task2", "task3"], value="task1", label="Task")
+            run_btn = gr.Button("🚀 Run Agent", variant="primary")
+            agent_output = gr.Textbox(label="Agent Log", lines=20, interactive=False)
+            run_btn.click(run_builtin_agent, [agent_task], [agent_output])
+        # ── Tab 3: Evaluation ─────────────────────────────────────────────────
+        with gr.TabItem("📊 Evaluation"):
+            with gr.Row():
+                eval_btn = gr.Button("🎯 Evaluation Report", variant="primary")
+                metrics_btn = gr.Button("📈 Metrics JSON", variant="secondary")
+                traj_btn = gr.Button("🗺️ Trajectory", variant="secondary")
+            eval_out = gr.Textbox(label="Output", lines=28, interactive=False)
+            eval_btn.click(get_evaluation, outputs=[eval_out])
+            metrics_btn.click(get_metrics, outputs=[eval_out])
+            traj_btn.click(get_trajectory, outputs=[eval_out])
+        # ── Tab 4: 🧠 Intelligence ─────────────────────────────────────────────
+        with gr.TabItem("🧠 Intelligence"):
+            gr.Markdown(
+                "### Deep Agent Intelligence Analysis\n"
+                "Failure classification, strategy detection, and advanced behavioral metrics."
             )
+            with gr.Row():
+                classify_btn = gr.Button("🔬 Classify Failure", variant="primary")
+                strategy_btn = gr.Button("🧭 Detect Strategy", variant="secondary")
+                adv_btn = gr.Button("⚡ Advanced Metrics", variant="secondary")
+            intel_out = gr.Textbox(label="Analysis", lines=32, interactive=False)
+            classify_btn.click(get_failure_classification, outputs=[intel_out])
+            strategy_btn.click(get_strategy_detection, outputs=[intel_out])
+            adv_btn.click(get_advanced_metrics, outputs=[intel_out])
+        # ── Tab 5: 🔁 Self-Improve ─────────────────────────────────────────────
+        with gr.TabItem("🔁 Self-Improve"):
+            gr.Markdown(
+                "### Self-Improvement Loop\n"
+                "After a failure, this generates an actionable improvement plan and a "
+                "system prompt injection for the agent's next attempt."
             )
+            improve_btn = gr.Button("🔁 Generate Improvement Plan", variant="primary")
+            improve_out = gr.Textbox(label="Improvement Plan", lines=32, interactive=False)
+            improve_btn.click(get_improvement_plan, outputs=[improve_out])
+        # ── Tab 6: ⚖️ Compare ──────────────────────────────────────────────────
+        with gr.TabItem("⚖️ Compare Agents"):
             gr.Markdown(
+                "### Multi-Agent Strategy Comparison\n"
+                "Runs 4 built-in agent strategies on the same task to compare "
+                "efficiency, strategy, and reliability side-by-side."
             )
+            with gr.Row():
+                comp_task = gr.Dropdown(["task1", "task2", "task3"], value="task1", label="Task")
+                comp_agents = gr.CheckboxGroup(
+                    ["test-first", "search-first", "minimal", "exhaustive"],
+                    value=["test-first", "search-first", "minimal", "exhaustive"],
+                    label="Agents to Compare",
+                )
+            comp_btn = gr.Button("⚖️ Run Comparison", variant="primary")
+            comp_out = gr.Textbox(label="Comparison Report", lines=30, interactive=False)
+            comp_btn.click(run_comparison, [comp_task, comp_agents], [comp_out])
+        # ── Tab 7: 🌐 3D Visualizer ────────────────────────────────────────────
+        with gr.TabItem("🌐 3D Visualizer"):
+            gr.Markdown(
+                "### Agent Trajectory 3D Visualization\n"
+                "Files = 3D nodes · Dependencies = edges · Agent path = animated beam · "
+                "Timeline = scrubbable replay. **Run an episode first, then refresh.**"
             )
+            refresh_viz_btn = gr.Button("🔄 Load Trajectory into Visualizer", variant="primary")
+            viz_html = gr.HTML(value="<p style='color:#64748b;text-align:center;padding:40px'>Click 'Load Trajectory' after running an episode.</p>")
+            refresh_viz_btn.click(get_viz_html, outputs=[viz_html])
+        # ── Tab 8: API ────────────────────────────────────────────────────────
         with gr.TabItem("📖 API"):
             gr.Markdown("""
+### REST API — v3.0 Endpoints
+#### Core (OpenEnv-compliant)
 | Endpoint | Method | Description |
 |----------|--------|-------------|
+| `/reset?task=task1` | POST | Start new episode |
+| `/step` | POST | Take action |
+| `/state` | GET | Current state |
+| `/health` | GET | Health check |
+#### Evaluation
+| Endpoint | Method | Description |
+|----------|--------|-------------|
+| `/trajectory` | GET | Full action log |
+| `/evaluate` | GET | 6-dimension scores |
+| `/metrics` | GET | Memory + security stats |
+| `/fault-config` | POST | Enable fault injection |
+#### Intelligence (NEW in v3)
+| Endpoint | Method | Description |
+|----------|--------|-------------|
+| `/classify` | GET | Typed failure classification |
+| `/strategy` | GET | Behavioral strategy detection |
+| `/advanced-metrics` | GET | Entropy, reliability, consistency |
+| `/improvement-plan` | GET | Self-improvement feedback |
+| `/compare-agents` | POST | Multi-agent comparison |
+| `/viz-data` | GET | 3D visualization data |
+```bash
+BASE="http://localhost:7860"
+curl -X POST "$BASE/reset?task=task1"
+curl -X POST "$BASE/step" -H "Content-Type: application/json" -d '{"action_type":"read_file","path":"src/auth.py"}'
+curl -X POST "$BASE/step" -d '{"action_type":"submit"}'
+curl "$BASE/classify"
+curl "$BASE/strategy"
+curl "$BASE/advanced-metrics"
+curl "$BASE/improvement-plan"
+curl -X POST "$BASE/compare-agents?task=task1"
 ```
 """)
+# ── Mount FastAPI under same process ──────────────────────────────────────────
 from server.app import app as fastapi_app
 gr_app = gr.mount_gradio_app(fastapi_app, demo, path="/")
 if __name__ == "__main__":

server/advanced_metrics.py ADDED Viewed

	@@ -0,0 +1,245 @@

+# server/advanced_metrics.py
+"""
+Advanced Metrics Engine.
+Computes metrics that existing benchmarks (SWE-bench, etc.) completely ignore:
+- Exploration vs Exploitation ratio across episode
+- Consistency score across multiple runs of same task
+- Reliability index (weighted aggregate)
+- Reasoning efficiency (useful actions / total actions)
+- Decision entropy (how predictable/focused the agent is)
+"""
+import math
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass, field
+@dataclass
+class AdvancedMetricsReport:
+    """All advanced metrics for one episode or cross-episode comparison."""
+    # Per-episode
+    reasoning_efficiency: float    # Useful steps / total steps
+    exploration_ratio: float       # Read+search vs write+test ratio
+    decision_entropy: float        # Shannon entropy of action distribution
+    reliability_index: float       # Composite reliability score
+    pivot_rate: float              # Strategy changes per 10 steps
+    wasteful_ratio: float          # Redundant actions / total actions
+    # Cross-episode (populated when history provided)
+    consistency_score: float = 0.0   # Variance across runs (lower variance = higher consistency)
+    runs_analyzed: int = 0
+    # Breakdowns
+    action_distribution: Dict[str, int] = field(default_factory=dict)
+    useful_actions: List[str] = field(default_factory=list)
+    wasteful_actions: List[str] = field(default_factory=list)
+    reliability_breakdown: Dict[str, float] = field(default_factory=dict)
+    def to_dict(self) -> dict:
+        return {
+            "reasoning_efficiency": round(self.reasoning_efficiency, 3),
+            "exploration_ratio": round(self.exploration_ratio, 3),
+            "decision_entropy": round(self.decision_entropy, 3),
+            "reliability_index": round(self.reliability_index, 3),
+            "pivot_rate": round(self.pivot_rate, 3),
+            "wasteful_ratio": round(self.wasteful_ratio, 3),
+            "consistency_score": round(self.consistency_score, 3),
+            "runs_analyzed": self.runs_analyzed,
+            "action_distribution": self.action_distribution,
+            "useful_actions": self.useful_actions,
+            "wasteful_actions": self.wasteful_actions,
+            "reliability_breakdown": {
+                k: round(v, 3) for k, v in self.reliability_breakdown.items()
+            },
+        }
+class AdvancedMetricsEngine:
+    """
+    Computes advanced behavioral and reliability metrics from trajectory data.
+    Usage:
+        engine = AdvancedMetricsEngine()
+        report = engine.compute(
+            trajectory_steps=[...],
+            variant_meta={...},
+            final_score=0.7,
+            files_read=[...],
+            files_written=[...],
+            history=[],  # Pass previous episode scores for consistency
+        )
+    """
+    def __init__(self):
+        self._score_history: List[float] = []  # Tracks scores across episodes
+    def compute(
+        self,
+        trajectory_steps: List[dict],
+        variant_meta: Dict[str, Any],
+        final_score: float,
+        files_read: List[str],
+        files_written: List[str],
+        history: Optional[List[float]] = None,
+    ) -> AdvancedMetricsReport:
+        """Compute all advanced metrics for one episode."""
+        # Record this score in history
+        self._score_history.append(final_score)
+        if not trajectory_steps:
+            return AdvancedMetricsReport(
+                reasoning_efficiency=0.0,
+                exploration_ratio=0.5,
+                decision_entropy=0.0,
+                reliability_index=0.0,
+                pivot_rate=0.0,
+                wasteful_ratio=1.0,
+            )
+        action_seq = [s.get("action_type", "unknown") for s in trajectory_steps]
+        total = len(action_seq)
+        # ── Action distribution ───────────────────────────────────────────────
+        from collections import Counter
+        dist = Counter(action_seq)
+        action_distribution = dict(dist)
+        # ── Decision entropy (Shannon entropy of action types) ────────────────
+        entropy = 0.0
+        for count in dist.values():
+            p = count / total
+            if p > 0:
+                entropy -= p * math.log2(p)
+        # Normalize by max possible entropy (log2 of unique action types)
+        max_entropy = math.log2(len(dist)) if len(dist) > 1 else 1.0
+        normalized_entropy = entropy / max_entropy if max_entropy > 0 else 0.0
+        # ── Exploration vs exploitation ratio ─────────────────────────────────
+        explore = dist.get("read_file", 0) + dist.get("search_code", 0)
+        exploit = dist.get("write_file", 0) + dist.get("run_tests", 0)
+        exploration_ratio = explore / (explore + exploit) if (explore + exploit) > 0 else 0.5
+        # ���─ Redundancy / wasteful actions ─────────────────────────────────────
+        read_paths = [
+            s.get("action_path")
+            for s in trajectory_steps
+            if s.get("action_type") == "read_file" and s.get("action_path")
+        ]
+        seen = set()
+        redundant_reads = 0
+        for p in read_paths:
+            if p in seen:
+                redundant_reads += 1
+            seen.add(p)
+        error_actions = sum(1 for s in trajectory_steps if s.get("error"))
+        total_wasteful = redundant_reads + error_actions
+        wasteful_ratio = total_wasteful / total if total > 0 else 0.0
+        wasteful_actions = []
+        if redundant_reads > 0:
+            wasteful_actions.append(f"{redundant_reads}x redundant file reads")
+        if error_actions > 0:
+            wasteful_actions.append(f"{error_actions}x actions that produced errors")
+        # ── Useful action detection ───────────────────────────────────────────
+        useful_actions = []
+        relevant = set(
+            variant_meta.get("bug_files", []) +
+            variant_meta.get("interface_files", []) +
+            variant_meta.get("read_first_files", []) +
+            variant_meta.get("files_to_implement", [])
+        )
+        relevant_reads = [f for f in files_read if f in relevant]
+        if relevant_reads:
+            useful_actions.append(f"Read {len(relevant_reads)} key files: {relevant_reads[:3]}")
+        test_rates = [
+            s.get("test_pass_rate")
+            for s in trajectory_steps
+            if s.get("test_pass_rate") is not None
+        ]
+        if len(test_rates) >= 2 and test_rates[-1] > test_rates[0]:
+            useful_actions.append(
+                f"Test pass rate improved from {test_rates[0]:.2f} to {test_rates[-1]:.2f}"
+            )
+        if files_written:
+            useful_actions.append(f"Wrote {len(files_written)} file(s): {files_written[:3]}")
+        # ── Reasoning efficiency ──────────────────────────────────────────────
+        useful_count = len(relevant_reads) + (1 if files_written else 0) + (1 if test_rates else 0)
+        reasoning_efficiency = min(1.0, useful_count / max(total, 1))
+        # ── Pivot rate (strategy switches per 10 steps) ───────────────────────
+        pivots = 0
+        for i in range(1, len(action_seq)):
+            prev_explore = action_seq[i-1] in ("read_file", "search_code")
+            curr_exploit = action_seq[i] in ("write_file", "run_tests")
+            prev_exploit = action_seq[i-1] in ("write_file", "run_tests")
+            curr_explore = action_seq[i] in ("read_file", "search_code")
+            if (prev_explore and curr_exploit) or (prev_exploit and curr_explore):
+                pivots += 1
+        pivot_rate = (pivots / total) * 10 if total > 0 else 0.0  # per 10 steps
+        # ── Reliability index ─────────────────────────────────────────────────
+        # Weighted aggregate: correctness matters most
+        reliability_breakdown = {
+            "correctness": final_score,
+            "efficiency": max(0.0, 1.0 - wasteful_ratio),
+            "focus": 1.0 - normalized_entropy,  # Low entropy = focused behavior
+            "verification": 1.0 if test_rates else 0.0,
+            "safety": 1.0,  # Will be reduced by security violations
+        }
+        # Check for security flags
+        sec_flags = sum(len(s.get("security_flags", [])) for s in trajectory_steps)
+        if sec_flags > 0:
+            reliability_breakdown["safety"] = max(0.0, 1.0 - sec_flags * 0.2)
+        # Weighted reliability index
+        weights = {
+            "correctness": 0.40,
+            "efficiency": 0.20,
+            "focus": 0.15,
+            "verification": 0.15,
+            "safety": 0.10,
+        }
+        reliability_index = sum(
+            reliability_breakdown[k] * weights[k]
+            for k in weights
+        )
+        # ── Consistency score (cross-episode) ────────────────────────────────
+        scores_to_use = list(history) if history else self._score_history
+        consistency_score = 0.0
+        runs_analyzed = len(scores_to_use)
+        if runs_analyzed >= 2:
+            mean = sum(scores_to_use) / runs_analyzed
+            variance = sum((s - mean) ** 2 for s in scores_to_use) / runs_analyzed
+            std_dev = math.sqrt(variance)
+            # Consistency = 1 - normalized_std_dev (higher = more consistent)
+            consistency_score = max(0.0, 1.0 - (std_dev / max(mean, 0.01)))
+        return AdvancedMetricsReport(
+            reasoning_efficiency=reasoning_efficiency,
+            exploration_ratio=exploration_ratio,
+            decision_entropy=normalized_entropy,
+            reliability_index=reliability_index,
+            pivot_rate=pivot_rate,
+            wasteful_ratio=wasteful_ratio,
+            consistency_score=consistency_score,
+            runs_analyzed=runs_analyzed,
+            action_distribution=action_distribution,
+            useful_actions=useful_actions,
+            wasteful_actions=wasteful_actions,
+            reliability_breakdown=reliability_breakdown,
+        )
+    def get_score_history(self) -> List[float]:
+        return list(self._score_history)
+    def reset_history(self):
+        self._score_history = []

server/app.py CHANGED Viewed

@@ -1,13 +1,17 @@
 # server/app.py
 """
-FastAPI server exposing the OpenEnv-compliant API + reliability layer endpoints.
-Core endpoints:      POST /reset, POST /step, GET /state, GET /health
-Evaluation endpoints: GET /trajectory, GET /evaluate, GET /metrics
-Control endpoints:    POST /fault-config
 """
 from fastapi import FastAPI, HTTPException
 from contextlib import asynccontextmanager
 from .environment import CodebaseNavEnvironment
 from .models import (
@@ -15,9 +19,19 @@ from .models import (
     TrajectoryResponse, EvaluationResponse, MetricsResponse,
     FaultConfigRequest,
 )
-# Global environment instance (one session per container)
 env = CodebaseNavEnvironment()
 @asynccontextmanager
@@ -27,45 +41,41 @@ async def lifespan(app: FastAPI):
 app = FastAPI(
-    title="Codebase Navigation & Repair — OpenEnv",
     description=(
-        "RL environment where agents navigate and repair Python codebases. "
-        "Extended with process-based evaluation, trajectory replay, "
-        "fault injection, security scanning, and memory tracking."
     ),
-    version="2.0.0",
     lifespan=lifespan,
 )
-# ── Core OpenEnv Endpoints ───────────────────────────────────────────────────
 @app.post("/reset", response_model=ResetResult)
 async def reset(task: str = "task1"):
-    """
-    Start a new episode.
-    task: "task1" | "task2" | "task3"
-    """
     valid_tasks = ["task1", "task2", "task3"]
     if task not in valid_tasks:
         raise HTTPException(status_code=400, detail=f"task must be one of {valid_tasks}")
     try:
-        result = env.reset(task=task)
-        return result
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/step", response_model=StepResult)
 async def step(action: RepoAction):
-    """
-    Take one action in the current episode.
-    """
     if env.done:
-        raise HTTPException(status_code=400, detail="Episode is done. POST /reset to start a new one.")
     try:
-        result = env.step(action)
-        return result
     except RuntimeError as e:
         raise HTTPException(status_code=400, detail=str(e))
     except Exception as e:
@@ -74,12 +84,8 @@ async def step(action: RepoAction):
 @app.get("/state", response_model=StateResult)
 async def state():
-    """
-    Get current state without advancing the episode.
-    """
-    obs = env.get_state()
     return StateResult(
-        observation=obs,
         current_score=env.final_score,
         total_steps_taken=env.steps_taken,
     )
@@ -87,17 +93,13 @@ async def state():
 @app.get("/health")
 async def health():
-    return {"status": "ok", "environment": "codebase-nav-env", "version": "2.0.0"}
-# ── Evaluation & Reliability Endpoints ───────────────────────────────────────
 @app.get("/trajectory", response_model=TrajectoryResponse)
 async def get_trajectory():
-    """
-    Get the full trajectory of the current or most recent episode.
-    Returns every action, observation snapshot, reward, timing, and security flags.
-    """
     traj = env.get_trajectory()
     if not traj:
         return TrajectoryResponse()
@@ -106,11 +108,6 @@ async def get_trajectory():
 @app.get("/evaluate", response_model=EvaluationResponse)
 async def get_evaluation():
-    """
-    Get multi-dimensional evaluation of the current/latest episode.
-    Scores across 6 dimensions: efficiency, navigation, correctness,
-    reasoning, robustness, security.
-    """
     evaluation = env.get_evaluation()
     if "error" in evaluation:
         return EvaluationResponse()
@@ -119,23 +116,224 @@ async def get_evaluation():
 @app.get("/metrics", response_model=MetricsResponse)
 async def get_metrics():
-    """
-    Get comprehensive metrics including memory usage, security stats,
-    fault injection report, wasteful patterns, and action timeline.
-    """
-    metrics = env.get_metrics()
-    return MetricsResponse(**metrics)
 @app.post("/fault-config")
 async def set_fault_config(config: FaultConfigRequest):
-    """
-    Configure fault injection for the NEXT episode (takes effect on next /reset).
-    Levels: "none" (default), "light" (misleading comments), "heavy" (all faults)
-    """
     env.set_fault_config(config.level)
     return {
         "status": "ok",
         "fault_level": config.level,
         "message": f"Fault injection set to '{config.level}'. Takes effect on next /reset.",
     }

 # server/app.py
 """
+FastAPI server — v3.0
+Core endpoints:        POST /reset, POST /step, GET /state, GET /health
+Evaluation endpoints:  GET /trajectory, GET /evaluate, GET /metrics
+Control endpoints:     POST /fault-config
+Intelligence endpoints: GET /classify, GET /strategy, GET /advanced-metrics,
+                        POST /compare-agents, GET /improvement-plan, GET /viz-data
 """
 from fastapi import FastAPI, HTTPException
+from fastapi.staticfiles import StaticFiles
 from contextlib import asynccontextmanager
+import os
 from .environment import CodebaseNavEnvironment
 from .models import (
     TrajectoryResponse, EvaluationResponse, MetricsResponse,
     FaultConfigRequest,
 )
+from .failure_classifier import FailureClassifier
+from .strategy_detector import StrategyDetector
+from .advanced_metrics import AdvancedMetricsEngine
+from .self_improvement import SelfImprovementEngine
+from .multi_agent import MultiAgentComparison
+# Global instances
 env = CodebaseNavEnvironment()
+failure_clf = FailureClassifier()
+strategy_det = StrategyDetector()
+adv_metrics = AdvancedMetricsEngine()
+improvement = SelfImprovementEngine()
+multi_agent = MultiAgentComparison()
 @asynccontextmanager
 app = FastAPI(
+    title="Codebase Navigation & Repair — OpenEnv v3",
     description=(
+        "RL environment for AI coding agents — extended with process-based evaluation, "
+        "failure classification, strategy detection, self-improvement loops, "
+        "multi-agent comparison, 3D visualization, and advanced metrics."
     ),
+    version="3.0.0",
     lifespan=lifespan,
 )
+# Serve static files (3D visualizer HTML)
+_static_dir = os.path.join(os.path.dirname(__file__), "..", "static")
+if os.path.exists(_static_dir):
+    app.mount("/static", StaticFiles(directory=_static_dir), name="static")
+# ── Core OpenEnv Endpoints ────────────────────────────────────────────────────
 @app.post("/reset", response_model=ResetResult)
 async def reset(task: str = "task1"):
     valid_tasks = ["task1", "task2", "task3"]
     if task not in valid_tasks:
         raise HTTPException(status_code=400, detail=f"task must be one of {valid_tasks}")
     try:
+        return env.reset(task=task)
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/step", response_model=StepResult)
 async def step(action: RepoAction):
     if env.done:
+        raise HTTPException(status_code=400, detail="Episode is done. POST /reset to start.")
     try:
+        return env.step(action)
     except RuntimeError as e:
         raise HTTPException(status_code=400, detail=str(e))
     except Exception as e:
 @app.get("/state", response_model=StateResult)
 async def state():
     return StateResult(
+        observation=env.get_state(),
         current_score=env.final_score,
         total_steps_taken=env.steps_taken,
     )
 @app.get("/health")
 async def health():
+    return {"status": "ok", "environment": "codebase-nav-env", "version": "3.0.0"}
+# ── Evaluation Endpoints ──────────────────────────────────────────────────────
 @app.get("/trajectory", response_model=TrajectoryResponse)
 async def get_trajectory():
     traj = env.get_trajectory()
     if not traj:
         return TrajectoryResponse()
 @app.get("/evaluate", response_model=EvaluationResponse)
 async def get_evaluation():
     evaluation = env.get_evaluation()
     if "error" in evaluation:
         return EvaluationResponse()
 @app.get("/metrics", response_model=MetricsResponse)
 async def get_metrics():
+    return MetricsResponse(**env.get_metrics())
 @app.post("/fault-config")
 async def set_fault_config(config: FaultConfigRequest):
     env.set_fault_config(config.level)
     return {
         "status": "ok",
         "fault_level": config.level,
         "message": f"Fault injection set to '{config.level}'. Takes effect on next /reset.",
     }
+# ── Intelligence Endpoints (NEW in v3) ────────────────────────────────────────
+@app.get("/classify")
+async def classify_failure():
+    """
+    Classify the failure type of the current/latest episode.
+    Returns typed failure taxonomy with root cause and remediation.
+    """
+    traj = env.get_trajectory()
+    if not traj:
+        return {"error": "No trajectory available. Run an episode first."}
+    steps = traj.get("steps", [])
+    meta = env.variant.meta if env.variant else {}
+    report = failure_clf.classify(
+        episode_id=traj.get("episode_id", ""),
+        task=env.current_task or "unknown",
+        trajectory_steps=steps,
+        variant_meta=meta,
+        files_read=list(env.files_read),
+        files_written=list(env.files_written),
+        final_score=env.final_score,
+        security_violations=env.security_violations,
+    )
+    return report.to_dict()
+@app.get("/strategy")
+async def detect_strategy():
+    """
+    Detect the behavioral strategy pattern used by the agent.
+    Returns: TARGETED_DEBUGGING | SYSTEMATIC_SEARCH | BRUTE_FORCE |
+             RANDOM_EXPLORATION | SPEC_DRIVEN | MINIMAL_EFFORT
+    """
+    traj = env.get_trajectory()
+    if not traj:
+        return {"error": "No trajectory available."}
+    steps = traj.get("steps", [])
+    meta = env.variant.meta if env.variant else {}
+    report = strategy_det.detect(
+        trajectory_steps=steps,
+        task=env.current_task or "unknown",
+        variant_meta=meta,
+        files_read=list(env.files_read),
+        final_score=env.final_score,
+    )
+    return report.to_dict()
+@app.get("/advanced-metrics")
+async def get_advanced_metrics():
+    """
+    Compute advanced metrics: reasoning efficiency, decision entropy,
+    exploration ratio, reliability index, consistency, pivot rate.
+    """
+    traj = env.get_trajectory()
+    if not traj:
+        return {"error": "No trajectory available."}
+    steps = traj.get("steps", [])
+    meta = env.variant.meta if env.variant else {}
+    report = adv_metrics.compute(
+        trajectory_steps=steps,
+        variant_meta=meta,
+        final_score=env.final_score,
+        files_read=list(env.files_read),
+        files_written=list(env.files_written),
+    )
+    return report.to_dict()
+@app.get("/improvement-plan")
+async def get_improvement_plan():
+    """
+    Generate a self-improvement plan based on failure classification.
+    Returns: what_went_wrong, improved_strategy, step-by-step plan,
+             system_prompt_addon (for injecting into next agent run).
+    """
+    traj = env.get_trajectory()
+    if not traj:
+        return {"error": "No trajectory available."}
+    steps = traj.get("steps", [])
+    meta = env.variant.meta if env.variant else {}
+    # Classify first
+    fail_report = failure_clf.classify(
+        episode_id=traj.get("episode_id", ""),
+        task=env.current_task or "unknown",
+        trajectory_steps=steps,
+        variant_meta=meta,
+        files_read=list(env.files_read),
+        files_written=list(env.files_written),
+        final_score=env.final_score,
+        security_violations=env.security_violations,
+    )
+    plan = improvement.generate_improvement_plan(
+        episode_id=traj.get("episode_id", ""),
+        task=env.current_task or "unknown",
+        failure_type=fail_report.primary_failure,
+        failure_evidence=[f.evidence for f in fail_report.failures],
+        original_score=env.final_score,
+        trajectory_steps=steps,
+        files_read=list(env.files_read),
+        files_written=list(env.files_written),
+    )
+    return plan.to_dict()
+@app.post("/compare-agents")
+async def compare_agents(task: str = "task1", agents: str = "all"):
+    """
+    Run multiple agent strategies on the same task and compare side-by-side.
+    agents: "all" | comma-separated list of: test-first,search-first,minimal,exhaustive
+    """
+    valid_tasks = ["task1", "task2", "task3"]
+    if task not in valid_tasks:
+        raise HTTPException(status_code=400, detail=f"task must be one of {valid_tasks}")
+    if agents == "all":
+        agent_list = None
+    else:
+        agent_list = [a.strip() for a in agents.split(",")]
+    try:
+        report = multi_agent.compare(env, task=task, agents=agent_list)
+        return report.to_dict()
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/viz-data")
+async def get_viz_data():
+    """
+    Get structured 3D visualization data for the current/latest episode.
+    Returns nodes (files), edges (dependencies), and step trajectory
+    in the format expected by the Three.js visualizer.
+    """
+    traj = env.get_trajectory()
+    if not traj:
+        return {"error": "No trajectory available."}
+    # Build file nodes
+    files = []
+    visited = set(env.files_read)
+    modified = set(env.files_written)
+    meta = env.variant.meta if env.variant else {}
+    bug_files = set(meta.get("bug_files", []))
+    if env.variant:
+        tree = env.variant.get_tree()
+        for f in tree:
+            ftype = "test" if f.startswith("tests/") else \
+                    "spec" if f.endswith(".md") else "src"
+            files.append({
+                "name": f,
+                "type": ftype,
+                "is_bug_file": f in bug_files,
+                "visited": f in visited,
+                "modified": f in modified,
+            })
+    # Build dependency edges from known patterns
+    deps = []
+    test_files = [f["name"] for f in files if f["type"] == "test"]
+    src_files = [f["name"] for f in files if f["type"] == "src"]
+    # Simple heuristic: connect tests to src files
+    for tf in test_files:
+        for sf in src_files:
+            deps.append({"from": tf, "to": sf})
+    # Build step data for trajectory
+    steps_data = []
+    for step in traj.get("steps", []):
+        steps_data.append({
+            "step": step.get("step_number", 0),
+            "action": step.get("action_type", ""),
+            "path": step.get("action_path"),
+            "reward": step.get("reward", 0.0),
+            "error": step.get("error"),
+            "pass_rate": step.get("test_pass_rate"),
+        })
+    # Get strategy
+    strategy_info = strategy_det.detect(
+        traj.get("steps", []),
+        env.current_task or "unknown",
+        meta,
+        list(env.files_read),
+        env.final_score,
+    ) if traj.get("steps") else None
+    return {
+        "task": env.current_task or "unknown",
+        "variant_id": traj.get("variant_id", "unknown"),
+        "final_score": env.final_score,
+        "strategy": strategy_info.strategy if strategy_info else "UNKNOWN",
+        "failure_type": "—",
+        "files": files,
+        "dependencies": deps,
+        "steps": steps_data,
+    }

server/failure_classifier.py ADDED Viewed

	@@ -0,0 +1,294 @@

+# server/failure_classifier.py
+"""
+Typed Failure Classification Engine.
+Classifies agent failures into precise, actionable categories rather than
+vague scores. Each failure type has a root cause, evidence, and remediation.
+Failure taxonomy:
+  WRONG_FILE_NAVIGATION  — agent read irrelevant files, missed key files
+  BLIND_WRITE            — agent wrote code without reading first
+  HALLUCINATED_CODE      — agent wrote syntactically/logically wrong code
+  NEVER_TESTED           — agent submitted without running any tests
+  LOOPING_BEHAVIOR       — agent repeated same action 3+ times
+  CONTEXT_OVERFLOW       — agent read enormous amounts of irrelevant data
+  SECURITY_VIOLATION     — agent wrote dangerous code
+  CORRECT                — no failure detected
+"""
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass, field
+@dataclass
+class FailureInstance:
+    """One classified failure event."""
+    failure_type: str        # e.g. "WRONG_FILE_NAVIGATION"
+    severity: str            # "critical" | "major" | "minor"
+    step_number: int         # Which step triggered it
+    evidence: str            # Specific observation
+    root_cause: str          # Why this happens
+    remediation: str         # How to fix in next run
+@dataclass
+class FailureReport:
+    """Full failure analysis for one episode."""
+    episode_id: str
+    task: str
+    primary_failure: str        # Most severe failure type
+    failures: List[FailureInstance] = field(default_factory=list)
+    success: bool = False
+    failure_summary: str = ""
+    retry_hint: str = ""        # Actionable hint for the next attempt
+    def to_dict(self) -> dict:
+        return {
+            "episode_id": self.episode_id,
+            "task": self.task,
+            "success": self.success,
+            "primary_failure": self.primary_failure,
+            "failure_count": len(self.failures),
+            "failures": [
+                {
+                    "type": f.failure_type,
+                    "severity": f.severity,
+                    "step": f.step_number,
+                    "evidence": f.evidence,
+                    "root_cause": f.root_cause,
+                    "remediation": f.remediation,
+                }
+                for f in self.failures
+            ],
+            "failure_summary": self.failure_summary,
+            "retry_hint": self.retry_hint,
+        }
+# ── Severity ordering for picking primary failure ─────────────────────────────
+SEVERITY_RANK = {"critical": 3, "major": 2, "minor": 1}
+FAILURE_REMEDIATION = {
+    "WRONG_FILE_NAVIGATION": (
+        "Read the failing test file first to understand the module under test, "
+        "then navigate directly to the imported source files."
+    ),
+    "BLIND_WRITE": (
+        "Always read the target file before writing. Use read_file → write_file → run_tests."
+    ),
+    "HALLUCINATED_CODE": (
+        "Re-read the source file, understand the function signature, "
+        "then write a minimal targeted fix. Run tests to verify."
+    ),
+    "NEVER_TESTED": (
+        "Always call run_tests after writing a fix. "
+        "Submit only when test pass rate has demonstrably improved."
+    ),
+    "LOOPING_BEHAVIOR": (
+        "Stop repeating the same action. Use search_code to find the bug location, "
+        "then navigate directly to it."
+    ),
+    "CONTEXT_OVERFLOW": (
+        "Focus on files explicitly referenced in the failing test's imports. "
+        "Avoid reading utility files unless the test error specifically mentions them."
+    ),
+    "SECURITY_VIOLATION": (
+        "Do not use os.system, eval, exec, or subprocess in fixes. "
+        "Write pure Python logic without shell calls."
+    ),
+    "CORRECT": "No remediation needed.",
+}
+class FailureClassifier:
+    """
+    Classifies agent failures from trajectory data.
+    Usage:
+        clf = FailureClassifier()
+        report = clf.classify(
+            episode_id="abc123",
+            task="task1",
+            trajectory_steps=[...],
+            variant_meta={...},
+            files_read=[...],
+            files_written=[...],
+            final_score=0.0,
+        )
+    """
+    def classify(
+        self,
+        episode_id: str,
+        task: str,
+        trajectory_steps: List[dict],
+        variant_meta: Dict[str, Any],
+        files_read: List[str],
+        files_written: List[str],
+        final_score: float,
+        security_violations: int = 0,
+    ) -> FailureReport:
+        """Run all classifiers and build a structured failure report."""
+        failures: List[FailureInstance] = []
+        success = final_score >= 0.5
+        if success and security_violations == 0:
+            return FailureReport(
+                episode_id=episode_id,
+                task=task,
+                primary_failure="CORRECT",
+                failures=[],
+                success=True,
+                failure_summary="Agent succeeded without errors.",
+                retry_hint="",
+            )
+        action_sequence = [s.get("action_type", "") for s in trajectory_steps]
+        # ── Classifier 1: Wrong File Navigation ───────────────────────────────
+        relevant = set(
+            variant_meta.get("bug_files", []) +
+            variant_meta.get("interface_files", []) +
+            variant_meta.get("read_first_files", []) +
+            variant_meta.get("files_to_implement", [])
+        )
+        if relevant and files_read:
+            irrelevant_reads = [f for f in files_read if f not in relevant
+                                and not f.startswith("tests/")]
+            if len(irrelevant_reads) > 1 and not any(f in files_read for f in relevant):
+                failures.append(FailureInstance(
+                    failure_type="WRONG_FILE_NAVIGATION",
+                    severity="critical",
+                    step_number=1,
+                    evidence=f"Read {len(irrelevant_reads)} irrelevant files: {irrelevant_reads[:3]}. "
+                             f"Never read key files: {list(relevant)[:3]}",
+                    root_cause="Agent navigated to wrong part of the codebase entirely.",
+                    remediation=FAILURE_REMEDIATION["WRONG_FILE_NAVIGATION"],
+                ))
+        # ── Classifier 2: Blind Write ─────────────────────────────────────────
+        write_indices = [i for i, a in enumerate(action_sequence) if a == "write_file"]
+        for wi in write_indices:
+            reads_before = [a for a in action_sequence[:wi] if a == "read_file"]
+            if not reads_before:
+                step = trajectory_steps[wi]
+                failures.append(FailureInstance(
+                    failure_type="BLIND_WRITE",
+                    severity="critical",
+                    step_number=wi + 1,
+                    evidence=f"write_file at step {wi+1} with zero prior read_file actions.",
+                    root_cause="Agent attempted to fix code without reading it first — likely hallucinating.",
+                    remediation=FAILURE_REMEDIATION["BLIND_WRITE"],
+                ))
+        # ── Classifier 3: Hallucinated Code ───────────────────────────────────
+        # Detect write followed by immediate test failure
+        for i, step in enumerate(trajectory_steps):
+            if step.get("action_type") == "run_tests":
+                prev_write = None
+                for j in range(i - 1, -1, -1):
+                    if trajectory_steps[j].get("action_type") == "write_file":
+                        prev_write = j
+                        break
+                if prev_write is not None:
+                    pass_rate = step.get("test_pass_rate", None)
+                    if pass_rate is not None and pass_rate < 0.3:
+                        failures.append(FailureInstance(
+                            failure_type="HALLUCINATED_CODE",
+                            severity="major",
+                            step_number=i + 1,
+                            evidence=f"Test pass rate {pass_rate:.2f} after write at step {prev_write+1}. "
+                                     f"Code change made things worse.",
+                            root_cause="Agent wrote syntactically correct but semantically wrong code.",
+                            remediation=FAILURE_REMEDIATION["HALLUCINATED_CODE"],
+                        ))
+        # ── Classifier 4: Never Tested ────────────────────────────────────────
+        has_tests = "run_tests" in action_sequence
+        has_writes = "write_file" in action_sequence
+        has_submit = "submit" in action_sequence
+        if has_submit and has_writes and not has_tests:
+            failures.append(FailureInstance(
+                failure_type="NEVER_TESTED",
+                severity="major",
+                step_number=len(action_sequence),
+                evidence="Agent wrote code changes but submitted without running any tests.",
+                root_cause="No feedback loop — agent cannot know if its fix worked.",
+                remediation=FAILURE_REMEDIATION["NEVER_TESTED"],
+            ))
+        # ── Classifier 5: Looping Behavior ────────────────────────────────────
+        read_paths = [
+            (i, s.get("action_path"))
+            for i, s in enumerate(trajectory_steps)
+            if s.get("action_type") == "read_file" and s.get("action_path")
+        ]
+        path_counts: Dict[str, List[int]] = {}
+        for idx, path in read_paths:
+            path_counts.setdefault(path, []).append(idx)
+        for path, indices in path_counts.items():
+            if len(indices) >= 3:
+                failures.append(FailureInstance(
+                    failure_type="LOOPING_BEHAVIOR",
+                    severity="major",
+                    step_number=indices[2] + 1,
+                    evidence=f"Read '{path}' {len(indices)} times (steps {[i+1 for i in indices]}). "
+                             f"Agent is stuck in a read loop.",
+                    root_cause="Agent cannot extract the needed information and keeps retrying.",
+                    remediation=FAILURE_REMEDIATION["LOOPING_BEHAVIOR"],
+                ))
+        # ── Classifier 6: Context Overflow ────────────────────────────────────
+        total_content = sum(
+            s.get("action_content_length") or 0
+            for s in trajectory_steps
+            if s.get("action_type") == "read_file"
+        )
+        if total_content > 50_000 and final_score < 0.5:
+            failures.append(FailureInstance(
+                failure_type="CONTEXT_OVERFLOW",
+                severity="minor",
+                step_number=len(trajectory_steps),
+                evidence=f"Agent read {total_content:,} chars total. "
+                         f"Most of this was likely irrelevant context.",
+                root_cause="Agent wasted token budget reading unnecessary files.",
+                remediation=FAILURE_REMEDIATION["CONTEXT_OVERFLOW"],
+            ))
+        # ── Classifier 7: Security Violation ─────────────────────────────────
+        if security_violations > 0:
+            sec_steps = [
+                s for s in trajectory_steps if s.get("security_flags")
+            ]
+            for ss in sec_steps:
+                failures.append(FailureInstance(
+                    failure_type="SECURITY_VIOLATION",
+                    severity="critical",
+                    step_number=ss.get("step_number", 0),
+                    evidence=f"Flags: {ss.get('security_flags', [])}",
+                    root_cause="Agent wrote unsafe code patterns that would be dangerous in production.",
+                    remediation=FAILURE_REMEDIATION["SECURITY_VIOLATION"],
+                ))
+        # ── Build report ──────────────────────────────────────────────────────
+        if not failures:
+            # Failed but no specific classifier triggered — generic low score
+            primary = "HALLUCINATED_CODE"
+            summary = f"Score {final_score:.2f} — fix was written but insufficient. Re-read the source files more carefully."
+            hint = "Read test file → read all src files → write targeted fix → run tests → submit."
+        else:
+            # Pick most severe failure as primary
+            failures.sort(key=lambda f: SEVERITY_RANK.get(f.severity, 0), reverse=True)
+            primary = failures[0].failure_type
+            summary = "; ".join(f"{f.failure_type} (step {f.step_number})" for f in failures[:3])
+            hint = failures[0].remediation
+        return FailureReport(
+            episode_id=episode_id,
+            task=task,
+            primary_failure=primary,
+            failures=failures,
+            success=success,
+            failure_summary=summary,
+            retry_hint=hint,
+        )

server/multi_agent.py ADDED Viewed

	@@ -0,0 +1,371 @@

+# server/multi_agent.py
+"""
+Multi-Agent Comparison Engine.
+Runs multiple agent configurations against the SAME task variant
+and produces a side-by-side comparison report.
+Agent configurations:
+  - Deterministic (rule-based, no LLM) — baseline
+  - Test-first (forces reading tests before anything)
+  - Search-first (forces search_code before reads)
+  - LLM-based (if HF_TOKEN provided)
+This is the key feature that answers: "Which agent strategy wins?"
+"""
+import time
+import copy
+from typing import List, Dict, Any, Optional, Callable
+from dataclasses import dataclass, field
+@dataclass
+class AgentRunResult:
+    """Result of one agent configuration running one episode."""
+    agent_name: str
+    task: str
+    variant_id: str
+    final_score: float
+    total_steps: int
+    cumulative_reward: float
+    duration_seconds: float
+    action_sequence: List[str]
+    files_read: List[str]
+    files_written: List[str]
+    strategy: str              # Detected strategy label
+    strategy_score: float
+    failure_type: str
+    reliability_index: float
+    step_timeline: List[dict]
+    def to_dict(self) -> dict:
+        return {
+            "agent_name": self.agent_name,
+            "task": self.task,
+            "variant_id": self.variant_id,
+            "final_score": round(self.final_score, 3),
+            "total_steps": self.total_steps,
+            "cumulative_reward": round(self.cumulative_reward, 3),
+            "duration_seconds": round(self.duration_seconds, 2),
+            "action_sequence": self.action_sequence,
+            "files_read": self.files_read,
+            "files_written": self.files_written,
+            "strategy": self.strategy,
+            "strategy_score": round(self.strategy_score, 3),
+            "failure_type": self.failure_type,
+            "reliability_index": round(self.reliability_index, 3),
+            "step_timeline": self.step_timeline,
+        }
+@dataclass
+class ComparisonReport:
+    """Side-by-side comparison of multiple agent configurations."""
+    task: str
+    variant_id: str
+    runs: List[AgentRunResult] = field(default_factory=list)
+    def to_dict(self) -> dict:
+        if not self.runs:
+            return {"error": "No runs to compare"}
+        # Rank by score then steps
+        ranked = sorted(self.runs, key=lambda r: (-r.final_score, r.total_steps))
+        winner = ranked[0]
+        return {
+            "task": self.task,
+            "variant_id": self.variant_id,
+            "winner": winner.agent_name,
+            "winner_score": winner.final_score,
+            "summary_table": [
+                {
+                    "rank": i + 1,
+                    "agent": r.agent_name,
+                    "score": round(r.final_score, 3),
+                    "steps": r.total_steps,
+                    "reward": round(r.cumulative_reward, 3),
+                    "strategy": r.strategy,
+                    "failure": r.failure_type,
+                    "reliability": round(r.reliability_index, 3),
+                }
+                for i, r in enumerate(ranked)
+            ],
+            "detailed_runs": [r.to_dict() for r in self.runs],
+            "insights": self._generate_insights(ranked),
+        }
+    def _generate_insights(self, ranked: List[AgentRunResult]) -> List[str]:
+        insights = []
+        if len(ranked) < 2:
+            return insights
+        best = ranked[0]
+        worst = ranked[-1]
+        if best.final_score > worst.final_score + 0.2:
+            insights.append(
+                f"'{best.agent_name}' significantly outperformed '{worst.agent_name}' "
+                f"({best.final_score:.2f} vs {worst.final_score:.2f})"
+            )
+        step_diffs = [(r.agent_name, r.total_steps) for r in ranked]
+        most_efficient = min(ranked, key=lambda r: r.total_steps if r.final_score >= 0.5 else float('inf'))
+        if most_efficient.final_score >= 0.5:
+            insights.append(
+                f"Most step-efficient successful agent: '{most_efficient.agent_name}' "
+                f"({most_efficient.total_steps} steps)"
+            )
+        strategies = [r.strategy for r in ranked]
+        if len(set(strategies)) > 1:
+            insights.append(
+                f"Strategy variance observed: {set(strategies)} — "
+                f"'{best.agent_name}' used {best.strategy} which proved most effective."
+            )
+        return insights
+class MultiAgentComparison:
+    """
+    Runs multiple deterministic agent strategies against the same environment.
+    Usage (in-process, no LLM required):
+        from server.environment import CodebaseNavEnvironment
+        from server.models import RepoAction
+        env = CodebaseNavEnvironment()
+        engine = MultiAgentComparison()
+        report = engine.compare(env, task="task1")
+    """
+    # ── Built-in agent strategies ─────────────────────────────────────────────
+    @staticmethod
+    def _agent_test_first(obs: dict, step: int, context: dict) -> dict:
+        """Strategy: Read tests before any source file."""
+        tree = obs.get("repo_tree", [])
+        files_read = set(obs.get("files_read", []))
+        test_files = sorted([f for f in tree if f.startswith("tests/")])
+        src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")])
+        spec_files = sorted([f for f in tree if f.endswith(".md")])
+        # Phase 1: Tests first
+        for tf in test_files:
+            if tf not in files_read:
+                return {"action_type": "read_file", "path": tf}
+        # Phase 2: Source files
+        for sf in src_files:
+            if sf not in files_read:
+                return {"action_type": "read_file", "path": sf}
+        # Phase 3: Run tests
+        if test_files and context.get("tests_run", 0) == 0:
+            context["tests_run"] = 1
+            return {"action_type": "run_tests", "path": test_files[0]}
+        return {"action_type": "submit"}
+    @staticmethod
+    def _agent_search_first(obs: dict, step: int, context: dict) -> dict:
+        """Strategy: Use search_code to locate the bug before reading."""
+        tree = obs.get("repo_tree", [])
+        files_read = set(obs.get("files_read", []))
+        failing = obs.get("failing_tests", [])
+        # Step 1: search for the failing test function name
+        if step == 1 and failing:
+            fn_name = failing[0].split(".")[-1] if failing else "bug"
+            context["searched"] = True
+            return {"action_type": "search_code", "query": fn_name}
+        # Step 2: Read files based on search
+        test_files = sorted([f for f in tree if f.startswith("tests/")])
+        src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")])
+        for tf in test_files:
+            if tf not in files_read:
+                return {"action_type": "read_file", "path": tf}
+        for sf in src_files:
+            if sf not in files_read:
+                return {"action_type": "read_file", "path": sf}
+        if test_files and context.get("tests_run", 0) == 0:
+            context["tests_run"] = 1
+            return {"action_type": "run_tests", "path": test_files[0]}
+        return {"action_type": "submit"}
+    @staticmethod
+    def _agent_minimal(obs: dict, step: int, context: dict) -> dict:
+        """Strategy: Minimal effort — read one file, submit immediately."""
+        tree = obs.get("repo_tree", [])
+        files_read = set(obs.get("files_read", []))
+        src_files = [f for f in tree if f.startswith("src/") and f.endswith(".py")]
+        if src_files and not files_read:
+            return {"action_type": "read_file", "path": src_files[0]}
+        return {"action_type": "submit"}
+    @staticmethod
+    def _agent_exhaustive(obs: dict, step: int, context: dict) -> dict:
+        """Strategy: Read everything, run tests, then submit."""
+        tree = obs.get("repo_tree", [])
+        files_read = set(obs.get("files_read", []))
+        all_readable = [f for f in tree if f.endswith(".py") or f.endswith(".md")]
+        for f in all_readable:
+            if f not in files_read:
+                return {"action_type": "read_file", "path": f}
+        test_files = [f for f in tree if f.startswith("tests/")]
+        if test_files and context.get("tests_run", 0) == 0:
+            context["tests_run"] = 1
+            return {"action_type": "run_tests", "path": test_files[0]}
+        if test_files and context.get("tests_run2", 0) == 0:
+            context["tests_run2"] = 1
+            return {"action_type": "run_tests"}
+        return {"action_type": "submit"}
+    AGENT_CONFIGS = {
+        "test-first": _agent_test_first.__func__,
+        "search-first": _agent_search_first.__func__,
+        "minimal": _agent_minimal.__func__,
+        "exhaustive": _agent_exhaustive.__func__,
+    }
+    def compare(
+        self,
+        env,  # CodebaseNavEnvironment instance
+        task: str = "task1",
+        agents: Optional[List[str]] = None,
+        shared_variant: Optional[str] = None,
+    ) -> ComparisonReport:
+        """
+        Run all (or selected) agents against the same task and compare.
+        The environment is reset to the same variant for each agent.
+        """
+        from server.models import RepoAction
+        from server.strategy_detector import StrategyDetector
+        from server.failure_classifier import FailureClassifier
+        from server.advanced_metrics import AdvancedMetricsEngine
+        agent_names = agents or list(self.AGENT_CONFIGS.keys())
+        strategy_detector = StrategyDetector()
+        failure_classifier = FailureClassifier()
+        metrics_engine = AdvancedMetricsEngine()
+        runs: List[AgentRunResult] = []
+        variant_id = None
+        for agent_name in agent_names:
+            agent_fn = self.AGENT_CONFIGS.get(agent_name)
+            if not agent_fn:
+                continue
+            # Reset environment
+            reset_result = env.reset(task=task)
+            obs = reset_result.observation
+            variant_id = reset_result.info.get("variant_id", "unknown")
+            context = {}
+            start = time.time()
+            max_steps = 15
+            files_read = []
+            files_written = []
+            cumulative_reward = 0.0
+            action_sequence = []
+            step_timeline = []
+            obs_dict = obs.model_dump()
+            for step_num in range(1, max_steps + 1):
+                if env.done:
+                    break
+                action_dict = agent_fn(obs_dict, step_num, context)
+                action = RepoAction(
+                    action_type=action_dict.get("action_type", "submit"),
+                    path=action_dict.get("path"),
+                    query=action_dict.get("query"),
+                    content=action_dict.get("content"),
+                )
+                result = env.step(action)
+                obs = result.observation
+                obs_dict = obs.model_dump()
+                cumulative_reward += result.reward
+                action_sequence.append(action.action_type)
+                if action.path and action.action_type == "read_file":
+                    files_read.append(action.path)
+                if action.path and action.action_type == "write_file":
+                    files_written.append(action.path)
+                step_timeline.append({
+                    "step": step_num,
+                    "action": action.action_type,
+                    "path": action.path,
+                    "reward": round(result.reward, 3),
+                })
+                if result.done:
+                    break
+            # Force submit if not done
+            if not env.done:
+                result = env.step(RepoAction(action_type="submit"))
+                cumulative_reward += result.reward
+                action_sequence.append("submit")
+            duration = time.time() - start
+            final_score = env.final_score
+            # Get trajectory for analysis
+            trajectory = env.get_trajectory()
+            traj_steps = trajectory.get("steps", []) if trajectory else []
+            variant_meta = {}
+            if env.variant:
+                variant_meta = env.variant.meta
+            # Detect strategy
+            strategy_report = strategy_detector.detect(
+                traj_steps, task, variant_meta, files_read, final_score
+            )
+            # Classify failure
+            failure_report = failure_classifier.classify(
+                episode_id=trajectory.get("episode_id", "") if trajectory else "",
+                task=task,
+                trajectory_steps=traj_steps,
+                variant_meta=variant_meta,
+                files_read=files_read,
+                files_written=files_written,
+                final_score=final_score,
+            )
+            # Advanced metrics
+            adv_metrics = metrics_engine.compute(
+                traj_steps, variant_meta, final_score, files_read, files_written
+            )
+            runs.append(AgentRunResult(
+                agent_name=agent_name,
+                task=task,
+                variant_id=variant_id or "unknown",
+                final_score=final_score,
+                total_steps=len(action_sequence),
+                cumulative_reward=cumulative_reward,
+                duration_seconds=duration,
+                action_sequence=action_sequence,
+                files_read=files_read,
+                files_written=files_written,
+                strategy=strategy_report.strategy,
+                strategy_score=strategy_report.score,
+                failure_type=failure_report.primary_failure,
+                reliability_index=adv_metrics.reliability_index,
+                step_timeline=step_timeline,
+            ))
+        return ComparisonReport(
+            task=task,
+            variant_id=variant_id or "unknown",
+            runs=runs,
+        )

server/self_improvement.py ADDED Viewed

	@@ -0,0 +1,292 @@

+# server/self_improvement.py
+"""
+Self-Improvement Loop.
+After a failure, generates structured feedback and an improved strategy prompt
+that can be injected into the agent's next attempt. This closes the loop
+between evaluation and agent behavior.
+The retry loop:
+  1. Run episode → evaluate → classify failures
+  2. Generate improvement prompt based on failure type
+  3. Re-run episode with improvement prompt injected into agent context
+  4. Compare before/after performance
+"""
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass, field
+@dataclass
+class ImprovementPlan:
+    """Structured feedback for the agent's next attempt."""
+    episode_id: str
+    task: str
+    failure_type: str
+    original_score: float
+    # Actionable feedback
+    what_went_wrong: str
+    specific_errors: List[str]
+    improved_strategy: str
+    step_by_step_plan: List[str]
+    # For injection into agent prompt
+    system_prompt_addon: str    # Extra instructions for the system prompt
+    user_context_addon: str     # Extra context for the user prompt
+    def to_dict(self) -> dict:
+        return {
+            "episode_id": self.episode_id,
+            "task": self.task,
+            "failure_type": self.failure_type,
+            "original_score": round(self.original_score, 3),
+            "what_went_wrong": self.what_went_wrong,
+            "specific_errors": self.specific_errors,
+            "improved_strategy": self.improved_strategy,
+            "step_by_step_plan": self.step_by_step_plan,
+            "system_prompt_addon": self.system_prompt_addon,
+            "user_context_addon": self.user_context_addon,
+        }
+@dataclass
+class RetryResult:
+    """Result of a retry attempt with improvement feedback."""
+    original_episode_id: str
+    retry_episode_id: str
+    original_score: float
+    retry_score: float
+    improvement: float     # retry_score - original_score
+    failure_fixed: bool
+    steps_comparison: Dict[str, int]  # {"original": N, "retry": M}
+    def to_dict(self) -> dict:
+        return {
+            "original_episode_id": self.original_episode_id,
+            "retry_episode_id": self.retry_episode_id,
+            "original_score": round(self.original_score, 3),
+            "retry_score": round(self.retry_score, 3),
+            "improvement": round(self.improvement, 3),
+            "failure_fixed": self.failure_fixed,
+            "steps_comparison": self.steps_comparison,
+        }
+# ── Strategy templates per failure type ──────────────────────────────────────
+STRATEGY_TEMPLATES = {
+    "WRONG_FILE_NAVIGATION": {
+        "what_went_wrong": "Agent navigated to the wrong files and missed the bug location entirely.",
+        "strategy": "START with the failing test file. Read its imports. Navigate exclusively to those imported modules.",
+        "plan": [
+            "1. Read the failing test file FIRST (in tests/ directory)",
+            "2. Find the import statements — these point to the buggy module",
+            "3. Read ONLY those imported source files",
+            "4. Look for the function/method the test is calling",
+            "5. Fix the specific function — do not touch other code",
+            "6. Run the failing test to verify",
+            "7. Submit",
+        ],
+        "system_addon": (
+            "CRITICAL: You previously failed by reading the wrong files. "
+            "This time: read the failing test first, identify its imports, "
+            "go directly to those source files. Do NOT read any file not referenced by the test."
+        ),
+    },
+    "BLIND_WRITE": {
+        "what_went_wrong": "Agent wrote code without reading the existing implementation first.",
+        "strategy": "NEVER write before reading. Read the target file. Understand the existing logic. Then write a minimal fix.",
+        "plan": [
+            "1. Read the failing test to understand expected behavior",
+            "2. Read the source file you plan to modify",
+            "3. Identify the exact line(s) causing failure",
+            "4. Write a FIX (not a rewrite) targeting only those lines",
+            "5. Run tests to verify improvement",
+            "6. Submit",
+        ],
+        "system_addon": (
+            "CRITICAL: You previously wrote code without reading the file first. "
+            "This time: ALWAYS call read_file on any file BEFORE using write_file. "
+            "No exceptions. Read → Understand → Write minimal fix."
+        ),
+    },
+    "HALLUCINATED_CODE": {
+        "what_went_wrong": "Agent wrote syntactically correct but logically wrong code that made tests worse.",
+        "strategy": "Write a targeted, minimal fix. Do not rewrite entire functions. Change only what the test requires.",
+        "plan": [
+            "1. Read the failing test and note EXACTLY what assertion fails",
+            "2. Read the source function — understand its current behavior",
+            "3. Identify the gap between current and expected behavior",
+            "4. Write the SMALLEST possible change that bridges that gap",
+            "5. Run tests BEFORE submitting to verify the fix works",
+            "6. If tests still fail, re-read and refine — don't guess",
+        ],
+        "system_addon": (
+            "CRITICAL: Your previous fix made things worse. This indicates hallucination. "
+            "This time: make the SMALLEST possible change. "
+            "Run run_tests after EVERY write to check if you're improving or degrading. "
+            "If tests get worse after a write, immediately read the file again and try a different approach."
+        ),
+    },
+    "NEVER_TESTED": {
+        "what_went_wrong": "Agent submitted code changes without running any tests to verify they work.",
+        "strategy": "ALWAYS run run_tests after every write_file. Never submit without test verification.",
+        "plan": [
+            "1. Read test → Read source → Write fix",
+            "2. IMMEDIATELY run run_tests pointing to the failing test file",
+            "3. If tests pass: submit",
+            "4. If tests still fail: re-read, refine, run tests again",
+            "5. ONLY submit when you have seen test improvement",
+        ],
+        "system_addon": (
+            "CRITICAL: You submitted without testing. This is invalid. "
+            "This time: after EVERY write_file action, you MUST call run_tests. "
+            "Only call submit when run_tests shows improvement. "
+            "The pattern is: read → write → run_tests → submit. Non-negotiable."
+        ),
+    },
+    "LOOPING_BEHAVIOR": {
+        "what_went_wrong": "Agent got stuck reading the same file repeatedly without making progress.",
+        "strategy": "Use search_code to find the exact bug location. Read each file at most once.",
+        "plan": [
+            "1. Use search_code with the function name from the failing test",
+            "2. Read the file that contains the matching code — ONCE",
+            "3. If you need more context, use search_code again with a different query",
+            "4. Once you have read a file, do NOT read it again",
+            "5. Write your fix, run tests, submit",
+        ],
+        "system_addon": (
+            "CRITICAL: You read the same files 3+ times without progress. "
+            "This time: you may read each file AT MOST ONCE. "
+            "Use search_code to pinpoint bug location. "
+            "If you are confused, use search_code — do not re-read files."
+        ),
+    },
+    "SECURITY_VIOLATION": {
+        "what_went_wrong": "Agent wrote dangerous code patterns that would be harmful in production.",
+        "strategy": "Write pure Python logic only. Never use os, subprocess, eval, or exec.",
+        "plan": [
+            "1. Read the test to understand what pure Python behavior is needed",
+            "2. Implement the fix using ONLY standard library functions",
+            "3. No os.system(), subprocess, eval(), exec(), or __import__()",
+            "4. Run tests and submit",
+        ],
+        "system_addon": (
+            "CRITICAL: Your previous code contained dangerous patterns (os.system, eval, exec, subprocess). "
+            "This is automatically penalized. "
+            "This time: write ONLY pure Python logic. No shell commands. No dynamic execution. "
+            "Use only stdlib modules that do not involve system access."
+        ),
+    },
+    "CORRECT": {
+        "what_went_wrong": "No failure — agent succeeded.",
+        "strategy": "Continue with same strategy.",
+        "plan": ["Maintain current approach."],
+        "system_addon": "",
+    },
+}
+# Default template for unknown failures
+DEFAULT_TEMPLATE = {
+    "what_went_wrong": "Agent failed to fix the bug sufficiently — score too low.",
+    "strategy": "Read all relevant files carefully, make a targeted fix, run tests, submit.",
+    "plan": [
+        "1. Read failing test to understand expected behavior",
+        "2. Read each source file referenced by the test",
+        "3. Identify the bug: wrong return value, missing case, logic error",
+        "4. Write minimal fix",
+        "5. Run tests",
+        "6. Submit only when tests improve",
+    ],
+    "system_addon": (
+        "IMPORTANT: Your previous attempt scored below 0.5. "
+        "This time: focus on understanding what the failing test EXPECTS. "
+        "Make a targeted fix. Verify with run_tests before submitting."
+    ),
+}
+class SelfImprovementEngine:
+    """
+    Generates structured improvement plans from failure analysis.
+    Usage:
+        engine = SelfImprovementEngine()
+        plan = engine.generate_improvement_plan(
+            episode_id="abc123",
+            task="task1",
+            failure_report=failure_report,
+            trajectory_steps=[...],
+        )
+        # Then inject plan.system_prompt_addon into the agent's next run
+    """
+    def generate_improvement_plan(
+        self,
+        episode_id: str,
+        task: str,
+        failure_type: str,
+        failure_evidence: List[str],
+        original_score: float,
+        trajectory_steps: List[dict],
+        files_read: List[str],
+        files_written: List[str],
+    ) -> ImprovementPlan:
+        """Generate an actionable improvement plan from failure data."""
+        template = STRATEGY_TEMPLATES.get(failure_type, DEFAULT_TEMPLATE)
+        # Build specific error list from trajectory
+        specific_errors = []
+        for step in trajectory_steps:
+            if step.get("error"):
+                specific_errors.append(
+                    f"Step {step.get('step_number', '?')}: {step['error'][:100]}"
+                )
+        specific_errors.extend(failure_evidence[:3])
+        # Build user context addon with trajectory summary
+        action_summary = []
+        for step in trajectory_steps[:8]:  # First 8 steps for context
+            a = step.get("action_type", "?")
+            p = step.get("action_path") or step.get("action_query") or ""
+            r = step.get("reward", 0)
+            err = " ❌" if step.get("error") else ""
+            action_summary.append(f"  Step {step.get('step_number', '?')}: {a} {p} → reward={r:+.2f}{err}")
+        user_context_addon = (
+            f"[PREVIOUS ATTEMPT REVIEW]\n"
+            f"Score: {original_score:.2f}/1.0\n"
+            f"Primary failure: {failure_type}\n"
+            f"What went wrong: {template['what_went_wrong']}\n"
+            f"\nYour previous actions:\n" + "\n".join(action_summary) +
+            f"\n\n[IMPROVED STRATEGY FOR THIS ATTEMPT]\n{template['strategy']}"
+        )
+        return ImprovementPlan(
+            episode_id=episode_id,
+            task=task,
+            failure_type=failure_type,
+            original_score=original_score,
+            what_went_wrong=template["what_went_wrong"],
+            specific_errors=specific_errors,
+            improved_strategy=template["strategy"],
+            step_by_step_plan=template["plan"],
+            system_prompt_addon=template["system_addon"],
+            user_context_addon=user_context_addon,
+        )
+    def build_retry_system_prompt(self, base_prompt: str, improvement_plan: ImprovementPlan) -> str:
+        """Inject improvement guidance into the agent system prompt."""
+        if not improvement_plan.system_prompt_addon:
+            return base_prompt
+        return (
+            f"{base_prompt}\n\n"
+            f"{'='*60}\n"
+            f"PREVIOUS ATTEMPT FEEDBACK (VERY IMPORTANT):\n"
+            f"{'='*60}\n"
+            f"{improvement_plan.system_prompt_addon}\n"
+            f"{'='*60}"
+        )
+    def build_retry_user_context(self, improvement_plan: ImprovementPlan) -> str:
+        """Build the user context string to prepend to the first prompt in a retry."""
+        return improvement_plan.user_context_addon

server/strategy_detector.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# server/strategy_detector.py
+"""
+Strategy Pattern Detector.
+Classifies what high-level search/navigation strategy the agent used.
+This goes beyond step counting — it classifies the cognitive approach.
+Strategies:
+  TARGETED_DEBUGGING   — reads test → reads relevant src → fixes → tests
+  SYSTEMATIC_SEARCH    — reads all files methodically before writing
+  BRUTE_FORCE          — writes and runs tests repeatedly until something passes
+  RANDOM_EXPLORATION   — no coherent pattern, reads random files
+  SPEC_DRIVEN          — reads spec/docs first, then implements
+  MINIMAL_EFFORT       — does the bare minimum (often fails)
+Each strategy gets a score (1.0 = ideal for the task), a label, and evidence.
+"""
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass, field
+from collections import Counter
+@dataclass
+class StrategyReport:
+    """Result of strategy pattern detection."""
+    strategy: str              # Primary strategy label
+    score: float               # 0.0–1.0 (task-appropriate quality)
+    confidence: float          # How confident we are in the label (0–1)
+    sub_patterns: List[str]    # Additional behavioral sub-patterns
+    evidence: List[str]        # Supporting observations
+    strategy_description: str  # Human-readable explanation
+    exploration_ratio: float   # 0=pure exploit, 1=pure explore
+    pivot_count: int           # How many times agent changed strategy mid-episode
+    def to_dict(self) -> dict:
+        return {
+            "strategy": self.strategy,
+            "score": round(self.score, 3),
+            "confidence": round(self.confidence, 3),
+            "sub_patterns": self.sub_patterns,
+            "evidence": self.evidence,
+            "strategy_description": self.strategy_description,
+            "exploration_ratio": round(self.exploration_ratio, 3),
+            "pivot_count": self.pivot_count,
+        }
+STRATEGY_DESCRIPTIONS = {
+    "TARGETED_DEBUGGING": (
+        "Agent reads the failing test to understand expected behavior, "
+        "then navigates directly to the relevant source file and makes a targeted fix."
+    ),
+    "SYSTEMATIC_SEARCH": (
+        "Agent reads all available files before writing any code. "
+        "Methodical but can waste steps on irrelevant files."
+    ),
+    "BRUTE_FORCE": (
+        "Agent repeatedly writes and runs tests hoping something sticks. "
+        "No clear hypothesis about the bug — trial and error approach."
+    ),
+    "RANDOM_EXPLORATION": (
+        "Agent reads files in an incoherent order with no visible strategy. "
+        "High entropy — possibly confused by misleading information."
+    ),
+    "SPEC_DRIVEN": (
+        "Agent reads the specification/feature doc first, "
+        "then systematically implements what is described. Ideal for task3."
+    ),
+    "MINIMAL_EFFORT": (
+        "Agent took very few steps and submitted early. "
+        "May indicate overconfidence or giving up."
+    ),
+}
+class StrategyDetector:
+    """
+    Detects the behavioral strategy pattern used by an agent.
+    Usage:
+        detector = StrategyDetector()
+        report = detector.detect(
+            trajectory_steps=[...],
+            task="task1",
+            variant_meta={...},
+            files_read=[...],
+            final_score=0.7,
+        )
+    """
+    def detect(
+        self,
+        trajectory_steps: List[dict],
+        task: str,
+        variant_meta: Dict[str, Any],
+        files_read: List[str],
+        final_score: float,
+    ) -> StrategyReport:
+        """Detect strategy from trajectory data."""
+        if not trajectory_steps:
+            return StrategyReport(
+                strategy="MINIMAL_EFFORT",
+                score=0.0,
+                confidence=1.0,
+                sub_patterns=[],
+                evidence=["No steps taken"],
+                strategy_description=STRATEGY_DESCRIPTIONS["MINIMAL_EFFORT"],
+                exploration_ratio=0.0,
+                pivot_count=0,
+            )
+        action_seq = [s.get("action_type", "") for s in trajectory_steps]
+        read_paths = [
+            s.get("action_path", "")
+            for s in trajectory_steps
+            if s.get("action_type") == "read_file"
+        ]
+        write_count = action_seq.count("write_file")
+        test_count = action_seq.count("run_tests")
+        read_count = action_seq.count("read_file")
+        search_count = action_seq.count("search_code")
+        total = len(action_seq)
+        relevant = set(
+            variant_meta.get("bug_files", []) +
+            variant_meta.get("interface_files", []) +
+            variant_meta.get("read_first_files", [])
+        )
+        test_files = [f for f in read_paths if f and f.startswith("tests/")]
+        spec_files = [f for f in read_paths if f and f.endswith(".md")]
+        sub_patterns = []
+        evidence = []
+        # ── Exploration ratio: reads/searches vs writes/tests ─────────────────
+        explore_actions = read_count + search_count
+        exploit_actions = write_count + test_count
+        exploration_ratio = (
+            explore_actions / (explore_actions + exploit_actions)
+            if (explore_actions + exploit_actions) > 0
+            else 0.5
+        )
+        # ── Pivot detection: strategy changes mid-episode ─────────────────────
+        pivots = 0
+        blocks = []
+        current_block = action_seq[0] if action_seq else None
+        for a in action_seq[1:]:
+            read_like = a in ("read_file", "search_code")
+            write_like = a in ("write_file", "run_tests")
+            cur_read = current_block in ("read_file", "search_code")
+            cur_write = current_block in ("write_file", "run_tests")
+            if (read_like and cur_write) or (write_like and cur_read):
+                pivots += 1
+            current_block = a
+        # ── Strategy classification ────────────────────────────────────────────
+        strategy = "RANDOM_EXPLORATION"
+        score = 0.4
+        confidence = 0.5
+        # 1. SPEC_DRIVEN (reads spec/md first, task3)
+        if task == "task3" and spec_files and action_seq.index("read_file") == 0:
+            strategy = "SPEC_DRIVEN"
+            score = 0.85 if final_score > 0.5 else 0.55
+            confidence = 0.9
+            evidence.append(f"Read spec file(s) first: {spec_files[:2]}")
+            sub_patterns.append("SPEC_FIRST")
+        # 2. TARGETED_DEBUGGING (test first → relevant src → write)
+        elif (test_files and read_paths and read_paths[0].startswith("tests/")
+              and write_count >= 1 and test_count >= 1):
+            strategy = "TARGETED_DEBUGGING"
+            score = 0.85 + (0.15 * final_score)
+            confidence = 0.85
+            evidence.append(f"First read was test file: {read_paths[0]}")
+            evidence.append(f"Followed by write + test verification")
+            sub_patterns.append("TEST_FIRST")
+            if relevant and any(f in files_read for f in relevant):
+                sub_patterns.append("TARGETED_READ")
+                score = min(1.0, score + 0.05)
+        # 3. SYSTEMATIC_SEARCH (all files read before any write)
+        elif write_count > 0:
+            first_write_idx = next((i for i, a in enumerate(action_seq) if a == "write_file"), total)
+            reads_before_write = sum(1 for i, a in enumerate(action_seq) if a == "read_file" and i < first_write_idx)
+            if read_count > 0 and reads_before_write == read_count:
+                strategy = "SYSTEMATIC_SEARCH"
+                score = 0.65
+                confidence = 0.75
+                evidence.append(f"Read {reads_before_write} files before first write")
+                sub_patterns.append("READ_ALL_FIRST")
+            # 4. BRUTE_FORCE (multiple write-test cycles)
+            elif write_count >= 2 and test_count >= 2:
+                strategy = "BRUTE_FORCE"
+                score = 0.35
+                confidence = 0.8
+                evidence.append(f"{write_count} writes + {test_count} test runs = trial and error")
+                sub_patterns.append("TRIAL_AND_ERROR")
+        # 5. MINIMAL_EFFORT (tiny episode, or only submit)
+        elif total <= 3 or (write_count == 0 and test_count == 0):
+            strategy = "MINIMAL_EFFORT"
+            score = 0.1
+            confidence = 0.95
+            evidence.append(f"Only {total} total steps with no fix attempt")
+            sub_patterns.append("GAVE_UP")
+        # ── Additional sub-pattern detection ──────────────────────────────────
+        # Search-before-read
+        if search_count > 0:
+            first_search = next((i for i, a in enumerate(action_seq) if a == "search_code"), total)
+            first_read = next((i for i, a in enumerate(action_seq) if a == "read_file"), total)
+            if first_search < first_read:
+                sub_patterns.append("SEARCH_GUIDED")
+                evidence.append("Used search_code to locate bug before reading")
+        # Excessive looping
+        path_counts = Counter(p for p in read_paths if p)
+        max_rereads = max(path_counts.values()) if path_counts else 0
+        if max_rereads >= 3:
+            sub_patterns.append("READ_LOOP")
+            evidence.append(f"Re-read same file {max_rereads}x — likely confused")
+            score = max(0.0, score - 0.2)
+        # Verified fix (ran tests and found improvement)
+        test_rates = [s.get("test_pass_rate") for s in trajectory_steps if s.get("test_pass_rate") is not None]
+        if len(test_rates) >= 2 and test_rates[-1] > test_rates[0]:
+            sub_patterns.append("VERIFIED_FIX")
+            evidence.append(f"Test pass rate improved: {test_rates[0]:.2f} → {test_rates[-1]:.2f}")
+            score = min(1.0, score + 0.1)
+        return StrategyReport(
+            strategy=strategy,
+            score=max(0.0, min(1.0, score)),
+            confidence=confidence,
+            sub_patterns=sub_patterns,
+            evidence=evidence,
+            strategy_description=STRATEGY_DESCRIPTIONS.get(strategy, ""),
+            exploration_ratio=exploration_ratio,
+            pivot_count=pivots,
+        )

static/viz3d.html ADDED Viewed

	@@ -0,0 +1,867 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Agent Trajectory 3D Visualizer</title>
+<style>
+  * { margin: 0; padding: 0; box-sizing: border-box; }
+  body {
+    background: #0a0e1a;
+    color: #e0e6f0;
+    font-family: 'Segoe UI', system-ui, sans-serif;
+    overflow: hidden;
+    height: 100vh;
+  }
+  #canvas-container {
+    position: absolute;
+    top: 0; left: 0;
+    width: 100%; height: 100%;
+  }
+  #ui-overlay {
+    position: absolute;
+    top: 0; left: 0;
+    width: 100%; height: 100%;
+    pointer-events: none;
+    z-index: 10;
+  }
+  /* Header */
+  #header {
+    position: absolute;
+    top: 12px; left: 50%;
+    transform: translateX(-50%);
+    text-align: center;
+    pointer-events: none;
+  }
+  #header h1 {
+    font-size: 16px;
+    font-weight: 700;
+    color: #7dd3fc;
+    letter-spacing: 0.05em;
+    text-shadow: 0 0 20px rgba(125,211,252,0.5);
+  }
+  #header p {
+    font-size: 11px;
+    color: #64748b;
+    margin-top: 2px;
+  }
+  /* Legend */
+  #legend {
+    position: absolute;
+    top: 12px; right: 16px;
+    background: rgba(10,14,26,0.85);
+    border: 1px solid rgba(125,211,252,0.2);
+    border-radius: 8px;
+    padding: 10px 14px;
+    font-size: 11px;
+    pointer-events: none;
+  }
+  #legend h3 { color: #7dd3fc; margin-bottom: 8px; font-size: 11px; }
+  .legend-item {
+    display: flex; align-items: center; gap: 8px;
+    margin-bottom: 5px;
+  }
+  .legend-dot {
+    width: 10px; height: 10px;
+    border-radius: 50%;
+    flex-shrink: 0;
+  }
+  /* Info panel */
+  #info-panel {
+    position: absolute;
+    top: 12px; left: 16px;
+    background: rgba(10,14,26,0.85);
+    border: 1px solid rgba(125,211,252,0.2);
+    border-radius: 8px;
+    padding: 12px 16px;
+    min-width: 220px;
+    pointer-events: none;
+  }
+  #info-panel h3 { color: #7dd3fc; margin-bottom: 8px; font-size: 11px; letter-spacing: 0.1em; }
+  .info-row {
+    display: flex; justify-content: space-between; gap: 12px;
+    font-size: 11px;
+    margin-bottom: 4px;
+    color: #94a3b8;
+  }
+  .info-value { color: #e0e6f0; font-weight: 600; }
+  /* Timeline */
+  #timeline-panel {
+    position: absolute;
+    bottom: 20px; left: 50%;
+    transform: translateX(-50%);
+    background: rgba(10,14,26,0.9);
+    border: 1px solid rgba(125,211,252,0.2);
+    border-radius: 10px;
+    padding: 14px 20px;
+    width: min(700px, 90vw);
+    pointer-events: all;
+  }
+  #timeline-panel .tl-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    margin-bottom: 10px;
+  }
+  #timeline-panel h3 {
+    font-size: 11px;
+    color: #7dd3fc;
+    letter-spacing: 0.1em;
+  }
+  #step-label {
+    font-size: 12px;
+    color: #f0abfc;
+    font-weight: 700;
+  }
+  #timeline-slider {
+    width: 100%;
+    -webkit-appearance: none;
+    height: 4px;
+    background: linear-gradient(to right, #7dd3fc 0%, #7dd3fc var(--pct,0%), #1e293b var(--pct,0%));
+    border-radius: 4px;
+    outline: none;
+    cursor: pointer;
+  }
+  #timeline-slider::-webkit-slider-thumb {
+    -webkit-appearance: none;
+    width: 16px; height: 16px;
+    border-radius: 50%;
+    background: #7dd3fc;
+    cursor: pointer;
+    box-shadow: 0 0 10px rgba(125,211,252,0.7);
+  }
+  #step-actions {
+    display: flex;
+    gap: 8px;
+    margin-top: 10px;
+    justify-content: center;
+  }
+  .tl-btn {
+    background: rgba(125,211,252,0.1);
+    border: 1px solid rgba(125,211,252,0.3);
+    color: #7dd3fc;
+    padding: 5px 14px;
+    border-radius: 6px;
+    cursor: pointer;
+    font-size: 11px;
+    transition: all 0.2s;
+  }
+  .tl-btn:hover { background: rgba(125,211,252,0.25); }
+  .tl-btn.active { background: rgba(125,211,252,0.3); }
+  /* Step log */
+  #step-log {
+    position: absolute;
+    bottom: 130px; right: 16px;
+    background: rgba(10,14,26,0.85);
+    border: 1px solid rgba(125,211,252,0.2);
+    border-radius: 8px;
+    padding: 10px 14px;
+    width: 260px;
+    max-height: 240px;
+    overflow-y: auto;
+    pointer-events: none;
+    font-size: 10px;
+  }
+  #step-log h3 { color: #7dd3fc; margin-bottom: 8px; font-size: 11px; }
+  .log-entry {
+    display: flex;
+    align-items: flex-start;
+    gap: 6px;
+    margin-bottom: 6px;
+    padding-bottom: 6px;
+    border-bottom: 1px solid rgba(255,255,255,0.05);
+  }
+  .log-entry:last-child { border-bottom: none; }
+  .log-step { color: #475569; min-width: 28px; }
+  .log-action { font-weight: 600; }
+  .log-reward { margin-left: auto; font-weight: 700; }
+  .reward-pos { color: #4ade80; }
+  .reward-neg { color: #f87171; }
+  .reward-zero { color: #94a3b8; }
+  /* Tooltip */
+  #tooltip {
+    position: absolute;
+    background: rgba(10,14,26,0.95);
+    border: 1px solid rgba(125,211,252,0.4);
+    border-radius: 6px;
+    padding: 8px 12px;
+    font-size: 11px;
+    pointer-events: none;
+    opacity: 0;
+    transition: opacity 0.15s;
+    max-width: 200px;
+    z-index: 20;
+  }
+  #tooltip h4 { color: #7dd3fc; margin-bottom: 4px; }
+  /* Score ring */
+  #score-ring {
+    position: absolute;
+    bottom: 130px; left: 16px;
+    pointer-events: none;
+  }
+  #score-ring svg text { font-family: 'Segoe UI', sans-serif; }
+  /* Loader */
+  #loader {
+    position: absolute;
+    top: 50%; left: 50%;
+    transform: translate(-50%, -50%);
+    color: #7dd3fc;
+    font-size: 14px;
+    text-align: center;
+  }
+  .loader-spinner {
+    width: 40px; height: 40px;
+    border: 3px solid rgba(125,211,252,0.2);
+    border-top-color: #7dd3fc;
+    border-radius: 50%;
+    animation: spin 0.8s linear infinite;
+    margin: 0 auto 12px;
+  }
+  @keyframes spin { to { transform: rotate(360deg); } }
+</style>
+</head>
+<body>
+<!-- Hidden data injection point -->
+<div id="viz-data" style="display:none"></div>
+<div id="canvas-container">
+  <canvas id="three-canvas"></canvas>
+</div>
+<div id="loader">
+  <div class="loader-spinner"></div>
+  <p>Initializing 3D Visualizer...</p>
+</div>
+<div id="ui-overlay">
+  <!-- Header -->
+  <div id="header">
+    <h1>🔍 Agent Trajectory Visualizer — 3D</h1>
+    <p>Files = nodes · Dependencies = edges · Agent path = animated beam</p>
+  </div>
+  <!-- Info panel -->
+  <div id="info-panel">
+    <h3>EPISODE STATS</h3>
+    <div class="info-row"><span>Task</span><span class="info-value" id="stat-task">—</span></div>
+    <div class="info-row"><span>Variant</span><span class="info-value" id="stat-variant">—</span></div>
+    <div class="info-row"><span>Steps</span><span class="info-value" id="stat-steps">—</span></div>
+    <div class="info-row"><span>Score</span><span class="info-value" id="stat-score">—</span></div>
+    <div class="info-row"><span>Strategy</span><span class="info-value" id="stat-strategy">—</span></div>
+    <div class="info-row"><span>Failure</span><span class="info-value" id="stat-failure">—</span></div>
+  </div>
+  <!-- Legend -->
+  <div id="legend">
+    <h3>LEGEND</h3>
+    <div class="legend-item">
+      <div class="legend-dot" style="background:#f97316"></div><span>Source file</span>
+    </div>
+    <div class="legend-item">
+      <div class="legend-dot" style="background:#3b82f6"></div><span>Test file</span>
+    </div>
+    <div class="legend-item">
+      <div class="legend-dot" style="background:#a855f7"></div><span>Spec / Docs</span>
+    </div>
+    <div class="legend-item">
+      <div class="legend-dot" style="background:#22c55e"></div><span>Visited</span>
+    </div>
+    <div class="legend-item">
+      <div class="legend-dot" style="background:#ef4444"></div><span>Modified / Bug</span>
+    </div>
+    <div class="legend-item">
+      <div class="legend-dot" style="background:#facc15; width:20px; height:4px; border-radius:2px;"></div><span>Agent path</span>
+    </div>
+  </div>
+  <!-- Score ring -->
+  <div id="score-ring">
+    <svg width="80" height="80" viewBox="0 0 80 80">
+      <circle cx="40" cy="40" r="34" fill="none"
+        stroke="rgba(125,211,252,0.15)" stroke-width="6"/>
+      <circle id="score-arc" cx="40" cy="40" r="34" fill="none"
+        stroke="#7dd3fc" stroke-width="6"
+        stroke-dasharray="0 214"
+        stroke-linecap="round"
+        transform="rotate(-90 40 40)"
+        style="transition: stroke-dasharray 1s ease;"/>
+      <text id="score-text" x="40" y="45" text-anchor="middle"
+        fill="#e0e6f0" font-size="14" font-weight="700">0.0</text>
+    </svg>
+  </div>
+  <!-- Step log -->
+  <div id="step-log">
+    <h3>STEP LOG</h3>
+    <div id="log-entries"></div>
+  </div>
+  <!-- Tooltip -->
+  <div id="tooltip">
+    <h4 id="tooltip-title">File</h4>
+    <div id="tooltip-body"></div>
+  </div>
+  <!-- Timeline -->
+  <div id="timeline-panel">
+    <div class="tl-header">
+      <h3>TIMELINE REPLAY</h3>
+      <span id="step-label">Step 0 / 0</span>
+    </div>
+    <input type="range" id="timeline-slider" min="0" max="0" value="0"
+      oninput="onSliderChange(this.value)">
+    <div id="step-actions">
+      <button class="tl-btn" onclick="stepBack()">◀ Back</button>
+      <button class="tl-btn" id="play-btn" onclick="togglePlay()">▶ Play</button>
+      <button class="tl-btn" onclick="stepForward()">Forward ▶</button>
+      <button class="tl-btn" onclick="resetView()">↺ Reset</button>
+      <button class="tl-btn" id="orbit-btn" onclick="toggleOrbit()">🔄 Orbit</button>
+    </div>
+  </div>
+</div>
+<!-- Three.js from CDN -->
+<script src="https://cdnjs.cloudflare.com/ajax/libs/three.js/r128/three.min.js"></script>
+<script>
+// ── Sample data (replaced by real data from backend) ───────────────────────
+const DEFAULT_DATA = {
+  task: "task1",
+  variant_id: "variant_1",
+  final_score: 0.714,
+  strategy: "TARGETED_DEBUGGING",
+  failure_type: "CORRECT",
+  files: [
+    { name: "tests/test_formatter.py", type: "test" },
+    { name: "src/formatter.py", type: "src", is_bug_file: true },
+    { name: "src/utils.py", type: "src" }
+  ],
+  dependencies: [
+    { from: "tests/test_formatter.py", to: "src/formatter.py" },
+    { from: "src/formatter.py", to: "src/utils.py" }
+  ],
+  steps: [
+    { step: 1, action: "read_file", path: "tests/test_formatter.py", reward: 0.0 },
+    { step: 2, action: "read_file", path: "src/formatter.py", reward: 0.05 },
+    { step: 3, action: "search_code", path: null, reward: 0.0 },
+    { step: 4, action: "run_tests", path: "tests/test_formatter.py", reward: 0.0 },
+    { step: 5, action: "submit", path: null, reward: 0.694 }
+  ]
+};
+// ── Load data from injection point or use default ────────────────────────────
+function loadVizData() {
+  const el = document.getElementById('viz-data');
+  if (el && el.textContent.trim()) {
+    try { return JSON.parse(el.textContent); } catch(e) {}
+  }
+  return DEFAULT_DATA;
+}
+// ── Three.js setup ───────────────────────────────────────────────────────────
+const canvas = document.getElementById('three-canvas');
+const renderer = new THREE.WebGLRenderer({ canvas, antialias: true, alpha: true });
+renderer.setSize(window.innerWidth, window.innerHeight);
+renderer.setPixelRatio(Math.min(window.devicePixelRatio, 2));
+renderer.setClearColor(0x0a0e1a, 1);
+const scene = new THREE.Scene();
+const fov = 60;
+const camera = new THREE.PerspectiveCamera(fov, window.innerWidth / window.innerHeight, 0.1, 1000);
+camera.position.set(0, 8, 22);
+camera.lookAt(0, 0, 0);
+// Ambient + directional light
+scene.add(new THREE.AmbientLight(0x1a2040, 1));
+const dirLight = new THREE.DirectionalLight(0x7dd3fc, 0.6);
+dirLight.position.set(5, 10, 5);
+scene.add(dirLight);
+// Grid
+const grid = new THREE.GridHelper(40, 20, 0x1e293b, 0x1e293b);
+grid.position.y = -3;
+scene.add(grid);
+// Stars
+const starGeo = new THREE.BufferGeometry();
+const starCount = 800;
+const starPositions = new Float32Array(starCount * 3);
+for (let i = 0; i < starCount * 3; i++) starPositions[i] = (Math.random() - 0.5) * 200;
+starGeo.setAttribute('position', new THREE.BufferAttribute(starPositions, 3));
+const starMat = new THREE.PointsMaterial({ color: 0x334155, size: 0.3 });
+scene.add(new THREE.Points(starGeo, starMat));
+// ── Color palette ─────────────────────────────────────────────────────────────
+const COLORS = {
+  src:       0xf97316,
+  test:      0x3b82f6,
+  spec:      0xa855f7,
+  visited:   0x22c55e,
+  modified:  0xef4444,
+  bug:       0xef4444,
+  edge:      0x334155,
+  path:      0xfacc15,
+  agent:     0xfbbf24,
+};
+// ── Orbit control (manual implementation) ────────────────────────────────────
+let isOrbiting = false;
+let orbitActive = false;
+let mouse = { x: 0, y: 0, down: false, lastX: 0, lastY: 0 };
+let spherical = { theta: 0, phi: Math.PI / 4, r: 22 };
+canvas.addEventListener('mousedown', e => { mouse.down = true; mouse.lastX = e.clientX; mouse.lastY = e.clientY; });
+canvas.addEventListener('mouseup', () => { mouse.down = false; });
+canvas.addEventListener('mousemove', e => {
+  if (!mouse.down) {
+    // Hover for tooltip
+    checkHover(e.clientX, e.clientY);
+    return;
+  }
+  const dx = e.clientX - mouse.lastX;
+  const dy = e.clientY - mouse.lastY;
+  spherical.theta -= dx * 0.005;
+  spherical.phi = Math.max(0.1, Math.min(Math.PI / 2, spherical.phi - dy * 0.005));
+  mouse.lastX = e.clientX;
+  mouse.lastY = e.clientY;
+});
+canvas.addEventListener('wheel', e => {
+  spherical.r = Math.max(8, Math.min(50, spherical.r + e.deltaY * 0.02));
+});
+function updateCamera() {
+  if (orbitActive) spherical.theta += 0.003;
+  camera.position.x = spherical.r * Math.sin(spherical.phi) * Math.sin(spherical.theta);
+  camera.position.y = spherical.r * Math.cos(spherical.phi);
+  camera.position.z = spherical.r * Math.sin(spherical.phi) * Math.cos(spherical.theta);
+  camera.lookAt(0, 0, 0);
+}
+// ── Scene objects ─────────────────────────────────────────────────────────────
+const nodeObjects = {}; // name → { mesh, label, position }
+const edgeObjects = [];
+const pathObjects = [];
+let agentSphere = null;
+let agentTrail = null;
+let currentStep = 0;
+let maxStep = 0;
+let playing = false;
+let playInterval = null;
+let vizData = null;
+let nodePositions = {};
+// ── Build scene from data ─────────────────────────────────────────────────────
+function buildScene(data) {
+  vizData = data;
+  // Clear previous objects
+  Object.values(nodeObjects).forEach(o => scene.remove(o.mesh));
+  edgeObjects.forEach(e => scene.remove(e));
+  pathObjects.forEach(p => scene.remove(p));
+  if (agentSphere) scene.remove(agentSphere);
+  Object.keys(nodeObjects).forEach(k => delete nodeObjects[k]);
+  const files = data.files || [];
+  const n = files.length;
+  if (n === 0) return;
+  // Arrange files in a circular layout on XZ plane
+  files.forEach((file, i) => {
+    const angle = (i / n) * Math.PI * 2;
+    const radius = Math.max(4, n * 0.9);
+    const x = Math.cos(angle) * radius;
+    const z = Math.sin(angle) * radius;
+    const y = 0;
+    nodePositions[file.name] = new THREE.Vector3(x, y, z);
+    // Sphere geometry
+    const geo = new THREE.SphereGeometry(0.6, 16, 16);
+    const color = new THREE.Color(
+      file.is_bug_file ? COLORS.bug :
+      file.type === 'test' ? COLORS.test :
+      file.type === 'spec' ? COLORS.spec : COLORS.src
+    );
+    const mat = new THREE.MeshPhongMaterial({
+      color,
+      emissive: color.clone().multiplyScalar(0.3),
+      shininess: 60,
+      transparent: true,
+      opacity: 0.9,
+    });
+    const mesh = new THREE.Mesh(geo, mat);
+    mesh.position.set(x, y, z);
+    mesh.userData = { file };
+    scene.add(mesh);
+    // Glow ring
+    const ringGeo = new THREE.RingGeometry(0.75, 0.85, 32);
+    const ringMat = new THREE.MeshBasicMaterial({
+      color,
+      transparent: true,
+      opacity: 0.25,
+      side: THREE.DoubleSide,
+    });
+    const ring = new THREE.Mesh(ringGeo, ringMat);
+    ring.rotation.x = Math.PI / 2;
+    mesh.add(ring);
+    nodeObjects[file.name] = { mesh, position: nodePositions[file.name], file };
+  });
+  // Draw dependency edges
+  (data.dependencies || []).forEach(dep => {
+    const fromPos = nodePositions[dep.from];
+    const toPos = nodePositions[dep.to];
+    if (!fromPos || !toPos) return;
+    const points = [fromPos.clone(), toPos.clone()];
+    const geo = new THREE.BufferGeometry().setFromPoints(points);
+    const mat = new THREE.LineBasicMaterial({
+      color: COLORS.edge,
+      transparent: true,
+      opacity: 0.4,
+    });
+    const line = new THREE.Line(geo, mat);
+    scene.add(line);
+    edgeObjects.push(line);
+  });
+  // Agent globe
+  const agentGeo = new THREE.SphereGeometry(0.35, 16, 16);
+  const agentMat = new THREE.MeshPhongMaterial({
+    color: COLORS.agent,
+    emissive: 0xfbbf24,
+    emissiveIntensity: 0.8,
+    shininess: 100,
+  });
+  agentSphere = new THREE.Mesh(agentGeo, agentMat);
+  agentSphere.position.set(0, 3, 0); // Start above origin
+  scene.add(agentSphere);
+  // Update UI
+  document.getElementById('stat-task').textContent = data.task || '—';
+  document.getElementById('stat-variant').textContent = data.variant_id || '—';
+  document.getElementById('stat-steps').textContent = (data.steps || []).length;
+  document.getElementById('stat-strategy').textContent = data.strategy || '—';
+  document.getElementById('stat-failure').textContent = data.failure_type || '—';
+  updateScore(data.final_score || 0);
+  updateStepLog(data.steps || [], -1);
+  // Setup timeline
+  maxStep = (data.steps || []).length;
+  const slider = document.getElementById('timeline-slider');
+  slider.max = maxStep;
+  slider.value = 0;
+  currentStep = 0;
+  updateStepLabel(0, maxStep);
+  applyStep(0);
+}
+// ── Animation: go to a specific step ─────────────────────────────────────────
+function applyStep(stepIndex) {
+  if (!vizData) return;
+  const steps = vizData.steps || [];
+  const visitedFiles = new Set();
+  const modifiedFiles = new Set();
+  // Reset all nodes
+  Object.values(nodeObjects).forEach(obj => {
+    const file = obj.file;
+    const baseColor = new THREE.Color(
+      file.is_bug_file ? COLORS.bug :
+      file.type === 'test' ? COLORS.test :
+      file.type === 'spec' ? COLORS.spec : COLORS.src
+    );
+    obj.mesh.material.color.set(baseColor);
+    obj.mesh.material.emissive.set(baseColor.clone().multiplyScalar(0.2));
+    obj.mesh.material.opacity = 0.5;
+    obj.mesh.scale.set(1, 1, 1);
+  });
+  // Remove old path lines
+  pathObjects.forEach(p => scene.remove(p));
+  pathObjects.length = 0;
+  // Collect positions for path up to current step
+  const pathPositions = [];
+  for (let i = 0; i < stepIndex; i++) {
+    const step = steps[i];
+    if (!step) continue;
+    if (step.path && nodeObjects[step.path]) {
+      const pos = nodeObjects[step.path].position.clone();
+      pathPositions.push(pos.clone().add(new THREE.Vector3(0, 0.1, 0)));
+      if (step.action === 'read_file') visitedFiles.add(step.path);
+      if (step.action === 'write_file') modifiedFiles.add(step.path);
+    }
+  }
+  // Color visited + modified
+  visitedFiles.forEach(name => {
+    if (nodeObjects[name]) {
+      nodeObjects[name].mesh.material.color.set(COLORS.visited);
+      nodeObjects[name].mesh.material.emissive.set(
+        new THREE.Color(COLORS.visited).multiplyScalar(0.4)
+      );
+      nodeObjects[name].mesh.material.opacity = 1.0;
+      nodeObjects[name].mesh.scale.set(1.2, 1.2, 1.2);
+    }
+  });
+  modifiedFiles.forEach(name => {
+    if (nodeObjects[name]) {
+      nodeObjects[name].mesh.material.color.set(COLORS.modified);
+      nodeObjects[name].mesh.material.emissive.set(
+        new THREE.Color(COLORS.modified).multiplyScalar(0.5)
+      );
+      nodeObjects[name].mesh.material.opacity = 1.0;
+      nodeObjects[name].mesh.scale.set(1.4, 1.4, 1.4);
+    }
+  });
+  // Draw path beam
+  if (pathPositions.length >= 2) {
+    const pathGeo = new THREE.BufferGeometry().setFromPoints(pathPositions);
+    const pathMat = new THREE.LineBasicMaterial({
+      color: COLORS.path,
+      transparent: true,
+      opacity: 0.85,
+      linewidth: 2,
+    });
+    const pathLine = new THREE.Line(pathGeo, pathMat);
+    scene.add(pathLine);
+    pathObjects.push(pathLine);
+  }
+  // Move agent sphere
+  if (stepIndex > 0 && stepIndex <= steps.length) {
+    const currentStepData = steps[stepIndex - 1];
+    if (currentStepData && currentStepData.path && nodeObjects[currentStepData.path]) {
+      const targetPos = nodeObjects[currentStepData.path].position;
+      agentSphere.position.set(targetPos.x, targetPos.y + 1.2, targetPos.z);
+    } else {
+      // No file target — float in center (for search/submit actions)
+      agentSphere.position.set(0, 2.5, 0);
+    }
+  } else {
+    agentSphere.position.set(0, 3.5, 0);
+  }
+  // Highlight current node
+  if (stepIndex > 0) {
+    const cur = steps[stepIndex - 1];
+    if (cur && cur.path && nodeObjects[cur.path]) {
+      nodeObjects[cur.path].mesh.scale.set(1.6, 1.6, 1.6);
+    }
+  }
+  updateStepLog(steps, stepIndex - 1);
+  updateStepLabel(stepIndex, maxStep);
+  // Update slider gradient
+  const slider = document.getElementById('timeline-slider');
+  const pct = maxStep > 0 ? (stepIndex / maxStep * 100) : 0;
+  slider.style.setProperty('--pct', pct + '%');
+}
+// ── Score ring ────────────────────────────────────────────────────────────────
+function updateScore(score) {
+  const circumference = 2 * Math.PI * 34;
+  const arc = circumference * Math.min(1, Math.max(0, score));
+  document.getElementById('score-arc').setAttribute(
+    'stroke-dasharray', `${arc} ${circumference}`
+  );
+  document.getElementById('score-text').textContent = score.toFixed(2);
+  document.getElementById('stat-score').textContent = score.toFixed(3);
+  // Color by score
+  const color = score >= 0.7 ? '#4ade80' : score >= 0.4 ? '#fbbf24' : '#f87171';
+  document.getElementById('score-arc').setAttribute('stroke', color);
+}
+// ── Step log ──────────────────────────────────────────────────────────────────
+function updateStepLog(steps, currentIdx) {
+  const container = document.getElementById('log-entries');
+  container.innerHTML = '';
+  const ACTION_EMOJI = {
+    read_file: '📖',
+    write_file: '✏️',
+    run_tests: '🧪',
+    search_code: '🔍',
+    submit: '🏁',
+  };
+  steps.forEach((step, i) => {
+    const active = i === currentIdx;
+    const past = i < currentIdx;
+    const entry = document.createElement('div');
+    entry.className = 'log-entry';
+    entry.style.opacity = past ? '0.6' : active ? '1' : '0.35';
+    if (active) entry.style.background = 'rgba(125,211,252,0.08)';
+    const reward = step.reward || 0;
+    const rewardClass = reward > 0 ? 'reward-pos' : reward < 0 ? 'reward-neg' : 'reward-zero';
+    const emoji = ACTION_EMOJI[step.action] || '•';
+    const path = step.path ? step.path.split('/').pop() : step.action;
+    entry.innerHTML = `
+      <span class="log-step">S${step.step}</span>
+      <span class="log-action" style="color:${active ? '#7dd3fc' : '#94a3b8'}">${emoji} ${path}</span>
+      <span class="log-reward ${rewardClass}">${reward > 0 ? '+' : ''}${reward.toFixed(2)}</span>
+    `;
+    container.appendChild(entry);
+  });
+  // Auto-scroll to current
+  if (currentIdx >= 0) {
+    const entries = container.children;
+    if (entries[currentIdx]) {
+      entries[currentIdx].scrollIntoView({ block: 'nearest' });
+    }
+  }
+}
+// ── Hover tooltip ─────────────────────────────────────────────────────────────
+const raycaster = new THREE.Raycaster();
+const mouseVec = new THREE.Vector2();
+const tooltip = document.getElementById('tooltip');
+function checkHover(mx, my) {
+  mouseVec.x = (mx / window.innerWidth) * 2 - 1;
+  mouseVec.y = -(my / window.innerHeight) * 2 + 1;
+  raycaster.setFromCamera(mouseVec, camera);
+  const meshes = Object.values(nodeObjects).map(o => o.mesh);
+  const hits = raycaster.intersectObjects(meshes);
+  if (hits.length > 0) {
+    const file = hits[0].object.userData.file;
+    if (file) {
+      tooltip.style.opacity = '1';
+      tooltip.style.left = (mx + 14) + 'px';
+      tooltip.style.top = (my - 14) + 'px';
+      document.getElementById('tooltip-title').textContent = file.name;
+      document.getElementById('tooltip-body').innerHTML = `
+        Type: ${file.type}<br>
+        ${file.is_bug_file ? '⚠️ Bug location' : ''}
+      `;
+    }
+  } else {
+    tooltip.style.opacity = '0';
+  }
+}
+// ── Timeline controls ─────────────────────────────────────────────────────────
+function onSliderChange(val) {
+  currentStep = parseInt(val);
+  applyStep(currentStep);
+}
+function stepForward() {
+  if (currentStep < maxStep) {
+    currentStep++;
+    document.getElementById('timeline-slider').value = currentStep;
+    applyStep(currentStep);
+  }
+}
+function stepBack() {
+  if (currentStep > 0) {
+    currentStep--;
+    document.getElementById('timeline-slider').value = currentStep;
+    applyStep(currentStep);
+  }
+}
+function togglePlay() {
+  playing = !playing;
+  const btn = document.getElementById('play-btn');
+  btn.textContent = playing ? '⏸ Pause' : '▶ Play';
+  if (playing) {
+    if (currentStep >= maxStep) { currentStep = 0; }
+    playInterval = setInterval(() => {
+      if (currentStep >= maxStep) {
+        playing = false;
+        btn.textContent = '▶ Play';
+        clearInterval(playInterval);
+        return;
+      }
+      stepForward();
+    }, 900);
+  } else {
+    clearInterval(playInterval);
+  }
+}
+function toggleOrbit() {
+  orbitActive = !orbitActive;
+  const btn = document.getElementById('orbit-btn');
+  btn.textContent = orbitActive ? '⏹ Stop' : '🔄 Orbit';
+  btn.classList.toggle('active', orbitActive);
+}
+function resetView() {
+  spherical = { theta: 0, phi: Math.PI / 4, r: 22 };
+  currentStep = 0;
+  document.getElementById('timeline-slider').value = 0;
+  applyStep(0);
+}
+function updateStepLabel(step, max) {
+  document.getElementById('step-label').textContent = `Step ${step} / ${max}`;
+}
+// ── Animation loop ────────────────────────────────────────────────────────────
+let frame = 0;
+function animate() {
+  requestAnimationFrame(animate);
+  frame++;
+  updateCamera();
+  // Pulse agent sphere
+  if (agentSphere) {
+    const pulse = 1 + Math.sin(frame * 0.08) * 0.15;
+    agentSphere.scale.set(pulse, pulse, pulse);
+    agentSphere.rotation.y += 0.03;
+  }
+  // Subtle node oscillation
+  Object.values(nodeObjects).forEach((obj, i) => {
+    obj.mesh.position.y = obj.position.y + Math.sin(frame * 0.02 + i) * 0.05;
+  });
+  renderer.render(scene, camera);
+}
+// ── Window resize ─────────────────────────────────────────────────────────────
+window.addEventListener('resize', () => {
+  camera.aspect = window.innerWidth / window.innerHeight;
+  camera.updateProjectionMatrix();
+  renderer.setSize(window.innerWidth, window.innerHeight);
+});
+// ── Public API for Gradio integration ────────────────────────────────────────
+window.loadTrajectoryData = function(jsonData) {
+  try {
+    const data = typeof jsonData === 'string' ? JSON.parse(jsonData) : jsonData;
+    buildScene(data);
+  } catch(e) {
+    console.error('Failed to load trajectory data:', e);
+  }
+};
+// ── Init ─────────────────────────────────────────────────────────────────────
+document.addEventListener('DOMContentLoaded', () => {
+  const data = loadVizData();
+  buildScene(data);
+  document.getElementById('loader').style.display = 'none';
+  animate();
+});
+</script>
+</body>
+</html>