Spaces:

Chirag0123
/

codebase-nav-env

Sleeping

App Files Files Community

Chirag0123 commited on 13 days ago

Commit

0b0338d

1 Parent(s): dfbd16e

v4 Research Modules & Pre-submission tweaks

Browse files

Files changed (11) hide show

app.py +480 -391
e2e_test_v3.py +389 -0
inference.py +8 -4
server/analytics_engine.py +551 -0
server/app.py +148 -3
server/benchmark_runner.py +413 -0
server/causal_probe.py +409 -0
server/confidence_calibrator.py +363 -0
server/counterfactual_engine.py +383 -0
server/memory_bank.py +362 -0
static/viz3d.html +419 -680

app.py CHANGED Viewed

@@ -1,16 +1,21 @@
 #!/usr/bin/env python3
 """
-app.py — Gradio UI v3.0 — Full Platform Entry Point
-Tabs:
-  🎮 Interactive          — manual step-by-step control
-  🤖 Run Agent            — built-in deterministic agent demo
-  📊 Evaluation           — 6-dimension evaluation report
-  🧠 Intelligence         — failure classification, strategy, advanced metrics
-  🔁 Self-Improve         — improvement plan after failure
-  ⚖️ Compare Agents       — side-by-side multi-agent comparison
-  🌐 3D Visualizer        — Three.js trajectory visualization
-  📖 API                  — REST API reference
 """
 import os
 import json
@@ -22,6 +27,12 @@ from server.strategy_detector import StrategyDetector
 from server.advanced_metrics import AdvancedMetricsEngine
 from server.self_improvement import SelfImprovementEngine
 from server.multi_agent import MultiAgentComparison
 # ── Global instances ──────────────────────────────────────────────────────────
 env = CodebaseNavEnvironment()
@@ -30,36 +41,56 @@ strategy_det = StrategyDetector()
 adv_metrics_engine = AdvancedMetricsEngine()
 improvement_engine = SelfImprovementEngine()
 multi_agent_engine = MultiAgentComparison()
 # ── Tab 1: Interactive ────────────────────────────────────────────────────────
-def reset_environment(task: str):
     try:
         result = env.reset(task=task)
         obs = result.observation
         tree = "\n".join(f"  📄 {f}" for f in obs.repo_tree)
-        failing = ", ".join(obs.failing_tests) if obs.failing_tests else "None listed"
         fi = result.info.get("fault_injection", {})
         faults = ""
         if fi.get("faults_injected"):
-            faults = f"\n\n⚠️ Fault Injection ({fi.get('difficulty_multiplier', 1.0):.1f}x):\n"
             faults += "\n".join(f"  • {f}" for f in fi["faults_injected"][:5])
         status = (
-            f"✅ Episode Started — {task} (variant: {result.info.get('variant_id', '?')})\n"
-            f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
-            f"Steps: {obs.steps_remaining} remaining\n\n"
             f"📁 Files:\n{tree}\n\n"
             f"🔴 Failing Tests: {failing}\n\n"
-            f"📋 Task: {obs.task_description}{faults}"
         )
         return status, "", "0", "0.000"
     except Exception as e:
         return f"❌ Error: {e}", "", "0", "0.000"
-def take_step(action_type: str, path: str, query: str, content: str):
     if env.done:
         return "❌ Episode done. Reset first.", "", "", ""
     try:
@@ -71,83 +102,88 @@ def take_step(action_type: str, path: str, query: str, content: str):
         )
         result = env.step(action)
         obs = result.observation
-        result_text = obs.last_action_result or "No output"
-        error = f"\n⚠️ {obs.last_action_error}" if obs.last_action_error else ""
         flags = result.info.get("security_flags", [])
-        sec = f"\n🔒 Security: {flags}" if flags else ""
         status = (
-            f"Step {result.info['steps_taken']} | "
-            f"Reward: {result.reward:+.3f} | "
-            f"Steps left: {obs.steps_remaining}{error}{sec}"
         )
         if result.done:
             status += f"\n\n🏁 DONE — Score: {result.info['final_score']:.3f}"
-        return (
-            status,
-            result_text[:3000],
-            str(result.info["steps_taken"]),
-            f"{result.info.get('cumulative_reward', 0):.3f}",
-        )
     except Exception as e:
-        return f"❌ Error: {e}", "", "", ""
 # ── Tab 2: Run Agent ──────────────────────────────────────────────────────────
-def run_builtin_agent(task: str):
     try:
         result = env.reset(task=task)
         obs = result.observation
-        log = [
-            f"🚀 {task} (variant: {result.info.get('variant_id')})",
-            f"   Files: {obs.repo_tree}",
-            f"   Failing: {obs.failing_tests}",
-        ]
         tree = obs.repo_tree
         test_files = sorted([f for f in tree if f.startswith("tests/")])
         src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")])
         spec_files = sorted([f for f in tree if f.endswith(".md")])
         steps = 0
         if task == "task3" and spec_files:
-            for sf in spec_files:
                 if env.done: break
                 r = env.step(RepoAction(action_type="read_file", path=sf))
-                steps += 1
-                log.append(f"   Step {steps}: read_file {sf} → {r.reward:+.3f}")
         for tf in test_files:
             if env.done: break
             r = env.step(RepoAction(action_type="read_file", path=tf))
-            steps += 1
-            log.append(f"   Step {steps}: read_file {tf} → {r.reward:+.3f}")
         for sf in src_files:
-            if env.done or steps >= 12: break
             r = env.step(RepoAction(action_type="read_file", path=sf))
-            steps += 1
-            log.append(f"   Step {steps}: read_file {sf} → {r.reward:+.3f}")
         if not env.done and test_files:
             r = env.step(RepoAction(action_type="run_tests", path=test_files[0]))
-            steps += 1
-            log.append(f"   Step {steps}: run_tests → {r.reward:+.3f}")
         if not env.done:
             r = env.step(RepoAction(action_type="submit"))
-            steps += 1
-            log.append(f"   Step {steps}: submit → {r.reward:+.3f}")
-        log += [
-            f"\n🏁 Score: {env.final_score:.3f}",
-            f"   Steps: {steps}",
-            f"   Reward: {env.cumulative_reward:.3f}",
-        ]
         return "\n".join(log)
     except Exception as e:
-        return f"❌ Error: {e}"
 # ── Tab 3: Evaluation ─────────────────────────────────────────────────────────
@@ -156,55 +192,42 @@ def get_evaluation():
     try:
         ev = env.get_evaluation()
         if "error" in ev:
-            return "No evaluation available. Run an episode first."
-        lines = [
-            f"🎯 Composite Score: {ev['composite_score']:.3f}",
-            "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
-        ]
         for name, dim in ev.get("dimensions", {}).items():
-            bar = "█" * int(dim["score"] * 20) + "░" * (20 - int(dim["score"] * 20))
             lines.append(f"  {name:15s} [{bar}] {dim['score']:.3f}")
-            for e in dim.get("evidence", [])[:2]:
                 lines.append(f"    → {e}")
         if ev.get("strengths"):
             lines += ["\n💪 Strengths:"] + [f"  ✅ {s}" for s in ev["strengths"]]
         if ev.get("failure_analysis"):
             lines += ["\n⚠️ Failures:"] + [f"  ❌ {f}" for f in ev["failure_analysis"]]
         if ev.get("recommendations"):
-            lines += ["\n💡 Recommendations:"] + [f"  → {r}" for r in ev["recommendations"]]
         return "\n".join(lines)
     except Exception as e:
         return f"Error: {e}"
 def get_metrics():
     try:
         return json.dumps(env.get_metrics(), indent=2, default=str)
     except Exception as e:
         return f"Error: {e}"
 def get_trajectory():
     try:
         t = env.get_trajectory()
-        if not t:
-            return "No trajectory. Run an episode first."
         lines = [
-            f"Episode: {t.get('episode_id')}",
-            f"Task: {t.get('task')} | Variant: {t.get('variant_id')}",
-            f"Score: {t.get('final_score', 0):.3f} | Duration: {t.get('duration_seconds', '?')}s",
-            "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
         ]
-        emojis = {"read_file": "📖", "write_file": "✏️", "run_tests": "🧪",
-                  "search_code": "🔍", "submit": "🏁"}
-        for step in t.get("steps", []):
-            em = emojis.get(step["action_type"], "•")
             p = step.get("action_path") or step.get("action_query") or ""
             err = " ❌" if step.get("error") else ""
-            lines.append(
-                f"  {em} {step['step_number']:2d}: {step['action_type']:12s} {p:30s} "
-                f"reward={step['reward']:+.3f} ({step['duration_ms']:.0f}ms){err}"
-            )
         return "\n".join(lines)
     except Exception as e:
         return f"Error: {e}"
@@ -214,294 +237,310 @@ def get_trajectory():
 def get_failure_classification():
     try:
-        traj = env.get_trajectory()
-        if not traj:
-            return "No trajectory. Run an episode first."
-        meta = env.variant.meta if env.variant else {}
-        report = failure_clf.classify(
-            episode_id=traj.get("episode_id", ""),
-            task=env.current_task or "unknown",
-            trajectory_steps=traj.get("steps", []),
-            variant_meta=meta,
-            files_read=list(env.files_read),
-            files_written=list(env.files_written),
-            final_score=env.final_score,
-            security_violations=env.security_violations,
-        )
-        d = report.to_dict()
         lines = [
             f"{'✅ SUCCESS' if d['success'] else '❌ FAILURE'}",
-            f"Primary Failure Type: {d['primary_failure']}",
-            f"Failures Detected: {d['failure_count']}",
-            "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
         ]
-        for f in d.get("failures", []):
-            lines += [
-                f"\n[{f['severity'].upper()}] {f['type']} @ Step {f['step']}",
-                f"  Evidence: {f['evidence']}",
-                f"  Root Cause: {f['root_cause']}",
-                f"  Fix: {f['remediation']}",
-            ]
         if d.get("failure_summary"):
             lines += ["\n📋 Summary:", f"  {d['failure_summary']}"]
         if d.get("retry_hint"):
-            lines += ["\n🔁 Retry Hint:", f"  {d['retry_hint']}"]
         return "\n".join(lines)
-    except Exception as e:
-        return f"Error: {e}"
 def get_strategy_detection():
     try:
-        traj = env.get_trajectory()
-        if not traj:
-            return "No trajectory. Run an episode first."
-        meta = env.variant.meta if env.variant else {}
-        report = strategy_det.detect(
-            trajectory_steps=traj.get("steps", []),
-            task=env.current_task or "unknown",
-            variant_meta=meta,
-            files_read=list(env.files_read),
-            final_score=env.final_score,
-        )
-        d = report.to_dict()
-        score_bar = "█" * int(d["score"] * 20) + "░" * (20 - int(d["score"] * 20))
         lines = [
-            f"🧭 Strategy: {d['strategy']}",
-            f"   Score:  [{score_bar}] {d['score']:.3f}",
-            f"   Confidence: {d['confidence']:.0%}",
-            f"\n📖 {d['strategy_description']}",
-            f"\n📊 Exploration Ratio: {d['exploration_ratio']:.2f} "
-            f"({'explore-heavy' if d['exploration_ratio'] > 0.6 else 'exploit-heavy' if d['exploration_ratio'] < 0.4 else 'balanced'})",
-            f"   Strategy Pivots: {d['pivot_count']}",
         ]
-        if d.get("sub_patterns"):
-            lines += ["\n🔖 Sub-patterns:"] + [f"  • {p}" for p in d["sub_patterns"]]
-        if d.get("evidence"):
-            lines += ["\n🔍 Evidence:"] + [f"  → {e}" for e in d["evidence"]]
         return "\n".join(lines)
-    except Exception as e:
-        return f"Error: {e}"
 def get_advanced_metrics():
     try:
-        traj = env.get_trajectory()
-        if not traj:
-            return "No trajectory. Run an episode first."
-        meta = env.variant.meta if env.variant else {}
-        report = adv_metrics_engine.compute(
-            trajectory_steps=traj.get("steps", []),
-            variant_meta=meta,
-            final_score=env.final_score,
-            files_read=list(env.files_read),
-            files_written=list(env.files_written),
-        )
-        d = report.to_dict()
-        def bar(v):
-            return "█" * int(v * 20) + "░" * (20 - int(v * 20))
-        lines = [
-            "⚡ ADVANCED METRICS",
-            "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
             f"  Reasoning Efficiency  [{bar(d['reasoning_efficiency'])}] {d['reasoning_efficiency']:.3f}",
             f"  Reliability Index     [{bar(d['reliability_index'])}] {d['reliability_index']:.3f}",
             f"  Exploration Ratio     [{bar(d['exploration_ratio'])}] {d['exploration_ratio']:.3f}",
             f"  Decision Entropy      [{bar(d['decision_entropy'])}] {d['decision_entropy']:.3f}",
             f"  Wasteful Ratio        [{bar(d['wasteful_ratio'])}] {d['wasteful_ratio']:.3f}",
-            f"  Pivot Rate            {d['pivot_rate']:.2f} per 10 steps",
-            f"  Consistency           [{bar(d['consistency_score'])}] {d['consistency_score']:.3f} ({d['runs_analyzed']} runs)",
-            "\n📊 Action Distribution:",
         ]
-        for action, count in d.get("action_distribution", {}).items():
-            lines.append(f"  {action:15s}: {count}")
-        if d.get("useful_actions"):
-            lines += ["\n✅ Useful Actions:"] + [f"  • {a}" for a in d["useful_actions"]]
-        if d.get("wasteful_actions"):
-            lines += ["\n⚠️ Wasteful Actions:"] + [f"  • {a}" for a in d["wasteful_actions"]]
-        lines += ["\n🔒 Reliability Breakdown:"]
-        for k, v in d.get("reliability_breakdown", {}).items():
-            lines.append(f"  {k:15s}: {v:.3f}")
         return "\n".join(lines)
-    except Exception as e:
-        return f"Error: {e}"
 # ── Tab 5: Self-Improve ───────────────────────────────────────────────────────
 def get_improvement_plan():
     try:
-        traj = env.get_trajectory()
-        if not traj:
-            return "No trajectory. Run an episode first."
-        meta = env.variant.meta if env.variant else {}
-        steps = traj.get("steps", [])
-        fail_report = failure_clf.classify(
-            episode_id=traj.get("episode_id", ""),
-            task=env.current_task or "unknown",
-            trajectory_steps=steps,
-            variant_meta=meta,
-            files_read=list(env.files_read),
-            files_written=list(env.files_written),
-            final_score=env.final_score,
-            security_violations=env.security_violations,
-        )
         plan = improvement_engine.generate_improvement_plan(
-            episode_id=traj.get("episode_id", ""),
-            task=env.current_task or "unknown",
-            failure_type=fail_report.primary_failure,
-            failure_evidence=[f.evidence for f in fail_report.failures],
-            original_score=env.final_score,
-            trajectory_steps=steps,
-            files_read=list(env.files_read),
-            files_written=list(env.files_written),
         )
         d = plan.to_dict()
         lines = [
-            f"🔁 SELF-IMPROVEMENT PLAN",
-            f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
-            f"Original Score: {d['original_score']:.3f}",
-            f"Failure Type: {d['failure_type']}",
-            f"\n❌ What Went Wrong:\n  {d['what_went_wrong']}",
-            f"\n🎯 Improved Strategy:\n  {d['improved_strategy']}",
-            f"\n📋 Step-by-Step Plan:",
-        ]
-        for step in d.get("step_by_step_plan", []):
-            lines.append(f"  {step}")
-        if d.get("specific_errors"):
-            lines += ["\n🔎 Specific Errors:"] + [f"  • {e}" for e in d["specific_errors"][:5]]
-        lines += [
-            "\n💉 System Prompt Injection (for next LLM run):",
-            "─────────────────────────────────────",
-            d.get("system_prompt_addon", "No injection needed."),
-        ]
         return "\n".join(lines)
-    except Exception as e:
-        return f"Error: {e}"
 # ── Tab 6: Compare Agents ─────────────────────────────────────────────────────
-def run_comparison(task: str, selected_agents: list):
     try:
-        agents = selected_agents if selected_agents else None
         report = multi_agent_engine.compare(env, task=task, agents=agents)
         d = report.to_dict()
         lines = [
             f"⚖️ MULTI-AGENT COMPARISON — {task} (variant: {d.get('variant_id')})",
-            f"🏆 Winner: {d.get('winner')} (score: {d.get('winner_score', 0):.3f})",
-            "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
-            f"{'Rank':<6} {'Agent':<16} {'Score':<8} {'Steps':<8} {'Strategy':<22} {'Failure':<22} {'Reliability':<12}",
-            "─" * 100,
         ]
-        for row in d.get("summary_table", []):
-            lines.append(
-                f"#{row['rank']:<5} {row['agent']:<16} {row['score']:<8.3f} "
-                f"{row['steps']:<8} {row['strategy']:<22} {row['failure']:<22} {row['reliability']:<12.3f}"
-            )
-        lines.append("━" * 100)
         if d.get("insights"):
             lines += ["\n💡 Insights:"] + [f"  → {i}" for i in d["insights"]]
-        lines.append("\n📊 Per-Agent Action Sequences:")
-        for run in d.get("detailed_runs", []):
-            seq = " → ".join(run.get("action_sequence", []))
             lines.append(f"  {run['agent_name']:16s}: {seq}")
         return "\n".join(lines)
-    except Exception as e:
-        return f"❌ Error: {e}"
 # ── Tab 7: 3D Visualizer ──────────────────────────────────────────────────────
-def get_viz_html():
-    """Generate the 3D visualizer HTML with current trajectory data injected."""
-    # Load the static HTML template
-    static_path = os.path.join(os.path.dirname(__file__), "static", "viz3d.html")
-    if not os.path.exists(static_path):
-        return "<p style='color:red'>viz3d.html not found in static/</p>"
-    with open(static_path, "r") as f:
-        html = f.read()
-    # Get viz data from current environment
-    traj = env.get_trajectory()
-    if traj:
-        meta = env.variant.meta if env.variant else {}
-        bug_files = set(meta.get("bug_files", []))
-        files = []
-        if env.variant:
-            for fname in env.variant.get_tree():
-                ftype = "test" if fname.startswith("tests/") else \
-                        "spec" if fname.endswith(".md") else "src"
-                files.append({
-                    "name": fname,
-                    "type": ftype,
-                    "is_bug_file": fname in bug_files,
-                    "visited": fname in env.files_read,
-                    "modified": fname in env.files_written,
-                })
-        test_files = [f["name"] for f in files if f["type"] == "test"]
-        src_files = [f["name"] for f in files if f["type"] == "src"]
-        deps = []
-        for tf in test_files:
-            for sf in src_files:
-                deps.append({"from": tf, "to": sf})
-        steps_data = []
-        for step in traj.get("steps", []):
-            steps_data.append({
-                "step": step.get("step_number", 0),
-                "action": step.get("action_type", ""),
-                "path": step.get("action_path"),
-                "reward": step.get("reward", 0.0),
-                "error": step.get("error"),
-                "pass_rate": step.get("test_pass_rate"),
-            })
-        strategy_report = strategy_det.detect(
-            traj.get("steps", []),
-            env.current_task or "unknown",
-            meta,
-            list(env.files_read),
-            env.final_score,
-        ) if traj.get("steps") else None
-        viz_data = {
-            "task": env.current_task or "unknown",
-            "variant_id": traj.get("variant_id", "unknown"),
-            "final_score": env.final_score,
-            "strategy": strategy_report.strategy if strategy_report else "UNKNOWN",
-            "failure_type": "—",
-            "files": files,
-            "dependencies": deps,
-            "steps": steps_data,
-        }
-        data_json = json.dumps(viz_data)
-    else:
-        data_json = ""
-    # Inject data into HTML
-    html = html.replace(
-        '<div id="viz-data" style="display:none"></div>',
-        f'<div id="viz-data" style="display:none">{data_json}</div>'
-    )
-    return html
-# ── Build Gradio UI ───────────────────────────────────────────────────────────
-with gr.Blocks(title="Codebase Navigation & Repair — OpenEnv v3") as demo:
     gr.Markdown(
-        "# 🔍 Codebase Navigation & Repair — OpenEnv v3\n"
-        "**The most advanced debugging + evaluation platform for AI coding agents.** "
-        "Navigate codebases · Fix bugs · Evaluate process · Visualize in 3D."
     )
     with gr.Tabs():
@@ -510,19 +549,12 @@ with gr.Blocks(title="Codebase Navigation & Repair — OpenEnv v3") as demo:
         with gr.TabItem("🎮 Interactive"):
             with gr.Row():
                 with gr.Column(scale=1):
-                    task_select = gr.Dropdown(
-                        ["task1", "task2", "task3"], value="task1",
-                        label="Task",
-                        info="task1=bugs, task2=cross-module, task3=feature impl"
-                    )
                     reset_btn = gr.Button("🔄 Reset Environment", variant="primary")
                     gr.Markdown("### Action")
-                    act_type = gr.Dropdown(
-                        ["read_file", "write_file", "run_tests", "search_code", "submit"],
-                        value="read_file", label="Action Type",
-                    )
                     act_path = gr.Textbox(label="Path", placeholder="src/auth.py")
-                    act_query = gr.Textbox(label="Query (search_code)", placeholder="validate_token")
                     act_content = gr.Textbox(label="Content (write_file)", lines=4)
                     step_btn = gr.Button("▶️ Execute Step", variant="secondary")
                 with gr.Column(scale=2):
@@ -531,16 +563,16 @@ with gr.Blocks(title="Codebase Navigation & Repair — OpenEnv v3") as demo:
                     with gr.Row():
                         steps_box = gr.Textbox(label="Steps", value="0", interactive=False)
                         reward_box = gr.Textbox(label="Cumulative Reward", value="0.000", interactive=False)
-            reset_btn.click(reset_environment, [task_select], [status_box, result_box, steps_box, reward_box])
             step_btn.click(take_step, [act_type, act_path, act_query, act_content], [status_box, result_box, steps_box, reward_box])
         # ── Tab 2: Run Agent ──────────────────────────────────────────────────
         with gr.TabItem("🤖 Run Agent"):
-            gr.Markdown("### Built-in Demonstration Agent\nRuns deterministic read→submit strategy.")
-            agent_task = gr.Dropdown(["task1", "task2", "task3"], value="task1", label="Task")
             run_btn = gr.Button("🚀 Run Agent", variant="primary")
-            agent_output = gr.Textbox(label="Agent Log", lines=20, interactive=False)
-            run_btn.click(run_builtin_agent, [agent_task], [agent_output])
         # ── Tab 3: Evaluation ─────────────────────────────────────────────────
         with gr.TabItem("📊 Evaluation"):
@@ -553,107 +585,164 @@ with gr.Blocks(title="Codebase Navigation & Repair — OpenEnv v3") as demo:
             metrics_btn.click(get_metrics, outputs=[eval_out])
             traj_btn.click(get_trajectory, outputs=[eval_out])
-        # ── Tab 4: 🧠 Intelligence ─────────────────────────────────────────────
         with gr.TabItem("🧠 Intelligence"):
-            gr.Markdown(
-                "### Deep Agent Intelligence Analysis\n"
-                "Failure classification, strategy detection, and advanced behavioral metrics."
-            )
             with gr.Row():
-                classify_btn = gr.Button("🔬 Classify Failure", variant="primary")
-                strategy_btn = gr.Button("🧭 Detect Strategy", variant="secondary")
                 adv_btn = gr.Button("⚡ Advanced Metrics", variant="secondary")
             intel_out = gr.Textbox(label="Analysis", lines=32, interactive=False)
-            classify_btn.click(get_failure_classification, outputs=[intel_out])
-            strategy_btn.click(get_strategy_detection, outputs=[intel_out])
             adv_btn.click(get_advanced_metrics, outputs=[intel_out])
-        # ── Tab 5: 🔁 Self-Improve ─────────────────────────────────────────────
         with gr.TabItem("🔁 Self-Improve"):
-            gr.Markdown(
-                "### Self-Improvement Loop\n"
-                "After a failure, this generates an actionable improvement plan and a "
-                "system prompt injection for the agent's next attempt."
-            )
-            improve_btn = gr.Button("🔁 Generate Improvement Plan", variant="primary")
-            improve_out = gr.Textbox(label="Improvement Plan", lines=32, interactive=False)
             improve_btn.click(get_improvement_plan, outputs=[improve_out])
-        # ── Tab 6: ⚖️ Compare ──────────────────────────────────────────────────
         with gr.TabItem("⚖️ Compare Agents"):
-            gr.Markdown(
-                "### Multi-Agent Strategy Comparison\n"
-                "Runs 4 built-in agent strategies on the same task to compare "
-                "efficiency, strategy, and reliability side-by-side."
-            )
             with gr.Row():
-                comp_task = gr.Dropdown(["task1", "task2", "task3"], value="task1", label="Task")
                 comp_agents = gr.CheckboxGroup(
-                    ["test-first", "search-first", "minimal", "exhaustive"],
-                    value=["test-first", "search-first", "minimal", "exhaustive"],
-                    label="Agents to Compare",
                 )
             comp_btn = gr.Button("⚖️ Run Comparison", variant="primary")
-            comp_out = gr.Textbox(label="Comparison Report", lines=30, interactive=False)
             comp_btn.click(run_comparison, [comp_task, comp_agents], [comp_out])
-        # ── Tab 7: 🌐 3D Visualizer ────────────────────────────────────────���───
         with gr.TabItem("🌐 3D Visualizer"):
             gr.Markdown(
                 "### Agent Trajectory 3D Visualization\n"
-                "Files = 3D nodes · Dependencies = edges · Agent path = animated beam · "
-                "Timeline = scrubbable replay. **Run an episode first, then refresh.**"
             )
-            refresh_viz_btn = gr.Button("🔄 Load Trajectory into Visualizer", variant="primary")
-            viz_html = gr.HTML(value="<p style='color:#64748b;text-align:center;padding:40px'>Click 'Load Trajectory' after running an episode.</p>")
-            refresh_viz_btn.click(get_viz_html, outputs=[viz_html])
-        # ── Tab 8: API ────────────────────────────────────────────────────────
         with gr.TabItem("📖 API"):
             gr.Markdown("""
-### REST API — v3.0 Endpoints
-#### Core (OpenEnv-compliant)
-| Endpoint | Method | Description |
-|----------|--------|-------------|
-| `/reset?task=task1` | POST | Start new episode |
-| `/step` | POST | Take action |
-| `/state` | GET | Current state |
-| `/health` | GET | Health check |
 #### Evaluation
-| Endpoint | Method | Description |
-|----------|--------|-------------|
-| `/trajectory` | GET | Full action log |
-| `/evaluate` | GET | 6-dimension scores |
-| `/metrics` | GET | Memory + security stats |
-| `/fault-config` | POST | Enable fault injection |
-#### Intelligence (NEW in v3)
-| Endpoint | Method | Description |
-|----------|--------|-------------|
-| `/classify` | GET | Typed failure classification |
-| `/strategy` | GET | Behavioral strategy detection |
-| `/advanced-metrics` | GET | Entropy, reliability, consistency |
-| `/improvement-plan` | GET | Self-improvement feedback |
-| `/compare-agents` | POST | Multi-agent comparison |
-| `/viz-data` | GET | 3D visualization data |
 ```bash
 BASE="http://localhost:7860"
 curl -X POST "$BASE/reset?task=task1"
-curl -X POST "$BASE/step" -H "Content-Type: application/json" -d '{"action_type":"read_file","path":"src/auth.py"}'
 curl -X POST "$BASE/step" -d '{"action_type":"submit"}'
 curl "$BASE/classify"
-curl "$BASE/strategy"
-curl "$BASE/advanced-metrics"
-curl "$BASE/improvement-plan"
-curl -X POST "$BASE/compare-agents?task=task1"
 ```
 """)
-# ── Mount FastAPI under same process ──────────────────────────────────────────
 from server.app import app as fastapi_app
 gr_app = gr.mount_gradio_app(fastapi_app, demo, path="/")

 #!/usr/bin/env python3
 """
+app.py — Gradio UI v4.0 — Full Research Platform
+13 tabs:
+  🎮 Interactive        — manual control
+  🤖 Run Agent          — deterministic demo agent
+  📊 Evaluation         — 6-dimension process evaluation
+  🧠 Intelligence       — failure, strategy, advanced metrics
+  🔁 Self-Improve       — improvement plan with prompt injection
+  ⚖️ Compare Agents     — multi-agent strategy comparison
+  🌐 3D Visualizer      — Three.js trajectory viz (FIXED: iframe)
+  🧪 Causal Probe       — causal reasoning vs guessing
+  🎭 Counterfactual     — brittleness / robustness testing
+  📐 Confidence         — calibration: overconfident vs underconfident
+  🏆 Benchmark          — automated leaderboard
+  📈 Analytics          — unified research-grade report
+  📖 API                — REST reference
 """
 import os
 import json
 from server.advanced_metrics import AdvancedMetricsEngine
 from server.self_improvement import SelfImprovementEngine
 from server.multi_agent import MultiAgentComparison
+from server.causal_probe import CausalProbe
+from server.counterfactual_engine import CounterfactualEngine
+from server.confidence_calibrator import ConfidenceCalibrator
+from server.benchmark_runner import BenchmarkRunner
+from server.analytics_engine import AnalyticsEngine
+from server.memory_bank import get_global_memory
 # ── Global instances ──────────────────────────────────────────────────────────
 env = CodebaseNavEnvironment()
 adv_metrics_engine = AdvancedMetricsEngine()
 improvement_engine = SelfImprovementEngine()
 multi_agent_engine = MultiAgentComparison()
+causal_probe = CausalProbe()
+counterfactual_engine = CounterfactualEngine()
+confidence_calibrator = ConfidenceCalibrator()
+benchmark_runner = BenchmarkRunner()
+analytics_engine = AnalyticsEngine()
+memory_bank = get_global_memory()
+# ── Helpers ───────────────────────────────────────────────────────────────────
+def _get_traj_and_meta():
+    traj = env.get_trajectory()
+    if not traj:
+        return None, None, None, None
+    meta = env.variant.meta if env.variant else {}
+    steps = traj.get("steps", [])
+    return traj, meta, steps, traj.get("episode_id", "")
+def _no_traj():
+    return "⚠️ No trajectory. Run an episode first (Interactive or Run Agent tab)."
 # ── Tab 1: Interactive ────────────────────────────────────────────────────────
+def reset_environment(task):
     try:
         result = env.reset(task=task)
         obs = result.observation
         tree = "\n".join(f"  📄 {f}" for f in obs.repo_tree)
+        failing = ", ".join(obs.failing_tests) if obs.failing_tests else "None"
         fi = result.info.get("fault_injection", {})
         faults = ""
         if fi.get("faults_injected"):
+            faults = f"\n\n⚠️ Fault Injection ({fi.get('difficulty_multiplier',1):.1f}×):\n"
             faults += "\n".join(f"  • {f}" for f in fi["faults_injected"][:5])
         status = (
+            f"✅ Episode started — {task} (variant: {result.info.get('variant_id','?')})\n"
+            f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
+            f"Steps remaining: {obs.steps_remaining}\n\n"
             f"📁 Files:\n{tree}\n\n"
             f"🔴 Failing Tests: {failing}\n\n"
+            f"📋 {obs.task_description}{faults}"
         )
         return status, "", "0", "0.000"
     except Exception as e:
         return f"❌ Error: {e}", "", "0", "0.000"
+def take_step(action_type, path, query, content):
     if env.done:
         return "❌ Episode done. Reset first.", "", "", ""
     try:
         )
         result = env.step(action)
         obs = result.observation
+        result_text = obs.last_action_result or ""
+        err = f"\n⚠️ {obs.last_action_error}" if obs.last_action_error else ""
         flags = result.info.get("security_flags", [])
+        sec = f"\n🔒 {flags}" if flags else ""
         status = (
+            f"Step {result.info['steps_taken']} | Reward: {result.reward:+.3f} | "
+            f"Left: {obs.steps_remaining}{err}{sec}"
         )
         if result.done:
             status += f"\n\n🏁 DONE — Score: {result.info['final_score']:.3f}"
+        return status, result_text[:3000], str(result.info["steps_taken"]), f"{result.info.get('cumulative_reward',0):.3f}"
     except Exception as e:
+        return f"❌ {e}", "", "", ""
 # ── Tab 2: Run Agent ──────────────────────────────────────────────────────────
+def run_builtin_agent(task):
     try:
         result = env.reset(task=task)
         obs = result.observation
         tree = obs.repo_tree
+        log = [f"🚀 {task} (variant: {result.info.get('variant_id')})", f"   Files: {tree}"]
         test_files = sorted([f for f in tree if f.startswith("tests/")])
         src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")])
         spec_files = sorted([f for f in tree if f.endswith(".md")])
         steps = 0
         if task == "task3" and spec_files:
+            for sf in spec_files[:2]:
                 if env.done: break
                 r = env.step(RepoAction(action_type="read_file", path=sf))
+                steps += 1; log.append(f"   Step {steps}: read_file {sf} → {r.reward:+.3f}")
         for tf in test_files:
             if env.done: break
             r = env.step(RepoAction(action_type="read_file", path=tf))
+            steps += 1; log.append(f"   Step {steps}: read_file {tf} → {r.reward:+.3f}")
+        if not env.done:
+            r = env.step(RepoAction(action_type="search_code", query="def "))
+            steps += 1; log.append(f"   Step {steps}: search_code → {r.reward:+.3f}")
         for sf in src_files:
+            if env.done or steps >= 14: break
             r = env.step(RepoAction(action_type="read_file", path=sf))
+            steps += 1; log.append(f"   Step {steps}: read_file {sf} → {r.reward:+.3f}")
         if not env.done and test_files:
             r = env.step(RepoAction(action_type="run_tests", path=test_files[0]))
+            steps += 1; log.append(f"   Step {steps}: run_tests → {r.reward:+.3f}")
         if not env.done:
             r = env.step(RepoAction(action_type="submit"))
+            steps += 1; log.append(f"   Step {steps}: submit → {r.reward:+.3f}")
+        log += ["", f"🏁 Score: {env.final_score:.3f} | Steps: {steps} | Reward: {env.cumulative_reward:.3f}"]
+        # Store in memory
+        traj = env.get_trajectory()
+        if traj:
+            meta = env.variant.meta if env.variant else {}
+            fail_r = failure_clf.classify(
+                traj.get("episode_id",""), task, traj.get("steps",[]), meta,
+                list(env.files_read), list(env.files_written), env.final_score
+            )
+            strat_r = strategy_det.detect(traj.get("steps",[]), task, meta, list(env.files_read), env.final_score)
+            imp_plan = improvement_engine.generate_improvement_plan(
+                traj.get("episode_id",""), task, fail_r.primary_failure,
+                [], env.final_score, traj.get("steps",[]),
+                list(env.files_read), list(env.files_written)
+            )
+            memory_bank.store(
+                traj.get("episode_id",""), task, fail_r.primary_failure,
+                fail_r.failure_summary or "", env.final_score,
+                strat_r.strategy, traj.get("steps",[]), imp_plan.to_dict()
+            )
+            log.append(f"💾 Stored lesson in memory bank ({memory_bank.get_stats()['total_entries']} total)")
         return "\n".join(log)
     except Exception as e:
+        return f"❌ {e}"
 # ── Tab 3: Evaluation ─────────────────────────────────────────────────────────
     try:
         ev = env.get_evaluation()
         if "error" in ev:
+            return _no_traj()
+        lines = [f"🎯 Composite Score: {ev['composite_score']:.3f}", "━"*50]
         for name, dim in ev.get("dimensions", {}).items():
+            bar = "█" * int(dim["score"]*20) + "░" * (20-int(dim["score"]*20))
             lines.append(f"  {name:15s} [{bar}] {dim['score']:.3f}")
+            for e in dim.get("evidence",[])[:2]:
                 lines.append(f"    → {e}")
         if ev.get("strengths"):
             lines += ["\n💪 Strengths:"] + [f"  ✅ {s}" for s in ev["strengths"]]
         if ev.get("failure_analysis"):
             lines += ["\n⚠️ Failures:"] + [f"  ❌ {f}" for f in ev["failure_analysis"]]
         if ev.get("recommendations"):
+            lines += ["\n💡 Recs:"] + [f"  → {r}" for r in ev["recommendations"]]
         return "\n".join(lines)
     except Exception as e:
         return f"Error: {e}"
 def get_metrics():
     try:
         return json.dumps(env.get_metrics(), indent=2, default=str)
     except Exception as e:
         return f"Error: {e}"
 def get_trajectory():
     try:
         t = env.get_trajectory()
+        if not t: return _no_traj()
         lines = [
+            f"Episode: {t.get('episode_id')}", f"Task: {t.get('task')} | Variant: {t.get('variant_id')}",
+            f"Score: {t.get('final_score',0):.3f} | Duration: {t.get('duration_seconds','?')}s", "━"*60,
         ]
+        em = {"read_file":"📖","write_file":"✏️","run_tests":"🧪","search_code":"🔍","submit":"🏁"}
+        for step in t.get("steps",[]):
             p = step.get("action_path") or step.get("action_query") or ""
             err = " ❌" if step.get("error") else ""
+            lines.append(f"  {em.get(step['action_type'],'•')} {step['step_number']:2d}: {step['action_type']:12s} {p:25s} reward={step['reward']:+.3f}{err}")
         return "\n".join(lines)
     except Exception as e:
         return f"Error: {e}"
 def get_failure_classification():
     try:
+        traj, meta, steps, ep_id = _get_traj_and_meta()
+        if not traj: return _no_traj()
+        r = failure_clf.classify(ep_id, env.current_task or "?", steps, meta,
+                                  list(env.files_read), list(env.files_written), env.final_score)
+        d = r.to_dict()
         lines = [
             f"{'✅ SUCCESS' if d['success'] else '❌ FAILURE'}",
+            f"Primary: {d['primary_failure']} | Count: {d['failure_count']}", "━"*50,
         ]
+        for f in d.get("failures",[]):
+            lines += [f"\n[{f['severity'].upper()}] {f['type']} @ step {f['step']}",
+                      f"  Evidence: {f['evidence']}", f"  Fix: {f['remediation']}"]
         if d.get("failure_summary"):
             lines += ["\n📋 Summary:", f"  {d['failure_summary']}"]
         if d.get("retry_hint"):
+            lines += [f"\n🔁 Retry hint: {d['retry_hint']}"]
         return "\n".join(lines)
+    except Exception as e: return f"Error: {e}"
 def get_strategy_detection():
     try:
+        traj, meta, steps, _ = _get_traj_and_meta()
+        if not traj: return _no_traj()
+        r = strategy_det.detect(steps, env.current_task or "?", meta, list(env.files_read), env.final_score)
+        d = r.to_dict()
+        bar = "█"*int(d["score"]*20)+"░"*(20-int(d["score"]*20))
         lines = [
+            f"🧭 Strategy: {d['strategy']}", f"   [{bar}] {d['score']:.3f} (confidence: {d['confidence']:.0%})",
+            f"\n{d['strategy_description']}",
+            f"\nExploration: {d['exploration_ratio']:.2f} | Pivots: {d['pivot_count']}",
         ]
+        if d.get("sub_patterns"): lines += ["\nSub-patterns:"] + [f"  • {p}" for p in d["sub_patterns"]]
+        if d.get("evidence"): lines += ["\nEvidence:"] + [f"  → {e}" for e in d["evidence"]]
         return "\n".join(lines)
+    except Exception as e: return f"Error: {e}"
 def get_advanced_metrics():
     try:
+        traj, meta, steps, _ = _get_traj_and_meta()
+        if not traj: return _no_traj()
+        r = adv_metrics_engine.compute(steps, meta, env.final_score, list(env.files_read), list(env.files_written))
+        d = r.to_dict()
+        def bar(v): return "█"*int(v*20)+"░"*(20-int(v*20))
+        lines = ["⚡ ADVANCED METRICS", "━"*50,
             f"  Reasoning Efficiency  [{bar(d['reasoning_efficiency'])}] {d['reasoning_efficiency']:.3f}",
             f"  Reliability Index     [{bar(d['reliability_index'])}] {d['reliability_index']:.3f}",
             f"  Exploration Ratio     [{bar(d['exploration_ratio'])}] {d['exploration_ratio']:.3f}",
             f"  Decision Entropy      [{bar(d['decision_entropy'])}] {d['decision_entropy']:.3f}",
             f"  Wasteful Ratio        [{bar(d['wasteful_ratio'])}] {d['wasteful_ratio']:.3f}",
+            f"  Pivot Rate  {d['pivot_rate']:.2f}/10 steps | Consistency {d['consistency_score']:.3f} ({d['runs_analyzed']} runs)",
         ]
+        if d.get("action_distribution"):
+            lines += ["\nAction Distribution:"] + [f"  {a:14s}: {c}" for a,c in d["action_distribution"].items()]
         return "\n".join(lines)
+    except Exception as e: return f"Error: {e}"
 # ── Tab 5: Self-Improve ───────────────────────────────────────────────────────
 def get_improvement_plan():
     try:
+        traj, meta, steps, ep_id = _get_traj_and_meta()
+        if not traj: return _no_traj()
+        fail_r = failure_clf.classify(ep_id, env.current_task or "?", steps, meta,
+                                       list(env.files_read), list(env.files_written), env.final_score)
         plan = improvement_engine.generate_improvement_plan(
+            ep_id, env.current_task or "?", fail_r.primary_failure,
+            [f.evidence for f in fail_r.failures], env.final_score,
+            steps, list(env.files_read), list(env.files_written)
         )
         d = plan.to_dict()
         lines = [
+            "🔁 SELF-IMPROVEMENT PLAN", "━"*50,
+            f"Original Score: {d['original_score']:.3f} | Failure: {d['failure_type']}",
+            f"\n❌ What went wrong:\n  {d['what_went_wrong']}",
+            f"\n🎯 Improved strategy:\n  {d['improved_strategy']}",
+            "\n📋 Step-by-step plan:",
+        ] + [f"  {s}" for s in d.get("step_by_step_plan",[])]
+        lines += ["\n💉 System Prompt Injection:", "─"*40, d.get("system_prompt_addon","None")]
         return "\n".join(lines)
+    except Exception as e: return f"Error: {e}"
+def get_memory_context_for_task(task):
+    try:
+        ctx = memory_bank.retrieve(task=task, max_lessons=3)
+        stats = memory_bank.get_stats()
+        lines = [
+            f"🧠 MEMORY BANK — {stats['total_entries']} total lessons",
+            f"Retrieving for: {task}", "━"*50,
+        ]
+        if not ctx.relevant_lessons:
+            lines.append("No lessons stored yet. Run episodes to build memory.")
+        else:
+            lines.append(f"\n📚 {ctx.lessons_count} relevant lesson(s):\n")
+            for i, e in enumerate(ctx.relevant_lessons, 1):
+                lines += [
+                    f"[Lesson {i}] Task: {e.task} | Failure: {e.failure_type} | Score: {e.score:.2f}",
+                    f"  Title: {e.lesson_title}",
+                    f"  Lesson: {e.lesson_body[:120]}",
+                    f"  Hint: {e.lesson_hint[:120]}" if e.lesson_hint else "",
+                    "",
+                ]
+            lines += ["\n💉 System Prompt Injection:", "─"*40, ctx.system_prompt_injection]
+        return "\n".join(l for l in lines)
+    except Exception as e: return f"Error: {e}"
 # ── Tab 6: Compare Agents ─────────────────────────────────────────────────────
+def run_comparison(task, selected_agents):
     try:
+        agents = selected_agents or None
         report = multi_agent_engine.compare(env, task=task, agents=agents)
         d = report.to_dict()
         lines = [
             f"⚖️ MULTI-AGENT COMPARISON — {task} (variant: {d.get('variant_id')})",
+            f"🏆 Winner: {d.get('winner')} (score: {d.get('winner_score',0):.3f})", "━"*80,
+            f"{'Rank':<5} {'Agent':<16} {'Score':<8} {'Steps':<7} {'Strategy':<22} {'Failure':<20} {'Reliability'}",
+            "─"*80,
         ]
+        for row in d.get("summary_table",[]):
+            lines.append(f"#{row['rank']:<4} {row['agent']:<16} {row['score']:<8.3f} {row['steps']:<7} {row['strategy']:<22} {row['failure']:<20} {row['reliability']:.3f}")
+        lines.append("━"*80)
         if d.get("insights"):
             lines += ["\n💡 Insights:"] + [f"  → {i}" for i in d["insights"]]
+        lines.append("\n📊 Action Sequences:")
+        for run in d.get("detailed_runs",[]):
+            seq = " → ".join(run.get("action_sequence",[]))
             lines.append(f"  {run['agent_name']:16s}: {seq}")
         return "\n".join(lines)
+    except Exception as e: return f"❌ {e}"
 # ── Tab 7: 3D Visualizer ──────────────────────────────────────────────────────
+def get_viz_iframe():
+    """Return iframe pointing to /static/viz3d.html — fixes Three.js canvas rendering."""
+    # Add a cache-busting timestamp so Gradio re-renders on refresh
+    import time
+    ts = int(time.time())
+    return (
+        f'<iframe src="/static/viz3d.html?t={ts}" '
+        f'width="100%" height="640" frameborder="0" '
+        f'style="border-radius:10px;border:1px solid rgba(125,211,252,0.2);'
+        f'background:#0a0e1a;" '
+        f'allow="accelerometer; autoplay" loading="lazy">'
+        f'</iframe>'
+    )
+# ── Tab 8: Causal Probe ───────────────────────────────────────────────────────
+def get_causal_probe():
+    try:
+        traj, meta, steps, ep_id = _get_traj_and_meta()
+        if not traj: return _no_traj()
+        r = causal_probe.probe(ep_id, env.current_task or "?", steps, meta,
+                                list(env.files_read), list(env.files_written), env.final_score)
+        d = r.to_dict()
+        bar = lambda v: "█"*int(v*20)+"░"*(20-int(v*20))
+        lines = [
+            f"🧪 CAUSAL REASONING PROBE",
+            f"━"*55,
+            f"Understanding Level: {d['understanding_level']}",
+            f"Causal Score:        [{bar(d['causal_score'])}] {d['causal_score']:.3f}",
+            f"Chain Coverage:      [{bar(d['chain_coverage'])}] {d['chain_coverage']:.3f}",
+            f"Chain Order Score:   [{bar(d['chain_order_score'])}] {d['chain_order_score']:.3f}",
+            f"\n📡 Behavioral Signals:",
+        ]
+        sigs = d.get("behavioral_signals",{})
+        for k,v in sigs.items():
+            lines.append(f"  {'✅' if v else '❌'} {k.replace('_',' ').title()}")
+        if d.get("understanding_indicators"):
+            lines += ["\n✅ Understanding Indicators:"] + [f"  • {i}" for i in d["understanding_indicators"]]
+        if d.get("guessing_indicators"):
+            lines += ["\n❌ Guessing Indicators:"] + [f"  • {i}" for i in d["guessing_indicators"]]
+        diag = d.get("diagnostics",{})
+        if diag.get("false_confidence_detected"):
+            lines.append("\n⚠️ FALSE CONFIDENCE DETECTED — submitted without adequate exploration")
+        if diag.get("shortcut_learning_detected"):
+            lines.append("⚠️ SHORTCUT LEARNING DETECTED — wrote without reading source")
+        lines += [f"\n📝 {d['explanation']}"]
+        if d.get("recommendations"):
+            lines += ["\n💡 Recommendations:"] + [f"  → {r_}" for r_ in d["recommendations"]]
+        return "\n".join(lines)
+    except Exception as e: return f"Error: {e}"
+# ── Tab 9: Counterfactual ─────────────────────────────────────────────────────
+def get_counterfactual():
+    try:
+        traj, meta, steps, ep_id = _get_traj_and_meta()
+        if not traj: return _no_traj()
+        r = counterfactual_engine.analyze(ep_id, env.current_task or "?", steps, meta,
+                                           list(env.files_read), list(env.files_written), env.final_score)
+        d = r.to_dict()
+        bar = lambda v: "█"*int(v*20)+"░"*(20-int(v*20))
+        lines = [
+            f"🎭 COUNTERFACTUAL ROBUSTNESS TEST",
+            f"━"*55,
+            f"Brittleness Level:  {d['brittleness_level']}",
+            f"Robustness Score:   [{bar(d['robustness_score'])}] {d['robustness_score']:.3f}",
+            f"Mutations Tested:   {d['mutations_tested']}",
+            f"Mutations Survived: {d['mutations_survived']} ✅ | Failed: {d['mutations_failed']} ❌",
+            f"\n🧬 Mutation Results:",
+        ]
+        for m in d.get("mutations",[]):
+            icon = "✅" if not m["would_break_agent"] else "❌"
+            lines.append(f"  {icon} [{m['type']}] {m['description'][:55]}")
+            lines.append(f"     {m['why'][:80]}")
+        if d.get("surface_dependencies"):
+            lines += ["\n⚠️ Surface Dependencies:"] + [f"  • {s}" for s in d["surface_dependencies"]]
+        if d.get("deep_dependencies"):
+            lines += ["\n✅ Deep Dependencies:"] + [f"  • {s}" for s in d["deep_dependencies"]]
+        lines += [f"\n📝 {d['explanation']}"]
+        if d.get("recommendations"):
+            lines += ["\n💡 Recommendations:"] + [f"  → {r_}" for r_ in d["recommendations"]]
+        return "\n".join(lines)
+    except Exception as e: return f"Error: {e}"
+# ── Tab 10: Confidence Calibration ────────────────────────────────────────────
+def get_calibration():
+    try:
+        traj, meta, steps, ep_id = _get_traj_and_meta()
+        if not traj: return _no_traj()
+        r = confidence_calibrator.calibrate(ep_id, env.current_task or "?", steps, env.final_score)
+        d = r.to_dict()
+        bar = lambda v: "█"*int(v*20)+"░"*(20-int(v*20))
+        lines = [
+            f"📐 CONFIDENCE CALIBRATION REPORT",
+            f"━"*55,
+            f"Calibration Profile: {d['profile']}",
+            f"Calibration Score:   [{bar(d['calibration_score'])}] {d['calibration_score']:.3f}",
+            f"Inferred Confidence: [{bar(d['inferred_confidence'])}] {d['inferred_confidence']:.3f}",
+            f"Actual Performance:  [{bar(d['actual_performance'])}] {d['actual_performance']:.3f}",
+            f"Calibration Error:   {d['expected_calibration_error']:.3f} (lower=better)",
+            f"Conf-Acc Correlation: {d['confidence_accuracy_correlation']:.3f}",
+            f"\n📊 Behavioral Signals:",
+        ]
+        sigs = d.get("signals",{})
+        lines.append(f"  Commitment Speed:    {sigs.get('commitment_speed',0):.3f} (high=fast commit)")
+        lines.append(f"  Re-Exploration Rate: {sigs.get('re_exploration_rate',0):.3f} (high=uncertain)")
+        lines.append(f"  Verification Rate:   {sigs.get('verification_rate',0):.3f} tests/write")
+        lines.append(f"  Submit Speed:        {sigs.get('submit_speed',0):.3f} (high=early submit)")
+        lines += [f"\n📝 {d['diagnosis']}"]
+        if d.get("recommendations"):
+            lines += ["\n💡 Recommendations:"] + [f"  → {r_}" for r_ in d["recommendations"]]
+        if d.get("confidence_trajectory"):
+            lines.append("\n📈 Confidence Trajectory:")
+            for s in d["confidence_trajectory"][:8]:
+                acc_str = f" | acc={s['accuracy']:.2f}" if s['accuracy'] is not None else ""
+                lines.append(f"  S{s['step']}: {s['action']:12s} conf={s['confidence']:.2f}{acc_str}")
+        return "\n".join(lines)
+    except Exception as e: return f"Error: {e}"
+# ── Tab 11: Benchmark ─────────────────────────────────────────────────────────
+def run_benchmark(tasks_selected, agents_selected):
+    try:
+        tasks = tasks_selected if tasks_selected else ["task1", "task2", "task3"]
+        agents = agents_selected if agents_selected else None
+        report = benchmark_runner.run(env, tasks=tasks, agents=agents)
+        return report.render_table()
+    except Exception as e:
+        return f"❌ Benchmark error: {e}"
+# ── Tab 12: Analytics ─────────────────────────────────────────────────────────
+def get_analytics():
+    try:
+        if not env.get_trajectory():
+            return _no_traj()
+        report = analytics_engine.analyze(env)
+        return report.render_text()
+    except Exception as e:
+        return f"Error: {e}"
+def get_analytics_json():
+    try:
+        if not env.get_trajectory():
+            return _no_traj()
+        report = analytics_engine.analyze(env)
+        return json.dumps(report.to_dict(), indent=2, default=str)
+    except Exception as e:
+        return f"Error: {e}"
+# ══════════════════════════════════════════════════════════════════════════════
+# Gradio UI
+# ══════════════════════════════════════════════════════════════════════════════
+with gr.Blocks(title="Codebase Navigation & Repair — OpenEnv v4") as demo:
     gr.Markdown(
+        "# 🔍 Codebase Navigation & Repair — OpenEnv v4\n"
+        "**The first platform that scientifically measures, explains, and improves AI agent reasoning.** "
+        "Navigate · Fix · Evaluate Process · Probe Causality · Test Counterfactuals · Calibrate Confidence · Benchmark."
     )
     with gr.Tabs():
         with gr.TabItem("🎮 Interactive"):
             with gr.Row():
                 with gr.Column(scale=1):
+                    task_sel = gr.Dropdown(["task1","task2","task3"], value="task1", label="Task")
                     reset_btn = gr.Button("🔄 Reset Environment", variant="primary")
                     gr.Markdown("### Action")
+                    act_type = gr.Dropdown(["read_file","write_file","run_tests","search_code","submit"], value="read_file", label="Action Type")
                     act_path = gr.Textbox(label="Path", placeholder="src/auth.py")
+                    act_query = gr.Textbox(label="Query", placeholder="validate_token")
                     act_content = gr.Textbox(label="Content (write_file)", lines=4)
                     step_btn = gr.Button("▶️ Execute Step", variant="secondary")
                 with gr.Column(scale=2):
                     with gr.Row():
                         steps_box = gr.Textbox(label="Steps", value="0", interactive=False)
                         reward_box = gr.Textbox(label="Cumulative Reward", value="0.000", interactive=False)
+            reset_btn.click(reset_environment, [task_sel], [status_box, result_box, steps_box, reward_box])
             step_btn.click(take_step, [act_type, act_path, act_query, act_content], [status_box, result_box, steps_box, reward_box])
         # ── Tab 2: Run Agent ──────────────────────────────────────────────────
         with gr.TabItem("🤖 Run Agent"):
+            gr.Markdown("### Built-in Demonstration Agent\nRuns test-first deterministic strategy + stores lesson in memory bank.")
+            agent_task = gr.Dropdown(["task1","task2","task3"], value="task1", label="Task")
             run_btn = gr.Button("🚀 Run Agent", variant="primary")
+            agent_out = gr.Textbox(label="Agent Log", lines=22, interactive=False)
+            run_btn.click(run_builtin_agent, [agent_task], [agent_out])
         # ── Tab 3: Evaluation ─────────────────────────────────────────────────
         with gr.TabItem("📊 Evaluation"):
             metrics_btn.click(get_metrics, outputs=[eval_out])
             traj_btn.click(get_trajectory, outputs=[eval_out])
+        # ── Tab 4: Intelligence ───────────────────────────────────────────────
         with gr.TabItem("🧠 Intelligence"):
+            gr.Markdown("### Deep Agent Intelligence Analysis")
             with gr.Row():
+                clf_btn = gr.Button("🔬 Classify Failure", variant="primary")
+                strat_btn = gr.Button("🧭 Detect Strategy", variant="secondary")
                 adv_btn = gr.Button("⚡ Advanced Metrics", variant="secondary")
             intel_out = gr.Textbox(label="Analysis", lines=32, interactive=False)
+            clf_btn.click(get_failure_classification, outputs=[intel_out])
+            strat_btn.click(get_strategy_detection, outputs=[intel_out])
             adv_btn.click(get_advanced_metrics, outputs=[intel_out])
+        # ── Tab 5: Self-Improve ───────────────────────────────────────────────
         with gr.TabItem("🔁 Self-Improve"):
+            gr.Markdown("### Self-Improvement Loop + Episodic Memory")
+            with gr.Row():
+                improve_btn = gr.Button("🔁 Improvement Plan", variant="primary")
+                mem_task = gr.Dropdown(["task1","task2","task3"], value="task1", label="Task for Memory")
+                mem_btn = gr.Button("🧠 Retrieve Memory", variant="secondary")
+            improve_out = gr.Textbox(label="Output", lines=32, interactive=False)
             improve_btn.click(get_improvement_plan, outputs=[improve_out])
+            mem_btn.click(get_memory_context_for_task, [mem_task], [improve_out])
+        # ── Tab 6: Compare Agents ─────────────────────────────────────────────
         with gr.TabItem("⚖️ Compare Agents"):
+            gr.Markdown("### Multi-Agent Strategy Comparison")
             with gr.Row():
+                comp_task = gr.Dropdown(["task1","task2","task3"], value="task1", label="Task")
                 comp_agents = gr.CheckboxGroup(
+                    ["test-first","search-first","minimal","exhaustive"],
+                    value=["test-first","search-first","minimal","exhaustive"],
+                    label="Agents",
                 )
             comp_btn = gr.Button("⚖️ Run Comparison", variant="primary")
+            comp_out = gr.Textbox(label="Report", lines=30, interactive=False)
             comp_btn.click(run_comparison, [comp_task, comp_agents], [comp_out])
+        # ── Tab 7: 3D Visualizer ──────────────────────────────────────────────
         with gr.TabItem("🌐 3D Visualizer"):
             gr.Markdown(
                 "### Agent Trajectory 3D Visualization\n"
+                "Files = glowing 3D spheres · Dependencies = edges · Agent = animated beam · **Run an episode first.**"
+            )
+            refresh_btn = gr.Button("🔄 Load / Refresh Visualizer", variant="primary")
+            viz_html = gr.HTML(
+                value='<div style="text-align:center;padding:60px;color:#475569;background:#0a0e1a;border-radius:10px">'
+                      '<p style="font-size:24px">🌐</p>'
+                      '<p style="color:#7dd3fc;font-weight:700">Run an episode then click Load</p></div>'
+            )
+            refresh_btn.click(get_viz_iframe, outputs=[viz_html])
+        # ── Tab 8: Causal Probe ───────────────────────────────────────────────
+        with gr.TabItem("🧪 Causal Probe"):
+            gr.Markdown(
+                "### Causal Reasoning Evaluation\n"
+                "Did the agent truly understand WHY the bug exists, "
+                "or did it pattern-match and guess? "
+                "Measures chain coverage, order, and shortcut learning."
+            )
+            causal_btn = gr.Button("🧪 Run Causal Probe", variant="primary")
+            causal_out = gr.Textbox(label="Causal Reasoning Report", lines=32, interactive=False)
+            causal_btn.click(get_causal_probe, outputs=[causal_out])
+        # ── Tab 9: Counterfactual ─────────────────────────────────────────────
+        with gr.TabItem("🎭 Counterfactual"):
+            gr.Markdown(
+                "### Counterfactual Robustness Testing\n"
+                "Applies 6 semantic-neutral mutations (filename rename, constant change, "
+                "dummy function, directory shift, docstring noise, import reorder) "
+                "and measures whether the agent's strategy survives."
             )
+            cf_btn = gr.Button("🎭 Run Counterfactual Analysis", variant="primary")
+            cf_out = gr.Textbox(label="Robustness Report", lines=32, interactive=False)
+            cf_btn.click(get_counterfactual, outputs=[cf_out])
+        # ── Tab 10: Confidence ────────────────────────────────────────────────
+        with gr.TabItem("📐 Confidence"):
+            gr.Markdown(
+                "### Confidence Calibration Analysis\n"
+                "Infers agent confidence from behavioral proxies (commitment speed, "
+                "re-exploration rate, verification rate, submit timing) "
+                "and compares to actual performance. Detects overconfident and underconfident agents."
+            )
+            calib_btn = gr.Button("📐 Analyze Calibration", variant="primary")
+            calib_out = gr.Textbox(label="Calibration Report", lines=32, interactive=False)
+            calib_btn.click(get_calibration, outputs=[calib_out])
+        # ── Tab 11: Benchmark ─────────────────────────────────────────────────
+        with gr.TabItem("🏆 Benchmark"):
+            gr.Markdown(
+                "### Automated Benchmark Leaderboard\n"
+                "Runs all selected agent strategies × all selected tasks automatically. "
+                "Ranks by composite score: correctness + causal reasoning + robustness + calibration + generalization."
+            )
+            with gr.Row():
+                bench_tasks = gr.CheckboxGroup(["task1","task2","task3"], value=["task1","task2"], label="Tasks to Benchmark")
+                bench_agents = gr.CheckboxGroup(
+                    ["test-first","search-first","minimal","exhaustive"],
+                    value=["test-first","minimal"],
+                    label="Agent Strategies",
+                )
+            bench_btn = gr.Button("🏆 Run Benchmark (2–4 min)", variant="primary")
+            bench_out = gr.Textbox(label="Leaderboard", lines=35, interactive=False)
+            bench_btn.click(run_benchmark, [bench_tasks, bench_agents], [bench_out])
+        # ── Tab 12: Analytics ─────────────────────────────────────────────────
+        with gr.TabItem("📈 Analytics"):
+            gr.Markdown(
+                "### Unified Research-Grade Analytics\n"
+                "Synthesizes ALL evaluation dimensions into one report: "
+                "reasoning graph, root cause tree, alternative paths, profile tags, "
+                "decision efficiency, composite score. Paper-ready JSON available."
+            )
+            with gr.Row():
+                analytics_btn = gr.Button("📈 Full Analytics Report", variant="primary")
+                analytics_json_btn = gr.Button("📋 Export JSON", variant="secondary")
+            analytics_out = gr.Textbox(label="Analytics Report", lines=40, interactive=False)
+            analytics_btn.click(get_analytics, outputs=[analytics_out])
+            analytics_json_btn.click(get_analytics_json, outputs=[analytics_out])
+        # ── Tab 13: API ───────────────────────────────────────────────────────
         with gr.TabItem("📖 API"):
             gr.Markdown("""
+### REST API — v4.0 Endpoints
+#### Core
+| `/reset` POST | `/step` POST | `/state` GET | `/health` GET |
 #### Evaluation
+| `/trajectory` GET | `/evaluate` GET | `/metrics` GET | `/fault-config` POST |
+#### Intelligence (v3)
+| `/classify` GET | `/strategy` GET | `/advanced-metrics` GET | `/improvement-plan` GET | `/compare-agents` POST | `/viz-data` GET |
+#### Research (v4 NEW)
+| `/causal-probe` GET | `/counterfactual` GET | `/confidence` GET | `/benchmark` POST | `/analytics` GET |
 ```bash
 BASE="http://localhost:7860"
+# Run a full episode
 curl -X POST "$BASE/reset?task=task1"
+curl -X POST "$BASE/step" -H "Content-Type: application/json" -d '{"action_type":"read_file","path":"tests/test_formatter.py"}'
 curl -X POST "$BASE/step" -d '{"action_type":"submit"}'
+# All intelligence endpoints
 curl "$BASE/classify"
+curl "$BASE/causal-probe"
+curl "$BASE/counterfactual"
+curl "$BASE/confidence"
+curl "$BASE/analytics"
+# Benchmark
+curl -X POST "$BASE/benchmark?tasks=task1,task2"
 ```
 """)
+# ── Mount FastAPI ─────────────────────────────────────────────────────────────
 from server.app import app as fastapi_app
 gr_app = gr.mount_gradio_app(fastapi_app, demo, path="/")

e2e_test_v3.py ADDED Viewed

	@@ -0,0 +1,389 @@

+#!/usr/bin/env python3
+"""
+e2e_test_v3.py — Full End-to-End test suite for v3.0
+Tests every endpoint, all 3 tasks, all new intelligence modules,
+multi-agent comparison, and the 3D viz-data endpoint.
+"""
+import sys
+import json
+import time
+import requests
+BASE = "http://localhost:7860"
+PASS = 0
+FAIL = 0
+RESULTS = []
+def check(name, condition, detail=""):
+    global PASS, FAIL
+    status = "✅ PASS" if condition else "❌ FAIL"
+    if condition:
+        PASS += 1
+    else:
+        FAIL += 1
+    msg = f"  {status}  {name}"
+    if detail:
+        msg += f"  →  {detail}"
+    print(msg)
+    RESULTS.append({"name": name, "passed": condition, "detail": detail})
+def section(title):
+    print(f"\n{'━'*60}")
+    print(f"  {title}")
+    print(f"{'━'*60}")
+# ─────────────────────────────────────────────────────────────────────────────
+section("1. HEALTH & BASIC CONNECTIVITY")
+# ─────────────────────────────────────────────────────────────────────────────
+r = requests.get(f"{BASE}/health")
+check("GET /health returns 200", r.status_code == 200)
+data = r.json()
+check("Health version is 3.0.0", data.get("version") == "3.0.0", data.get("version"))
+check("Health status is ok", data.get("status") == "ok")
+# ─────────────────────────────────────────────────────────────────────────────
+section("2. CORE OPENENV — ALL 3 TASKS")
+# ─────────────────────────────────────────────────────────────────────────────
+for task in ["task1", "task2", "task3"]:
+    r = requests.post(f"{BASE}/reset?task={task}")
+    check(f"POST /reset?task={task} → 200", r.status_code == 200, f"status={r.status_code}")
+    if r.status_code == 200:
+        d = r.json()
+        obs = d.get("observation", {})
+        check(f"  {task}: has repo_tree", bool(obs.get("repo_tree")), str(obs.get("repo_tree", [])[:2]))
+        check(f"  {task}: has variant_id", bool(d.get("info", {}).get("variant_id")))
+        check(f"  {task}: steps_remaining > 0", obs.get("steps_remaining", 0) > 0)
+# ─────────────────────────────────────────────────────────────────────────────
+section("3. STEP ACTIONS — FULL EPISODE (task1)")
+# ─────────────────────────────────────────────────────────────────────────────
+r = requests.post(f"{BASE}/reset?task=task1")
+obs = r.json()["observation"]
+tree = obs["repo_tree"]
+test_files = [f for f in tree if f.startswith("tests/")]
+src_files = [f for f in tree if f.startswith("src/")]
+# read_file
+r = requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": test_files[0]})
+check("POST /step read_file test file → 200", r.status_code == 200)
+check("read_file reward >= 0", r.json().get("reward", -1) >= 0, str(r.json().get("reward")))
+r = requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": src_files[0]})
+check("POST /step read_file src file → 200", r.status_code == 200)
+# search_code
+r = requests.post(f"{BASE}/step", json={"action_type": "search_code", "query": "def "})
+check("POST /step search_code → 200", r.status_code == 200)
+# run_tests
+r = requests.post(f"{BASE}/step", json={"action_type": "run_tests"})
+check("POST /step run_tests → 200", r.status_code == 200, f"reward={r.json().get('reward')}")
+# submit
+r = requests.post(f"{BASE}/step", json={"action_type": "submit"})
+check("POST /step submit → 200", r.status_code == 200)
+final_score = r.json()["info"].get("final_score", 0)
+check("Episode done after submit", r.json().get("done") == True)
+# Try stepping after done → should get 400
+r = requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": "x.py"})
+check("POST /step after done → 400", r.status_code == 400)
+# ─────────────────────────────────────────────────────────────────────────────
+section("4. STATE ENDPOINT")
+# ─────────────────────────────────────────────────────────────────────────────
+requests.post(f"{BASE}/reset?task=task1")
+requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": test_files[0]})
+r = requests.get(f"{BASE}/state")
+check("GET /state → 200", r.status_code == 200)
+d = r.json()
+check("State has observation", "observation" in d)
+check("State total_steps_taken >= 1", d.get("total_steps_taken", 0) >= 1)
+# ─────────────────────────────────────────────────────────────────────────────
+section("5. TRAJECTORY & EVALUATION")
+# ─────────────────────────────────────────────────────────────────────────────
+requests.post(f"{BASE}/step", json={"action_type": "submit"})
+r = requests.get(f"{BASE}/trajectory")
+check("GET /trajectory → 200", r.status_code == 200)
+traj = r.json()
+check("Trajectory has episode_id", bool(traj.get("episode_id")))
+check("Trajectory steps > 0", len(traj.get("steps", [])) > 0, f"steps={len(traj.get('steps',[]))}")
+r = requests.get(f"{BASE}/evaluate")
+check("GET /evaluate → 200", r.status_code == 200)
+ev = r.json()
+check("Evaluation has composite_score", "composite_score" in ev, str(ev.get("composite_score")))
+check("Evaluation has 6 dimensions", len(ev.get("dimensions", {})) == 6, str(list(ev.get("dimensions", {}).keys())))
+r = requests.get(f"{BASE}/metrics")
+check("GET /metrics → 200", r.status_code == 200)
+m = r.json()
+check("Metrics has timeline", "timeline" in m, str(list(m.keys())[:5]))
+# ─────────────────────────────────────────────────────────────────────────────
+section("6. FAULT INJECTION")
+# ─────────────────────────────────────────────────────────────────────────────
+r = requests.post(f"{BASE}/fault-config", json={"level": "light"})
+check("POST /fault-config light → 200", r.status_code == 200)
+r = requests.post(f"{BASE}/reset?task=task1")
+check("Reset with fault injection → 200", r.status_code == 200)
+fi = r.json().get("info", {}).get("fault_injection", {})
+check("Fault injection info present", "difficulty_multiplier" in fi or "faults_injected" in fi, str(fi))
+# Reset back
+requests.post(f"{BASE}/fault-config", json={"level": "none"})
+# ─────────────────────────────────────────────────────────────────────────────
+section("7. INTELLIGENCE — FAILURE CLASSIFIER")
+# ─────────────────────────────────────────────────────────────────────────────
+# Run a fresh episode with minimal effort to get a known failure
+requests.post(f"{BASE}/reset?task=task1")
+requests.post(f"{BASE}/step", json={"action_type": "submit"})  # Submit without doing anything
+r = requests.get(f"{BASE}/classify")
+check("GET /classify → 200", r.status_code == 200)
+d = r.json()
+check("Classify has episode_id", "episode_id" in d, d.get("episode_id"))
+check("Classify has primary_failure", "primary_failure" in d, d.get("primary_failure"))
+check("Classify has success field", "success" in d)
+check("Classify success=False for minimal effort", d.get("success") == False)
+check("Classify has retry_hint", bool(d.get("retry_hint")), d.get("retry_hint", "")[:60])
+# ─────────────────────────────────────────────────────────────────────────────
+section("8. INTELLIGENCE — STRATEGY DETECTOR")
+# ─────────────────────────────────────────────────────────────────────────────
+r = requests.get(f"{BASE}/strategy")
+check("GET /strategy → 200", r.status_code == 200)
+d = r.json()
+check("Strategy has strategy field", "strategy" in d, d.get("strategy"))
+VALID_STRATEGIES = ["TARGETED_DEBUGGING", "SYSTEMATIC_SEARCH", "BRUTE_FORCE",
+                    "RANDOM_EXPLORATION", "SPEC_DRIVEN", "MINIMAL_EFFORT"]
+check("Strategy is a known label", d.get("strategy") in VALID_STRATEGIES, d.get("strategy"))
+check("Strategy has score 0-1", 0 <= d.get("score", -1) <= 1, str(d.get("score")))
+check("Strategy has exploration_ratio", "exploration_ratio" in d)
+check("Strategy has sub_patterns list", isinstance(d.get("sub_patterns"), list))
+# ─────────────────────────────────────────────────────────────────────────────
+section("9. INTELLIGENCE — ADVANCED METRICS")
+# ─────────────────────────────────────────────────────────────────────────────
+r = requests.get(f"{BASE}/advanced-metrics")
+check("GET /advanced-metrics → 200", r.status_code == 200)
+d = r.json()
+expected_keys = ["reasoning_efficiency", "exploration_ratio", "decision_entropy",
+                 "reliability_index", "pivot_rate", "wasteful_ratio", "consistency_score"]
+for key in expected_keys:
+    check(f"  advanced-metrics has '{key}'", key in d, str(d.get(key, "MISSING")))
+check("reliability_index in [0,1]", 0 <= d.get("reliability_index", -1) <= 1)
+check("action_distribution is dict", isinstance(d.get("action_distribution"), dict))
+# ─────────────────────────────────────────────────────────────────────────────
+section("10. INTELLIGENCE — IMPROVEMENT PLAN")
+# ─────────────────────────────────────────────────────────────────────────────
+r = requests.get(f"{BASE}/improvement-plan")
+check("GET /improvement-plan → 200", r.status_code == 200)
+d = r.json()
+check("Plan has failure_type", "failure_type" in d, d.get("failure_type"))
+check("Plan has what_went_wrong", bool(d.get("what_went_wrong")))
+check("Plan has improved_strategy", bool(d.get("improved_strategy")))
+check("Plan has step_by_step_plan list", isinstance(d.get("step_by_step_plan"), list))
+check("Plan step_by_step_plan not empty", len(d.get("step_by_step_plan", [])) > 0)
+check("Plan has system_prompt_addon", "system_prompt_addon" in d)
+# ─────────────────────────────────────────────────────────────────────────────
+section("11. MULTI-AGENT COMPARISON")
+# ─────────────────────────────────────────────────────────────────────────────
+r = requests.post(f"{BASE}/compare-agents?task=task1&agents=test-first,minimal")
+check("POST /compare-agents (2 agents) → 200", r.status_code == 200, f"status={r.status_code}")
+if r.status_code == 200:
+    d = r.json()
+    check("Comparison has winner", "winner" in d, d.get("winner"))
+    check("Comparison has summary_table", "summary_table" in d)
+    check("Summary table has 2 rows", len(d.get("summary_table", [])) == 2,
+          str(len(d.get("summary_table", []))))
+    check("Each row has score/steps/strategy", all(
+        "score" in row and "steps" in row and "strategy" in row
+        for row in d.get("summary_table", [])
+    ))
+    check("Comparison has insights", "insights" in d)
+    check("Comparison has detailed_runs", len(d.get("detailed_runs", [])) == 2)
+# Test all 4 agents
+r = requests.post(f"{BASE}/compare-agents?task=task1")
+check("POST /compare-agents (all agents) → 200", r.status_code == 200)
+if r.status_code == 200:
+    d = r.json()
+    check("All 4 agents ran", len(d.get("summary_table", [])) == 4,
+          f"rows={len(d.get('summary_table',[]))}")
+# ─────────────────────────────────────────────────────────────────────────────
+section("12. 3D VISUALIZATION DATA")
+# ─────────────────────────────────────────────────────────────────────────────
+# Run a full episode first for viz data
+requests.post(f"{BASE}/reset?task=task1")
+requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": test_files[0]})
+requests.post(f"{BASE}/step", json={"action_type": "submit"})
+r = requests.get(f"{BASE}/viz-data")
+check("GET /viz-data → 200", r.status_code == 200)
+d = r.json()
+check("Viz-data has files array", isinstance(d.get("files"), list), f"len={len(d.get('files',[]))}")
+check("Viz-data files > 0", len(d.get("files", [])) > 0)
+check("Viz-data has dependencies", isinstance(d.get("dependencies"), list))
+check("Viz-data has steps", isinstance(d.get("steps"), list))
+check("Viz-data has strategy", "strategy" in d, d.get("strategy"))
+check("Viz-data has final_score", "final_score" in d)
+if d.get("files"):
+    f = d["files"][0]
+    check("File node has name/type/is_bug_file", all(k in f for k in ["name","type","is_bug_file"]))
+# ─────────────────────────────────────────────────────────────────────────────
+section("13. INVALID ACTION HANDLING")
+# ─────────────────────────────────────────────────────────────────────────────
+requests.post(f"{BASE}/reset?task=task1")
+# Invalid task
+r = requests.post(f"{BASE}/reset?task=task99")
+check("Invalid task → 400", r.status_code == 400)
+# Invalid action type
+r = requests.post(f"{BASE}/step", json={"action_type": "hack_system"})
+check("Invalid action_type → 400 or 422", r.status_code in (400, 422))
+# Non-existent file
+r = requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": "non_existent.py"})
+check("Read non-existent file → 200 with error", r.status_code == 200)
+obs = r.json().get("observation", {})
+check("Non-existent file has error in obs", bool(obs.get("last_action_error")), obs.get("last_action_error","")[:60])
+# ─────────────────────────────────────────────────────────────────────────────
+section("14. SECURITY SCANNING")
+# ─────────────────────────────────────────────────────────────────────────────
+requests.post(f"{BASE}/reset?task=task1")
+# Try to write a file with dangerous code
+r = requests.post(f"{BASE}/step", json={
+    "action_type": "write_file",
+    "path": src_files[0] if src_files else "src/hack.py",
+    "content": "import os\nos.system('rm -rf /')\n"
+})
+check("Write dangerous code → 200", r.status_code == 200)
+if r.status_code == 200:
+    info = r.json().get("info", {})
+    flags = info.get("security_flags", [])
+    check("Security flags populated for os.system", len(flags) > 0, str(flags[:2]))
+# ─────────────────────────────────────────────────────────────────────────────
+section("15. GRADIO UI ENDPOINTS")
+# ─────────────────────────────────────────────────────────────────────────────
+r = requests.get(f"{BASE}/")
+check("GET / (Gradio UI) → 200", r.status_code == 200)
+check("Response is HTML", "text/html" in r.headers.get("content-type", ""))
+r = requests.get(f"{BASE}/static/viz3d.html")
+check("GET /static/viz3d.html → 200", r.status_code == 200)
+check("viz3d.html is HTML", "html" in r.text.lower()[:200])
+check("viz3d.html has Three.js", "three" in r.text.lower())
+check("viz3d.html has timeline-slider", "timeline-slider" in r.text)
+# ─────────────────────────────────────────────────────────────────────────────
+section("16. TASK2 & TASK3 FULL EPISODE")
+# ─────────────────────────────────────────────────────────────────────────────
+for task in ["task2", "task3"]:
+    r = requests.post(f"{BASE}/reset?task={task}")
+    check(f"{task} reset → 200", r.status_code == 200)
+    obs = r.json()["observation"]
+    tree = obs["repo_tree"]
+    tf = [f for f in tree if f.startswith("tests/")]
+    sf = [f for f in tree if f.startswith("src/")]
+    md = [f for f in tree if f.endswith(".md")]
+    if task == "task3" and md:
+        requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": md[0]})
+    if tf:
+        requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": tf[0]})
+    if sf:
+        requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": sf[0]})
+    r = requests.post(f"{BASE}/step", json={"action_type": "submit"})
+    check(f"{task} submit → done", r.json().get("done") == True)
+    # Verify all intelligence endpoints work post-episode
+    r = requests.get(f"{BASE}/classify")
+    check(f"{task} /classify works", r.status_code == 200 and "primary_failure" in r.json())
+    r = requests.get(f"{BASE}/strategy")
+    check(f"{task} /strategy works", r.status_code == 200 and "strategy" in r.json())
+# ─────────��───────────────────────────────────────────────────────────────────
+section("17. CONSISTENCY — 3 RUNS SAME TASK")
+# ─────────────────────────────────────────────────────────────────────────────
+scores = []
+for i in range(3):
+    requests.post(f"{BASE}/reset?task=task1")
+    r = requests.get(f"{BASE}/state")
+    tree = r.json()["observation"]["repo_tree"]
+    tf = [f for f in tree if f.startswith("tests/")]
+    if tf:
+        requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": tf[0]})
+    requests.post(f"{BASE}/step", json={"action_type": "submit"})
+    metrics = requests.get(f"{BASE}/advanced-metrics").json()
+    scores.append(requests.get(f"{BASE}/evaluate").json().get("composite_score", 0))
+check("3 runs completed", len(scores) == 3, str(scores))
+check("All runs have valid scores", all(0 <= s <= 1 for s in scores), str(scores))
+# Consistency metric
+r = requests.get(f"{BASE}/advanced-metrics")
+d = r.json()
+check("Consistency score populated after multiple runs", d.get("runs_analyzed", 0) >= 1,
+      f"runs={d.get('runs_analyzed')}, consistency={d.get('consistency_score'):.3f}")
+# ─────────────────────────────────────────────────────────────────────────────
+print(f"\n{'═'*60}")
+print(f"  E2E RESULTS: {PASS} passed | {FAIL} failed | {PASS+FAIL} total")
+print(f"  Score: {PASS/(PASS+FAIL)*100:.1f}%")
+print(f"{'═'*60}")
+if FAIL > 0:
+    print("\nFailed tests:")
+    for r in RESULTS:
+        if not r["passed"]:
+            print(f"  ❌ {r['name']}: {r['detail']}")
+sys.exit(0 if FAIL == 0 else 1)

inference.py CHANGED Viewed

@@ -17,9 +17,13 @@ from openai import OpenAI
 import httpx
 # ── Configuration ─────────────────────────────────────────────────────────────
-API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
-API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
-MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
 ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:7860")
 MAX_STEPS_PER_TASK = {"task1": 12, "task2": 18, "task3": 22}
@@ -229,7 +233,7 @@ def run_task(env_client: EnvClient, llm_client: OpenAI, task: str) -> tuple:
 def main():
     env_client = EnvClient(ENV_BASE_URL)
-    llm_client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
     all_scores = []
     for task in TASKS:

 import httpx
 # ── Configuration ─────────────────────────────────────────────────────────────
+API_BASE_URL = os.getenv("API_BASE_URL", "<your-active-endpoint>")
+MODEL_NAME = os.getenv("MODEL_NAME", "<your-active-model>")
+HF_TOKEN = os.getenv("HF_TOKEN")
+# Optional — if you use from_docker_image():
+LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
 ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:7860")
 MAX_STEPS_PER_TASK = {"task1": 12, "task2": 18, "task3": 22}
 def main():
     env_client = EnvClient(ENV_BASE_URL)
+    llm_client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
     all_scores = []
     for task in TASKS:

server/analytics_engine.py ADDED Viewed

	@@ -0,0 +1,551 @@

+# server/analytics_engine.py
+"""
+Unified Analytics Engine — v4.0
+Aggregates ALL scoring dimensions into a single research-grade report.
+Produces:
+- Reasoning graph (structured DAG of the agent's decision process)
+- Root cause analysis (why the agent failed at every level)
+- Decision efficiency score
+- Overall AI reliability profile (radar chart data)
+- Paper-ready JSON suitable for arXiv submission
+This module is the "top of the stack" — it calls all other engines
+and synthesizes their outputs into one authoritative report.
+"""
+from __future__ import annotations
+import time
+import json
+from typing import List, Dict, Any, Optional, Tuple
+from dataclasses import dataclass, field
+@dataclass
+class ReasoningNode:
+    """One node in the agent's reconstructed reasoning graph."""
+    node_id: str
+    step_number: int
+    action_type: str
+    target: Optional[str]     # file path or search query
+    reward: float
+    was_useful: bool
+    connected_to: List[str]   # IDs of subsequent nodes that built on this
+    label: str                # Human-readable description
+@dataclass
+class ReasoningGraph:
+    """
+    A directed graph reconstruction of the agent's thought process.
+    Nodes = actions taken.
+    Edges = "built on" relationships (e.g., write followed a read = used info from read).
+    Clusters = logical reasoning phases (Exploration, Hypothesis, Verification, Commit)
+    """
+    nodes: List[ReasoningNode]
+    phases: Dict[str, List[str]]  # phase_name → [node_ids]
+    critical_path: List[str]       # node_ids on the most impactful path
+    wasted_nodes: List[str]        # node_ids that contributed nothing
+    optimal_path_comparison: Optional[str]  # What should the agent have done
+    def to_dict(self) -> dict:
+        return {
+            "nodes": [
+                {
+                    "id": n.node_id, "step": n.step_number,
+                    "action": n.action_type, "target": n.target,
+                    "reward": round(n.reward, 3), "useful": n.was_useful,
+                    "connects_to": n.connected_to, "label": n.label,
+                }
+                for n in self.nodes
+            ],
+            "phases": self.phases,
+            "critical_path": self.critical_path,
+            "wasted_nodes": self.wasted_nodes,
+            "optimal_path": self.optimal_path_comparison,
+        }
+@dataclass
+class AnalyticsReport:
+    """
+    The master analytics report — synthesizes all evaluation dimensions.
+    Paper-ready, structured for research publication or leaderboard submission.
+    """
+    report_id: str
+    episode_id: str
+    task: str
+    variant_id: str
+    generated_at: float
+    # Dimension scores (0.0–1.0 each)
+    correctness_score: float        # Did it fix the bug?
+    causal_score: float             # Did it understand WHY?
+    robustness_score: float         # Is the strategy resilient?
+    calibration_score: float        # Was it appropriately confident?
+    reliability_index: float        # Weighted multi-dim score
+    generalization_hint: float      # Based on strategy (robust strategies generalize better)
+    decision_efficiency: float      # Score / Steps ratio (normalized)
+    process_quality: float          # How structured was the reasoning process?
+    # Composite
+    composite_score: float          # Weighted aggregate of all dimensions
+    # Graph
+    reasoning_graph: ReasoningGraph
+    # Root cause trees
+    failure_root_causes: List[Dict]  # Each: {cause, effect, evidence, depth}
+    # Alternative path analysis
+    what_agent_did: List[str]
+    what_agent_should_have_done: List[str]
+    steps_wasted: int
+    steps_optimal: int
+    # Profile tags
+    profile_tags: List[str]  # e.g., ["OVERCONFIDENT", "SHORTCUT_LEARNER", "WELL_CALIBRATED"]
+    # Executive summary
+    executive_summary: str
+    researcher_notes: str    # More technical deep dive
+    def to_dict(self) -> dict:
+        return {
+            "report_id": self.report_id,
+            "episode_id": self.episode_id,
+            "task": self.task,
+            "variant_id": self.variant_id,
+            "generated_at": self.generated_at,
+            "dimension_scores": {
+                "correctness": round(self.correctness_score, 3),
+                "causal_reasoning": round(self.causal_score, 3),
+                "robustness": round(self.robustness_score, 3),
+                "calibration": round(self.calibration_score, 3),
+                "reliability_index": round(self.reliability_index, 3),
+                "generalization": round(self.generalization_hint, 3),
+                "decision_efficiency": round(self.decision_efficiency, 3),
+                "process_quality": round(self.process_quality, 3),
+                "composite": round(self.composite_score, 3),
+            },
+            "reasoning_graph": self.reasoning_graph.to_dict(),
+            "failure_root_causes": self.failure_root_causes,
+            "alternative_paths": {
+                "what_agent_did": self.what_agent_did,
+                "optimal_path": self.what_agent_should_have_done,
+                "steps_wasted": self.steps_wasted,
+                "steps_optimal": self.steps_optimal,
+            },
+            "profile_tags": self.profile_tags,
+            "executive_summary": self.executive_summary,
+            "researcher_notes": self.researcher_notes,
+        }
+    def render_text(self) -> str:
+        """Render a human-readable analytics report."""
+        def bar(v: float, width: int = 20) -> str:
+            filled = int(v * width)
+            return "█" * filled + "░" * (width - filled)
+        lines = [
+            f"{'═'*70}",
+            f"  📈 ANALYTICS ENGINE REPORT — {self.task} | {self.variant_id}",
+            f"  Episode: {self.episode_id}",
+            f"{'═'*70}",
+            "",
+            "┌─ DIMENSION SCORES ─────────────────────────────────────────────────",
+            f"│  Correctness      [{bar(self.correctness_score)}] {self.correctness_score:.3f}",
+            f"│  Causal Reasoning [{bar(self.causal_score)}] {self.causal_score:.3f}",
+            f"│  Robustness       [{bar(self.robustness_score)}] {self.robustness_score:.3f}",
+            f"│  Calibration      [{bar(self.calibration_score)}] {self.calibration_score:.3f}",
+            f"│  Reliability      [{bar(self.reliability_index)}] {self.reliability_index:.3f}",
+            f"│  Decision Effic.  [{bar(self.decision_efficiency)}] {self.decision_efficiency:.3f}",
+            f"│  Process Quality  [{bar(self.process_quality)}] {self.process_quality:.3f}",
+            f"│  {'─'*60}",
+            f"│  COMPOSITE        [{bar(self.composite_score)}] {self.composite_score:.3f}",
+            "└────────────────────────────────────────────────────────────────────",
+            "",
+        ]
+        if self.profile_tags:
+            lines.append(f"🏷️  Profile: {' | '.join(self.profile_tags)}")
+            lines.append("")
+        lines += [
+            "📝 Executive Summary",
+            f"   {self.executive_summary}",
+            "",
+        ]
+        if self.failure_root_causes:
+            lines.append("🔥 Failure Root Cause Analysis")
+            for rc in self.failure_root_causes[:3]:
+                lines.append(f"   Cause:  {rc.get('cause')}")
+                lines.append(f"   Effect: {rc.get('effect')}")
+                lines.append(f"   Fix:    {rc.get('remediation')}")
+                lines.append("")
+        lines += [
+            "🗺️  What Agent Did vs Optimal",
+            f"   Steps taken: {len(self.what_agent_did)} | Steps optimal: {self.steps_optimal} | Wasted: {self.steps_wasted}",
+        ]
+        for a, o in zip(
+            self.what_agent_did[:5],
+            self.what_agent_should_have_done[:5],
+        ):
+            prefix_a = "  ✓" if a == o else "  ✗"
+            lines.append(f"   Agent:   {a}")
+            lines.append(f"   Optimal: {o}")
+            lines.append("")
+        if self.researcher_notes:
+            lines += ["🔬 Researcher Notes", f"   {self.researcher_notes}", ""]
+        lines.append(f"{'═'*70}")
+        return "\n".join(lines)
+class AnalyticsEngine:
+    """
+    Master analytics engine — integrates all evaluation modules.
+    Call .analyze() after an episode to get the full AnalyticsReport.
+    """
+    def analyze(
+        self,
+        env,
+        causal_report=None,
+        counterfactual_report=None,
+        calibration_report=None,
+        advanced_metrics=None,
+        failure_report=None,
+        strategy_report=None,
+    ) -> AnalyticsReport:
+        """
+        Synthesize all evaluation outputs into one AnalyticsReport.
+        Each sub-report is optional — we gracefully handle None.
+        """
+        import uuid
+        traj = env.get_trajectory()
+        steps = traj.get("steps", []) if traj else []
+        meta = env.variant.meta if env.variant else {}
+        episode_id = traj.get("episode_id", "unknown") if traj else "unknown"
+        variant_id = traj.get("variant_id", "unknown") if traj else "unknown"
+        task = env.current_task or "unknown"
+        final_score = env.final_score
+        files_read = list(env.files_read)
+        files_written = list(env.files_written)
+        # ── Run sub-engines if reports not provided ────────────────────────────
+        if causal_report is None:
+            from server.causal_probe import CausalProbe
+            causal_report = CausalProbe().probe(
+                episode_id, task, steps, meta, files_read, files_written, final_score
+            )
+        if counterfactual_report is None:
+            from server.counterfactual_engine import CounterfactualEngine
+            counterfactual_report = CounterfactualEngine().analyze(
+                episode_id, task, steps, meta, files_read, files_written, final_score
+            )
+        if calibration_report is None:
+            from server.confidence_calibrator import ConfidenceCalibrator
+            calibration_report = ConfidenceCalibrator().calibrate(
+                episode_id, task, steps, final_score
+            )
+        if advanced_metrics is None:
+            from server.advanced_metrics import AdvancedMetricsEngine
+            advanced_metrics = AdvancedMetricsEngine().compute(
+                steps, meta, final_score, files_read, files_written
+            )
+        if failure_report is None:
+            from server.failure_classifier import FailureClassifier
+            failure_report = FailureClassifier().classify(
+                episode_id, task, steps, meta, files_read, files_written, final_score
+            )
+        if strategy_report is None:
+            from server.strategy_detector import StrategyDetector
+            strategy_report = StrategyDetector().detect(
+                steps, task, meta, files_read, final_score
+            )
+        # ── Compute derived scores ─────────────────────────────────────────────
+        causal_score = causal_report.causal_score
+        robustness_score = counterfactual_report.robustness_score
+        calibration_score = calibration_report.calibration_score
+        reliability_index = advanced_metrics.reliability_index
+        correctness_score = final_score
+        # Decision efficiency: correctness per step, normalized
+        total_steps = max(len(steps), 1)
+        max_steps_possible = meta.get("max_steps", 20)
+        decision_efficiency = (
+            final_score /
+            max(1.0, total_steps / max(1, max_steps_possible / 3))
+        )
+        decision_efficiency = min(1.0, decision_efficiency)
+        # Process quality: measures structural quality of reasoning process
+        read_before_write = causal_report.read_before_write
+        tested_before_submit = causal_report.submit_after_test
+        used_search = causal_report.search_before_navigate
+        full_chain = causal_report.actual_chain_coverage
+        process_quality = (
+            (0.25 if read_before_write else 0.0) +
+            (0.25 if tested_before_submit else 0.0) +
+            (0.20 if used_search else 0.0) +
+            full_chain * 0.30
+        )
+        # Generalization hint from strategy robustness
+        strategy_generalization_map = {
+            "TARGETED_DEBUGGING": 0.75,
+            "SYSTEMATIC_SEARCH": 0.70,
+            "SPEC_DRIVEN": 0.80,
+            "BRUTE_FORCE": 0.40,
+            "RANDOM_EXPLORATION": 0.30,
+            "MINIMAL_EFFORT": 0.20,
+        }
+        generalization_hint = strategy_generalization_map.get(strategy_report.strategy, 0.5)
+        generalization_hint = (generalization_hint + robustness_score) / 2
+        # Composite (research-grade weighted aggregate)
+        composite_score = (
+            correctness_score * 0.30 +
+            causal_score * 0.20 +
+            robustness_score * 0.15 +
+            calibration_score * 0.12 +
+            reliability_index * 0.10 +
+            process_quality * 0.08 +
+            decision_efficiency * 0.05
+        )
+        # ── Build reasoning graph ──────────────────────────────────────────────
+        reasoning_graph = self._build_reasoning_graph(steps, meta, files_read, files_written)
+        # ── Root cause analysis ────────────────────────────────────────────────
+        root_causes = self._build_root_cause_tree(
+            failure_report, causal_report, calibration_report, final_score
+        )
+        # ── Alternative path analysis ─────────────────────────────────────────
+        what_did = [
+            f"{s.get('action_type')} {s.get('action_path') or s.get('action_query') or ''}".strip()
+            for s in steps
+        ]
+        optimal = self._compute_optimal_path(meta, files_read, files_written, final_score)
+        steps_wasted = max(0, total_steps - len(optimal))
+        # ── Profile tags ───────────────────────────────────────────────────────
+        tags = []
+        if calibration_report.profile.value != "WELL_CALIBRATED":
+            tags.append(calibration_report.profile.value)
+        if causal_report.shortcut_learning_detected:
+            tags.append("SHORTCUT_LEARNER")
+        if causal_report.false_confidence_detected:
+            tags.append("FALSE_CONFIDENCE")
+        if counterfactual_report.brittleness_level.value in ("BRITTLE", "FRAGILE"):
+            tags.append(f"BRITTLE_STRATEGY_{counterfactual_report.brittleness_level.value}")
+        if causal_report.understanding_level.value == "DEEP":
+            tags.append("DEEP_REASONER")
+        if strategy_report.strategy == "TARGETED_DEBUGGING":
+            tags.append("TARGETED_DEBUGGER")
+        if not tags:
+            tags.append("TYPICAL")
+        # ── Executive summary ──────────────────────────────────────────────────
+        summary_parts = [
+            f"Agent scored {final_score:.2f} on {task}.",
+            f"Causal understanding: {causal_report.understanding_level.value} ({causal_score:.2f}).",
+            f"Strategy: {strategy_report.strategy} (robustness: {robustness_score:.2f}).",
+            f"Confidence calibration: {calibration_report.profile.value} (error: {calibration_report.expected_calibration_error:.2f}).",
+            f"Composite reliability: {composite_score:.2f}.",
+        ]
+        executive_summary = " ".join(summary_parts)
+        # ── Researcher notes ───────────────────────────────────────────────────
+        researcher_notes = (
+            f"Observed {total_steps} steps ({steps_wasted} wasted vs estimated {len(optimal)} optimal). "
+            f"Chain coverage: {causal_report.actual_chain_coverage:.0%}. "
+            f"Chain order score: {causal_report.chain_order_score:.2f}. "
+            f"Counterfactual mutations survived: {counterfactual_report.mutations_survived}/{len(counterfactual_report.mutations_tested)}. "
+            f"Expected calibration error: {calibration_report.expected_calibration_error:.3f}. "
+            f"Decision efficiency: {decision_efficiency:.3f}. "
+            f"Process quality: {process_quality:.3f}."
+        )
+        return AnalyticsReport(
+            report_id=f"ar_{uuid.uuid4().hex[:10]}",
+            episode_id=episode_id,
+            task=task,
+            variant_id=variant_id,
+            generated_at=time.time(),
+            correctness_score=correctness_score,
+            causal_score=causal_score,
+            robustness_score=robustness_score,
+            calibration_score=calibration_score,
+            reliability_index=reliability_index,
+            generalization_hint=generalization_hint,
+            decision_efficiency=decision_efficiency,
+            process_quality=process_quality,
+            composite_score=composite_score,
+            reasoning_graph=reasoning_graph,
+            failure_root_causes=root_causes,
+            what_agent_did=what_did,
+            what_agent_should_have_done=optimal,
+            steps_wasted=steps_wasted,
+            steps_optimal=len(optimal),
+            profile_tags=tags,
+            executive_summary=executive_summary,
+            researcher_notes=researcher_notes,
+        )
+    def _build_reasoning_graph(
+        self,
+        steps: List[dict],
+        meta: dict,
+        files_read: List[str],
+        files_written: List[str],
+    ) -> ReasoningGraph:
+        """Build a DAG from the trajectory steps."""
+        bug_files = set(meta.get("bug_files", []) + meta.get("files_to_implement", []))
+        nodes: List[ReasoningNode] = []
+        phases: Dict[str, List[str]] = {
+            "Exploration": [], "Hypothesis": [], "Verification": [], "Commit": []
+        }
+        files_read_set = set()
+        last_useful_node_id: Optional[str] = None
+        all_node_ids: List[str] = []
+        for s in steps:
+            node_id = f"n{s.get('step_number', len(nodes)+1)}"
+            atype = s.get("action_type", "unknown")
+            target = s.get("action_path") or s.get("action_query")
+            reward = s.get("reward", 0.0)
+            # Determine usefulness
+            was_useful = (
+                reward > 0 or
+                (atype == "read_file" and target in bug_files) or
+                (atype == "search_code") or
+                (atype == "run_tests") or
+                (atype == "submit" and reward > 0)
+            )
+            # Determine phase
+            if atype in ("read_file", "search_code"):
+                phase = "Exploration"
+            elif atype == "write_file":
+                phase = "Hypothesis"
+            elif atype == "run_tests":
+                phase = "Verification"
+            else:
+                phase = "Commit"
+            # Build label
+            short_target = (target.split("/")[-1] if target else "")[:20] if target else ""
+            label = f"{atype}({short_target})" if short_target else atype
+            # Connections: link to previous useful node
+            connects_to = [last_useful_node_id] if last_useful_node_id and was_useful else []
+            connects_to = [c for c in connects_to if c]
+            node = ReasoningNode(
+                node_id=node_id,
+                step_number=s.get("step_number", len(nodes) + 1),
+                action_type=atype,
+                target=target,
+                reward=reward,
+                was_useful=was_useful,
+                connected_to=connects_to,
+                label=label,
+            )
+            nodes.append(node)
+            phases[phase].append(node_id)
+            all_node_ids.append(node_id)
+            if was_useful:
+                last_useful_node_id = node_id
+        # Critical path: nodes with positive reward or that led to the final submit
+        critical_path = [n.node_id for n in nodes if n.reward > 0 or n.action_type == "submit"]
+        wasted_nodes = [n.node_id for n in nodes if not n.was_useful and n.action_type != "submit"]
+        # Optimal path comparison
+        optimal_actions = []
+        test_files = [f for f in (list(files_read) + list(bug_files)) if "test" in f.lower()]
+        src_files = [f for f in (list(files_read) + list(bug_files)) if f not in test_files]
+        for tf in test_files[:1]:
+            optimal_actions.append(f"read_file({tf.split('/')[-1]})")
+        for sf in src_files[:2]:
+            optimal_actions.append(f"read_file({sf.split('/')[-1]})")
+        optimal_actions += ["write_file(src)", "run_tests", "submit"]
+        optimal_path = " → ".join(optimal_actions)
+        return ReasoningGraph(
+            nodes=nodes,
+            phases={k: v for k, v in phases.items() if v},
+            critical_path=critical_path,
+            wasted_nodes=wasted_nodes,
+            optimal_path_comparison=optimal_path,
+        )
+    def _build_root_cause_tree(
+        self, failure_report, causal_report, calibration_report, final_score: float
+    ) -> List[Dict]:
+        """Build a structured root cause tree."""
+        causes = []
+        if failure_report and failure_report.failures:
+            for f in failure_report.failures[:3]:
+                causes.append({
+                    "depth": "primary",
+                    "cause": f.failure_type if hasattr(f, "failure_type") else str(f),
+                    "effect": f.evidence if hasattr(f, "evidence") else "unknown",
+                    "remediation": f.remediation if hasattr(f, "remediation") else "See improvement plan",
+                })
+        elif final_score < 0.5:
+            causes.append({
+                "depth": "primary",
+                "cause": failure_report.primary_failure if failure_report else "LOW_SCORE",
+                "effect": f"Final score only {final_score:.2f} — bug not adequately fixed",
+                "remediation": "Use test-first navigation and verify with run_tests",
+            })
+        if causal_report and causal_report.guessing_indicators:
+            for ind in causal_report.guessing_indicators[:2]:
+                causes.append({
+                    "depth": "secondary",
+                    "cause": "CAUSAL_GAP",
+                    "effect": ind,
+                    "remediation": causal_report.recommendations[0] if causal_report.recommendations else "",
+                })
+        if calibration_report and calibration_report.profile.value == "OVERCONFIDENT":
+            causes.append({
+                "depth": "secondary",
+                "cause": "OVERCONFIDENCE",
+                "effect": f"Inferred confidence {calibration_report.inferred_confidence:.2f} vs actual {calibration_report.actual_performance:.2f}",
+                "remediation": "Read more before committing. Verify with tests.",
+            })
+        return causes
+    def _compute_optimal_path(
+        self, meta: dict, files_read: List[str], files_written: List[str], score: float
+    ) -> List[str]:
+        """Suggest what the optimal action sequence would have been."""
+        test_files = [f for f in files_read if "test" in f.lower()]
+        bug_files = meta.get("bug_files", []) or meta.get("files_to_implement", [])
+        path = []
+        for tf in (test_files or ["tests/test_main.py"])[:1]:
+            path.append(f"read_file {tf}")
+        for bf in (bug_files or ["src/main.py"])[:2]:
+            path.append(f"read_file {bf}")
+        path.append("search_code <function_name>")
+        path.append("write_file <targeted_fix>")
+        path.append("run_tests")
+        path.append("submit")
+        return path

server/app.py CHANGED Viewed

@@ -1,12 +1,14 @@
 # server/app.py
 """
-FastAPI server — v3.0
 Core endpoints:        POST /reset, POST /step, GET /state, GET /health
 Evaluation endpoints:  GET /trajectory, GET /evaluate, GET /metrics
 Control endpoints:     POST /fault-config
-Intelligence endpoints: GET /classify, GET /strategy, GET /advanced-metrics,
-                        POST /compare-agents, GET /improvement-plan, GET /viz-data
 """
 from fastapi import FastAPI, HTTPException
 from fastapi.staticfiles import StaticFiles
@@ -337,3 +339,146 @@ async def get_viz_data():
         "dependencies": deps,
         "steps": steps_data,
     }

 # server/app.py
 """
+FastAPI server — v4.0
 Core endpoints:        POST /reset, POST /step, GET /state, GET /health
 Evaluation endpoints:  GET /trajectory, GET /evaluate, GET /metrics
 Control endpoints:     POST /fault-config
+Intelligence (v3):     GET /classify, GET /strategy, GET /advanced-metrics,
+                       POST /compare-agents, GET /improvement-plan, GET /viz-data
+Research (v4 NEW):     GET /causal-probe, GET /counterfactual, GET /confidence,
+                       POST /benchmark, GET /analytics
 """
 from fastapi import FastAPI, HTTPException
 from fastapi.staticfiles import StaticFiles
         "dependencies": deps,
         "steps": steps_data,
     }
+# ── Research Endpoints (NEW in v4) ────────────────────────────────────────────
+from .causal_probe import CausalProbe
+from .counterfactual_engine import CounterfactualEngine
+from .confidence_calibrator import ConfidenceCalibrator
+from .benchmark_runner import BenchmarkRunner
+from .analytics_engine import AnalyticsEngine
+_causal = CausalProbe()
+_counter = CounterfactualEngine()
+_calibrator = ConfidenceCalibrator()
+_benchmark = BenchmarkRunner()
+_analytics = AnalyticsEngine()
+@app.get("/causal-probe")
+async def causal_probe():
+    """
+    Causal reasoning probe — did the agent understand WHY the bug exists?
+    Returns: causal_score, understanding_level, chain_coverage, shortcut_detection.
+    """
+    traj = env.get_trajectory()
+    if not traj:
+        return {"error": "No trajectory available."}
+    steps = traj.get("steps", [])
+    meta = env.variant.meta if env.variant else {}
+    report = _causal.probe(
+        episode_id=traj.get("episode_id", ""),
+        task=env.current_task or "unknown",
+        trajectory_steps=steps,
+        variant_meta=meta,
+        files_read=list(env.files_read),
+        files_written=list(env.files_written),
+        final_score=env.final_score,
+    )
+    return report.to_dict()
+@app.get("/counterfactual")
+async def counterfactual():
+    """
+    Counterfactual robustness test — is the agent's strategy brittle?
+    Simulates 6 mutations and measures how many the strategy survives.
+    Returns: robustness_score, brittleness_level, mutations analysis.
+    """
+    traj = env.get_trajectory()
+    if not traj:
+        return {"error": "No trajectory available."}
+    steps = traj.get("steps", [])
+    meta = env.variant.meta if env.variant else {}
+    report = _counter.analyze(
+        episode_id=traj.get("episode_id", ""),
+        task=env.current_task or "unknown",
+        trajectory_steps=steps,
+        variant_meta=meta,
+        files_read=list(env.files_read),
+        files_written=list(env.files_written),
+        final_score=env.final_score,
+    )
+    return report.to_dict()
+@app.get("/confidence")
+async def confidence_calibration():
+    """
+    Confidence calibration — is the agent appropriately confident?
+    Infers confidence from behavioral proxies and compares to actual performance.
+    Returns: profile (WELL_CALIBRATED|OVERCONFIDENT|UNDERCONFIDENT), calibration_score.
+    """
+    traj = env.get_trajectory()
+    if not traj:
+        return {"error": "No trajectory available."}
+    steps = traj.get("steps", [])
+    report = _calibrator.calibrate(
+        episode_id=traj.get("episode_id", ""),
+        task=env.current_task or "unknown",
+        trajectory_steps=steps,
+        final_score=env.final_score,
+    )
+    return report.to_dict()
+@app.post("/benchmark")
+async def run_benchmark(
+    tasks: str = "task1,task2",
+    agents: str = "all",
+    benchmark_id: str = None,
+):
+    """
+    Automated benchmark leaderboard.
+    Runs all selected agents × tasks. Returns ranked leaderboard.
+    tasks: comma-separated task IDs. agents: "all" or comma-separated strategy names.
+    """
+    task_list = [t.strip() for t in tasks.split(",") if t.strip()]
+    valid_tasks = ["task1", "task2", "task3"]
+    task_list = [t for t in task_list if t in valid_tasks]
+    if not task_list:
+        raise HTTPException(status_code=400, detail=f"tasks must be one of {valid_tasks}")
+    agent_list = None if agents == "all" else [a.strip() for a in agents.split(",")]
+    try:
+        report = _benchmark.run(env, tasks=task_list, agents=agent_list, benchmark_id=benchmark_id)
+        return report.to_dict()
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/analytics")
+async def get_analytics():
+    """
+    Unified research-grade analytics report.
+    Synthesizes all v3+v4 evaluation dimensions into one report with:
+    reasoning graph, root cause tree, alternative paths, profile tags,
+    composite score, executive summary, researcher notes.
+    """
+    traj = env.get_trajectory()
+    if not traj:
+        return {"error": "No trajectory available."}
+    try:
+        report = _analytics.analyze(env)
+        return report.to_dict()
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/health")
+async def health_v4():
+    return {
+        "status": "ok",
+        "environment": "codebase-nav-env",
+        "version": "4.0.0",
+        "endpoints": [
+            "/reset", "/step", "/state", "/health",
+            "/trajectory", "/evaluate", "/metrics", "/fault-config",
+            "/classify", "/strategy", "/advanced-metrics",
+            "/improvement-plan", "/compare-agents", "/viz-data",
+            "/causal-probe", "/counterfactual", "/confidence",
+            "/benchmark", "/analytics",
+        ],
+    }

server/benchmark_runner.py ADDED Viewed

	@@ -0,0 +1,413 @@

+# server/benchmark_runner.py
+"""
+Benchmark Runner + Leaderboard — v4.0
+Automatically runs ALL tasks × selected agent configurations and generates
+a research-grade leaderboard output with per-task, per-strategy breakdowns.
+Unlike existing benchmarks (SWE-bench, HumanEval) which require manual setup,
+this runs end-to-end in-process with deterministic strategies.
+Output format:
+- Leaderboard table (ranked by composite score)
+- Per-task breakdown
+- Per-failure-type breakdown
+- Generalization score (variance across tasks)
+- Robustness score (from counterfactual engine)
+- A "benchmark JSON" suitable for publishing or comparing systems
+"""
+from __future__ import annotations
+import time
+import json
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass, field
+@dataclass
+class BenchmarkResult:
+    """Result of running one agent on one task variant."""
+    agent_name: str
+    task: str
+    variant_id: str
+    final_score: float
+    total_steps: int
+    cumulative_reward: float
+    duration_seconds: float
+    strategy: str
+    failure_type: str
+    reliability_index: float
+    causal_score: float
+    robustness_score: float
+    calibration_score: float
+    action_sequence: List[str]
+@dataclass
+class AgentBenchmarkSummary:
+    """Aggregated results for one agent across all tasks."""
+    agent_name: str
+    tasks_run: int
+    mean_score: float
+    std_score: float
+    generalization_score: float  # 1 - std (lower variance = more generalizable)
+    mean_steps: float
+    best_task: str
+    worst_task: str
+    mean_reliability: float
+    mean_causal_score: float
+    mean_robustness_score: float
+    mean_calibration_score: float
+    dominant_strategy: str
+    dominant_failure: str
+    composite_rank_score: float   # Weighted final score for leaderboard
+    per_task_scores: Dict[str, float]
+    def to_dict(self) -> dict:
+        return {
+            "agent_name": self.agent_name,
+            "tasks_run": self.tasks_run,
+            "scores": {
+                "mean": round(self.mean_score, 3),
+                "std": round(self.std_score, 3),
+                "generalization": round(self.generalization_score, 3),
+                "reliability": round(self.mean_reliability, 3),
+                "causal_reasoning": round(self.mean_causal_score, 3),
+                "robustness": round(self.mean_robustness_score, 3),
+                "calibration": round(self.mean_calibration_score, 3),
+                "composite": round(self.composite_rank_score, 3),
+            },
+            "efficiency": {
+                "mean_steps": round(self.mean_steps, 1),
+            },
+            "behavior": {
+                "dominant_strategy": self.dominant_strategy,
+                "dominant_failure": self.dominant_failure,
+            },
+            "per_task_scores": {k: round(v, 3) for k, v in self.per_task_scores.items()},
+            "best_task": self.best_task,
+            "worst_task": self.worst_task,
+        }
+@dataclass
+class LeaderboardReport:
+    """Full benchmark leaderboard."""
+    benchmark_id: str
+    tasks_evaluated: List[str]
+    agents_evaluated: List[str]
+    total_episodes: int
+    run_duration_seconds: float
+    rankings: List[AgentBenchmarkSummary]
+    raw_results: List[BenchmarkResult]
+    def to_dict(self) -> dict:
+        return {
+            "benchmark_id": self.benchmark_id,
+            "tasks_evaluated": self.tasks_evaluated,
+            "agents_evaluated": self.agents_evaluated,
+            "total_episodes": self.total_episodes,
+            "run_duration_seconds": round(self.run_duration_seconds, 2),
+            "leaderboard": [r.to_dict() for r in self.rankings],
+            "winner": self.rankings[0].agent_name if self.rankings else "none",
+            "insights": self._generate_insights(),
+        }
+    def _generate_insights(self) -> List[str]:
+        if not self.rankings:
+            return []
+        insights = []
+        top = self.rankings[0]
+        bottom = self.rankings[-1]
+        if top.composite_rank_score - bottom.composite_rank_score > 0.2:
+            insights.append(
+                f"Large performance gap: '{top.agent_name}' ({top.composite_rank_score:.2f}) "
+                f"vs '{bottom.agent_name}' ({bottom.composite_rank_score:.2f})"
+            )
+        if top.generalization_score > 0.7:
+            insights.append(
+                f"'{top.agent_name}' shows strong generalization "
+                f"(std={top.std_score:.3f} across {top.tasks_run} tasks)"
+            )
+        for r in self.rankings:
+            if r.mean_causal_score > 0.6:
+                insights.append(
+                    f"'{r.agent_name}' demonstrated genuine causal reasoning "
+                    f"(causal_score={r.mean_causal_score:.2f})"
+                )
+        strategies = [r.dominant_strategy for r in self.rankings]
+        if len(set(strategies)) > 1:
+            best_strategy = self.rankings[0].dominant_strategy
+            insights.append(
+                f"Strategy '{best_strategy}' produced the highest composite score."
+            )
+        return insights
+    def render_table(self) -> str:
+        """Render ASCII leaderboard table."""
+        if not self.rankings:
+            return "No results."
+        lines = [
+            f"{'═'*90}",
+            f"  🏆 BENCHMARK LEADERBOARD — {self.benchmark_id}",
+            f"  Tasks: {', '.join(self.tasks_evaluated)} | Agents: {len(self.agents_evaluated)} | Episodes: {self.total_episodes}",
+            f"{'═'*90}",
+            f"{'Rank':<5} {'Agent':<16} {'Score':<8} {'Causal':<8} {'Robust':<8} {'Calibr':<8} {'Genrz':<8} {'Steps':<7} {'Strategy'}",
+            f"{'─'*90}",
+        ]
+        for i, r in enumerate(self.rankings):
+            medal = "🥇" if i == 0 else "🥈" if i == 1 else "🥉" if i == 2 else f"  #{i+1}"
+            lines.append(
+                f"{medal:<5} {r.agent_name:<16} {r.mean_score:<8.3f} "
+                f"{r.mean_causal_score:<8.3f} {r.mean_robustness_score:<8.3f} "
+                f"{r.mean_calibration_score:<8.3f} {r.generalization_score:<8.3f} "
+                f"{r.mean_steps:<7.1f} {r.dominant_strategy}"
+            )
+        lines.append(f"{'═'*90}")
+        lines.append("\n📊 Per-Task Breakdown:")
+        for r in self.rankings:
+            task_str = " | ".join(f"{t}: {s:.2f}" for t, s in sorted(r.per_task_scores.items()))
+            lines.append(f"  {r.agent_name:<16} {task_str}")
+        if self._generate_insights():
+            lines.append("\n💡 Insights:")
+            lines.extend(f"  → {i}" for i in self._generate_insights())
+        return "\n".join(lines)
+class BenchmarkRunner:
+    """
+    Automated benchmark runner.
+    Runs each agent in AGENT_CONFIGS across each task, collecting:
+    - Final score
+    - All intelligence metrics (causal, counterfactual, confidence)
+    - Strategy and failure classification
+    - Reliability index
+    Then generates a ranked leaderboard.
+    """
+    def run(
+        self,
+        env,
+        tasks: Optional[List[str]] = None,
+        agents: Optional[List[str]] = None,
+        benchmark_id: Optional[str] = None,
+    ) -> LeaderboardReport:
+        """Run the full benchmark."""
+        import uuid
+        from server.models import RepoAction
+        from server.strategy_detector import StrategyDetector
+        from server.failure_classifier import FailureClassifier
+        from server.advanced_metrics import AdvancedMetricsEngine
+        from server.causal_probe import CausalProbe
+        from server.counterfactual_engine import CounterfactualEngine
+        from server.confidence_calibrator import ConfidenceCalibrator
+        benchmark_id = benchmark_id or f"bench_{uuid.uuid4().hex[:8]}"
+        tasks = tasks or ["task1", "task2", "task3"]
+        agent_configs = self._get_agent_configs()
+        if agents:
+            agent_configs = {k: v for k, v in agent_configs.items() if k in agents}
+        clf = FailureClassifier()
+        det = StrategyDetector()
+        adv = AdvancedMetricsEngine()
+        causal = CausalProbe()
+        counter = CounterfactualEngine()
+        calibrator = ConfidenceCalibrator()
+        start_time = time.time()
+        all_results: List[BenchmarkResult] = []
+        for task in tasks:
+            for agent_name, agent_fn in agent_configs.items():
+                try:
+                    result = self._run_episode(
+                        env, task, agent_name, agent_fn,
+                        clf, det, adv, causal, counter, calibrator
+                    )
+                    all_results.append(result)
+                except Exception as e:
+                    # Don't crash the whole benchmark on one failure
+                    all_results.append(BenchmarkResult(
+                        agent_name=agent_name, task=task, variant_id="error",
+                        final_score=0.0, total_steps=0, cumulative_reward=0.0,
+                        duration_seconds=0.0, strategy="ERROR", failure_type="BENCHMARK_ERROR",
+                        reliability_index=0.0, causal_score=0.0, robustness_score=0.0,
+                        calibration_score=0.0, action_sequence=[],
+                    ))
+        total_duration = time.time() - start_time
+        rankings = self._compute_rankings(all_results, tasks)
+        return LeaderboardReport(
+            benchmark_id=benchmark_id,
+            tasks_evaluated=tasks,
+            agents_evaluated=list(agent_configs.keys()),
+            total_episodes=len(all_results),
+            run_duration_seconds=total_duration,
+            rankings=rankings,
+            raw_results=all_results,
+        )
+    def _run_episode(
+        self, env, task, agent_name, agent_fn,
+        clf, det, adv, causal, counter, calibrator
+    ) -> BenchmarkResult:
+        from server.models import RepoAction
+        reset_result = env.reset(task=task)
+        obs = reset_result.observation
+        variant_id = reset_result.info.get("variant_id", "unknown")
+        context = {}
+        obs_dict = obs.model_dump()
+        start = time.time()
+        cumulative_reward = 0.0
+        files_read, files_written, action_sequence = [], [], []
+        max_steps = 15
+        for step_num in range(1, max_steps + 1):
+            if env.done:
+                break
+            action_dict = agent_fn(obs_dict, step_num, context)
+            action = RepoAction(
+                action_type=action_dict.get("action_type", "submit"),
+                path=action_dict.get("path"),
+                query=action_dict.get("query"),
+            )
+            result = env.step(action)
+            obs = result.observation
+            obs_dict = obs.model_dump()
+            cumulative_reward += result.reward
+            action_sequence.append(action.action_type)
+            if action.path and action.action_type == "read_file":
+                files_read.append(action.path)
+            if action.path and action.action_type == "write_file":
+                files_written.append(action.path)
+            if result.done:
+                break
+        if not env.done:
+            r = env.step(RepoAction(action_type="submit"))
+            cumulative_reward += r.reward
+            action_sequence.append("submit")
+        duration = time.time() - start
+        final_score = env.final_score
+        traj = env.get_trajectory()
+        steps = traj.get("steps", []) if traj else []
+        meta = env.variant.meta if env.variant else {}
+        # Intelligence metrics
+        fail_r = clf.classify(
+            traj.get("episode_id", "") if traj else "", task,
+            steps, meta, files_read, files_written, final_score
+        )
+        strat_r = det.detect(steps, task, meta, files_read, final_score)
+        adv_r = adv.compute(steps, meta, final_score, files_read, files_written)
+        causal_r = causal.probe(
+            traj.get("episode_id", "") if traj else "", task,
+            steps, meta, files_read, files_written, final_score
+        )
+        counter_r = counter.analyze(
+            traj.get("episode_id", "") if traj else "", task,
+            steps, meta, files_read, files_written, final_score
+        )
+        calib_r = calibrator.calibrate(
+            traj.get("episode_id", "") if traj else "", task,
+            steps, final_score,
+        )
+        return BenchmarkResult(
+            agent_name=agent_name,
+            task=task,
+            variant_id=variant_id,
+            final_score=final_score,
+            total_steps=len(action_sequence),
+            cumulative_reward=cumulative_reward,
+            duration_seconds=duration,
+            strategy=strat_r.strategy,
+            failure_type=fail_r.primary_failure,
+            reliability_index=adv_r.reliability_index,
+            causal_score=causal_r.causal_score,
+            robustness_score=counter_r.robustness_score,
+            calibration_score=calib_r.calibration_score,
+            action_sequence=action_sequence,
+        )
+    def _compute_rankings(
+        self, results: List[BenchmarkResult], tasks: List[str]
+    ) -> List[AgentBenchmarkSummary]:
+        import math
+        from collections import Counter
+        # Group by agent
+        agent_results: Dict[str, List[BenchmarkResult]] = {}
+        for r in results:
+            agent_results.setdefault(r.agent_name, []).append(r)
+        summaries = []
+        for agent_name, agent_res in agent_results.items():
+            scores = [r.final_score for r in agent_res]
+            mean_score = sum(scores) / len(scores)
+            if len(scores) > 1:
+                variance = sum((s - mean_score) ** 2 for s in scores) / len(scores)
+                std_score = math.sqrt(variance)
+            else:
+                std_score = 0.0
+            generalization_score = max(0.0, 1.0 - std_score)
+            per_task = {r.task: r.final_score for r in agent_res}
+            strategies = Counter(r.strategy for r in agent_res)
+            failures = Counter(r.failure_type for r in agent_res)
+            mean_steps = sum(r.total_steps for r in agent_res) / len(agent_res)
+            mean_reliability = sum(r.reliability_index for r in agent_res) / len(agent_res)
+            mean_causal = sum(r.causal_score for r in agent_res) / len(agent_res)
+            mean_robustness = sum(r.robustness_score for r in agent_res) / len(agent_res)
+            mean_calibration = sum(r.calibration_score for r in agent_res) / len(agent_res)
+            # Composite leaderboard score — weighted across all dimensions
+            composite = (
+                mean_score * 0.35 +
+                mean_causal * 0.20 +
+                mean_robustness * 0.15 +
+                mean_calibration * 0.15 +
+                generalization_score * 0.15
+            )
+            best_task = max(per_task, key=per_task.get)
+            worst_task = min(per_task, key=per_task.get)
+            summaries.append(AgentBenchmarkSummary(
+                agent_name=agent_name,
+                tasks_run=len(agent_res),
+                mean_score=mean_score,
+                std_score=std_score,
+                generalization_score=generalization_score,
+                mean_steps=mean_steps,
+                best_task=best_task,
+                worst_task=worst_task,
+                mean_reliability=mean_reliability,
+                mean_causal_score=mean_causal,
+                mean_robustness_score=mean_robustness,
+                mean_calibration_score=mean_calibration,
+                dominant_strategy=strategies.most_common(1)[0][0],
+                dominant_failure=failures.most_common(1)[0][0],
+                composite_rank_score=composite,
+                per_task_scores=per_task,
+            ))
+        summaries.sort(key=lambda s: -s.composite_rank_score)
+        return summaries
+    def _get_agent_configs(self) -> Dict:
+        """Reuse built-in strategies from multi_agent.py."""
+        from server.multi_agent import MultiAgentComparison
+        return MultiAgentComparison.AGENT_CONFIGS

server/causal_probe.py ADDED Viewed

	@@ -0,0 +1,409 @@

+# server/causal_probe.py
+"""
+Causal Reasoning Probe — v4.0
+The key scientific question: Did the agent understand WHY the bug exists,
+or did it accidentally fix it by pattern matching?
+We measure causal understanding by checking if the agent traversed the
+COMPLETE causal chain: Failing test → tested function → return path → root cause.
+An agent that reads only the test and immediately rewrites the function
+is guessing. An agent that reads test → traces the call stack → finds the
+actual cause first is reasoning causally.
+This is NOT in any current benchmark. SWE-bench only checks if the test passes.
+We check HOW the agent got there.
+"""
+from __future__ import annotations
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass, field
+from enum import Enum
+class CausalUnderstandingLevel(str, Enum):
+    DEEP = "DEEP"             # Full causal chain traversal
+    PARTIAL = "PARTIAL"       # Partial chain (some steps missing)
+    SUPERFICIAL = "SUPERFICIAL"  # Direct test→rewrite with no chain
+    RANDOM = "RANDOM"         # No discernible causal pattern
+@dataclass
+class CausalChainNode:
+    """One node in the reconstructed causal chain."""
+    file: str
+    role: str   # "test", "caller", "called", "root_cause", "missed"
+    was_read: bool
+    read_order: Optional[int]  # Which step did agent read this?
+@dataclass
+class CausalProbeReport:
+    """
+    Full causal reasoning analysis for one episode.
+    This is the primary output of the CausalProbe.
+    """
+    episode_id: str
+    task: str
+    # Core verdict
+    understanding_level: CausalUnderstandingLevel
+    causal_score: float            # 0.0 – 1.0
+    # Chain analysis
+    expected_chain: List[CausalChainNode]  # What SHOULD have been read
+    actual_chain_coverage: float           # Fraction of chain actually traversed
+    chain_order_score: float               # Was chain traversed in correct order?
+    # Behavioral signals
+    read_before_write: bool        # Did agent read all relevant files before writing?
+    test_informed_navigation: bool # Did reading tests change which files agent read next?
+    search_before_navigate: bool   # Did agent search for function names before reading?
+    submit_after_test: bool        # Did agent verify fix before submitting?
+    # Signal: understanding vs guessing
+    guessing_indicators: List[str]   # Signs agent was guessing
+    understanding_indicators: List[str]  # Signs agent understood
+    # Calibration
+    false_confidence_detected: bool  # Submitted without reading root cause file
+    shortcut_learning_detected: bool # Read test file → immediately wrote → submitted
+    explanation: str
+    recommendations: List[str]
+    def to_dict(self) -> dict:
+        return {
+            "episode_id": self.episode_id,
+            "task": self.task,
+            "understanding_level": self.understanding_level.value,
+            "causal_score": round(self.causal_score, 3),
+            "chain_coverage": round(self.actual_chain_coverage, 3),
+            "chain_order_score": round(self.chain_order_score, 3),
+            "behavioral_signals": {
+                "read_before_write": self.read_before_write,
+                "test_informed_navigation": self.test_informed_navigation,
+                "search_before_navigate": self.search_before_navigate,
+                "submit_after_test": self.submit_after_test,
+            },
+            "guessing_indicators": self.guessing_indicators,
+            "understanding_indicators": self.understanding_indicators,
+            "diagnostics": {
+                "false_confidence_detected": self.false_confidence_detected,
+                "shortcut_learning_detected": self.shortcut_learning_detected,
+            },
+            "expected_chain": [
+                {"file": n.file, "role": n.role, "read": n.was_read, "order": n.read_order}
+                for n in self.expected_chain
+            ],
+            "explanation": self.explanation,
+            "recommendations": self.recommendations,
+        }
+class CausalProbe:
+    """
+    Analyzes whether an agent engaged in true causal reasoning.
+    The core insight: for a bug in src/X.py called from tests/test_X.py,
+    the causal chain is:
+        tests/test_X.py → (calls) → src/X.py → (calls) → src/utils.py (maybe)
+    A causally-aware agent reads in this order.
+    A shortcut agent reads the test, guesses the bug, rewrites without reading source.
+    We score order, coverage, and behavioral signals.
+    """
+    def probe(
+        self,
+        episode_id: str,
+        task: str,
+        trajectory_steps: List[dict],
+        variant_meta: dict,
+        files_read: List[str],
+        files_written: List[str],
+        final_score: float,
+    ) -> CausalProbeReport:
+        """Run the causal probe on an episode's trajectory."""
+        # ── Build expected causal chain from variant metadata ─────────────────
+        test_files = variant_meta.get("test_files", []) or [
+            f for f in variant_meta.get("read_first_files", []) if "test" in f
+        ]
+        bug_files = variant_meta.get("bug_files", []) or variant_meta.get("files_to_implement", [])
+        dep_files = variant_meta.get("dependencies", []) or []
+        # If metadata sparse, infer from trajectory
+        all_files_in_traj = list({
+            s.get("action_path") for s in trajectory_steps
+            if s.get("action_path") and s.get("action_type") in ("read_file", "write_file")
+        })
+        if not test_files:
+            test_files = [f for f in all_files_in_traj if "test" in f.lower()]
+        if not bug_files:
+            bug_files = [f for f in all_files_in_traj
+                        if "test" not in f.lower() and f.endswith(".py")]
+        # Build expected chain
+        expected_chain: List[CausalChainNode] = []
+        read_set = set(files_read)
+        read_order: Dict[str, int] = {}
+        for step in trajectory_steps:
+            if step.get("action_type") == "read_file" and step.get("action_path"):
+                path = step["action_path"]
+                if path not in read_order:
+                    read_order[path] = step.get("step_number", len(read_order) + 1)
+        for tf in test_files:
+            expected_chain.append(CausalChainNode(
+                file=tf, role="test",
+                was_read=tf in read_set,
+                read_order=read_order.get(tf),
+            ))
+        for bf in bug_files:
+            expected_chain.append(CausalChainNode(
+                file=bf, role="root_cause",
+                was_read=bf in read_set,
+                read_order=read_order.get(bf),
+            ))
+        for df in dep_files:
+            expected_chain.append(CausalChainNode(
+                file=df, role="caller",
+                was_read=df in read_set,
+                read_order=read_order.get(df),
+            ))
+        if not expected_chain:
+            # Fallback: any file is better than none
+            for f in all_files_in_traj[:3]:
+                expected_chain.append(CausalChainNode(
+                    file=f, role="unknown",
+                    was_read=True,
+                    read_order=read_order.get(f),
+                ))
+        # ── Chain coverage ────────────────────────────────────────────────────
+        chain_files_read = [n for n in expected_chain if n.was_read and n.role != "missed"]
+        actual_chain_coverage = (
+            len(chain_files_read) / len(expected_chain) if expected_chain else 0.0
+        )
+        # ── Chain order score (tests before src = good causal order) ──────────
+        chain_order_score = 0.0
+        test_orders = [n.read_order for n in expected_chain if n.role == "test" and n.read_order]
+        src_orders = [n.read_order for n in expected_chain
+                      if n.role in ("root_cause", "caller") and n.read_order]
+        if test_orders and src_orders:
+            # Good: all tests read before source files
+            correct_order_pairs = sum(
+                1 for to in test_orders for so in src_orders if to < so
+            )
+            total_pairs = len(test_orders) * len(src_orders)
+            chain_order_score = correct_order_pairs / total_pairs if total_pairs > 0 else 0.0
+        elif test_orders and not src_orders:
+            chain_order_score = 0.3  # Partial — read tests but not source
+        elif src_orders and not test_orders:
+            chain_order_score = 0.2  # Read source without reading tests = weaker
+        # ── Behavioral signals ────────────────────────────────────────────────
+        action_types = [s.get("action_type", "") for s in trajectory_steps]
+        action_paths = [s.get("action_path") for s in trajectory_steps]
+        # read_before_write: all written files were read at least once before write
+        read_before_write = True
+        for step in trajectory_steps:
+            if step.get("action_type") == "write_file" and step.get("action_path"):
+                p = step["action_path"]
+                step_n = step.get("step_number", 0)
+                was_read_before = any(
+                    s2.get("action_type") == "read_file"
+                    and s2.get("action_path") == p
+                    and s2.get("step_number", 99) < step_n
+                    for s2 in trajectory_steps
+                )
+                if not was_read_before:
+                    read_before_write = False
+                    break
+        # test_informed_navigation: did agent read source files AFTER reading tests?
+        test_read_step = min(
+            (s.get("step_number", 99) for s in trajectory_steps
+             if s.get("action_type") == "read_file"
+             and any(tf in (s.get("action_path") or "") for tf in test_files)),
+            default=None
+        )
+        src_read_after_test = test_read_step is not None and any(
+            s.get("action_type") == "read_file"
+            and s.get("step_number", 0) > test_read_step
+            and any(bf in (s.get("action_path") or "") for bf in bug_files)
+            for s in trajectory_steps
+        )
+        test_informed_navigation = src_read_after_test
+        # search_before_navigate: used search_code before reading source files
+        search_steps = [s for s in trajectory_steps if s.get("action_type") == "search_code"]
+        first_src_read = min(
+            (s.get("step_number", 99) for s in trajectory_steps
+             if s.get("action_type") == "read_file"
+             and any(bf in (s.get("action_path") or "") for bf in bug_files)),
+            default=None
+        )
+        search_before_navigate = bool(search_steps) and (
+            first_src_read is None or
+            any(s.get("step_number", 99) < first_src_read for s in search_steps)
+        )
+        # submit_after_test: ran tests before submitting
+        test_runs = [s for s in trajectory_steps if s.get("action_type") == "run_tests"]
+        submit_step = next(
+            (s.get("step_number", 99) for s in trajectory_steps
+             if s.get("action_type") == "submit"), None
+        )
+        submit_after_test = bool(test_runs) and submit_step is not None and any(
+            s.get("step_number", 0) < submit_step for s in test_runs
+        )
+        # ── Guessing vs understanding indicators ──────────────────────────────
+        guessing_indicators = []
+        understanding_indicators = []
+        total = len(trajectory_steps)
+        # Guessing: short episode with low score
+        if total <= 3 and final_score < 0.5:
+            guessing_indicators.append(f"Submitted in only {total} steps with score {final_score:.2f}")
+        # Guessing: wrote without reading
+        if not read_before_write:
+            guessing_indicators.append("Wrote to file(s) without first reading them")
+        # Guessing: skipped test files
+        if not any(n.was_read for n in expected_chain if n.role == "test"):
+            guessing_indicators.append("Never read any test files")
+        # Guessing: skipped source files
+        if not any(n.was_read for n in expected_chain if n.role == "root_cause"):
+            guessing_indicators.append("Never read the bug/source file")
+        # Understanding: search used
+        if search_steps:
+            understanding_indicators.append(
+                f"Used search_code {len(search_steps)}× to locate bug"
+            )
+        # Understanding: read tests first
+        if chain_order_score > 0.7:
+            understanding_indicators.append("Read tests before source files (correct causal order)")
+        # Understanding: tested before submitting
+        if submit_after_test:
+            understanding_indicators.append("Verified fix with run_tests before submitting")
+        # Understanding: explored full chain
+        if actual_chain_coverage > 0.7:
+            understanding_indicators.append(
+                f"Covered {actual_chain_coverage:.0%} of expected causal chain"
+            )
+        # ── Diagnostics ───────────────────────────────────────────────────────
+        # False confidence: submitted very early without testing
+        false_confidence_detected = (
+            submit_step is not None and submit_step <= 3 and not test_runs
+        )
+        # Shortcut learning: read test → immediate write → submit (skipped source)
+        has_write = "write_file" in action_types
+        has_src_read = any(
+            bf in (s.get("action_path") or "")
+            for s in trajectory_steps
+            if s.get("action_type") == "read_file"
+            for bf in bug_files
+        )
+        shortcut_sequence = has_write and not has_src_read
+        shortcut_learning_detected = shortcut_sequence
+        # ── Composite causal score ─────────────────────────────────────────────
+        scores = {
+            "chain_coverage": actual_chain_coverage * 0.30,
+            "chain_order": chain_order_score * 0.25,
+            "read_before_write": (0.15 if read_before_write else 0.0),
+            "test_informed": (0.15 if test_informed_navigation else 0.0),
+            "verified": (0.10 if submit_after_test else 0.0),
+            "searched": (0.05 if search_before_navigate else 0.0),
+        }
+        causal_score = sum(scores.values())
+        causal_score = max(0.0, min(1.0, causal_score))
+        # ── Understanding level classification ────────────────────────────────
+        if causal_score >= 0.75:
+            level = CausalUnderstandingLevel.DEEP
+        elif causal_score >= 0.45:
+            level = CausalUnderstandingLevel.PARTIAL
+        elif causal_score >= 0.20:
+            level = CausalUnderstandingLevel.SUPERFICIAL
+        else:
+            level = CausalUnderstandingLevel.RANDOM
+        # ── Explanation ───────────────────────────────────────────────────────
+        level_explanations = {
+            CausalUnderstandingLevel.DEEP: (
+                "Agent demonstrated genuine causal reasoning: read tests to understand expected "
+                "behavior, traced the call chain to the root cause, made a targeted fix, and "
+                "verified with tests before submitting."
+            ),
+            CausalUnderstandingLevel.PARTIAL: (
+                "Agent showed partial causal understanding. Some chain links were traversed "
+                "but the reasoning was incomplete — likely missed tracing deeper dependencies "
+                "or skipped test verification."
+            ),
+            CausalUnderstandingLevel.SUPERFICIAL: (
+                "Agent showed superficial reasoning. Actions did not follow a clear causal "
+                "path from test → failure → root cause. Likely pattern-matched on filenames "
+                "or guessed the fix location."
+            ),
+            CausalUnderstandingLevel.RANDOM: (
+                "Agent showed no discernible causal reasoning. Actions appear random relative "
+                "to the causal structure of the bug. This is the profile of pure trial-and-error."
+            ),
+        }
+        explanation = level_explanations[level]
+        # ── Recommendations ───────────────────────────────────────────────────
+        recs = []
+        if not any(n.was_read for n in expected_chain if n.role == "test"):
+            recs.append("Always read the failing test first — it defines the expected behavior.")
+        if not read_before_write:
+            recs.append("Never write to a file before reading it — blind writes cause more bugs.")
+        if not submit_after_test:
+            recs.append("Run tests after every write to verify your fix is correct.")
+        if not search_steps:
+            recs.append("Use search_code to find function definitions before navigating blindly.")
+        if actual_chain_coverage < 0.5:
+            recs.append(
+                "Explore more of the causal chain. The bug's root cause may be deeper than the first file."
+            )
+        if not recs:
+            recs.append("Excellent reasoning! Maintain this systematic approach.")
+        return CausalProbeReport(
+            episode_id=episode_id,
+            task=task,
+            understanding_level=level,
+            causal_score=causal_score,
+            expected_chain=expected_chain,
+            actual_chain_coverage=actual_chain_coverage,
+            chain_order_score=chain_order_score,
+            read_before_write=read_before_write,
+            test_informed_navigation=test_informed_navigation,
+            search_before_navigate=search_before_navigate,
+            submit_after_test=submit_after_test,
+            guessing_indicators=guessing_indicators,
+            understanding_indicators=understanding_indicators,
+            false_confidence_detected=false_confidence_detected,
+            shortcut_learning_detected=shortcut_learning_detected,
+            explanation=explanation,
+            recommendations=recs,
+        )

server/confidence_calibrator.py ADDED Viewed

	@@ -0,0 +1,363 @@

+# server/confidence_calibrator.py
+"""
+Confidence Calibration Engine — v4.0
+The key scientific question: Is the agent calibrated?
+An agent is calibrated when its certainty level (inferred from behavior)
+matches its likelihood of being correct.
+Since agents don't expose probability distributions directly, we infer
+confidence from behavioral proxies:
+- How quickly did it commit to a hypothesis (read → write speed)?
+- How much did it re-explore after writing (re-reads after write)?
+- Did it verify (run_tests) before submitting?
+- How many steps did it spend before the first write?
+We then compare inferred confidence to actual accuracy (final_score).
+Overconfident agents submit fast but score poorly.
+Underconfident agents explore extensively but still score well.
+Well-calibrated agents: confidence ∝ accuracy.
+This is NOT measured by any existing benchmark or tracing tool.
+"""
+from __future__ import annotations
+import math
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass, field
+from enum import Enum
+class CalibrationProfile(str, Enum):
+    WELL_CALIBRATED = "WELL_CALIBRATED"    # Confidence ≈ accuracy
+    OVERCONFIDENT = "OVERCONFIDENT"        # High confidence, low accuracy
+    UNDERCONFIDENT = "UNDERCONFIDENT"      # Low confidence, high accuracy
+    ERRATIC = "ERRATIC"                   # Confidence changes randomly
+@dataclass
+class ConfidenceSample:
+    """Inferred confidence at one point in the trajectory."""
+    step: int
+    action_type: str
+    inferred_confidence: float   # 0.0–1.0 based on behavioral proxy
+    actual_accuracy: Optional[float]  # test_pass_rate at this step if known
+    calibration_error: Optional[float]  # |confidence - accuracy| if both known
+@dataclass
+class CalibrationReport:
+    """Full confidence calibration analysis."""
+    episode_id: str
+    task: str
+    profile: CalibrationProfile
+    calibration_score: float      # 1.0 = perfectly calibrated
+    # Inferred overall confidence level (behavioral proxy)
+    inferred_confidence: float    # 0.0–1.0
+    actual_performance: float     # final_score
+    # Decomposed signals
+    commitment_speed: float      # How fast did agent commit? (0=slow/careful, 1=fast)
+    re_exploration_rate: float   # Reads after first write / total reads
+    verification_rate: float     # run_tests per write_file
+    submit_speed: float          # Submit step / max_steps (early=overconfident)
+    # Trajectory of inferred confidence
+    confidence_trajectory: List[ConfidenceSample]
+    # Calibration error
+    expected_calibration_error: float  # Mean(|conf - acc|) where acc is known
+    confidence_accuracy_correlation: float  # Should be high for good agents
+    diagnosis: str
+    recommendations: List[str]
+    def to_dict(self) -> dict:
+        return {
+            "episode_id": self.episode_id,
+            "task": self.task,
+            "profile": self.profile.value,
+            "calibration_score": round(self.calibration_score, 3),
+            "inferred_confidence": round(self.inferred_confidence, 3),
+            "actual_performance": round(self.actual_performance, 3),
+            "signals": {
+                "commitment_speed": round(self.commitment_speed, 3),
+                "re_exploration_rate": round(self.re_exploration_rate, 3),
+                "verification_rate": round(self.verification_rate, 3),
+                "submit_speed": round(self.submit_speed, 3),
+            },
+            "expected_calibration_error": round(self.expected_calibration_error, 3),
+            "confidence_accuracy_correlation": round(self.confidence_accuracy_correlation, 3),
+            "confidence_trajectory": [
+                {
+                    "step": s.step,
+                    "action": s.action_type,
+                    "confidence": round(s.inferred_confidence, 3),
+                    "accuracy": round(s.actual_accuracy, 3) if s.actual_accuracy is not None else None,
+                    "error": round(s.calibration_error, 3) if s.calibration_error is not None else None,
+                }
+                for s in self.confidence_trajectory
+            ],
+            "diagnosis": self.diagnosis,
+            "recommendations": self.recommendations,
+        }
+class ConfidenceCalibrator:
+    """
+    Infers behavioral confidence and compares to actual performance.
+    Confidence proxy model:
+    - Reading files = low confidence (still exploring)
+    - Writing files = medium-high confidence (committed to hypothesis)
+    - Running tests = verification (moderate, checking own hypothesis)
+    - Submitting = maximum commitment (fully confident)
+    Each action type has a confidence weight:
+      read_file:   0.2  (exploring, uncertain)
+      search_code: 0.3  (slightly more directed)
+      run_tests:   0.6  (confident enough to test)
+      write_file:  0.75 (committed to hypothesis)
+      submit:      1.0  (maximum confidence)
+    We track how this evolves over the trajectory.
+    """
+    ACTION_CONFIDENCE = {
+        "read_file":   0.2,
+        "search_code": 0.3,
+        "run_tests":   0.6,
+        "write_file":  0.75,
+        "submit":      1.0,
+    }
+    def calibrate(
+        self,
+        episode_id: str,
+        task: str,
+        trajectory_steps: List[dict],
+        final_score: float,
+        max_steps: int = 20,
+    ) -> CalibrationReport:
+        """Compute the full calibration report for one episode."""
+        if not trajectory_steps:
+            return self._empty_report(episode_id, task, final_score)
+        action_types = [s.get("action_type", "read_file") for s in trajectory_steps]
+        total_steps = len(trajectory_steps)
+        # ── Build confidence trajectory ───────────────────────────────────────
+        confidence_traj: List[ConfidenceSample] = []
+        running_conf = 0.0
+        for s in trajectory_steps:
+            atype = s.get("action_type", "read_file")
+            base_conf = self.ACTION_CONFIDENCE.get(atype, 0.3)
+            # Confidence grows as episode progresses
+            step_n = s.get("step_number", 1)
+            progress_bonus = (step_n / max(total_steps, 1)) * 0.1
+            # Re-reads slightly lower confidence
+            step_write_count = sum(
+                1 for s2 in trajectory_steps
+                if s2.get("action_type") == "write_file"
+                and s2.get("step_number", 99) < step_n
+            )
+            step_reread = (
+                s.get("action_type") == "read_file"
+                and any(
+                    s2.get("action_path") == s.get("action_path")
+                    and s2.get("step_number", 0) < step_n
+                    for s2 in trajectory_steps
+                )
+            )
+            reread_penalty = -0.1 if step_reread else 0.0
+            # After a write, confidence should be higher
+            post_write_bonus = min(0.15, step_write_count * 0.05)
+            inferred = min(1.0, max(0.0,
+                base_conf + progress_bonus + post_write_bonus + reread_penalty
+            ))
+            # Actual accuracy at this step if test_pass_rate is known
+            actual_acc = s.get("test_pass_rate")
+            calib_err = abs(inferred - actual_acc) if actual_acc is not None else None
+            confidence_traj.append(ConfidenceSample(
+                step=step_n,
+                action_type=atype,
+                inferred_confidence=inferred,
+                actual_accuracy=actual_acc,
+                calibration_error=calib_err,
+            ))
+        # ── Behavioral signal computation ─────────────────────────────────────
+        total = max(total_steps, 1)
+        # Commitment speed: how many reads before first write?
+        read_steps = [i for i, a in enumerate(action_types) if a == "read_file"]
+        write_steps = [i for i, a in enumerate(action_types) if a == "write_file"]
+        submit_step = next(
+            (s.get("step_number", total) for s in trajectory_steps if s.get("action_type") == "submit"),
+            total,
+        )
+        if write_steps:
+            reads_before_first_write = len([r for r in read_steps if r < write_steps[0]])
+            # Low reads before write = high commitment speed = overconfident
+            commitment_speed = max(0.0, 1.0 - reads_before_first_write / max(total, 1))
+        else:
+            commitment_speed = 0.0  # Never wrote = very cautious
+        # Re-exploration rate: reads after first write / total reads
+        if write_steps and read_steps:
+            reads_after_write = len([r for r in read_steps if r > write_steps[0]])
+            re_exploration_rate = reads_after_write / len(read_steps)
+        else:
+            re_exploration_rate = 0.0
+        # Verification rate: run_tests per write
+        test_count = action_types.count("run_tests")
+        write_count = action_types.count("write_file")
+        verification_rate = test_count / max(write_count, 1)
+        # Submit speed: earlier = more overconfident
+        submit_speed = 1.0 - (submit_step / max(max_steps, 1))
+        submit_speed = max(0.0, min(1.0, submit_speed))
+        # ── Inferred overall confidence ───────────────────────────────────────
+        # Weighted behavioral proxy
+        inferred_confidence = (
+            commitment_speed * 0.30 +
+            (1.0 - re_exploration_rate) * 0.15 +
+            verification_rate * 0.15 +
+            submit_speed * 0.20 +
+            (confidence_traj[-1].inferred_confidence if confidence_traj else 0.5) * 0.20
+        )
+        inferred_confidence = min(1.0, max(0.0, inferred_confidence))
+        # ── Calibration error (where we have both conf + acc) ─────────────────
+        calib_errors = [
+            s.calibration_error for s in confidence_traj
+            if s.calibration_error is not None
+        ]
+        ece = sum(calib_errors) / len(calib_errors) if calib_errors else abs(inferred_confidence - final_score)
+        # ── Confidence-accuracy correlation ────────────────────────────────────
+        paired = [
+            (s.inferred_confidence, s.actual_accuracy)
+            for s in confidence_traj
+            if s.actual_accuracy is not None
+        ]
+        if len(paired) >= 2:
+            corr = self._pearson_r([p[0] for p in paired], [p[1] for p in paired])
+        else:
+            # Fallback: use final point only
+            conf_err = abs(inferred_confidence - final_score)
+            corr = 1.0 - conf_err * 2
+        corr = max(-1.0, min(1.0, corr))
+        # ── Calibration score ─────────────────────────────────────────────────
+        calibration_score = max(0.0, 1.0 - ece) * 0.5 + max(0.0, corr) * 0.5
+        calibration_score = max(0.0, min(1.0, calibration_score))
+        # ── Profile classification ─────────────────────────────────────────────
+        conf_diff = inferred_confidence - final_score
+        if abs(conf_diff) <= 0.2:
+            profile = CalibrationProfile.WELL_CALIBRATED
+        elif conf_diff > 0.2:
+            profile = CalibrationProfile.OVERCONFIDENT
+        elif conf_diff < -0.2:
+            profile = CalibrationProfile.UNDERCONFIDENT
+        else:
+            profile = CalibrationProfile.ERRATIC
+        # ── Diagnosis ─────────────────────────────────────────────────────────
+        diagnoses = {
+            CalibrationProfile.WELL_CALIBRATED: (
+                f"Agent is well-calibrated: inferred confidence ({inferred_confidence:.2f}) "
+                f"closely matches actual performance ({final_score:.2f}). "
+                "This indicates genuine self-awareness — the agent commits when ready and "
+                "explores when uncertain."
+            ),
+            CalibrationProfile.OVERCONFIDENT: (
+                f"Agent is overconfident: behavioral confidence ({inferred_confidence:.2f}) "
+                f"significantly exceeds actual performance ({final_score:.2f}). "
+                "Agent committed to a hypothesis too early, skipped verification, "
+                "or submitted without adequate exploration. This is the profile of agents "
+                "that 'feel certain but are wrong'."
+            ),
+            CalibrationProfile.UNDERCONFIDENT: (
+                f"Agent is underconfident: behavioral confidence ({inferred_confidence:.2f}) "
+                f"is well below actual performance ({final_score:.2f}). "
+                "Agent explored far more than necessary, re-read files unnecessarily, "
+                "or hesitated to commit despite having the right information. "
+                "This wastes compute and steps without improving accuracy."
+            ),
+            CalibrationProfile.ERRATIC: (
+                "Agent calibration is erratic — confidence signals are inconsistent "
+                "with behavior. The agent may be applying a rigid strategy regardless "
+                "of the task difficulty."
+            ),
+        }
+        recs = []
+        if profile == CalibrationProfile.OVERCONFIDENT:
+            recs.append("Read more files before writing — commit only when you've seen the full causal chain.")
+            recs.append("Always run_tests after writing — don't trust your fix without verification.")
+        elif profile == CalibrationProfile.UNDERCONFIDENT:
+            recs.append("Commit to hypotheses earlier — excessive re-reading wastes steps.")
+            recs.append("After reading tests + source files, write your fix. Stop re-reading.")
+        if verification_rate < 0.5:
+            recs.append("Increase test verification rate: run_tests after each write.")
+        if re_exploration_rate > 0.5:
+            recs.append("High re-exploration after writing suggests uncalibrated hypothesis formation.")
+        return CalibrationReport(
+            episode_id=episode_id,
+            task=task,
+            profile=profile,
+            calibration_score=calibration_score,
+            inferred_confidence=inferred_confidence,
+            actual_performance=final_score,
+            commitment_speed=commitment_speed,
+            re_exploration_rate=re_exploration_rate,
+            verification_rate=verification_rate,
+            submit_speed=submit_speed,
+            confidence_trajectory=confidence_traj,
+            expected_calibration_error=ece,
+            confidence_accuracy_correlation=corr,
+            diagnosis=diagnoses[profile],
+            recommendations=recs,
+        )
+    def _pearson_r(self, xs: List[float], ys: List[float]) -> float:
+        n = len(xs)
+        if n < 2:
+            return 0.0
+        mx, my = sum(xs) / n, sum(ys) / n
+        num = sum((x - mx) * (y - my) for x, y in zip(xs, ys))
+        dx = math.sqrt(sum((x - mx) ** 2 for x in xs))
+        dy = math.sqrt(sum((y - my) ** 2 for y in ys))
+        if dx * dy == 0:
+            return 0.0
+        return num / (dx * dy)
+    def _empty_report(self, episode_id: str, task: str, final_score: float) -> CalibrationReport:
+        return CalibrationReport(
+            episode_id=episode_id, task=task,
+            profile=CalibrationProfile.ERRATIC,
+            calibration_score=0.0,
+            inferred_confidence=0.0, actual_performance=final_score,
+            commitment_speed=0.0, re_exploration_rate=0.0,
+            verification_rate=0.0, submit_speed=0.0,
+            confidence_trajectory=[],
+            expected_calibration_error=1.0,
+            confidence_accuracy_correlation=0.0,
+            diagnosis="No trajectory data.", recommendations=[],
+        )

server/counterfactual_engine.py ADDED Viewed

	@@ -0,0 +1,383 @@

+# server/counterfactual_engine.py
+"""
+Counterfactual Robustness Engine — v4.0
+The key scientific question: Is the agent's strategy robust, or is it brittle?
+We test this by:
+1. Running an episode → recording strategy
+2. Applying small, semantically-neutral mutations to the repo
+   (rename variable, change a constant, add a dummy function)
+3. Measuring whether the agent's recorded strategy would fail on the mutated repo
+IMPORTANT: This does NOT re-run the agent. It analyzes whether the
+already-recorded navigation pattern was based on deep structure (robust)
+or surface signals like filenames/constants (brittle).
+This is completely novel — no benchmark or tool does this.
+"""
+from __future__ import annotations
+import random
+import hashlib
+from typing import List, Dict, Any, Tuple
+from dataclasses import dataclass, field
+from enum import Enum
+class BrittlenessLevel(str, Enum):
+    ROBUST = "ROBUST"           # Strategy survives all mutations
+    MILDLY_BRITTLE = "MILDLY_BRITTLE"  # Survives 60-80% of mutations
+    BRITTLE = "BRITTLE"         # Survives < 60%
+    FRAGILE = "FRAGILE"         # Survives < 30%
+@dataclass
+class Mutation:
+    """A single counterfactual mutation applied to the repo."""
+    mutation_type: str
+    target_file: str
+    description: str
+    would_break_agent: bool  # Would this mutation cause agent's strategy to fail?
+    why: str                 # Explanation
+@dataclass
+class CounterfactualReport:
+    """Results of counterfactual robustness testing."""
+    episode_id: str
+    task: str
+    brittleness_level: BrittlenessLevel
+    robustness_score: float      # 0.0 – 1.0
+    mutations_tested: List[Mutation]
+    mutations_survived: int
+    mutations_failed: int
+    surface_dependencies: List[str]  # What surface signals the agent relied on
+    deep_dependencies: List[str]     # What structural signals it used correctly
+    explanation: str
+    recommendations: List[str]
+    def to_dict(self) -> dict:
+        return {
+            "episode_id": self.episode_id,
+            "task": self.task,
+            "brittleness_level": self.brittleness_level.value,
+            "robustness_score": round(self.robustness_score, 3),
+            "mutations_tested": len(self.mutations_tested),
+            "mutations_survived": self.mutations_survived,
+            "mutations_failed": self.mutations_failed,
+            "mutations": [
+                {
+                    "type": m.mutation_type,
+                    "file": m.target_file,
+                    "description": m.description,
+                    "would_break_agent": m.would_break_agent,
+                    "why": m.why,
+                }
+                for m in self.mutations_tested
+            ],
+            "surface_dependencies": self.surface_dependencies,
+            "deep_dependencies": self.deep_dependencies,
+            "explanation": self.explanation,
+            "recommendations": self.recommendations,
+        }
+class CounterfactualEngine:
+    """
+    Analyzes brittleness by reasoning about what mutations would break the agent.
+    We don't need to actually re-run the agent — we analyze the recorded
+    trajectory and ask: "If file X was named differently / had a different
+    constant, would this agent's navigation pattern still work?"
+    Brittle signals:
+    - Agent found bug file by pattern-matching on filename (not content search)
+    - Agent submitted after reading the same file every run
+    - Agent ignored test content and relied on positional heuristics
+    Robust signals:
+    - Agent used search_code to find function by name
+    - Agent read test → traced import → found source
+    - Agent ran tests and verified result before submitting
+    """
+    MUTATION_TEMPLATES = [
+        {
+            "type": "FILENAME_RENAME",
+            "description": "Rename src/X.py to src/X_v2.py (same content)",
+            "breaks_if": "agent found file by name pattern, not by search or import tracing",
+            "surface_signal": "filename",
+            "robust_signal": "import tracing or search_code",
+        },
+        {
+            "type": "CONSTANT_CHANGE",
+            "description": "Change a numeric constant by ±1 (semantically neutral for navigation)",
+            "breaks_if": "agent hardcoded expected value rather than reading actual code",
+            "surface_signal": "constant value pattern matching",
+            "robust_signal": "dynamic code reading",
+        },
+        {
+            "type": "DUMMY_FUNCTION",
+            "description": "Add a dummy function with a similar name near the bug",
+            "breaks_if": "agent used first-match navigation without reading full context",
+            "surface_signal": "first result of search or first match in file",
+            "robust_signal": "reading complete function signatures before deciding",
+        },
+        {
+            "type": "DIRECTORY_SHUFFLE",
+            "description": "Move test file from tests/ to test/ (same content)",
+            "breaks_if": "agent hardcoded path prefix tests/ instead of searching",
+            "surface_signal": "hardcoded directory prefix",
+            "robust_signal": "search or dynamic discovery",
+        },
+        {
+            "type": "DOCSTRING_NOISE",
+            "description": "Add misleading docstring claiming a different function causes the bug",
+            "breaks_if": "agent read docs instead of tests to understand expected behavior",
+            "surface_signal": "docstring content",
+            "robust_signal": "test assertions as ground truth",
+        },
+        {
+            "type": "IMPORT_REORDER",
+            "description": "Reorder imports in the source file",
+            "breaks_if": "agent relied on line numbers instead of function names",
+            "surface_signal": "absolute line numbers",
+            "robust_signal": "function name search",
+        },
+    ]
+    def analyze(
+        self,
+        episode_id: str,
+        task: str,
+        trajectory_steps: List[dict],
+        variant_meta: dict,
+        files_read: List[str],
+        files_written: List[str],
+        final_score: float,
+    ) -> CounterfactualReport:
+        """
+        Analyze robustness by simulating mutations and reasoning about
+        whether the agent's recorded pattern would survive them.
+        """
+        action_types = [s.get("action_type", "") for s in trajectory_steps]
+        action_paths = [s.get("action_path") for s in trajectory_steps]
+        bug_files = set(variant_meta.get("bug_files", []) or
+                        variant_meta.get("files_to_implement", []) or [])
+        test_files_meta = set(variant_meta.get("test_files", []) or [])
+        # Infer what signals agent used
+        used_search = "search_code" in action_types
+        used_tests_first = self._tests_read_before_src(trajectory_steps, test_files_meta, bug_files)
+        used_run_tests = "run_tests" in action_types
+        blind_navigation = not used_search and not used_tests_first
+        read_count = action_types.count("read_file")
+        write_count = action_types.count("write_file")
+        immediate_write = write_count > 0 and action_types.index("write_file") <= 2
+        verified_before_submit = self._verified_before_submit(trajectory_steps)
+        # ── Evaluate each mutation ────────────────────────────────────────────
+        mutations: List[Mutation] = []
+        for tmpl in self.MUTATION_TEMPLATES:
+            target_file = self._pick_target_file(tmpl["type"], files_read, bug_files)
+            would_break, why = self._would_break_agent(
+                mutation_type=tmpl["type"],
+                used_search=used_search,
+                used_tests_first=used_tests_first,
+                verified_before_submit=verified_before_submit,
+                blind_navigation=blind_navigation,
+                immediate_write=immediate_write,
+                read_count=read_count,
+                tmpl=tmpl,
+            )
+            mutations.append(Mutation(
+                mutation_type=tmpl["type"],
+                target_file=target_file or "unknown",
+                description=tmpl["description"],
+                would_break_agent=would_break,
+                why=why,
+            ))
+        survived = sum(1 for m in mutations if not m.would_break_agent)
+        failed = len(mutations) - survived
+        robustness_score = survived / len(mutations) if mutations else 0.0
+        # ── Surface vs deep dependency analysis ──────────────────────────────
+        surface_deps = []
+        deep_deps = []
+        if not used_search:
+            surface_deps.append("Filename-based navigation (no search_code used)")
+        if not used_tests_first:
+            surface_deps.append("Skipped test-informed navigation")
+        if immediate_write:
+            surface_deps.append("Immediate write after minimal reading (blind fix)")
+        if not verified_before_submit:
+            surface_deps.append("Submitted without running tests (no verification)")
+        if used_search:
+            deep_deps.append("Used search_code to find functions by name (content-based)")
+        if used_tests_first:
+            deep_deps.append("Read tests first — used expected behavior as compass")
+        if read_count >= 3:
+            deep_deps.append(f"Read {read_count} files — explored structure before committing")
+        if verified_before_submit:
+            deep_deps.append("Verified fix with run_tests before submitting")
+        # ── Brittleness classification ────────────────────────────────────────
+        if robustness_score >= 0.80:
+            level = BrittlenessLevel.ROBUST
+        elif robustness_score >= 0.60:
+            level = BrittlenessLevel.MILDLY_BRITTLE
+        elif robustness_score >= 0.30:
+            level = BrittlenessLevel.BRITTLE
+        else:
+            level = BrittlenessLevel.FRAGILE
+        explanations = {
+            BrittlenessLevel.ROBUST: (
+                "Agent strategy is robust. It relies on deep structural signals (function names, "
+                "test assertions, causal chain traversal) rather than surface patterns. "
+                "Minor repo mutations would not break its navigation."
+            ),
+            BrittlenessLevel.MILDLY_BRITTLE: (
+                "Agent strategy is mildly brittle. Some mutations would break its navigation, "
+                "particularly those that change surface signals it relied on. "
+                "Using search_code and test-first navigation consistently would improve robustness."
+            ),
+            BrittlenessLevel.BRITTLE: (
+                "Agent strategy is brittle. Most mutations would break its navigation. "
+                "The agent appears to rely on stable surface patterns (filenames, positions) "
+                "rather than understanding the semantic structure of the codebase."
+            ),
+            BrittlenessLevel.FRAGILE: (
+                "Agent strategy is fragile. Almost any perturbation to the repo structure "
+                "would cause this agent to fail. This indicates pure pattern-matching on "
+                "the specific repo layout rather than generalizable code understanding."
+            ),
+        }
+        recs = []
+        if not used_search:
+            recs.append("Use search_code to find functions by name — survives filename renames.")
+        if not used_tests_first:
+            recs.append("Read tests first to anchor your navigation in expected behavior, not filenames.")
+        if immediate_write:
+            recs.append("Read source files before writing to them — avoid blind writes.")
+        if not verified_before_submit:
+            recs.append("Run tests after writing — verify your fix holds on the actual behavior.")
+        return CounterfactualReport(
+            episode_id=episode_id,
+            task=task,
+            brittleness_level=level,
+            robustness_score=robustness_score,
+            mutations_tested=mutations,
+            mutations_survived=survived,
+            mutations_failed=failed,
+            surface_dependencies=surface_deps,
+            deep_dependencies=deep_deps,
+            explanation=explanations[level],
+            recommendations=recs,
+        )
+    # ── Helpers ───────────────────────────────────────────────────────────────
+    def _tests_read_before_src(
+        self, steps: List[dict], test_files: set, bug_files: set
+    ) -> bool:
+        test_steps = [
+            s.get("step_number", 99) for s in steps
+            if s.get("action_type") == "read_file"
+            and any(tf in (s.get("action_path") or "") for tf in test_files)
+        ]
+        src_steps = [
+            s.get("step_number", 99) for s in steps
+            if s.get("action_type") == "read_file"
+            and any(bf in (s.get("action_path") or "") for bf in bug_files)
+        ]
+        if test_steps and src_steps:
+            return min(test_steps) < min(src_steps)
+        return False
+    def _verified_before_submit(self, steps: List[dict]) -> bool:
+        submit_step = next(
+            (s.get("step_number", 9999) for s in steps if s.get("action_type") == "submit"),
+            None,
+        )
+        if submit_step is None:
+            return False
+        return any(
+            s.get("action_type") == "run_tests"
+            and s.get("step_number", 0) < submit_step
+            for s in steps
+        )
+    def _pick_target_file(
+        self, mutation_type: str, files_read: List[str], bug_files: set
+    ) -> str:
+        if mutation_type in ("FILENAME_RENAME", "DUMMY_FUNCTION", "IMPORT_REORDER"):
+            for f in bug_files:
+                return f
+            return files_read[0] if files_read else "src/main.py"
+        if mutation_type == "DIRECTORY_SHUFFLE":
+            for f in files_read:
+                if "test" in f.lower():
+                    return f
+        return files_read[0] if files_read else "unknown"
+    def _would_break_agent(
+        self,
+        mutation_type: str,
+        used_search: bool,
+        used_tests_first: bool,
+        verified_before_submit: bool,
+        blind_navigation: bool,
+        immediate_write: bool,
+        read_count: int,
+        tmpl: dict,
+    ) -> Tuple[bool, str]:
+        """
+        Return (would_break, explanation) by reasoning about the agent's signals.
+        """
+        if mutation_type == "FILENAME_RENAME":
+            if used_search:
+                return False, "Agent used search_code — finds function by name, not filename"
+            if blind_navigation:
+                return True, "Agent navigated by filename without search — rename breaks it"
+            return True, "Agent likely relied on filename pattern without search fallback"
+        if mutation_type == "CONSTANT_CHANGE":
+            # Almost never breaks well-behaved agents
+            if read_count >= 2:
+                return False, "Agent read files dynamically — adapts to any constant value"
+            return True, "Agent may have hardcoded expected value in navigation heuristic"
+        if mutation_type == "DUMMY_FUNCTION":
+            if used_search and read_count >= 3:
+                return False, "Agent searched and read thoroughly — would disambiguate"
+            return True, "Agent took first match without thorough reading"
+        if mutation_type == "DIRECTORY_SHUFFLE":
+            if used_search:
+                return False, "search_code finds tests regardless of directory"
+            return True, "Agent used hardcoded path prefix — directory change breaks it"
+        if mutation_type == "DOCSTRING_NOISE":
+            if used_tests_first:
+                return False, "Agent used test assertions as ground truth, not docstrings"
+            return True, "Agent may have read misleading docstring instead of test"
+        if mutation_type == "IMPORT_REORDER":
+            # Only brittle if agent relied on line numbers
+            if read_count <= 1:
+                return True, "Agent skimmed — likely used line numbers for navigation"
+            return False, "Agent read full files — import reorder doesn't change function content"
+        return False, "Neutral mutation"

server/memory_bank.py ADDED Viewed

	@@ -0,0 +1,362 @@

+# server/memory_bank.py
+"""
+Episodic Memory Bank — v4.0
+Cross-episode learning store for AI coding agents.
+Every time an agent fails at a specific failure type, we store:
+1. The failure pattern (what actions led to it)
+2. The remediation hint (what should have been done)
+3. A compact "lesson" that can be injected into future prompts
+The memory grows across episodes. When a new episode starts:
+- We retrieve the most relevant past lessons (by task similarity)
+- We inject them as a "memory context" into the agent's system prompt
+- This creates a real self-improvement loop
+This is NOT implemented in any current agent framework as an
+environment-side primitive. Devin, Copilot, etc. start fresh every run.
+"""
+from __future__ import annotations
+import json
+import time
+import os
+import hashlib
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass, field, asdict
+@dataclass
+class MemoryEntry:
+    """One stored episode lesson."""
+    entry_id: str
+    episode_id: str
+    task: str
+    created_at: float
+    # Failure details
+    failure_type: str
+    failure_evidence: str
+    score: float
+    # Strategy used
+    strategy: str
+    action_sequence_hash: str  # Compact fingerprint of the action pattern
+    # Lesson extracted
+    lesson_title: str
+    lesson_body: str      # Full explanation of what went wrong
+    lesson_hint: str      # Compact hint to inject into future prompts
+    lesson_plan: List[str]  # Step-by-step corrective plan
+    # Retrieval metadata
+    relevance_tags: List[str]    # Tags for retrieval (task1, write_file, read_before_write...)
+    times_retrieved: int = 0
+    times_helpful: int = 0       # Incremented when retry after this lesson improved score
+    def to_dict(self) -> dict:
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, d: dict) -> "MemoryEntry":
+        return cls(**d)
+@dataclass
+class MemoryContext:
+    """Injected memory context for a new episode."""
+    relevant_lessons: List[MemoryEntry]
+    system_prompt_injection: str   # Full text to prepend to system prompt
+    user_context_injection: str    # Full text to prepend to first user message
+    lessons_count: int
+    most_relevant_lesson: Optional[str]
+class EpisodicMemoryBank:
+    """
+    Persistent cross-episode memory bank.
+    Storage: JSON file on disk (or in-memory for Gradio sessions).
+    Each entry is a MemoryEntry with lesson + retrieval metadata.
+    Usage:
+        bank = EpisodicMemoryBank(persist_path="memory.json")
+        # After an episode:
+        bank.store(episode_result)
+        # Before next episode:
+        context = bank.retrieve(task="task1", max_lessons=3)
+        # Inject context.system_prompt_injection into agent
+    """
+    MAX_ENTRIES = 50  # Keep last 50 lessons per task
+    def __init__(self, persist_path: Optional[str] = None):
+        self.persist_path = persist_path
+        self._entries: List[MemoryEntry] = []
+        if persist_path and os.path.exists(persist_path):
+            self._load()
+    def store(
+        self,
+        episode_id: str,
+        task: str,
+        failure_type: str,
+        failure_evidence: str,
+        score: float,
+        strategy: str,
+        trajectory_steps: List[dict],
+        improvement_plan: Optional[dict] = None,
+    ) -> MemoryEntry:
+        """Store a lesson from a completed episode."""
+        # Build action fingerprint
+        actions = [s.get("action_type", "?") for s in trajectory_steps]
+        seq_str = "→".join(actions[:12])
+        seq_hash = hashlib.md5(seq_str.encode()).hexdigest()[:8]
+        # Relevance tags for retrieval
+        tags = [task, failure_type, strategy]
+        if "read_file" in actions:
+            tags.append("read_file")
+        if "write_file" in actions:
+            tags.append("write_file")
+        if "run_tests" not in actions:
+            tags.append("no_verification")
+        if len(actions) <= 3:
+            tags.append("too_short")
+        # Extract lesson from improvement plan or failure type
+        if improvement_plan:
+            lesson_title = improvement_plan.get("failure_type", failure_type)
+            lesson_body = improvement_plan.get("what_went_wrong", "Agent failed.")
+            lesson_hint = improvement_plan.get("system_prompt_addon", "")
+            lesson_plan = improvement_plan.get("step_by_step_plan", [])
+        else:
+            lesson_title, lesson_body, lesson_hint, lesson_plan = self._default_lesson(
+                failure_type, score, strategy
+            )
+        entry = MemoryEntry(
+            entry_id=f"{task}_{seq_hash}_{int(time.time())}",
+            episode_id=episode_id,
+            task=task,
+            created_at=time.time(),
+            failure_type=failure_type,
+            failure_evidence=failure_evidence[:200],
+            score=score,
+            strategy=strategy,
+            action_sequence_hash=seq_hash,
+            lesson_title=lesson_title,
+            lesson_body=lesson_body,
+            lesson_hint=lesson_hint,
+            lesson_plan=lesson_plan,
+            relevance_tags=tags,
+            times_retrieved=0,
+            times_helpful=0,
+        )
+        self._entries.append(entry)
+        self._trim()
+        if self.persist_path:
+            self._save()
+        return entry
+    def retrieve(
+        self,
+        task: str,
+        failure_type: Optional[str] = None,
+        strategy: Optional[str] = None,
+        max_lessons: int = 3,
+    ) -> MemoryContext:
+        """Retrieve relevant lessons for an upcoming episode."""
+        if not self._entries:
+            return MemoryContext(
+                relevant_lessons=[],
+                system_prompt_injection="",
+                user_context_injection="",
+                lessons_count=0,
+                most_relevant_lesson=None,
+            )
+        # Score each entry by relevance
+        scored: List[tuple[float, MemoryEntry]] = []
+        for e in self._entries:
+            score = 0.0
+            if e.task == task:
+                score += 3.0
+            elif task in e.relevance_tags:
+                score += 2.0
+            if failure_type and e.failure_type == failure_type:
+                score += 2.0
+            if strategy and e.strategy == strategy:
+                score += 1.0
+            # Penalize already-retrieved lessons slightly (freshness)
+            score -= e.times_retrieved * 0.1
+            # Boost low-score lessons (more informative failures)
+            score += max(0, 0.5 - e.score)
+            scored.append((score, e))
+        scored.sort(key=lambda x: -x[0])
+        relevant = [e for _, e in scored[:max_lessons]]
+        # Mark as retrieved
+        for e in relevant:
+            e.times_retrieved += 1
+        if not relevant:
+            return MemoryContext(
+                relevant_lessons=[],
+                system_prompt_injection="",
+                user_context_injection="",
+                lessons_count=0,
+                most_relevant_lesson=None,
+            )
+        # Build injection text
+        sys_lines = [
+            "🧠 AGENT MEMORY — LESSONS FROM PAST EPISODES",
+            "=" * 50,
+            "You have made these mistakes before. Do NOT repeat them.",
+            "",
+        ]
+        for i, e in enumerate(relevant, 1):
+            sys_lines += [
+                f"[Lesson {i}] Task: {e.task} | Failure: {e.failure_type} | Score was: {e.score:.2f}",
+                f"What went wrong: {e.lesson_body}",
+                f"IMPORTANT: {e.lesson_hint}" if e.lesson_hint else "",
+                "",
+            ]
+        sys_lines.append("=" * 50)
+        system_injection = "\n".join(l for l in sys_lines if l is not None)
+        user_lines = [
+            "[MEMORY CONTEXT — Read before you act]",
+        ]
+        for i, e in enumerate(relevant, 1):
+            user_lines.append(f"Past lesson {i}: {e.lesson_title}")
+            if e.lesson_plan:
+                user_lines.append("Correct approach:")
+                user_lines.extend(f"  {step}" for step in e.lesson_plan[:4])
+        user_injection = "\n".join(user_lines)
+        return MemoryContext(
+            relevant_lessons=relevant,
+            system_prompt_injection=system_injection,
+            user_context_injection=user_injection,
+            lessons_count=len(relevant),
+            most_relevant_lesson=relevant[0].lesson_title if relevant else None,
+        )
+    def get_all_entries(self) -> List[dict]:
+        return [e.to_dict() for e in self._entries]
+    def get_stats(self) -> dict:
+        if not self._entries:
+            return {"total_entries": 0, "tasks": {}}
+        from collections import Counter
+        failure_counts = Counter(e.failure_type for e in self._entries)
+        task_counts = Counter(e.task for e in self._entries)
+        avg_score = sum(e.score for e in self._entries) / len(self._entries)
+        return {
+            "total_entries": len(self._entries),
+            "average_score_of_stored_episodes": round(avg_score, 3),
+            "failure_breakdown": dict(failure_counts.most_common()),
+            "tasks": dict(task_counts),
+            "most_helpful_lesson": max(self._entries, key=lambda e: e.times_helpful).lesson_title
+                if any(e.times_helpful > 0 for e in self._entries) else None,
+        }
+    def mark_helpful(self, episode_id: str):
+        """Call this when a retry with a lesson improved the score."""
+        for e in self._entries:
+            if e.episode_id == episode_id:
+                e.times_helpful += 1
+        if self.persist_path:
+            self._save()
+    def clear(self, task: Optional[str] = None):
+        if task:
+            self._entries = [e for e in self._entries if e.task != task]
+        else:
+            self._entries = []
+        if self.persist_path:
+            self._save()
+    # ── Persistence ───────────────────────────────────────────────────────────
+    def _save(self):
+        with open(self.persist_path, "w") as f:
+            json.dump([e.to_dict() for e in self._entries], f, indent=2)
+    def _load(self):
+        try:
+            with open(self.persist_path, "r") as f:
+                data = json.load(f)
+            self._entries = [MemoryEntry.from_dict(d) for d in data]
+        except Exception:
+            self._entries = []
+    def _trim(self):
+        """Keep at most MAX_ENTRIES, dropping oldest low-score entries first."""
+        if len(self._entries) <= self.MAX_ENTRIES:
+            return
+        # Sort by: useful first, then by recency
+        self._entries.sort(
+            key=lambda e: (
+                -e.times_helpful,
+                -e.times_retrieved,
+                e.created_at,
+            ),
+            reverse=True,
+        )
+        self._entries = self._entries[:self.MAX_ENTRIES]
+    def _default_lesson(
+        self, failure_type: str, score: float, strategy: str
+    ) -> tuple[str, str, str, List[str]]:
+        lessons = {
+            "NEVER_TESTED": (
+                "Submitted without verification",
+                "Agent submitted code without running tests. No confidence in correctness.",
+                "CRITICAL: Run run_tests after EVERY write_file. Never submit without test verification.",
+                ["1. Write fix", "2. run_tests to check", "3. If passing → submit", "4. If failing → re-read and fix"],
+            ),
+            "BLIND_WRITE": (
+                "Wrote without reading",
+                "Agent wrote to a file without reading it first. Blind writes introduce new bugs.",
+                "NEVER use write_file before read_file on the same path.",
+                ["1. read_file first", "2. Understand existing code", "3. Then write minimal fix"],
+            ),
+            "WRONG_FILE_NAVIGATION": (
+                "Navigated to wrong files",
+                "Agent read files unrelated to the bug. Wasted steps and missed root cause.",
+                "ALWAYS start with the failing test file. Its imports show you exactly where to go.",
+                ["1. Read failing test", "2. Find its imports", "3. Navigate ONLY there"],
+            ),
+            "LOOPING_BEHAVIOR": (
+                "Read same files repeatedly",
+                f"Agent looped reading the same files without progress. Score={score:.2f}.",
+                "Each file may be read AT MOST ONCE. Use search_code if confused.",
+                ["1. Use search_code with function name", "2. Read matched file — once", "3. commit to fix"],
+            ),
+        }
+        defaults = lessons.get(failure_type, (
+            f"{failure_type} failure",
+            f"Agent failed with type '{failure_type}', score={score:.2f}.",
+            "Read test → read source → fix → run_tests → submit.",
+            ["1. read test", "2. read source", "3. write fix", "4. run_tests", "5. submit"],
+        ))
+        return defaults
+# Global singleton (shared across the Gradio session)
+_GLOBAL_MEMORY = EpisodicMemoryBank(
+    persist_path=os.path.join(
+        os.path.dirname(__file__), "..", "agent_memory.json"
+    )
+)
+def get_global_memory() -> EpisodicMemoryBank:
+    return _GLOBAL_MEMORY

static/viz3d.html CHANGED Viewed

@@ -6,862 +6,601 @@
 <title>Agent Trajectory 3D Visualizer</title>
 <style>
   * { margin: 0; padding: 0; box-sizing: border-box; }
-  body {
     background: #0a0e1a;
     color: #e0e6f0;
     font-family: 'Segoe UI', system-ui, sans-serif;
     overflow: hidden;
-    height: 100vh;
   }
-  #canvas-container {
-    position: absolute;
     top: 0; left: 0;
     width: 100%; height: 100%;
-  }
-  #ui-overlay {
-    position: absolute;
-    top: 0; left: 0;
-    width: 100%; height: 100%;
-    pointer-events: none;
-    z-index: 10;
   }
   /* Header */
   #header {
-    position: absolute;
-    top: 12px; left: 50%;
     transform: translateX(-50%);
     text-align: center;
     pointer-events: none;
   }
   #header h1 {
-    font-size: 16px;
-    font-weight: 700;
     color: #7dd3fc;
     letter-spacing: 0.05em;
-    text-shadow: 0 0 20px rgba(125,211,252,0.5);
-  }
-  #header p {
-    font-size: 11px;
-    color: #64748b;
-    margin-top: 2px;
   }
-  /* Legend */
-  #legend {
-    position: absolute;
-    top: 12px; right: 16px;
-    background: rgba(10,14,26,0.85);
     border: 1px solid rgba(125,211,252,0.2);
-    border-radius: 8px;
     padding: 10px 14px;
     font-size: 11px;
-    pointer-events: none;
-  }
-  #legend h3 { color: #7dd3fc; margin-bottom: 8px; font-size: 11px; }
-  .legend-item {
-    display: flex; align-items: center; gap: 8px;
-    margin-bottom: 5px;
   }
-  .legend-dot {
-    width: 10px; height: 10px;
-    border-radius: 50%;
-    flex-shrink: 0;
   }
   /* Info panel */
-  #info-panel {
-    position: absolute;
-    top: 12px; left: 16px;
-    background: rgba(10,14,26,0.85);
-    border: 1px solid rgba(125,211,252,0.2);
-    border-radius: 8px;
-    padding: 12px 16px;
-    min-width: 220px;
-    pointer-events: none;
-  }
-  #info-panel h3 { color: #7dd3fc; margin-bottom: 8px; font-size: 11px; letter-spacing: 0.1em; }
-  .info-row {
-    display: flex; justify-content: space-between; gap: 12px;
-    font-size: 11px;
-    margin-bottom: 4px;
-    color: #94a3b8;
   }
-  .info-value { color: #e0e6f0; font-weight: 600; }
   /* Timeline */
-  #timeline-panel {
-    position: absolute;
-    bottom: 20px; left: 50%;
     transform: translateX(-50%);
-    background: rgba(10,14,26,0.9);
-    border: 1px solid rgba(125,211,252,0.2);
-    border-radius: 10px;
-    padding: 14px 20px;
-    width: min(700px, 90vw);
-    pointer-events: all;
-  }
-  #timeline-panel .tl-header {
-    display: flex;
-    justify-content: space-between;
-    align-items: center;
-    margin-bottom: 10px;
-  }
-  #timeline-panel h3 {
-    font-size: 11px;
-    color: #7dd3fc;
-    letter-spacing: 0.1em;
-  }
-  #step-label {
-    font-size: 12px;
-    color: #f0abfc;
-    font-weight: 700;
   }
-  #timeline-slider {
-    width: 100%;
-    -webkit-appearance: none;
-    height: 4px;
     background: linear-gradient(to right, #7dd3fc 0%, #7dd3fc var(--pct,0%), #1e293b var(--pct,0%));
-    border-radius: 4px;
-    outline: none;
-    cursor: pointer;
-  }
-  #timeline-slider::-webkit-slider-thumb {
-    -webkit-appearance: none;
-    width: 16px; height: 16px;
-    border-radius: 50%;
-    background: #7dd3fc;
-    cursor: pointer;
-    box-shadow: 0 0 10px rgba(125,211,252,0.7);
   }
-  #step-actions {
-    display: flex;
-    gap: 8px;
-    margin-top: 10px;
-    justify-content: center;
   }
-  .tl-btn {
     background: rgba(125,211,252,0.1);
     border: 1px solid rgba(125,211,252,0.3);
-    color: #7dd3fc;
-    padding: 5px 14px;
-    border-radius: 6px;
-    cursor: pointer;
-    font-size: 11px;
-    transition: all 0.2s;
   }
-  .tl-btn:hover { background: rgba(125,211,252,0.25); }
-  .tl-btn.active { background: rgba(125,211,252,0.3); }
-  /* Step log */
-  #step-log {
-    position: absolute;
-    bottom: 130px; right: 16px;
-    background: rgba(10,14,26,0.85);
-    border: 1px solid rgba(125,211,252,0.2);
-    border-radius: 8px;
-    padding: 10px 14px;
-    width: 260px;
-    max-height: 240px;
-    overflow-y: auto;
-    pointer-events: none;
-    font-size: 10px;
-  }
-  #step-log h3 { color: #7dd3fc; margin-bottom: 8px; font-size: 11px; }
-  .log-entry {
-    display: flex;
-    align-items: flex-start;
-    gap: 6px;
-    margin-bottom: 6px;
-    padding-bottom: 6px;
-    border-bottom: 1px solid rgba(255,255,255,0.05);
-  }
-  .log-entry:last-child { border-bottom: none; }
-  .log-step { color: #475569; min-width: 28px; }
-  .log-action { font-weight: 600; }
-  .log-reward { margin-left: auto; font-weight: 700; }
-  .reward-pos { color: #4ade80; }
-  .reward-neg { color: #f87171; }
-  .reward-zero { color: #94a3b8; }
   /* Tooltip */
   #tooltip {
-    position: absolute;
     background: rgba(10,14,26,0.95);
     border: 1px solid rgba(125,211,252,0.4);
-    border-radius: 6px;
-    padding: 8px 12px;
-    font-size: 11px;
-    pointer-events: none;
-    opacity: 0;
-    transition: opacity 0.15s;
-    max-width: 200px;
-    z-index: 20;
-  }
-  #tooltip h4 { color: #7dd3fc; margin-bottom: 4px; }
-  /* Score ring */
-  #score-ring {
-    position: absolute;
-    bottom: 130px; left: 16px;
-    pointer-events: none;
   }
-  #score-ring svg text { font-family: 'Segoe UI', sans-serif; }
   /* Loader */
   #loader {
-    position: absolute;
-    top: 50%; left: 50%;
-    transform: translate(-50%, -50%);
-    color: #7dd3fc;
-    font-size: 14px;
-    text-align: center;
   }
-  .loader-spinner {
-    width: 40px; height: 40px;
-    border: 3px solid rgba(125,211,252,0.2);
     border-top-color: #7dd3fc;
     border-radius: 50%;
-    animation: spin 0.8s linear infinite;
-    margin: 0 auto 12px;
   }
-  @keyframes spin { to { transform: rotate(360deg); } }
 </style>
 </head>
 <body>
-<!-- Hidden data injection point -->
-<div id="viz-data" style="display:none"></div>
-<div id="canvas-container">
-  <canvas id="three-canvas"></canvas>
 </div>
-<div id="loader">
-  <div class="loader-spinner"></div>
-  <p>Initializing 3D Visualizer...</p>
-</div>
-<div id="ui-overlay">
-  <!-- Header -->
-  <div id="header">
-    <h1>🔍 Agent Trajectory Visualizer — 3D</h1>
-    <p>Files = nodes · Dependencies = edges · Agent path = animated beam</p>
-  </div>
-  <!-- Info panel -->
-  <div id="info-panel">
-    <h3>EPISODE STATS</h3>
-    <div class="info-row"><span>Task</span><span class="info-value" id="stat-task">—</span></div>
-    <div class="info-row"><span>Variant</span><span class="info-value" id="stat-variant">—</span></div>
-    <div class="info-row"><span>Steps</span><span class="info-value" id="stat-steps">—</span></div>
-    <div class="info-row"><span>Score</span><span class="info-value" id="stat-score">—</span></div>
-    <div class="info-row"><span>Strategy</span><span class="info-value" id="stat-strategy">—</span></div>
-    <div class="info-row"><span>Failure</span><span class="info-value" id="stat-failure">—</span></div>
-  </div>
-  <!-- Legend -->
-  <div id="legend">
-    <h3>LEGEND</h3>
-    <div class="legend-item">
-      <div class="legend-dot" style="background:#f97316"></div><span>Source file</span>
-    </div>
-    <div class="legend-item">
-      <div class="legend-dot" style="background:#3b82f6"></div><span>Test file</span>
-    </div>
-    <div class="legend-item">
-      <div class="legend-dot" style="background:#a855f7"></div><span>Spec / Docs</span>
-    </div>
-    <div class="legend-item">
-      <div class="legend-dot" style="background:#22c55e"></div><span>Visited</span>
-    </div>
-    <div class="legend-item">
-      <div class="legend-dot" style="background:#ef4444"></div><span>Modified / Bug</span>
-    </div>
-    <div class="legend-item">
-      <div class="legend-dot" style="background:#facc15; width:20px; height:4px; border-radius:2px;"></div><span>Agent path</span>
-    </div>
-  </div>
-  <!-- Score ring -->
-  <div id="score-ring">
-    <svg width="80" height="80" viewBox="0 0 80 80">
-      <circle cx="40" cy="40" r="34" fill="none"
-        stroke="rgba(125,211,252,0.15)" stroke-width="6"/>
-      <circle id="score-arc" cx="40" cy="40" r="34" fill="none"
-        stroke="#7dd3fc" stroke-width="6"
-        stroke-dasharray="0 214"
-        stroke-linecap="round"
-        transform="rotate(-90 40 40)"
-        style="transition: stroke-dasharray 1s ease;"/>
-      <text id="score-text" x="40" y="45" text-anchor="middle"
-        fill="#e0e6f0" font-size="14" font-weight="700">0.0</text>
-    </svg>
-  </div>
-  <!-- Step log -->
-  <div id="step-log">
-    <h3>STEP LOG</h3>
-    <div id="log-entries"></div>
-  </div>
-  <!-- Tooltip -->
-  <div id="tooltip">
-    <h4 id="tooltip-title">File</h4>
-    <div id="tooltip-body"></div>
   </div>
-  <!-- Timeline -->
-  <div id="timeline-panel">
-    <div class="tl-header">
-      <h3>TIMELINE REPLAY</h3>
-      <span id="step-label">Step 0 / 0</span>
-    </div>
-    <input type="range" id="timeline-slider" min="0" max="0" value="0"
-      oninput="onSliderChange(this.value)">
-    <div id="step-actions">
-      <button class="tl-btn" onclick="stepBack()">◀ Back</button>
-      <button class="tl-btn" id="play-btn" onclick="togglePlay()">▶ Play</button>
-      <button class="tl-btn" onclick="stepForward()">Forward ▶</button>
-      <button class="tl-btn" onclick="resetView()">↺ Reset</button>
-      <button class="tl-btn" id="orbit-btn" onclick="toggleOrbit()">🔄 Orbit</button>
-    </div>
   </div>
 </div>
-<!-- Three.js from CDN -->
 <script src="https://cdnjs.cloudflare.com/ajax/libs/three.js/r128/three.min.js"></script>
 <script>
-// ── Sample data (replaced by real data from backend) ───────────────────────
-const DEFAULT_DATA = {
-  task: "task1",
-  variant_id: "variant_1",
-  final_score: 0.714,
-  strategy: "TARGETED_DEBUGGING",
-  failure_type: "CORRECT",
-  files: [
-    { name: "tests/test_formatter.py", type: "test" },
-    { name: "src/formatter.py", type: "src", is_bug_file: true },
-    { name: "src/utils.py", type: "src" }
-  ],
-  dependencies: [
-    { from: "tests/test_formatter.py", to: "src/formatter.py" },
-    { from: "src/formatter.py", to: "src/utils.py" }
-  ],
-  steps: [
-    { step: 1, action: "read_file", path: "tests/test_formatter.py", reward: 0.0 },
-    { step: 2, action: "read_file", path: "src/formatter.py", reward: 0.05 },
-    { step: 3, action: "search_code", path: null, reward: 0.0 },
-    { step: 4, action: "run_tests", path: "tests/test_formatter.py", reward: 0.0 },
-    { step: 5, action: "submit", path: null, reward: 0.694 }
-  ]
-};
-// ── Load data from injection point or use default ────────────────────────────
-function loadVizData() {
-  const el = document.getElementById('viz-data');
-  if (el && el.textContent.trim()) {
-    try { return JSON.parse(el.textContent); } catch(e) {}
-  }
-  return DEFAULT_DATA;
-}
-// ── Three.js setup ───────────────────────────────────────────────────────────
 const canvas = document.getElementById('three-canvas');
-const renderer = new THREE.WebGLRenderer({ canvas, antialias: true, alpha: true });
-renderer.setSize(window.innerWidth, window.innerHeight);
 renderer.setPixelRatio(Math.min(window.devicePixelRatio, 2));
 renderer.setClearColor(0x0a0e1a, 1);
 const scene = new THREE.Scene();
-const fov = 60;
-const camera = new THREE.PerspectiveCamera(fov, window.innerWidth / window.innerHeight, 0.1, 1000);
-camera.position.set(0, 8, 22);
 camera.lookAt(0, 0, 0);
-// Ambient + directional light
-scene.add(new THREE.AmbientLight(0x1a2040, 1));
-const dirLight = new THREE.DirectionalLight(0x7dd3fc, 0.6);
-dirLight.position.set(5, 10, 5);
-scene.add(dirLight);
 // Grid
-const grid = new THREE.GridHelper(40, 20, 0x1e293b, 0x1e293b);
-grid.position.y = -3;
 scene.add(grid);
 // Stars
-const starGeo = new THREE.BufferGeometry();
-const starCount = 800;
-const starPositions = new Float32Array(starCount * 3);
-for (let i = 0; i < starCount * 3; i++) starPositions[i] = (Math.random() - 0.5) * 200;
-starGeo.setAttribute('position', new THREE.BufferAttribute(starPositions, 3));
-const starMat = new THREE.PointsMaterial({ color: 0x334155, size: 0.3 });
-scene.add(new THREE.Points(starGeo, starMat));
-// ── Color palette ─────────────────────────────────────────────────────────────
-const COLORS = {
-  src:       0xf97316,
-  test:      0x3b82f6,
-  spec:      0xa855f7,
-  visited:   0x22c55e,
-  modified:  0xef4444,
-  bug:       0xef4444,
-  edge:      0x334155,
-  path:      0xfacc15,
-  agent:     0xfbbf24,
-};
-// ── Orbit control (manual implementation) ────────────────────────────────────
-let isOrbiting = false;
-let orbitActive = false;
-let mouse = { x: 0, y: 0, down: false, lastX: 0, lastY: 0 };
-let spherical = { theta: 0, phi: Math.PI / 4, r: 22 };
-canvas.addEventListener('mousedown', e => { mouse.down = true; mouse.lastX = e.clientX; mouse.lastY = e.clientY; });
-canvas.addEventListener('mouseup', () => { mouse.down = false; });
-canvas.addEventListener('mousemove', e => {
-  if (!mouse.down) {
-    // Hover for tooltip
-    checkHover(e.clientX, e.clientY);
-    return;
-  }
-  const dx = e.clientX - mouse.lastX;
-  const dy = e.clientY - mouse.lastY;
-  spherical.theta -= dx * 0.005;
-  spherical.phi = Math.max(0.1, Math.min(Math.PI / 2, spherical.phi - dy * 0.005));
-  mouse.lastX = e.clientX;
-  mouse.lastY = e.clientY;
 });
 canvas.addEventListener('wheel', e => {
-  spherical.r = Math.max(8, Math.min(50, spherical.r + e.deltaY * 0.02));
 });
 function updateCamera() {
-  if (orbitActive) spherical.theta += 0.003;
-  camera.position.x = spherical.r * Math.sin(spherical.phi) * Math.sin(spherical.theta);
-  camera.position.y = spherical.r * Math.cos(spherical.phi);
-  camera.position.z = spherical.r * Math.sin(spherical.phi) * Math.cos(spherical.theta);
   camera.lookAt(0, 0, 0);
 }
-// ── Scene objects ─────────────────────────────────────────────────────────────
-const nodeObjects = {}; // name → { mesh, label, position }
-const edgeObjects = [];
-const pathObjects = [];
-let agentSphere = null;
-let agentTrail = null;
-let currentStep = 0;
-let maxStep = 0;
-let playing = false;
-let playInterval = null;
 let vizData = null;
-let nodePositions = {};
-// ── Build scene from data ─────────────────────────────────────────────────────
 function buildScene(data) {
   vizData = data;
-  // Clear previous objects
-  Object.values(nodeObjects).forEach(o => scene.remove(o.mesh));
-  edgeObjects.forEach(e => scene.remove(e));
-  pathObjects.forEach(p => scene.remove(p));
-  if (agentSphere) scene.remove(agentSphere);
-  Object.keys(nodeObjects).forEach(k => delete nodeObjects[k]);
   const files = data.files || [];
   const n = files.length;
-  if (n === 0) return;
-  // Arrange files in a circular layout on XZ plane
-  files.forEach((file, i) => {
-    const angle = (i / n) * Math.PI * 2;
-    const radius = Math.max(4, n * 0.9);
-    const x = Math.cos(angle) * radius;
-    const z = Math.sin(angle) * radius;
-    const y = 0;
-    nodePositions[file.name] = new THREE.Vector3(x, y, z);
-    // Sphere geometry
-    const geo = new THREE.SphereGeometry(0.6, 16, 16);
-    const color = new THREE.Color(
-      file.is_bug_file ? COLORS.bug :
-      file.type === 'test' ? COLORS.test :
-      file.type === 'spec' ? COLORS.spec : COLORS.src
-    );
     const mat = new THREE.MeshPhongMaterial({
-      color,
-      emissive: color.clone().multiplyScalar(0.3),
-      shininess: 60,
-      transparent: true,
-      opacity: 0.9,
     });
     const mesh = new THREE.Mesh(geo, mat);
-    mesh.position.set(x, y, z);
-    mesh.userData = { file };
     scene.add(mesh);
-    // Glow ring
-    const ringGeo = new THREE.RingGeometry(0.75, 0.85, 32);
-    const ringMat = new THREE.MeshBasicMaterial({
-      color,
-      transparent: true,
-      opacity: 0.25,
-      side: THREE.DoubleSide,
-    });
-    const ring = new THREE.Mesh(ringGeo, ringMat);
     ring.rotation.x = Math.PI / 2;
     mesh.add(ring);
-    nodeObjects[file.name] = { mesh, position: nodePositions[file.name], file };
   });
-  // Draw dependency edges
   (data.dependencies || []).forEach(dep => {
-    const fromPos = nodePositions[dep.from];
-    const toPos = nodePositions[dep.to];
-    if (!fromPos || !toPos) return;
-    const points = [fromPos.clone(), toPos.clone()];
-    const geo = new THREE.BufferGeometry().setFromPoints(points);
-    const mat = new THREE.LineBasicMaterial({
-      color: COLORS.edge,
-      transparent: true,
-      opacity: 0.4,
-    });
     const line = new THREE.Line(geo, mat);
     scene.add(line);
-    edgeObjects.push(line);
   });
-  // Agent globe
-  const agentGeo = new THREE.SphereGeometry(0.35, 16, 16);
-  const agentMat = new THREE.MeshPhongMaterial({
-    color: COLORS.agent,
-    emissive: 0xfbbf24,
-    emissiveIntensity: 0.8,
-    shininess: 100,
-  });
-  agentSphere = new THREE.Mesh(agentGeo, agentMat);
-  agentSphere.position.set(0, 3, 0); // Start above origin
-  scene.add(agentSphere);
-  // Update UI
-  document.getElementById('stat-task').textContent = data.task || '—';
-  document.getElementById('stat-variant').textContent = data.variant_id || '—';
-  document.getElementById('stat-steps').textContent = (data.steps || []).length;
-  document.getElementById('stat-strategy').textContent = data.strategy || '—';
-  document.getElementById('stat-failure').textContent = data.failure_type || '—';
   updateScore(data.final_score || 0);
-  updateStepLog(data.steps || [], -1);
-  // Setup timeline
   maxStep = (data.steps || []).length;
-  const slider = document.getElementById('timeline-slider');
-  slider.max = maxStep;
-  slider.value = 0;
-  currentStep = 0;
-  updateStepLabel(0, maxStep);
   applyStep(0);
 }
-// ── Animation: go to a specific step ─────────────────────────────────────────
-function applyStep(stepIndex) {
   if (!vizData) return;
   const steps = vizData.steps || [];
-  const visitedFiles = new Set();
-  const modifiedFiles = new Set();
   // Reset all nodes
-  Object.values(nodeObjects).forEach(obj => {
-    const file = obj.file;
-    const baseColor = new THREE.Color(
-      file.is_bug_file ? COLORS.bug :
-      file.type === 'test' ? COLORS.test :
-      file.type === 'spec' ? COLORS.spec : COLORS.src
-    );
-    obj.mesh.material.color.set(baseColor);
-    obj.mesh.material.emissive.set(baseColor.clone().multiplyScalar(0.2));
-    obj.mesh.material.opacity = 0.5;
-    obj.mesh.scale.set(1, 1, 1);
   });
   // Remove old path lines
-  pathObjects.forEach(p => scene.remove(p));
-  pathObjects.length = 0;
-  // Collect positions for path up to current step
-  const pathPositions = [];
-  for (let i = 0; i < stepIndex; i++) {
-    const step = steps[i];
-    if (!step) continue;
-    if (step.path && nodeObjects[step.path]) {
-      const pos = nodeObjects[step.path].position.clone();
-      pathPositions.push(pos.clone().add(new THREE.Vector3(0, 0.1, 0)));
-      if (step.action === 'read_file') visitedFiles.add(step.path);
-      if (step.action === 'write_file') modifiedFiles.add(step.path);
     }
   }
-  // Color visited + modified
-  visitedFiles.forEach(name => {
-    if (nodeObjects[name]) {
-      nodeObjects[name].mesh.material.color.set(COLORS.visited);
-      nodeObjects[name].mesh.material.emissive.set(
-        new THREE.Color(COLORS.visited).multiplyScalar(0.4)
-      );
-      nodeObjects[name].mesh.material.opacity = 1.0;
-      nodeObjects[name].mesh.scale.set(1.2, 1.2, 1.2);
     }
   });
-  modifiedFiles.forEach(name => {
-    if (nodeObjects[name]) {
-      nodeObjects[name].mesh.material.color.set(COLORS.modified);
-      nodeObjects[name].mesh.material.emissive.set(
-        new THREE.Color(COLORS.modified).multiplyScalar(0.5)
-      );
-      nodeObjects[name].mesh.material.opacity = 1.0;
-      nodeObjects[name].mesh.scale.set(1.4, 1.4, 1.4);
     }
   });
-  // Draw path beam
-  if (pathPositions.length >= 2) {
-    const pathGeo = new THREE.BufferGeometry().setFromPoints(pathPositions);
-    const pathMat = new THREE.LineBasicMaterial({
-      color: COLORS.path,
-      transparent: true,
-      opacity: 0.85,
-      linewidth: 2,
-    });
-    const pathLine = new THREE.Line(pathGeo, pathMat);
-    scene.add(pathLine);
-    pathObjects.push(pathLine);
   }
-  // Move agent sphere
-  if (stepIndex > 0 && stepIndex <= steps.length) {
-    const currentStepData = steps[stepIndex - 1];
-    if (currentStepData && currentStepData.path && nodeObjects[currentStepData.path]) {
-      const targetPos = nodeObjects[currentStepData.path].position;
-      agentSphere.position.set(targetPos.x, targetPos.y + 1.2, targetPos.z);
-    } else {
-      // No file target — float in center (for search/submit actions)
-      agentSphere.position.set(0, 2.5, 0);
-    }
-  } else {
-    agentSphere.position.set(0, 3.5, 0);
   }
-  // Highlight current node
-  if (stepIndex > 0) {
-    const cur = steps[stepIndex - 1];
-    if (cur && cur.path && nodeObjects[cur.path]) {
-      nodeObjects[cur.path].mesh.scale.set(1.6, 1.6, 1.6);
     }
   }
-  updateStepLog(steps, stepIndex - 1);
-  updateStepLabel(stepIndex, maxStep);
-  // Update slider gradient
-  const slider = document.getElementById('timeline-slider');
-  const pct = maxStep > 0 ? (stepIndex / maxStep * 100) : 0;
-  slider.style.setProperty('--pct', pct + '%');
 }
-// ── Score ring ────────────────────────────────────────────────────────────────
-function updateScore(score) {
-  const circumference = 2 * Math.PI * 34;
-  const arc = circumference * Math.min(1, Math.max(0, score));
-  document.getElementById('score-arc').setAttribute(
-    'stroke-dasharray', `${arc} ${circumference}`
-  );
-  document.getElementById('score-text').textContent = score.toFixed(2);
-  document.getElementById('stat-score').textContent = score.toFixed(3);
-  // Color by score
-  const color = score >= 0.7 ? '#4ade80' : score >= 0.4 ? '#fbbf24' : '#f87171';
-  document.getElementById('score-arc').setAttribute('stroke', color);
 }
-// ── Step log ──────────────────────────────────────────────────────────────────
-function updateStepLog(steps, currentIdx) {
-  const container = document.getElementById('log-entries');
   container.innerHTML = '';
-  const ACTION_EMOJI = {
-    read_file: '📖',
-    write_file: '✏️',
-    run_tests: '🧪',
-    search_code: '🔍',
-    submit: '🏁',
-  };
-  steps.forEach((step, i) => {
-    const active = i === currentIdx;
-    const past = i < currentIdx;
-    const entry = document.createElement('div');
-    entry.className = 'log-entry';
-    entry.style.opacity = past ? '0.6' : active ? '1' : '0.35';
-    if (active) entry.style.background = 'rgba(125,211,252,0.08)';
-    const reward = step.reward || 0;
-    const rewardClass = reward > 0 ? 'reward-pos' : reward < 0 ? 'reward-neg' : 'reward-zero';
-    const emoji = ACTION_EMOJI[step.action] || '•';
-    const path = step.path ? step.path.split('/').pop() : step.action;
-    entry.innerHTML = `
-      <span class="log-step">S${step.step}</span>
-      <span class="log-action" style="color:${active ? '#7dd3fc' : '#94a3b8'}">${emoji} ${path}</span>
-      <span class="log-reward ${rewardClass}">${reward > 0 ? '+' : ''}${reward.toFixed(2)}</span>
-    `;
-    container.appendChild(entry);
   });
-  // Auto-scroll to current
-  if (currentIdx >= 0) {
-    const entries = container.children;
-    if (entries[currentIdx]) {
-      entries[currentIdx].scrollIntoView({ block: 'nearest' });
-    }
   }
 }
-// ── Hover tooltip ─────────────────────────────────────────────────────────────
-const raycaster = new THREE.Raycaster();
-const mouseVec = new THREE.Vector2();
-const tooltip = document.getElementById('tooltip');
 function checkHover(mx, my) {
-  mouseVec.x = (mx / window.innerWidth) * 2 - 1;
-  mouseVec.y = -(my / window.innerHeight) * 2 + 1;
-  raycaster.setFromCamera(mouseVec, camera);
-  const meshes = Object.values(nodeObjects).map(o => o.mesh);
-  const hits = raycaster.intersectObjects(meshes);
-  if (hits.length > 0) {
-    const file = hits[0].object.userData.file;
-    if (file) {
-      tooltip.style.opacity = '1';
-      tooltip.style.left = (mx + 14) + 'px';
-      tooltip.style.top = (my - 14) + 'px';
-      document.getElementById('tooltip-title').textContent = file.name;
-      document.getElementById('tooltip-body').innerHTML = `
-        Type: ${file.type}<br>
-        ${file.is_bug_file ? '⚠️ Bug location' : ''}
-      `;
-    }
   } else {
-    tooltip.style.opacity = '0';
-  }
-}
-// ── Timeline controls ─────────────────────────────────────────────────────────
-function onSliderChange(val) {
-  currentStep = parseInt(val);
-  applyStep(currentStep);
-}
-function stepForward() {
-  if (currentStep < maxStep) {
-    currentStep++;
-    document.getElementById('timeline-slider').value = currentStep;
-    applyStep(currentStep);
-  }
-}
-function stepBack() {
-  if (currentStep > 0) {
-    currentStep--;
-    document.getElementById('timeline-slider').value = currentStep;
-    applyStep(currentStep);
   }
 }
 function togglePlay() {
   playing = !playing;
-  const btn = document.getElementById('play-btn');
-  btn.textContent = playing ? '⏸ Pause' : '▶ Play';
   if (playing) {
-    if (currentStep >= maxStep) { currentStep = 0; }
-    playInterval = setInterval(() => {
-      if (currentStep >= maxStep) {
-        playing = false;
-        btn.textContent = '▶ Play';
-        clearInterval(playInterval);
-        return;
-      }
-      stepForward();
-    }, 900);
   } else {
-    clearInterval(playInterval);
   }
 }
 function toggleOrbit() {
-  orbitActive = !orbitActive;
   const btn = document.getElementById('orbit-btn');
-  btn.textContent = orbitActive ? '⏹ Stop' : '🔄 Orbit';
-  btn.classList.toggle('active', orbitActive);
 }
 function resetView() {
-  spherical = { theta: 0, phi: Math.PI / 4, r: 22 };
-  currentStep = 0;
-  document.getElementById('timeline-slider').value = 0;
   applyStep(0);
 }
-function updateStepLabel(step, max) {
-  document.getElementById('step-label').textContent = `Step ${step} / ${max}`;
-}
-// ── Animation loop ────────────────────────────────────────────────────────────
-let frame = 0;
 function animate() {
   requestAnimationFrame(animate);
   frame++;
   updateCamera();
-  // Pulse agent sphere
-  if (agentSphere) {
-    const pulse = 1 + Math.sin(frame * 0.08) * 0.15;
-    agentSphere.scale.set(pulse, pulse, pulse);
-    agentSphere.rotation.y += 0.03;
-  }
-  // Subtle node oscillation
-  Object.values(nodeObjects).forEach((obj, i) => {
-    obj.mesh.position.y = obj.position.y + Math.sin(frame * 0.02 + i) * 0.05;
   });
   renderer.render(scene, camera);
 }
-// ── Window resize ─────────────────────────────────────────────────────────────
-window.addEventListener('resize', () => {
-  camera.aspect = window.innerWidth / window.innerHeight;
-  camera.updateProjectionMatrix();
-  renderer.setSize(window.innerWidth, window.innerHeight);
-});
-// ── Public API for Gradio integration ────────────────────────────────────────
-window.loadTrajectoryData = function(jsonData) {
   try {
-    const data = typeof jsonData === 'string' ? JSON.parse(jsonData) : jsonData;
     buildScene(data);
   } catch(e) {
-    console.error('Failed to load trajectory data:', e);
   }
-};
-// ── Init ─────────────────────────────────────────────────────────────────────
-document.addEventListener('DOMContentLoaded', () => {
-  const data = loadVizData();
   buildScene(data);
   document.getElementById('loader').style.display = 'none';
-  animate();
-});
 </script>
 </body>
 </html>

 <title>Agent Trajectory 3D Visualizer</title>
 <style>
   * { margin: 0; padding: 0; box-sizing: border-box; }
+  html, body {
+    width: 100%; height: 100%;
     background: #0a0e1a;
     color: #e0e6f0;
     font-family: 'Segoe UI', system-ui, sans-serif;
     overflow: hidden;
   }
+  #three-canvas {
+    position: fixed;
     top: 0; left: 0;
     width: 100%; height: 100%;
+    display: block;
   }
   /* Header */
   #header {
+    position: fixed;
+    top: 10px; left: 50%;
     transform: translateX(-50%);
     text-align: center;
+    z-index: 20;
     pointer-events: none;
   }
   #header h1 {
+    font-size: 14px; font-weight: 700;
     color: #7dd3fc;
     letter-spacing: 0.05em;
+    text-shadow: 0 0 16px rgba(125,211,252,0.6);
   }
+  /* Panel base */
+  .panel {
+    position: fixed;
+    background: rgba(10,14,26,0.88);
     border: 1px solid rgba(125,211,252,0.2);
+    border-radius: 10px;
     padding: 10px 14px;
     font-size: 11px;
+    z-index: 20;
+    backdrop-filter: blur(6px);
   }
+  .panel h3 {
+    font-size: 10px; letter-spacing: 0.1em;
+    color: #7dd3fc; margin-bottom: 8px;
+    text-transform: uppercase;
   }
   /* Info panel */
+  #info-panel { top: 10px; left: 14px; min-width: 190px; }
+  .info-row { display: flex; justify-content: space-between; gap: 10px; margin-bottom: 4px; color: #94a3b8; }
+  .info-val { color: #e0e6f0; font-weight: 600; max-width: 110px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
+  /* Legend */
+  #legend { top: 10px; right: 14px; }
+  .leg { display: flex; align-items: center; gap: 7px; margin-bottom: 5px; }
+  .leg-dot { width: 9px; height: 9px; border-radius: 50%; flex-shrink: 0; }
+  .leg-line { width: 18px; height: 3px; border-radius: 2px; flex-shrink: 0; }
+  /* Score ring */
+  #score-ring { position: fixed; bottom: 150px; left: 14px; z-index: 20; }
+  /* Step log */
+  #step-log {
+    position: fixed; bottom: 150px; right: 14px;
+    width: 230px; max-height: 200px; overflow-y: auto;
+    z-index: 20;
   }
+  .log-e { display: flex; gap: 5px; margin-bottom: 5px; padding-bottom: 5px; border-bottom: 1px solid rgba(255,255,255,0.05); font-size: 10px; }
+  .log-e:last-child { border-bottom: none; }
+  .log-s { color: #475569; min-width: 24px; }
+  .log-a { font-weight: 600; flex: 1; }
+  .rp { color: #4ade80; } .rn { color: #f87171; } .rz { color: #94a3b8; }
   /* Timeline */
+  #timeline {
+    position: fixed; bottom: 16px; left: 50%;
     transform: translateX(-50%);
+    width: min(680px, 92vw);
+    z-index: 20;
   }
+  #tl-header { display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px; }
+  #tl-header h3 { font-size: 10px; color: #7dd3fc; letter-spacing: 0.1em; }
+  #step-label { font-size: 11px; color: #f0abfc; font-weight: 700; }
+  #slider {
+    width: 100%; -webkit-appearance: none; height: 4px;
     background: linear-gradient(to right, #7dd3fc 0%, #7dd3fc var(--pct,0%), #1e293b var(--pct,0%));
+    border-radius: 4px; outline: none; cursor: pointer;
   }
+  #slider::-webkit-slider-thumb {
+    -webkit-appearance: none; width: 15px; height: 15px;
+    border-radius: 50%; background: #7dd3fc; cursor: pointer;
+    box-shadow: 0 0 8px rgba(125,211,252,0.8);
   }
+  #tl-btns { display: flex; gap: 7px; margin-top: 8px; justify-content: center; }
+  .tb {
     background: rgba(125,211,252,0.1);
     border: 1px solid rgba(125,211,252,0.3);
+    color: #7dd3fc; padding: 4px 12px;
+    border-radius: 6px; cursor: pointer; font-size: 10px;
+    transition: all 0.15s;
   }
+  .tb:hover { background: rgba(125,211,252,0.25); }
+  .tb.active { background: rgba(125,211,252,0.3); }
   /* Tooltip */
   #tooltip {
+    position: fixed; z-index: 30;
     background: rgba(10,14,26,0.95);
     border: 1px solid rgba(125,211,252,0.4);
+    border-radius: 6px; padding: 7px 11px;
+    font-size: 10px; pointer-events: none;
+    opacity: 0; transition: opacity 0.1s;
+    max-width: 180px;
   }
+  #tt-title { color: #7dd3fc; margin-bottom: 3px; font-weight: 700; }
   /* Loader */
   #loader {
+    position: fixed; top: 50%; left: 50%;
+    transform: translate(-50%,-50%);
+    text-align: center; z-index: 50; color: #7dd3fc; font-size: 13px;
   }
+  .spin {
+    width: 36px; height: 36px; margin: 0 auto 10px;
+    border: 3px solid rgba(125,211,252,0.15);
     border-top-color: #7dd3fc;
     border-radius: 50%;
+    animation: sp 0.7s linear infinite;
+  }
+  @keyframes sp { to { transform: rotate(360deg); } }
+  #no-data {
+    position: fixed; top: 50%; left: 50%;
+    transform: translate(-50%,-50%);
+    text-align: center; color: #475569; font-size: 13px;
+    display: none;
   }
 </style>
 </head>
 <body>
+<canvas id="three-canvas"></canvas>
+<div id="loader"><div class="spin"></div><p>Loading 3D...</p></div>
+<div id="no-data">
+  <p style="font-size:28px;margin-bottom:12px">🔍</p>
+  <p style="color:#7dd3fc;font-weight:700;margin-bottom:6px">No Episode Loaded</p>
+  <p>Run an episode first, then click<br><strong style="color:#7dd3fc">Load Trajectory</strong></p>
 </div>
+<div id="header"><h1>🔍 Agent Trajectory Visualizer — 3D</h1></div>
+<!-- Info panel -->
+<div class="panel" id="info-panel">
+  <h3>Episode Stats</h3>
+  <div class="info-row"><span>Task</span><span class="info-val" id="st-task">—</span></div>
+  <div class="info-row"><span>Variant</span><span class="info-val" id="st-var">—</span></div>
+  <div class="info-row"><span>Steps</span><span class="info-val" id="st-steps">—</span></div>
+  <div class="info-row"><span>Score</span><span class="info-val" id="st-score">—</span></div>
+  <div class="info-row"><span>Strategy</span><span class="info-val" id="st-strat">—</span></div>
+</div>
+<!-- Legend -->
+<div class="panel" id="legend">
+  <h3>Legend</h3>
+  <div class="leg"><div class="leg-dot" style="background:#f97316"></div><span>Source file</span></div>
+  <div class="leg"><div class="leg-dot" style="background:#3b82f6"></div><span>Test file</span></div>
+  <div class="leg"><div class="leg-dot" style="background:#a855f7"></div><span>Spec / Docs</span></div>
+  <div class="leg"><div class="leg-dot" style="background:#22c55e"></div><span>Visited</span></div>
+  <div class="leg"><div class="leg-dot" style="background:#ef4444"></div><span>Bug / Modified</span></div>
+  <div class="leg"><div class="leg-line" style="background:#facc15"></div><span>Agent path</span></div>
+</div>
+<!-- Score ring -->
+<div id="score-ring">
+  <svg width="76" height="76" viewBox="0 0 76 76">
+    <circle cx="38" cy="38" r="30" fill="none" stroke="rgba(125,211,252,0.12)" stroke-width="6"/>
+    <circle id="score-arc" cx="38" cy="38" r="30" fill="none"
+      stroke="#7dd3fc" stroke-width="6"
+      stroke-dasharray="0 188"
+      stroke-linecap="round"
+      transform="rotate(-90 38 38)"
+      style="transition:stroke-dasharray 1.2s ease"/>
+    <text id="score-txt" x="38" y="43" text-anchor="middle"
+      fill="#e0e6f0" font-size="13" font-weight="700"
+      font-family="'Segoe UI',sans-serif">0.0</text>
+  </svg>
+</div>
+<!-- Step log -->
+<div class="panel" id="step-log">
+  <h3>Step Log</h3>
+  <div id="log-list"></div>
+</div>
+<!-- Tooltip -->
+<div id="tooltip"><div id="tt-title"></div><div id="tt-body"></div></div>
+<!-- Timeline -->
+<div class="panel" id="timeline">
+  <div id="tl-header">
+    <h3>Timeline Replay</h3>
+    <span id="step-label">Step 0 / 0</span>
   </div>
+  <input type="range" id="slider" min="0" max="0" value="0"
+    oninput="onSlider(this.value)">
+  <div id="tl-btns">
+    <button class="tb" onclick="stepBack()">◀ Back</button>
+    <button class="tb" id="play-btn" onclick="togglePlay()">▶ Play</button>
+    <button class="tb" onclick="stepFwd()">Forward ▶</button>
+    <button class="tb" onclick="resetView()">↺ Reset</button>
+    <button class="tb" id="orbit-btn" onclick="toggleOrbit()">🔄 Orbit</button>
   </div>
 </div>
 <script src="https://cdnjs.cloudflare.com/ajax/libs/three.js/r128/three.min.js"></script>
 <script>
+// ── Renderer ──────────────────────────────────────────────────────────────────
 const canvas = document.getElementById('three-canvas');
+const renderer = new THREE.WebGLRenderer({ canvas, antialias: true, alpha: false });
 renderer.setPixelRatio(Math.min(window.devicePixelRatio, 2));
 renderer.setClearColor(0x0a0e1a, 1);
+function resize() {
+  renderer.setSize(window.innerWidth, window.innerHeight, false);
+  camera.aspect = window.innerWidth / window.innerHeight;
+  camera.updateProjectionMatrix();
+}
+window.addEventListener('resize', resize);
+// ── Scene + Camera ─────────────────────────────────────────────────────────────
 const scene = new THREE.Scene();
+const camera = new THREE.PerspectiveCamera(58, 1, 0.1, 1000);
+camera.position.set(0, 8, 24);
 camera.lookAt(0, 0, 0);
+resize();
+// Lights
+scene.add(new THREE.AmbientLight(0x1a2040, 1.2));
+const dl = new THREE.DirectionalLight(0x7dd3fc, 0.5);
+dl.position.set(5, 12, 5);
+scene.add(dl);
 // Grid
+const grid = new THREE.GridHelper(50, 25, 0x1e293b, 0x0f172a);
+grid.position.y = -3.5;
 scene.add(grid);
 // Stars
+(function() {
+  const geo = new THREE.BufferGeometry();
+  const pos = new Float32Array(900 * 3);
+  for (let i = 0; i < 900 * 3; i++) pos[i] = (Math.random() - 0.5) * 220;
+  geo.setAttribute('position', new THREE.BufferAttribute(pos, 3));
+  scene.add(new THREE.Points(geo, new THREE.PointsMaterial({ color: 0x1e3a5f, size: 0.25 })));
+})();
+// ── Orbit controls (manual) ───────────────────────────────────────────────────
+let sph = { theta: 0, phi: 1.1, r: 24 };
+let orbitAuto = false, dragging = false, lastX = 0, lastY = 0;
+canvas.addEventListener('mousedown', e => { dragging = true; lastX = e.clientX; lastY = e.clientY; });
+window.addEventListener('mouseup', () => { dragging = false; });
+window.addEventListener('mousemove', e => {
+  if (dragging) {
+    sph.theta -= (e.clientX - lastX) * 0.006;
+    sph.phi = Math.max(0.15, Math.min(1.55, sph.phi - (e.clientY - lastY) * 0.006));
+    lastX = e.clientX; lastY = e.clientY;
+  } else { checkHover(e.clientX, e.clientY); }
 });
 canvas.addEventListener('wheel', e => {
+  sph.r = Math.max(8, Math.min(55, sph.r + e.deltaY * 0.025));
 });
 function updateCamera() {
+  if (orbitAuto) sph.theta += 0.004;
+  const sin_p = Math.sin(sph.phi);
+  camera.position.set(
+    sph.r * sin_p * Math.sin(sph.theta),
+    sph.r * Math.cos(sph.phi),
+    sph.r * sin_p * Math.cos(sph.theta)
+  );
   camera.lookAt(0, 0, 0);
 }
+// ── Scene state ───────────────────────────────────────────────────────────────
+const COLS = { src:0xf97316, test:0x3b82f6, spec:0xa855f7, visited:0x22c55e, bug:0xef4444, agent:0xfbbf24, path:0xfacc15, edge:0x334155 };
+let nodeMap = {};   // filename → { mesh, basePos }
+let pathLines = [], edgeLines = [];
+let agentMesh = null;
 let vizData = null;
+let curStep = 0, maxStep = 0;
+let playing = false, playTimer = null;
+let frame = 0;
+// ── Build scene ───────────────────────────────────────────────────────────────
+function clearScene() {
+  Object.values(nodeMap).forEach(o => scene.remove(o.mesh));
+  pathLines.forEach(l => scene.remove(l));
+  edgeLines.forEach(l => scene.remove(l));
+  if (agentMesh) scene.remove(agentMesh);
+  nodeMap = {}; pathLines = []; edgeLines = []; agentMesh = null;
+}
 function buildScene(data) {
+  clearScene();
   vizData = data;
   const files = data.files || [];
   const n = files.length;
+  if (!n) return;
+  // Layout: circle
+  files.forEach((f, i) => {
+    const angle = (i / n) * Math.PI * 2 - Math.PI / 2;
+    const R = Math.max(5, n * 1.0);
+    const x = Math.cos(angle) * R;
+    const z = Math.sin(angle) * R;
+    const pos = new THREE.Vector3(x, 0, z);
+    const baseColor = f.is_bug_file ? COLS.bug :
+                      f.type === 'test' ? COLS.test :
+                      f.type === 'spec' ? COLS.spec : COLS.src;
+    const col = new THREE.Color(baseColor);
+    // Main sphere
+    const geo = new THREE.SphereGeometry(0.55, 20, 20);
     const mat = new THREE.MeshPhongMaterial({
+      color: col, emissive: col.clone().multiplyScalar(0.25),
+      shininess: 70, transparent: true, opacity: 0.85,
     });
     const mesh = new THREE.Mesh(geo, mat);
+    mesh.position.copy(pos);
+    mesh.userData = { file: f, basePos: pos.clone() };
     scene.add(mesh);
+    // Ring halo
+    const rg = new THREE.RingGeometry(0.7, 0.82, 32);
+    const rm = new THREE.MeshBasicMaterial({ color: col, transparent: true, opacity: 0.2, side: THREE.DoubleSide });
+    const ring = new THREE.Mesh(rg, rm);
     ring.rotation.x = Math.PI / 2;
     mesh.add(ring);
+    nodeMap[f.name] = { mesh, basePos: pos.clone() };
   });
+  // Dependency edges
   (data.dependencies || []).forEach(dep => {
+    const a = nodeMap[dep.from], b = nodeMap[dep.to];
+    if (!a || !b) return;
+    const geo = new THREE.BufferGeometry().setFromPoints([a.basePos.clone(), b.basePos.clone()]);
+    const mat = new THREE.LineBasicMaterial({ color: COLS.edge, transparent: true, opacity: 0.35 });
     const line = new THREE.Line(geo, mat);
     scene.add(line);
+    edgeLines.push(line);
   });
+  // Agent sphere
+  const ag = new THREE.SphereGeometry(0.32, 14, 14);
+  const am = new THREE.MeshPhongMaterial({ color: COLS.agent, emissive: 0xfbbf24, emissiveIntensity: 0.9, shininess: 120 });
+  agentMesh = new THREE.Mesh(ag, am);
+  agentMesh.position.set(0, 3, 0);
+  scene.add(agentMesh);
+  // Update UI stats
+  document.getElementById('st-task').textContent = data.task || '—';
+  document.getElementById('st-var').textContent = (data.variant_id || '—').slice(0, 12);
+  document.getElementById('st-steps').textContent = (data.steps || []).length;
+  document.getElementById('st-strat').textContent = data.strategy || '—';
   updateScore(data.final_score || 0);
   maxStep = (data.steps || []).length;
+  const sl = document.getElementById('slider');
+  sl.max = maxStep; sl.value = 0;
+  curStep = 0;
+  updateLabel(0, maxStep);
   applyStep(0);
 }
+// ── Apply step ────────────────────────────────────────────────────────────────
+function applyStep(idx) {
   if (!vizData) return;
   const steps = vizData.steps || [];
   // Reset all nodes
+  Object.values(nodeMap).forEach(({ mesh, basePos: _ }) => {
+    const f = mesh.userData.file;
+    const bc = f.is_bug_file ? COLS.bug : f.type === 'test' ? COLS.test : f.type === 'spec' ? COLS.spec : COLS.src;
+    mesh.material.color.set(bc);
+    mesh.material.emissive.set(new THREE.Color(bc).multiplyScalar(0.2));
+    mesh.material.opacity = 0.55;
+    mesh.scale.setScalar(1);
   });
   // Remove old path lines
+  pathLines.forEach(l => scene.remove(l));
+  pathLines = [];
+  // Collect positions for path
+  const pathPts = [];
+  const visited = new Set(), modified = new Set();
+  for (let i = 0; i < idx; i++) {
+    const s = steps[i];
+    if (!s) continue;
+    if (s.path && nodeMap[s.path]) {
+      const p = nodeMap[s.path].basePos.clone().add(new THREE.Vector3(0, 0.15, 0));
+      pathPts.push(p);
     }
+    if (s.action === 'read_file' && s.path) visited.add(s.path);
+    if (s.action === 'write_file' && s.path) modified.add(s.path);
   }
+  // Color visited/modified
+  visited.forEach(name => {
+    if (nodeMap[name]) {
+      nodeMap[name].mesh.material.color.set(COLS.visited);
+      nodeMap[name].mesh.material.emissive.set(new THREE.Color(COLS.visited).multiplyScalar(0.4));
+      nodeMap[name].mesh.material.opacity = 1;
+      nodeMap[name].mesh.scale.setScalar(1.25);
     }
   });
+  modified.forEach(name => {
+    if (nodeMap[name]) {
+      nodeMap[name].mesh.material.color.set(COLS.bug);
+      nodeMap[name].mesh.material.emissive.set(new THREE.Color(COLS.bug).multiplyScalar(0.5));
+      nodeMap[name].mesh.material.opacity = 1;
+      nodeMap[name].mesh.scale.setScalar(1.45);
     }
   });
+  // Highlight current node
+  if (idx > 0 && idx <= steps.length) {
+    const cur = steps[idx - 1];
+    if (cur && cur.path && nodeMap[cur.path]) {
+      nodeMap[cur.path].mesh.scale.setScalar(1.65);
+    }
   }
+  // Draw path
+  if (pathPts.length >= 2) {
+    const geo = new THREE.BufferGeometry().setFromPoints(pathPts);
+    const mat = new THREE.LineBasicMaterial({ color: COLS.path, transparent: true, opacity: 0.9 });
+    const line = new THREE.Line(geo, mat);
+    scene.add(line); pathLines.push(line);
   }
+  // Move agent
+  if (idx > 0 && idx <= steps.length) {
+    const cur = steps[idx - 1];
+    if (cur && cur.path && nodeMap[cur.path]) {
+      const tp = nodeMap[cur.path].basePos;
+      agentMesh.position.set(tp.x, tp.y + 1.3, tp.z);
+    } else {
+      agentMesh.position.set(0, 2.5, 0);
     }
+  } else {
+    agentMesh.position.set(0, 3.5, 0);
   }
+  updateLog(steps, idx - 1);
+  updateLabel(idx, maxStep);
+  const sl = document.getElementById('slider');
+  sl.style.setProperty('--pct', (maxStep > 0 ? idx / maxStep * 100 : 0) + '%');
 }
+// ── Score ring ─────────────────────────────────────────────────────────────────
+function updateScore(s) {
+  const c = 2 * Math.PI * 30;
+  const arc = c * Math.min(1, Math.max(0, s));
+  document.getElementById('score-arc').setAttribute('stroke-dasharray', `${arc} ${c}`);
+  document.getElementById('score-txt').textContent = s.toFixed(2);
+  document.getElementById('st-score').textContent = s.toFixed(3);
+  const col = s >= 0.7 ? '#4ade80' : s >= 0.4 ? '#fbbf24' : '#f87171';
+  document.getElementById('score-arc').setAttribute('stroke', col);
 }
+// ── Step log ───────────────────────────────────────────────────────────────────
+function updateLog(steps, curIdx) {
+  const em = { read_file:'📖', write_file:'✏️', run_tests:'🧪', search_code:'🔍', submit:'🏁' };
+  const container = document.getElementById('log-list');
   container.innerHTML = '';
+  steps.forEach((s, i) => {
+    const e = document.createElement('div');
+    e.className = 'log-e';
+    e.style.opacity = i < curIdx ? '0.55' : i === curIdx ? '1' : '0.3';
+    if (i === curIdx) e.style.background = 'rgba(125,211,252,0.07)';
+    const r = s.reward || 0;
+    const rc = r > 0 ? 'rp' : r < 0 ? 'rn' : 'rz';
+    const name = (s.path || s.action || '').split('/').pop() || s.action;
+    e.innerHTML = `<span class="log-s">S${s.step}</span><span class="log-a">${em[s.action]||'•'} ${name}</span><span class="${rc}">${r>0?'+':''}${r.toFixed(2)}</span>`;
+    container.appendChild(e);
   });
+  if (curIdx >= 0 && container.children[curIdx]) {
+    container.children[curIdx].scrollIntoView({ block: 'nearest' });
   }
 }
+// ── Hover tooltip ──────────────────────────────────────────────────────────────
+const ray = new THREE.Raycaster();
+const mv = new THREE.Vector2();
+const tt = document.getElementById('tooltip');
 function checkHover(mx, my) {
+  mv.x = (mx / window.innerWidth) * 2 - 1;
+  mv.y = -(my / window.innerHeight) * 2 + 1;
+  ray.setFromCamera(mv, camera);
+  const meshes = Object.values(nodeMap).map(o => o.mesh);
+  const hits = ray.intersectObjects(meshes);
+  if (hits.length) {
+    const f = hits[0].object.userData.file;
+    tt.style.opacity = '1';
+    tt.style.left = (mx + 12) + 'px';
+    tt.style.top = (my - 8) + 'px';
+    document.getElementById('tt-title').textContent = f.name;
+    document.getElementById('tt-body').innerHTML =
+      `Type: ${f.type}${f.is_bug_file ? '<br>⚠️ Bug location' : ''}${f.visited ? '<br>✅ Visited' : ''}`;
   } else {
+    tt.style.opacity = '0';
   }
 }
+// ── Controls ───────────────────────────────────────────────────────────────────
+function onSlider(v) { curStep = +v; applyStep(curStep); }
+function stepFwd() { if (curStep < maxStep) { curStep++; document.getElementById('slider').value = curStep; applyStep(curStep); } }
+function stepBack() { if (curStep > 0) { curStep--; document.getElementById('slider').value = curStep; applyStep(curStep); } }
 function togglePlay() {
   playing = !playing;
+  document.getElementById('play-btn').textContent = playing ? '⏸ Pause' : '▶ Play';
   if (playing) {
+    if (curStep >= maxStep) curStep = 0;
+    playTimer = setInterval(() => {
+      if (curStep >= maxStep) { playing = false; document.getElementById('play-btn').textContent = '▶ Play'; clearInterval(playTimer); return; }
+      stepFwd();
+    }, 850);
   } else {
+    clearInterval(playTimer);
   }
 }
 function toggleOrbit() {
+  orbitAuto = !orbitAuto;
   const btn = document.getElementById('orbit-btn');
+  btn.textContent = orbitAuto ? '⏹ Stop' : '🔄 Orbit';
+  btn.classList.toggle('active', orbitAuto);
 }
 function resetView() {
+  sph = { theta: 0, phi: 1.1, r: 24 };
+  curStep = 0;
+  document.getElementById('slider').value = 0;
   applyStep(0);
 }
+function updateLabel(s, m) { document.getElementById('step-label').textContent = `Step ${s} / ${m}`; }
+// ── Animation loop ─────────────────────────────────────────────────────────────
 function animate() {
   requestAnimationFrame(animate);
   frame++;
   updateCamera();
+  // Pulsing agent
+  if (agentMesh) {
+    const p = 1 + Math.sin(frame * 0.09) * 0.18;
+    agentMesh.scale.setScalar(p);
+    agentMesh.rotation.y += 0.04;
+  }
+  // Subtle node float
+  Object.values(nodeMap).forEach(({ mesh, basePos }, i) => {
+    mesh.position.y = basePos.y + Math.sin(frame * 0.018 + i * 1.1) * 0.07;
   });
   renderer.render(scene, camera);
 }
+animate();
+// ── Load data from API ─────────────────────────────────────────────────────────
+async function fetchAndLoad() {
+  document.getElementById('loader').style.display = 'block';
+  document.getElementById('no-data').style.display = 'none';
   try {
+    // Try to determine base URL from window location
+    const base = window.location.origin;
+    const res = await fetch(`${base}/viz-data`, { cache: 'no-store' });
+    if (!res.ok) throw new Error('no data');
+    const data = await res.json();
+    if (data.error || !data.files || data.files.length === 0) {
+      document.getElementById('loader').style.display = 'none';
+      document.getElementById('no-data').style.display = 'block';
+      return;
+    }
     buildScene(data);
+    document.getElementById('loader').style.display = 'none';
   } catch(e) {
+    document.getElementById('loader').style.display = 'none';
+    document.getElementById('no-data').style.display = 'block';
   }
+}
+// ── Public API (can be called from parent window) ─────────────────────────────
+window.loadData = function(data) {
+  if (typeof data === 'string') { try { data = JSON.parse(data); } catch(e) { return; } }
   buildScene(data);
   document.getElementById('loader').style.display = 'none';
+  document.getElementById('no-data').style.display = 'none';
+};
+// Auto-load on init
+window.addEventListener('load', fetchAndLoad);
 </script>
 </body>
 </html>