Chirag0123 commited on
Commit
dfbd16e
·
1 Parent(s): f7185e1

v3.0 — Intelligence layer: failure classification, strategy detection, advanced metrics, self-improvement, multi-agent comparison, 3D visualizer

Browse files
app.py CHANGED
@@ -1,80 +1,92 @@
1
  #!/usr/bin/env python3
2
  """
3
- app.py — Gradio UI + FastAPI endpoints for the OpenEnv environment.
4
- This is the HF Space entry point.
 
 
 
 
 
 
 
 
 
5
  """
6
  import os
7
  import json
8
  import gradio as gr
9
  from server.environment import CodebaseNavEnvironment
10
  from server.models import RepoAction
 
 
 
 
 
11
 
12
- # ── Global environment instance ──────────────────────────────────────────────
13
  env = CodebaseNavEnvironment()
 
 
 
 
 
14
 
15
 
16
- # ── Gradio callback functions ────────────────────────────────────────────────
17
 
18
  def reset_environment(task: str):
19
- """Reset environment and return initial state."""
20
  try:
21
  result = env.reset(task=task)
22
  obs = result.observation
23
  tree = "\n".join(f" 📄 {f}" for f in obs.repo_tree)
24
  failing = ", ".join(obs.failing_tests) if obs.failing_tests else "None listed"
25
- info_data = result.info
26
-
27
- status_text = (
28
- f" Episode started\n"
29
- f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
30
- f"Task: {task}\n"
31
- f"Variant: {info_data.get('variant_id', 'unknown')}\n"
32
- f"Steps remaining: {obs.steps_remaining}\n"
33
- f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
34
- f"📁 Repository Files:\n{tree}\n\n"
 
35
  f"🔴 Failing Tests: {failing}\n\n"
36
- f"📋 Task: {obs.task_description}"
37
  )
38
- return status_text, "", "0", "0.000"
39
  except Exception as e:
40
  return f"❌ Error: {e}", "", "0", "0.000"
41
 
42
 
43
  def take_step(action_type: str, path: str, query: str, content: str):
44
- """Execute one agent step."""
45
  if env.done:
46
- return "❌ Episode is done. Reset first.", "", "", ""
47
-
48
  try:
49
  action = RepoAction(
50
  action_type=action_type,
51
- path=path if path.strip() else None,
52
- query=query if query.strip() else None,
53
- content=content if content.strip() else None,
54
  )
55
  result = env.step(action)
56
  obs = result.observation
57
-
58
- action_result = obs.last_action_result or "No output"
59
- error = obs.last_action_error or ""
60
- if error:
61
- error = f"⚠️ {error}"
62
 
63
  status = (
64
  f"Step {result.info['steps_taken']} | "
65
  f"Reward: {result.reward:+.3f} | "
66
- f"Steps left: {obs.steps_remaining}"
67
  )
68
  if result.done:
69
- status += f"\n\n🏁 EPISODE DONE — Final Score: {result.info['final_score']:.3f}"
70
-
71
- flags = result.info.get("security_flags", [])
72
- if flags:
73
- status += f"\n🔒 Security: {flags}"
74
 
75
  return (
76
  status,
77
- action_result[:3000],
78
  str(result.info["steps_taken"]),
79
  f"{result.info.get('cumulative_reward', 0):.3f}",
80
  )
@@ -82,261 +94,567 @@ def take_step(action_type: str, path: str, query: str, content: str):
82
  return f"❌ Error: {e}", "", "", ""
83
 
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  def get_evaluation():
86
- """Get multi-dimensional evaluation report."""
87
  try:
88
  ev = env.get_evaluation()
89
  if "error" in ev:
90
  return "No evaluation available. Run an episode first."
91
-
92
  lines = [
93
  f"🎯 Composite Score: {ev['composite_score']:.3f}",
94
- "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
95
  ]
96
  for name, dim in ev.get("dimensions", {}).items():
97
  bar = "█" * int(dim["score"] * 20) + "░" * (20 - int(dim["score"] * 20))
98
  lines.append(f" {name:15s} [{bar}] {dim['score']:.3f}")
99
- for e in dim.get("evidence", []):
100
  lines.append(f" → {e}")
101
-
102
  if ev.get("strengths"):
103
- lines.append("\n💪 Strengths:")
104
- for s in ev["strengths"]:
105
- lines.append(f" ✅ {s}")
106
-
107
  if ev.get("failure_analysis"):
108
- lines.append("\n⚠️ Failures:")
109
- for f in ev["failure_analysis"]:
110
- lines.append(f" ❌ {f}")
111
-
112
  if ev.get("recommendations"):
113
- lines.append("\n💡 Recommendations:")
114
- for r in ev["recommendations"]:
115
- lines.append(f" → {r}")
116
-
117
  return "\n".join(lines)
118
  except Exception as e:
119
  return f"Error: {e}"
120
 
121
 
122
  def get_metrics():
123
- """Get comprehensive metrics."""
124
  try:
125
- m = env.get_metrics()
126
- return json.dumps(m, indent=2, default=str)
127
  except Exception as e:
128
  return f"Error: {e}"
129
 
130
 
131
  def get_trajectory():
132
- """Get full trajectory."""
133
  try:
134
  t = env.get_trajectory()
135
  if not t:
136
- return "No trajectory available."
137
-
138
  lines = [
139
- f"Episode: {t.get('episode_id', 'N/A')}",
140
- f"Task: {t.get('task', 'N/A')} | Variant: {t.get('variant_id', 'N/A')}",
141
- f"Duration: {t.get('duration_seconds', 'N/A')}s | Score: {t.get('final_score', 0):.3f}",
142
- "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
143
  ]
 
 
144
  for step in t.get("steps", []):
145
- emoji = "📖" if step["action_type"] == "read_file" else \
146
- "✏️" if step["action_type"] == "write_file" else \
147
- "🧪" if step["action_type"] == "run_tests" else \
148
- "🔍" if step["action_type"] == "search_code" else "🏁"
149
- path = step.get("action_path") or step.get("action_query") or ""
150
- err = f" ❌ {step['error']}" if step.get("error") else ""
151
  lines.append(
152
- f" {emoji} Step {step['step_number']:2d}: "
153
- f"{step['action_type']:12s} {path:30s} "
154
- f"reward={step['reward']:+.3f} "
155
- f"({step['duration_ms']:.0f}ms){err}"
156
  )
157
  return "\n".join(lines)
158
  except Exception as e:
159
  return f"Error: {e}"
160
 
161
 
162
- def run_builtin_agent(task: str):
163
- """Run the built-in deterministic agent for a quick demo."""
 
164
  try:
165
- # Reset
166
- result = env.reset(task=task)
167
- obs = result.observation
168
- log_lines = [f"🚀 Starting {task} (variant: {result.info.get('variant_id')})"]
169
- log_lines.append(f" Files: {obs.repo_tree}")
170
- log_lines.append(f" Failing: {obs.failing_tests}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
- # Strategy: read test file → read source → fix → run tests ��� submit
173
- test_files = [f for f in obs.repo_tree if f.startswith("tests/")]
174
- src_files = [f for f in obs.repo_tree if f.startswith("src/") and f.endswith(".py")]
175
- spec_files = [f for f in obs.repo_tree if f.endswith(".md")]
176
 
177
- steps_done = 0
178
- max_demo_steps = 15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
- # Step 1: read spec or test
181
- if task == "task3" and spec_files:
182
- target = spec_files[0]
183
- elif test_files:
184
- target = test_files[0]
185
- else:
186
- target = obs.repo_tree[0]
187
 
188
- step_result = env.step(RepoAction(action_type="read_file", path=target))
189
- steps_done += 1
190
- log_lines.append(f" Step {steps_done}: read_file {target} → reward={step_result.reward:+.3f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
- # Step 2+: read all source files
193
- for sf in src_files:
194
- if env.done or steps_done >= max_demo_steps - 2:
195
- break
196
- step_result = env.step(RepoAction(action_type="read_file", path=sf))
197
- steps_done += 1
198
- log_lines.append(f" Step {steps_done}: read_file {sf} → reward={step_result.reward:+.3f}")
199
-
200
- # Step N-1: run tests
201
- if not env.done and steps_done < max_demo_steps - 1:
202
- step_result = env.step(RepoAction(action_type="run_tests"))
203
- steps_done += 1
204
- log_lines.append(f" Step {steps_done}: run_tests → reward={step_result.reward:+.3f}")
205
-
206
- # Step N: submit
207
- if not env.done:
208
- step_result = env.step(RepoAction(action_type="submit"))
209
- steps_done += 1
210
- log_lines.append(f" Step {steps_done}: submit reward={step_result.reward:+.3f}")
 
 
 
 
 
211
 
212
- log_lines.append(f"\n🏁 Final Score: {env.final_score:.3f}")
213
- log_lines.append(f" Total Steps: {steps_done}")
214
- log_lines.append(f" Cumulative Reward: {env.cumulative_reward:.3f}")
215
 
216
- return "\n".join(log_lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  except Exception as e:
218
  return f"❌ Error: {e}"
219
 
220
 
221
- # ── Build the Gradio UI ─────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
- with gr.Blocks(
224
- title="Codebase Navigation & Repair OpenEnv",
225
- ) as demo:
 
226
  gr.Markdown(
227
- "# 🔍 Codebase Navigation & Repair — OpenEnv\n"
228
- "**RL environment for testing AI coding agents.** "
229
- "Agents navigate repos, find bugs, and fix them graded by actual pytest execution."
230
  )
231
 
232
  with gr.Tabs():
233
- # ── Tab 1: Interactive Environment ────────────────────────────────
 
234
  with gr.TabItem("🎮 Interactive"):
235
  with gr.Row():
236
  with gr.Column(scale=1):
237
  task_select = gr.Dropdown(
238
- choices=["task1", "task2", "task3"],
239
- value="task1",
240
  label="Task",
241
- info="task1=single-file bugs, task2=cross-module, task3=feature impl"
242
  )
243
  reset_btn = gr.Button("🔄 Reset Environment", variant="primary")
244
-
245
- gr.Markdown("### Take an Action")
246
- action_type = gr.Dropdown(
247
- choices=["read_file", "write_file", "run_tests", "search_code", "submit"],
248
- value="read_file",
249
- label="Action Type",
250
  )
251
- action_path = gr.Textbox(label="Path (for read/write/run_tests)", placeholder="src/auth.py")
252
- action_query = gr.Textbox(label="Query (for search_code)", placeholder="validate_token")
253
- action_content = gr.Textbox(label="Content (for write_file)", lines=5, placeholder="# new file content...")
254
  step_btn = gr.Button("▶️ Execute Step", variant="secondary")
255
-
256
  with gr.Column(scale=2):
257
- status_box = gr.Textbox(label="Status", lines=15, interactive=False)
258
- result_box = gr.Textbox(label="Last Action Result", lines=10, interactive=False)
259
  with gr.Row():
260
- steps_box = gr.Textbox(label="Steps Taken", value="0", interactive=False)
261
  reward_box = gr.Textbox(label="Cumulative Reward", value="0.000", interactive=False)
 
 
 
 
 
 
 
 
 
 
262
 
263
- reset_btn.click(
264
- reset_environment, inputs=[task_select],
265
- outputs=[status_box, result_box, steps_box, reward_box],
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  )
267
- step_btn.click(
268
- take_step,
269
- inputs=[action_type, action_path, action_query, action_content],
270
- outputs=[status_box, result_box, steps_box, reward_box],
 
 
 
 
 
 
 
 
 
 
 
271
  )
 
 
 
272
 
273
- # ── Tab 2: Run Agent ─────────────────────────────────────────────
274
- with gr.TabItem("🤖 Run Agent"):
275
  gr.Markdown(
276
- "### Built-in Demonstration Agent\n"
277
- "Runs a deterministic read-all-then-submit agent. "
278
- "For LLM-based agent, use `run_agent.py` or `inference.py`."
279
  )
280
- agent_task = gr.Dropdown(
281
- choices=["task1", "task2", "task3"], value="task1", label="Task"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  )
283
- run_btn = gr.Button("🚀 Run Agent", variant="primary")
284
- agent_output = gr.Textbox(label="Agent Log", lines=20, interactive=False)
285
- run_btn.click(run_builtin_agent, inputs=[agent_task], outputs=[agent_output])
286
 
287
- # ── Tab 3: Evaluation Dashboard ──────────────────────────────────
288
- with gr.TabItem("📊 Evaluation"):
289
- with gr.Row():
290
- eval_btn = gr.Button("🎯 Get Evaluation", variant="primary")
291
- metrics_btn = gr.Button("📈 Get Metrics", variant="secondary")
292
- traj_btn = gr.Button("🗺️ Get Trajectory", variant="secondary")
293
- eval_output = gr.Textbox(label="Evaluation Report", lines=25, interactive=False)
294
- eval_btn.click(get_evaluation, outputs=[eval_output])
295
- metrics_btn.click(get_metrics, outputs=[eval_output])
296
- traj_btn.click(get_trajectory, outputs=[eval_output])
297
-
298
- # ── Tab 4: API Docs ──────────────────────────────────────────────
299
  with gr.TabItem("📖 API"):
300
  gr.Markdown("""
301
- ### REST API Endpoints
302
-
303
- The FastAPI endpoints are mounted alongside this UI at `/api/`.
304
 
 
305
  | Endpoint | Method | Description |
306
  |----------|--------|-------------|
307
- | `/api/reset?task=task1` | POST | Start new episode |
308
- | `/api/step` | POST | Take action (JSON body) |
309
- | `/api/state` | GET | Get current state |
310
- | `/api/health` | GET | Health check |
311
- | `/api/trajectory` | GET | Full action log |
312
- | `/api/evaluate` | GET | Multi-dimensional scores |
313
- | `/api/metrics` | GET | Comprehensive stats |
314
- | `/api/fault-config` | POST | Enable fault injection |
315
-
316
- ### Example: Reset + Read + Submit
317
- ```bash
318
- BASE="https://YOUR-SPACE.hf.space/api"
319
-
320
- # Reset
321
- curl -X POST "$BASE/reset?task=task1"
322
 
323
- # Read a file
324
- curl -X POST "$BASE/step" -H "Content-Type: application/json" \\
325
- -d '{"action_type":"read_file","path":"src/auth.py"}'
 
 
 
 
326
 
327
- # Submit
328
- curl -X POST "$BASE/step" -H "Content-Type: application/json" \\
329
- -d '{"action_type":"submit"}'
 
 
 
 
 
 
330
 
331
- # Get evaluation
332
- curl "$BASE/evaluate"
 
 
 
 
 
 
 
 
333
  ```
334
  """)
335
 
336
 
337
- # ── Mount FastAPI under /api ─────────────────────────────────────────────────
338
  from server.app import app as fastapi_app
339
-
340
  gr_app = gr.mount_gradio_app(fastapi_app, demo, path="/")
341
 
342
  if __name__ == "__main__":
 
1
  #!/usr/bin/env python3
2
  """
3
+ app.py — Gradio UI v3.0 Full Platform Entry Point
4
+
5
+ Tabs:
6
+ 🎮 Interactive — manual step-by-step control
7
+ 🤖 Run Agent — built-in deterministic agent demo
8
+ 📊 Evaluation — 6-dimension evaluation report
9
+ 🧠 Intelligence — failure classification, strategy, advanced metrics
10
+ 🔁 Self-Improve — improvement plan after failure
11
+ ⚖️ Compare Agents — side-by-side multi-agent comparison
12
+ 🌐 3D Visualizer — Three.js trajectory visualization
13
+ 📖 API — REST API reference
14
  """
15
  import os
16
  import json
17
  import gradio as gr
18
  from server.environment import CodebaseNavEnvironment
19
  from server.models import RepoAction
20
+ from server.failure_classifier import FailureClassifier
21
+ from server.strategy_detector import StrategyDetector
22
+ from server.advanced_metrics import AdvancedMetricsEngine
23
+ from server.self_improvement import SelfImprovementEngine
24
+ from server.multi_agent import MultiAgentComparison
25
 
26
+ # ── Global instances ──────────────────────────────────────────────────────────
27
  env = CodebaseNavEnvironment()
28
+ failure_clf = FailureClassifier()
29
+ strategy_det = StrategyDetector()
30
+ adv_metrics_engine = AdvancedMetricsEngine()
31
+ improvement_engine = SelfImprovementEngine()
32
+ multi_agent_engine = MultiAgentComparison()
33
 
34
 
35
+ # ── Tab 1: Interactive ────────────────────────────────────────────────────────
36
 
37
  def reset_environment(task: str):
 
38
  try:
39
  result = env.reset(task=task)
40
  obs = result.observation
41
  tree = "\n".join(f" 📄 {f}" for f in obs.repo_tree)
42
  failing = ", ".join(obs.failing_tests) if obs.failing_tests else "None listed"
43
+ fi = result.info.get("fault_injection", {})
44
+ faults = ""
45
+ if fi.get("faults_injected"):
46
+ faults = f"\n\n⚠️ Fault Injection ({fi.get('difficulty_multiplier', 1.0):.1f}x):\n"
47
+ faults += "\n".join(f" • {f}" for f in fi["faults_injected"][:5])
48
+
49
+ status = (
50
+ f" Episode Started — {task} (variant: {result.info.get('variant_id', '?')})\n"
51
+ f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
52
+ f"Steps: {obs.steps_remaining} remaining\n\n"
53
+ f"📁 Files:\n{tree}\n\n"
54
  f"🔴 Failing Tests: {failing}\n\n"
55
+ f"📋 Task: {obs.task_description}{faults}"
56
  )
57
+ return status, "", "0", "0.000"
58
  except Exception as e:
59
  return f"❌ Error: {e}", "", "0", "0.000"
60
 
61
 
62
  def take_step(action_type: str, path: str, query: str, content: str):
 
63
  if env.done:
64
+ return "❌ Episode done. Reset first.", "", "", ""
 
65
  try:
66
  action = RepoAction(
67
  action_type=action_type,
68
+ path=path.strip() or None,
69
+ query=query.strip() or None,
70
+ content=content.strip() or None,
71
  )
72
  result = env.step(action)
73
  obs = result.observation
74
+ result_text = obs.last_action_result or "No output"
75
+ error = f"\n⚠️ {obs.last_action_error}" if obs.last_action_error else ""
76
+ flags = result.info.get("security_flags", [])
77
+ sec = f"\n🔒 Security: {flags}" if flags else ""
 
78
 
79
  status = (
80
  f"Step {result.info['steps_taken']} | "
81
  f"Reward: {result.reward:+.3f} | "
82
+ f"Steps left: {obs.steps_remaining}{error}{sec}"
83
  )
84
  if result.done:
85
+ status += f"\n\n🏁 DONE — Score: {result.info['final_score']:.3f}"
 
 
 
 
86
 
87
  return (
88
  status,
89
+ result_text[:3000],
90
  str(result.info["steps_taken"]),
91
  f"{result.info.get('cumulative_reward', 0):.3f}",
92
  )
 
94
  return f"❌ Error: {e}", "", "", ""
95
 
96
 
97
+ # ── Tab 2: Run Agent ──────────────────────────────────────────────────────────
98
+
99
+ def run_builtin_agent(task: str):
100
+ try:
101
+ result = env.reset(task=task)
102
+ obs = result.observation
103
+ log = [
104
+ f"🚀 {task} (variant: {result.info.get('variant_id')})",
105
+ f" Files: {obs.repo_tree}",
106
+ f" Failing: {obs.failing_tests}",
107
+ ]
108
+ tree = obs.repo_tree
109
+ test_files = sorted([f for f in tree if f.startswith("tests/")])
110
+ src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")])
111
+ spec_files = sorted([f for f in tree if f.endswith(".md")])
112
+ steps = 0
113
+
114
+ if task == "task3" and spec_files:
115
+ for sf in spec_files:
116
+ if env.done: break
117
+ r = env.step(RepoAction(action_type="read_file", path=sf))
118
+ steps += 1
119
+ log.append(f" Step {steps}: read_file {sf} → {r.reward:+.3f}")
120
+
121
+ for tf in test_files:
122
+ if env.done: break
123
+ r = env.step(RepoAction(action_type="read_file", path=tf))
124
+ steps += 1
125
+ log.append(f" Step {steps}: read_file {tf} → {r.reward:+.3f}")
126
+
127
+ for sf in src_files:
128
+ if env.done or steps >= 12: break
129
+ r = env.step(RepoAction(action_type="read_file", path=sf))
130
+ steps += 1
131
+ log.append(f" Step {steps}: read_file {sf} → {r.reward:+.3f}")
132
+
133
+ if not env.done and test_files:
134
+ r = env.step(RepoAction(action_type="run_tests", path=test_files[0]))
135
+ steps += 1
136
+ log.append(f" Step {steps}: run_tests → {r.reward:+.3f}")
137
+
138
+ if not env.done:
139
+ r = env.step(RepoAction(action_type="submit"))
140
+ steps += 1
141
+ log.append(f" Step {steps}: submit → {r.reward:+.3f}")
142
+
143
+ log += [
144
+ f"\n🏁 Score: {env.final_score:.3f}",
145
+ f" Steps: {steps}",
146
+ f" Reward: {env.cumulative_reward:.3f}",
147
+ ]
148
+ return "\n".join(log)
149
+ except Exception as e:
150
+ return f"❌ Error: {e}"
151
+
152
+
153
+ # ── Tab 3: Evaluation ─────────────────────────────────────────────────────────
154
+
155
  def get_evaluation():
 
156
  try:
157
  ev = env.get_evaluation()
158
  if "error" in ev:
159
  return "No evaluation available. Run an episode first."
 
160
  lines = [
161
  f"🎯 Composite Score: {ev['composite_score']:.3f}",
162
+ "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
163
  ]
164
  for name, dim in ev.get("dimensions", {}).items():
165
  bar = "█" * int(dim["score"] * 20) + "░" * (20 - int(dim["score"] * 20))
166
  lines.append(f" {name:15s} [{bar}] {dim['score']:.3f}")
167
+ for e in dim.get("evidence", [])[:2]:
168
  lines.append(f" → {e}")
 
169
  if ev.get("strengths"):
170
+ lines += ["\n💪 Strengths:"] + [f" ✅ {s}" for s in ev["strengths"]]
 
 
 
171
  if ev.get("failure_analysis"):
172
+ lines += ["\n⚠️ Failures:"] + [f" ❌ {f}" for f in ev["failure_analysis"]]
 
 
 
173
  if ev.get("recommendations"):
174
+ lines += ["\n💡 Recommendations:"] + [f" → {r}" for r in ev["recommendations"]]
 
 
 
175
  return "\n".join(lines)
176
  except Exception as e:
177
  return f"Error: {e}"
178
 
179
 
180
  def get_metrics():
 
181
  try:
182
+ return json.dumps(env.get_metrics(), indent=2, default=str)
 
183
  except Exception as e:
184
  return f"Error: {e}"
185
 
186
 
187
  def get_trajectory():
 
188
  try:
189
  t = env.get_trajectory()
190
  if not t:
191
+ return "No trajectory. Run an episode first."
 
192
  lines = [
193
+ f"Episode: {t.get('episode_id')}",
194
+ f"Task: {t.get('task')} | Variant: {t.get('variant_id')}",
195
+ f"Score: {t.get('final_score', 0):.3f} | Duration: {t.get('duration_seconds', '?')}s",
196
+ "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
197
  ]
198
+ emojis = {"read_file": "📖", "write_file": "✏️", "run_tests": "🧪",
199
+ "search_code": "🔍", "submit": "🏁"}
200
  for step in t.get("steps", []):
201
+ em = emojis.get(step["action_type"], "")
202
+ p = step.get("action_path") or step.get("action_query") or ""
203
+ err = "" if step.get("error") else ""
 
 
 
204
  lines.append(
205
+ f" {em} {step['step_number']:2d}: {step['action_type']:12s} {p:30s} "
206
+ f"reward={step['reward']:+.3f} ({step['duration_ms']:.0f}ms){err}"
 
 
207
  )
208
  return "\n".join(lines)
209
  except Exception as e:
210
  return f"Error: {e}"
211
 
212
 
213
+ # ── Tab 4: Intelligence ───────────────────────────────────────────────────────
214
+
215
+ def get_failure_classification():
216
  try:
217
+ traj = env.get_trajectory()
218
+ if not traj:
219
+ return "No trajectory. Run an episode first."
220
+ meta = env.variant.meta if env.variant else {}
221
+ report = failure_clf.classify(
222
+ episode_id=traj.get("episode_id", ""),
223
+ task=env.current_task or "unknown",
224
+ trajectory_steps=traj.get("steps", []),
225
+ variant_meta=meta,
226
+ files_read=list(env.files_read),
227
+ files_written=list(env.files_written),
228
+ final_score=env.final_score,
229
+ security_violations=env.security_violations,
230
+ )
231
+ d = report.to_dict()
232
+ lines = [
233
+ f"{'✅ SUCCESS' if d['success'] else '❌ FAILURE'}",
234
+ f"Primary Failure Type: {d['primary_failure']}",
235
+ f"Failures Detected: {d['failure_count']}",
236
+ "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
237
+ ]
238
+ for f in d.get("failures", []):
239
+ lines += [
240
+ f"\n[{f['severity'].upper()}] {f['type']} @ Step {f['step']}",
241
+ f" Evidence: {f['evidence']}",
242
+ f" Root Cause: {f['root_cause']}",
243
+ f" Fix: {f['remediation']}",
244
+ ]
245
+ if d.get("failure_summary"):
246
+ lines += ["\n📋 Summary:", f" {d['failure_summary']}"]
247
+ if d.get("retry_hint"):
248
+ lines += ["\n🔁 Retry Hint:", f" {d['retry_hint']}"]
249
+ return "\n".join(lines)
250
+ except Exception as e:
251
+ return f"Error: {e}"
252
 
 
 
 
 
253
 
254
+ def get_strategy_detection():
255
+ try:
256
+ traj = env.get_trajectory()
257
+ if not traj:
258
+ return "No trajectory. Run an episode first."
259
+ meta = env.variant.meta if env.variant else {}
260
+ report = strategy_det.detect(
261
+ trajectory_steps=traj.get("steps", []),
262
+ task=env.current_task or "unknown",
263
+ variant_meta=meta,
264
+ files_read=list(env.files_read),
265
+ final_score=env.final_score,
266
+ )
267
+ d = report.to_dict()
268
+ score_bar = "█" * int(d["score"] * 20) + "░" * (20 - int(d["score"] * 20))
269
+ lines = [
270
+ f"🧭 Strategy: {d['strategy']}",
271
+ f" Score: [{score_bar}] {d['score']:.3f}",
272
+ f" Confidence: {d['confidence']:.0%}",
273
+ f"\n📖 {d['strategy_description']}",
274
+ f"\n📊 Exploration Ratio: {d['exploration_ratio']:.2f} "
275
+ f"({'explore-heavy' if d['exploration_ratio'] > 0.6 else 'exploit-heavy' if d['exploration_ratio'] < 0.4 else 'balanced'})",
276
+ f" Strategy Pivots: {d['pivot_count']}",
277
+ ]
278
+ if d.get("sub_patterns"):
279
+ lines += ["\n🔖 Sub-patterns:"] + [f" • {p}" for p in d["sub_patterns"]]
280
+ if d.get("evidence"):
281
+ lines += ["\n🔍 Evidence:"] + [f" → {e}" for e in d["evidence"]]
282
+ return "\n".join(lines)
283
+ except Exception as e:
284
+ return f"Error: {e}"
285
 
 
 
 
 
 
 
 
286
 
287
+ def get_advanced_metrics():
288
+ try:
289
+ traj = env.get_trajectory()
290
+ if not traj:
291
+ return "No trajectory. Run an episode first."
292
+ meta = env.variant.meta if env.variant else {}
293
+ report = adv_metrics_engine.compute(
294
+ trajectory_steps=traj.get("steps", []),
295
+ variant_meta=meta,
296
+ final_score=env.final_score,
297
+ files_read=list(env.files_read),
298
+ files_written=list(env.files_written),
299
+ )
300
+ d = report.to_dict()
301
+
302
+ def bar(v):
303
+ return "█" * int(v * 20) + "░" * (20 - int(v * 20))
304
 
305
+ lines = [
306
+ "⚡ ADVANCED METRICS",
307
+ "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
308
+ f" Reasoning Efficiency [{bar(d['reasoning_efficiency'])}] {d['reasoning_efficiency']:.3f}",
309
+ f" Reliability Index [{bar(d['reliability_index'])}] {d['reliability_index']:.3f}",
310
+ f" Exploration Ratio [{bar(d['exploration_ratio'])}] {d['exploration_ratio']:.3f}",
311
+ f" Decision Entropy [{bar(d['decision_entropy'])}] {d['decision_entropy']:.3f}",
312
+ f" Wasteful Ratio [{bar(d['wasteful_ratio'])}] {d['wasteful_ratio']:.3f}",
313
+ f" Pivot Rate {d['pivot_rate']:.2f} per 10 steps",
314
+ f" Consistency [{bar(d['consistency_score'])}] {d['consistency_score']:.3f} ({d['runs_analyzed']} runs)",
315
+ "\n📊 Action Distribution:",
316
+ ]
317
+ for action, count in d.get("action_distribution", {}).items():
318
+ lines.append(f" {action:15s}: {count}")
319
+ if d.get("useful_actions"):
320
+ lines += ["\n✅ Useful Actions:"] + [f" • {a}" for a in d["useful_actions"]]
321
+ if d.get("wasteful_actions"):
322
+ lines += ["\n⚠️ Wasteful Actions:"] + [f" • {a}" for a in d["wasteful_actions"]]
323
+ lines += ["\n🔒 Reliability Breakdown:"]
324
+ for k, v in d.get("reliability_breakdown", {}).items():
325
+ lines.append(f" {k:15s}: {v:.3f}")
326
+ return "\n".join(lines)
327
+ except Exception as e:
328
+ return f"Error: {e}"
329
 
 
 
 
330
 
331
+ # ── Tab 5: Self-Improve ───────────────────────────────────────────────────────
332
+
333
+ def get_improvement_plan():
334
+ try:
335
+ traj = env.get_trajectory()
336
+ if not traj:
337
+ return "No trajectory. Run an episode first."
338
+ meta = env.variant.meta if env.variant else {}
339
+ steps = traj.get("steps", [])
340
+
341
+ fail_report = failure_clf.classify(
342
+ episode_id=traj.get("episode_id", ""),
343
+ task=env.current_task or "unknown",
344
+ trajectory_steps=steps,
345
+ variant_meta=meta,
346
+ files_read=list(env.files_read),
347
+ files_written=list(env.files_written),
348
+ final_score=env.final_score,
349
+ security_violations=env.security_violations,
350
+ )
351
+ plan = improvement_engine.generate_improvement_plan(
352
+ episode_id=traj.get("episode_id", ""),
353
+ task=env.current_task or "unknown",
354
+ failure_type=fail_report.primary_failure,
355
+ failure_evidence=[f.evidence for f in fail_report.failures],
356
+ original_score=env.final_score,
357
+ trajectory_steps=steps,
358
+ files_read=list(env.files_read),
359
+ files_written=list(env.files_written),
360
+ )
361
+ d = plan.to_dict()
362
+ lines = [
363
+ f"🔁 SELF-IMPROVEMENT PLAN",
364
+ f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
365
+ f"Original Score: {d['original_score']:.3f}",
366
+ f"Failure Type: {d['failure_type']}",
367
+ f"\n❌ What Went Wrong:\n {d['what_went_wrong']}",
368
+ f"\n🎯 Improved Strategy:\n {d['improved_strategy']}",
369
+ f"\n📋 Step-by-Step Plan:",
370
+ ]
371
+ for step in d.get("step_by_step_plan", []):
372
+ lines.append(f" {step}")
373
+ if d.get("specific_errors"):
374
+ lines += ["\n🔎 Specific Errors:"] + [f" • {e}" for e in d["specific_errors"][:5]]
375
+ lines += [
376
+ "\n💉 System Prompt Injection (for next LLM run):",
377
+ "─────────────────────────────────────",
378
+ d.get("system_prompt_addon", "No injection needed."),
379
+ ]
380
+ return "\n".join(lines)
381
+ except Exception as e:
382
+ return f"Error: {e}"
383
+
384
+
385
+ # ── Tab 6: Compare Agents ─────────────────────────────────────────────────────
386
+
387
+ def run_comparison(task: str, selected_agents: list):
388
+ try:
389
+ agents = selected_agents if selected_agents else None
390
+ report = multi_agent_engine.compare(env, task=task, agents=agents)
391
+ d = report.to_dict()
392
+
393
+ lines = [
394
+ f"⚖️ MULTI-AGENT COMPARISON — {task} (variant: {d.get('variant_id')})",
395
+ f"🏆 Winner: {d.get('winner')} (score: {d.get('winner_score', 0):.3f})",
396
+ "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
397
+ f"{'Rank':<6} {'Agent':<16} {'Score':<8} {'Steps':<8} {'Strategy':<22} {'Failure':<22} {'Reliability':<12}",
398
+ "─" * 100,
399
+ ]
400
+ for row in d.get("summary_table", []):
401
+ lines.append(
402
+ f"#{row['rank']:<5} {row['agent']:<16} {row['score']:<8.3f} "
403
+ f"{row['steps']:<8} {row['strategy']:<22} {row['failure']:<22} {row['reliability']:<12.3f}"
404
+ )
405
+ lines.append("━" * 100)
406
+
407
+ if d.get("insights"):
408
+ lines += ["\n💡 Insights:"] + [f" → {i}" for i in d["insights"]]
409
+
410
+ lines.append("\n📊 Per-Agent Action Sequences:")
411
+ for run in d.get("detailed_runs", []):
412
+ seq = " → ".join(run.get("action_sequence", []))
413
+ lines.append(f" {run['agent_name']:16s}: {seq}")
414
+
415
+ return "\n".join(lines)
416
  except Exception as e:
417
  return f"❌ Error: {e}"
418
 
419
 
420
+ # ── Tab 7: 3D Visualizer ─────────────────────────────────────────────────────
421
+
422
+ def get_viz_html():
423
+ """Generate the 3D visualizer HTML with current trajectory data injected."""
424
+ # Load the static HTML template
425
+ static_path = os.path.join(os.path.dirname(__file__), "static", "viz3d.html")
426
+ if not os.path.exists(static_path):
427
+ return "<p style='color:red'>viz3d.html not found in static/</p>"
428
+
429
+ with open(static_path, "r") as f:
430
+ html = f.read()
431
+
432
+ # Get viz data from current environment
433
+ traj = env.get_trajectory()
434
+ if traj:
435
+ meta = env.variant.meta if env.variant else {}
436
+ bug_files = set(meta.get("bug_files", []))
437
+ files = []
438
+ if env.variant:
439
+ for fname in env.variant.get_tree():
440
+ ftype = "test" if fname.startswith("tests/") else \
441
+ "spec" if fname.endswith(".md") else "src"
442
+ files.append({
443
+ "name": fname,
444
+ "type": ftype,
445
+ "is_bug_file": fname in bug_files,
446
+ "visited": fname in env.files_read,
447
+ "modified": fname in env.files_written,
448
+ })
449
+
450
+ test_files = [f["name"] for f in files if f["type"] == "test"]
451
+ src_files = [f["name"] for f in files if f["type"] == "src"]
452
+ deps = []
453
+ for tf in test_files:
454
+ for sf in src_files:
455
+ deps.append({"from": tf, "to": sf})
456
+
457
+ steps_data = []
458
+ for step in traj.get("steps", []):
459
+ steps_data.append({
460
+ "step": step.get("step_number", 0),
461
+ "action": step.get("action_type", ""),
462
+ "path": step.get("action_path"),
463
+ "reward": step.get("reward", 0.0),
464
+ "error": step.get("error"),
465
+ "pass_rate": step.get("test_pass_rate"),
466
+ })
467
+
468
+ strategy_report = strategy_det.detect(
469
+ traj.get("steps", []),
470
+ env.current_task or "unknown",
471
+ meta,
472
+ list(env.files_read),
473
+ env.final_score,
474
+ ) if traj.get("steps") else None
475
+
476
+ viz_data = {
477
+ "task": env.current_task or "unknown",
478
+ "variant_id": traj.get("variant_id", "unknown"),
479
+ "final_score": env.final_score,
480
+ "strategy": strategy_report.strategy if strategy_report else "UNKNOWN",
481
+ "failure_type": "—",
482
+ "files": files,
483
+ "dependencies": deps,
484
+ "steps": steps_data,
485
+ }
486
+ data_json = json.dumps(viz_data)
487
+ else:
488
+ data_json = ""
489
+
490
+ # Inject data into HTML
491
+ html = html.replace(
492
+ '<div id="viz-data" style="display:none"></div>',
493
+ f'<div id="viz-data" style="display:none">{data_json}</div>'
494
+ )
495
+ return html
496
 
497
+
498
+ # ── Build Gradio UI ───────────────────────────────────────────────────────────
499
+
500
+ with gr.Blocks(title="Codebase Navigation & Repair — OpenEnv v3") as demo:
501
  gr.Markdown(
502
+ "# 🔍 Codebase Navigation & Repair — OpenEnv v3\n"
503
+ "**The most advanced debugging + evaluation platform for AI coding agents.** "
504
+ "Navigate codebases · Fix bugs · Evaluate process · Visualize in 3D."
505
  )
506
 
507
  with gr.Tabs():
508
+
509
+ # ── Tab 1: Interactive ────────────────────────────────────────────────
510
  with gr.TabItem("🎮 Interactive"):
511
  with gr.Row():
512
  with gr.Column(scale=1):
513
  task_select = gr.Dropdown(
514
+ ["task1", "task2", "task3"], value="task1",
 
515
  label="Task",
516
+ info="task1=bugs, task2=cross-module, task3=feature impl"
517
  )
518
  reset_btn = gr.Button("🔄 Reset Environment", variant="primary")
519
+ gr.Markdown("### Action")
520
+ act_type = gr.Dropdown(
521
+ ["read_file", "write_file", "run_tests", "search_code", "submit"],
522
+ value="read_file", label="Action Type",
 
 
523
  )
524
+ act_path = gr.Textbox(label="Path", placeholder="src/auth.py")
525
+ act_query = gr.Textbox(label="Query (search_code)", placeholder="validate_token")
526
+ act_content = gr.Textbox(label="Content (write_file)", lines=4)
527
  step_btn = gr.Button("▶️ Execute Step", variant="secondary")
 
528
  with gr.Column(scale=2):
529
+ status_box = gr.Textbox(label="Status", lines=14, interactive=False)
530
+ result_box = gr.Textbox(label="Last Result", lines=8, interactive=False)
531
  with gr.Row():
532
+ steps_box = gr.Textbox(label="Steps", value="0", interactive=False)
533
  reward_box = gr.Textbox(label="Cumulative Reward", value="0.000", interactive=False)
534
+ reset_btn.click(reset_environment, [task_select], [status_box, result_box, steps_box, reward_box])
535
+ step_btn.click(take_step, [act_type, act_path, act_query, act_content], [status_box, result_box, steps_box, reward_box])
536
+
537
+ # ── Tab 2: Run Agent ──────────────────────────────────────────────────
538
+ with gr.TabItem("🤖 Run Agent"):
539
+ gr.Markdown("### Built-in Demonstration Agent\nRuns deterministic read→submit strategy.")
540
+ agent_task = gr.Dropdown(["task1", "task2", "task3"], value="task1", label="Task")
541
+ run_btn = gr.Button("🚀 Run Agent", variant="primary")
542
+ agent_output = gr.Textbox(label="Agent Log", lines=20, interactive=False)
543
+ run_btn.click(run_builtin_agent, [agent_task], [agent_output])
544
 
545
+ # ── Tab 3: Evaluation ─────────────────────────────────────────────────
546
+ with gr.TabItem("📊 Evaluation"):
547
+ with gr.Row():
548
+ eval_btn = gr.Button("🎯 Evaluation Report", variant="primary")
549
+ metrics_btn = gr.Button("📈 Metrics JSON", variant="secondary")
550
+ traj_btn = gr.Button("🗺️ Trajectory", variant="secondary")
551
+ eval_out = gr.Textbox(label="Output", lines=28, interactive=False)
552
+ eval_btn.click(get_evaluation, outputs=[eval_out])
553
+ metrics_btn.click(get_metrics, outputs=[eval_out])
554
+ traj_btn.click(get_trajectory, outputs=[eval_out])
555
+
556
+ # ── Tab 4: 🧠 Intelligence ─────────────────────────────────────────────
557
+ with gr.TabItem("🧠 Intelligence"):
558
+ gr.Markdown(
559
+ "### Deep Agent Intelligence Analysis\n"
560
+ "Failure classification, strategy detection, and advanced behavioral metrics."
561
  )
562
+ with gr.Row():
563
+ classify_btn = gr.Button("🔬 Classify Failure", variant="primary")
564
+ strategy_btn = gr.Button("🧭 Detect Strategy", variant="secondary")
565
+ adv_btn = gr.Button("⚡ Advanced Metrics", variant="secondary")
566
+ intel_out = gr.Textbox(label="Analysis", lines=32, interactive=False)
567
+ classify_btn.click(get_failure_classification, outputs=[intel_out])
568
+ strategy_btn.click(get_strategy_detection, outputs=[intel_out])
569
+ adv_btn.click(get_advanced_metrics, outputs=[intel_out])
570
+
571
+ # ── Tab 5: 🔁 Self-Improve ─────────────────────────────────────────────
572
+ with gr.TabItem("🔁 Self-Improve"):
573
+ gr.Markdown(
574
+ "### Self-Improvement Loop\n"
575
+ "After a failure, this generates an actionable improvement plan and a "
576
+ "system prompt injection for the agent's next attempt."
577
  )
578
+ improve_btn = gr.Button("🔁 Generate Improvement Plan", variant="primary")
579
+ improve_out = gr.Textbox(label="Improvement Plan", lines=32, interactive=False)
580
+ improve_btn.click(get_improvement_plan, outputs=[improve_out])
581
 
582
+ # ── Tab 6: ⚖️ Compare ──────────────────────────────────────────────────
583
+ with gr.TabItem("⚖️ Compare Agents"):
584
  gr.Markdown(
585
+ "### Multi-Agent Strategy Comparison\n"
586
+ "Runs 4 built-in agent strategies on the same task to compare "
587
+ "efficiency, strategy, and reliability side-by-side."
588
  )
589
+ with gr.Row():
590
+ comp_task = gr.Dropdown(["task1", "task2", "task3"], value="task1", label="Task")
591
+ comp_agents = gr.CheckboxGroup(
592
+ ["test-first", "search-first", "minimal", "exhaustive"],
593
+ value=["test-first", "search-first", "minimal", "exhaustive"],
594
+ label="Agents to Compare",
595
+ )
596
+ comp_btn = gr.Button("⚖️ Run Comparison", variant="primary")
597
+ comp_out = gr.Textbox(label="Comparison Report", lines=30, interactive=False)
598
+ comp_btn.click(run_comparison, [comp_task, comp_agents], [comp_out])
599
+
600
+ # ── Tab 7: 🌐 3D Visualizer ────────────────────────────────────────────
601
+ with gr.TabItem("🌐 3D Visualizer"):
602
+ gr.Markdown(
603
+ "### Agent Trajectory 3D Visualization\n"
604
+ "Files = 3D nodes · Dependencies = edges · Agent path = animated beam · "
605
+ "Timeline = scrubbable replay. **Run an episode first, then refresh.**"
606
  )
607
+ refresh_viz_btn = gr.Button("🔄 Load Trajectory into Visualizer", variant="primary")
608
+ viz_html = gr.HTML(value="<p style='color:#64748b;text-align:center;padding:40px'>Click 'Load Trajectory' after running an episode.</p>")
609
+ refresh_viz_btn.click(get_viz_html, outputs=[viz_html])
610
 
611
+ # ── Tab 8: API ────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
612
  with gr.TabItem("📖 API"):
613
  gr.Markdown("""
614
+ ### REST API — v3.0 Endpoints
 
 
615
 
616
+ #### Core (OpenEnv-compliant)
617
  | Endpoint | Method | Description |
618
  |----------|--------|-------------|
619
+ | `/reset?task=task1` | POST | Start new episode |
620
+ | `/step` | POST | Take action |
621
+ | `/state` | GET | Current state |
622
+ | `/health` | GET | Health check |
 
 
 
 
 
 
 
 
 
 
 
623
 
624
+ #### Evaluation
625
+ | Endpoint | Method | Description |
626
+ |----------|--------|-------------|
627
+ | `/trajectory` | GET | Full action log |
628
+ | `/evaluate` | GET | 6-dimension scores |
629
+ | `/metrics` | GET | Memory + security stats |
630
+ | `/fault-config` | POST | Enable fault injection |
631
 
632
+ #### Intelligence (NEW in v3)
633
+ | Endpoint | Method | Description |
634
+ |----------|--------|-------------|
635
+ | `/classify` | GET | Typed failure classification |
636
+ | `/strategy` | GET | Behavioral strategy detection |
637
+ | `/advanced-metrics` | GET | Entropy, reliability, consistency |
638
+ | `/improvement-plan` | GET | Self-improvement feedback |
639
+ | `/compare-agents` | POST | Multi-agent comparison |
640
+ | `/viz-data` | GET | 3D visualization data |
641
 
642
+ ```bash
643
+ BASE="http://localhost:7860"
644
+ curl -X POST "$BASE/reset?task=task1"
645
+ curl -X POST "$BASE/step" -H "Content-Type: application/json" -d '{"action_type":"read_file","path":"src/auth.py"}'
646
+ curl -X POST "$BASE/step" -d '{"action_type":"submit"}'
647
+ curl "$BASE/classify"
648
+ curl "$BASE/strategy"
649
+ curl "$BASE/advanced-metrics"
650
+ curl "$BASE/improvement-plan"
651
+ curl -X POST "$BASE/compare-agents?task=task1"
652
  ```
653
  """)
654
 
655
 
656
+ # ── Mount FastAPI under same process ──────────────────────────────────────────
657
  from server.app import app as fastapi_app
 
658
  gr_app = gr.mount_gradio_app(fastapi_app, demo, path="/")
659
 
660
  if __name__ == "__main__":
server/advanced_metrics.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # server/advanced_metrics.py
2
+ """
3
+ Advanced Metrics Engine.
4
+
5
+ Computes metrics that existing benchmarks (SWE-bench, etc.) completely ignore:
6
+ - Exploration vs Exploitation ratio across episode
7
+ - Consistency score across multiple runs of same task
8
+ - Reliability index (weighted aggregate)
9
+ - Reasoning efficiency (useful actions / total actions)
10
+ - Decision entropy (how predictable/focused the agent is)
11
+ """
12
+ import math
13
+ from typing import List, Dict, Any, Optional
14
+ from dataclasses import dataclass, field
15
+
16
+
17
+ @dataclass
18
+ class AdvancedMetricsReport:
19
+ """All advanced metrics for one episode or cross-episode comparison."""
20
+
21
+ # Per-episode
22
+ reasoning_efficiency: float # Useful steps / total steps
23
+ exploration_ratio: float # Read+search vs write+test ratio
24
+ decision_entropy: float # Shannon entropy of action distribution
25
+ reliability_index: float # Composite reliability score
26
+ pivot_rate: float # Strategy changes per 10 steps
27
+ wasteful_ratio: float # Redundant actions / total actions
28
+
29
+ # Cross-episode (populated when history provided)
30
+ consistency_score: float = 0.0 # Variance across runs (lower variance = higher consistency)
31
+ runs_analyzed: int = 0
32
+
33
+ # Breakdowns
34
+ action_distribution: Dict[str, int] = field(default_factory=dict)
35
+ useful_actions: List[str] = field(default_factory=list)
36
+ wasteful_actions: List[str] = field(default_factory=list)
37
+ reliability_breakdown: Dict[str, float] = field(default_factory=dict)
38
+
39
+ def to_dict(self) -> dict:
40
+ return {
41
+ "reasoning_efficiency": round(self.reasoning_efficiency, 3),
42
+ "exploration_ratio": round(self.exploration_ratio, 3),
43
+ "decision_entropy": round(self.decision_entropy, 3),
44
+ "reliability_index": round(self.reliability_index, 3),
45
+ "pivot_rate": round(self.pivot_rate, 3),
46
+ "wasteful_ratio": round(self.wasteful_ratio, 3),
47
+ "consistency_score": round(self.consistency_score, 3),
48
+ "runs_analyzed": self.runs_analyzed,
49
+ "action_distribution": self.action_distribution,
50
+ "useful_actions": self.useful_actions,
51
+ "wasteful_actions": self.wasteful_actions,
52
+ "reliability_breakdown": {
53
+ k: round(v, 3) for k, v in self.reliability_breakdown.items()
54
+ },
55
+ }
56
+
57
+
58
+ class AdvancedMetricsEngine:
59
+ """
60
+ Computes advanced behavioral and reliability metrics from trajectory data.
61
+
62
+ Usage:
63
+ engine = AdvancedMetricsEngine()
64
+ report = engine.compute(
65
+ trajectory_steps=[...],
66
+ variant_meta={...},
67
+ final_score=0.7,
68
+ files_read=[...],
69
+ files_written=[...],
70
+ history=[], # Pass previous episode scores for consistency
71
+ )
72
+ """
73
+
74
+ def __init__(self):
75
+ self._score_history: List[float] = [] # Tracks scores across episodes
76
+
77
+ def compute(
78
+ self,
79
+ trajectory_steps: List[dict],
80
+ variant_meta: Dict[str, Any],
81
+ final_score: float,
82
+ files_read: List[str],
83
+ files_written: List[str],
84
+ history: Optional[List[float]] = None,
85
+ ) -> AdvancedMetricsReport:
86
+ """Compute all advanced metrics for one episode."""
87
+ # Record this score in history
88
+ self._score_history.append(final_score)
89
+
90
+ if not trajectory_steps:
91
+ return AdvancedMetricsReport(
92
+ reasoning_efficiency=0.0,
93
+ exploration_ratio=0.5,
94
+ decision_entropy=0.0,
95
+ reliability_index=0.0,
96
+ pivot_rate=0.0,
97
+ wasteful_ratio=1.0,
98
+ )
99
+
100
+ action_seq = [s.get("action_type", "unknown") for s in trajectory_steps]
101
+ total = len(action_seq)
102
+
103
+ # ── Action distribution ───────────────────────────────────────────────
104
+ from collections import Counter
105
+ dist = Counter(action_seq)
106
+ action_distribution = dict(dist)
107
+
108
+ # ── Decision entropy (Shannon entropy of action types) ────────────────
109
+ entropy = 0.0
110
+ for count in dist.values():
111
+ p = count / total
112
+ if p > 0:
113
+ entropy -= p * math.log2(p)
114
+ # Normalize by max possible entropy (log2 of unique action types)
115
+ max_entropy = math.log2(len(dist)) if len(dist) > 1 else 1.0
116
+ normalized_entropy = entropy / max_entropy if max_entropy > 0 else 0.0
117
+
118
+ # ── Exploration vs exploitation ratio ─────────────────────────────────
119
+ explore = dist.get("read_file", 0) + dist.get("search_code", 0)
120
+ exploit = dist.get("write_file", 0) + dist.get("run_tests", 0)
121
+ exploration_ratio = explore / (explore + exploit) if (explore + exploit) > 0 else 0.5
122
+
123
+ # ���─ Redundancy / wasteful actions ─────────────────────────────────────
124
+ read_paths = [
125
+ s.get("action_path")
126
+ for s in trajectory_steps
127
+ if s.get("action_type") == "read_file" and s.get("action_path")
128
+ ]
129
+ seen = set()
130
+ redundant_reads = 0
131
+ for p in read_paths:
132
+ if p in seen:
133
+ redundant_reads += 1
134
+ seen.add(p)
135
+
136
+ error_actions = sum(1 for s in trajectory_steps if s.get("error"))
137
+ total_wasteful = redundant_reads + error_actions
138
+ wasteful_ratio = total_wasteful / total if total > 0 else 0.0
139
+
140
+ wasteful_actions = []
141
+ if redundant_reads > 0:
142
+ wasteful_actions.append(f"{redundant_reads}x redundant file reads")
143
+ if error_actions > 0:
144
+ wasteful_actions.append(f"{error_actions}x actions that produced errors")
145
+
146
+ # ── Useful action detection ───────────────────────────────────────────
147
+ useful_actions = []
148
+ relevant = set(
149
+ variant_meta.get("bug_files", []) +
150
+ variant_meta.get("interface_files", []) +
151
+ variant_meta.get("read_first_files", []) +
152
+ variant_meta.get("files_to_implement", [])
153
+ )
154
+ relevant_reads = [f for f in files_read if f in relevant]
155
+ if relevant_reads:
156
+ useful_actions.append(f"Read {len(relevant_reads)} key files: {relevant_reads[:3]}")
157
+
158
+ test_rates = [
159
+ s.get("test_pass_rate")
160
+ for s in trajectory_steps
161
+ if s.get("test_pass_rate") is not None
162
+ ]
163
+ if len(test_rates) >= 2 and test_rates[-1] > test_rates[0]:
164
+ useful_actions.append(
165
+ f"Test pass rate improved from {test_rates[0]:.2f} to {test_rates[-1]:.2f}"
166
+ )
167
+
168
+ if files_written:
169
+ useful_actions.append(f"Wrote {len(files_written)} file(s): {files_written[:3]}")
170
+
171
+ # ── Reasoning efficiency ──────────────────────────────────────────────
172
+ useful_count = len(relevant_reads) + (1 if files_written else 0) + (1 if test_rates else 0)
173
+ reasoning_efficiency = min(1.0, useful_count / max(total, 1))
174
+
175
+ # ── Pivot rate (strategy switches per 10 steps) ───────────────────────
176
+ pivots = 0
177
+ for i in range(1, len(action_seq)):
178
+ prev_explore = action_seq[i-1] in ("read_file", "search_code")
179
+ curr_exploit = action_seq[i] in ("write_file", "run_tests")
180
+ prev_exploit = action_seq[i-1] in ("write_file", "run_tests")
181
+ curr_explore = action_seq[i] in ("read_file", "search_code")
182
+ if (prev_explore and curr_exploit) or (prev_exploit and curr_explore):
183
+ pivots += 1
184
+ pivot_rate = (pivots / total) * 10 if total > 0 else 0.0 # per 10 steps
185
+
186
+ # ── Reliability index ─────────────────────────────────────────────────
187
+ # Weighted aggregate: correctness matters most
188
+ reliability_breakdown = {
189
+ "correctness": final_score,
190
+ "efficiency": max(0.0, 1.0 - wasteful_ratio),
191
+ "focus": 1.0 - normalized_entropy, # Low entropy = focused behavior
192
+ "verification": 1.0 if test_rates else 0.0,
193
+ "safety": 1.0, # Will be reduced by security violations
194
+ }
195
+
196
+ # Check for security flags
197
+ sec_flags = sum(len(s.get("security_flags", [])) for s in trajectory_steps)
198
+ if sec_flags > 0:
199
+ reliability_breakdown["safety"] = max(0.0, 1.0 - sec_flags * 0.2)
200
+
201
+ # Weighted reliability index
202
+ weights = {
203
+ "correctness": 0.40,
204
+ "efficiency": 0.20,
205
+ "focus": 0.15,
206
+ "verification": 0.15,
207
+ "safety": 0.10,
208
+ }
209
+ reliability_index = sum(
210
+ reliability_breakdown[k] * weights[k]
211
+ for k in weights
212
+ )
213
+
214
+ # ── Consistency score (cross-episode) ────────────────────────────────
215
+ scores_to_use = list(history) if history else self._score_history
216
+ consistency_score = 0.0
217
+ runs_analyzed = len(scores_to_use)
218
+
219
+ if runs_analyzed >= 2:
220
+ mean = sum(scores_to_use) / runs_analyzed
221
+ variance = sum((s - mean) ** 2 for s in scores_to_use) / runs_analyzed
222
+ std_dev = math.sqrt(variance)
223
+ # Consistency = 1 - normalized_std_dev (higher = more consistent)
224
+ consistency_score = max(0.0, 1.0 - (std_dev / max(mean, 0.01)))
225
+
226
+ return AdvancedMetricsReport(
227
+ reasoning_efficiency=reasoning_efficiency,
228
+ exploration_ratio=exploration_ratio,
229
+ decision_entropy=normalized_entropy,
230
+ reliability_index=reliability_index,
231
+ pivot_rate=pivot_rate,
232
+ wasteful_ratio=wasteful_ratio,
233
+ consistency_score=consistency_score,
234
+ runs_analyzed=runs_analyzed,
235
+ action_distribution=action_distribution,
236
+ useful_actions=useful_actions,
237
+ wasteful_actions=wasteful_actions,
238
+ reliability_breakdown=reliability_breakdown,
239
+ )
240
+
241
+ def get_score_history(self) -> List[float]:
242
+ return list(self._score_history)
243
+
244
+ def reset_history(self):
245
+ self._score_history = []
server/app.py CHANGED
@@ -1,13 +1,17 @@
1
  # server/app.py
2
  """
3
- FastAPI server exposing the OpenEnv-compliant API + reliability layer endpoints.
4
 
5
- Core endpoints: POST /reset, POST /step, GET /state, GET /health
6
- Evaluation endpoints: GET /trajectory, GET /evaluate, GET /metrics
7
- Control endpoints: POST /fault-config
 
 
8
  """
9
  from fastapi import FastAPI, HTTPException
 
10
  from contextlib import asynccontextmanager
 
11
 
12
  from .environment import CodebaseNavEnvironment
13
  from .models import (
@@ -15,9 +19,19 @@ from .models import (
15
  TrajectoryResponse, EvaluationResponse, MetricsResponse,
16
  FaultConfigRequest,
17
  )
 
 
 
 
 
18
 
19
- # Global environment instance (one session per container)
20
  env = CodebaseNavEnvironment()
 
 
 
 
 
21
 
22
 
23
  @asynccontextmanager
@@ -27,45 +41,41 @@ async def lifespan(app: FastAPI):
27
 
28
 
29
  app = FastAPI(
30
- title="Codebase Navigation & Repair — OpenEnv",
31
  description=(
32
- "RL environment where agents navigate and repair Python codebases. "
33
- "Extended with process-based evaluation, trajectory replay, "
34
- "fault injection, security scanning, and memory tracking."
35
  ),
36
- version="2.0.0",
37
  lifespan=lifespan,
38
  )
39
 
 
 
 
 
40
 
41
- # ── Core OpenEnv Endpoints ───────────────────────────────────────────────────
 
42
 
43
  @app.post("/reset", response_model=ResetResult)
44
  async def reset(task: str = "task1"):
45
- """
46
- Start a new episode.
47
- task: "task1" | "task2" | "task3"
48
- """
49
  valid_tasks = ["task1", "task2", "task3"]
50
  if task not in valid_tasks:
51
  raise HTTPException(status_code=400, detail=f"task must be one of {valid_tasks}")
52
  try:
53
- result = env.reset(task=task)
54
- return result
55
  except Exception as e:
56
  raise HTTPException(status_code=500, detail=str(e))
57
 
58
 
59
  @app.post("/step", response_model=StepResult)
60
  async def step(action: RepoAction):
61
- """
62
- Take one action in the current episode.
63
- """
64
  if env.done:
65
- raise HTTPException(status_code=400, detail="Episode is done. POST /reset to start a new one.")
66
  try:
67
- result = env.step(action)
68
- return result
69
  except RuntimeError as e:
70
  raise HTTPException(status_code=400, detail=str(e))
71
  except Exception as e:
@@ -74,12 +84,8 @@ async def step(action: RepoAction):
74
 
75
  @app.get("/state", response_model=StateResult)
76
  async def state():
77
- """
78
- Get current state without advancing the episode.
79
- """
80
- obs = env.get_state()
81
  return StateResult(
82
- observation=obs,
83
  current_score=env.final_score,
84
  total_steps_taken=env.steps_taken,
85
  )
@@ -87,17 +93,13 @@ async def state():
87
 
88
  @app.get("/health")
89
  async def health():
90
- return {"status": "ok", "environment": "codebase-nav-env", "version": "2.0.0"}
91
 
92
 
93
- # ── Evaluation & Reliability Endpoints ───────────────────────────────────────
94
 
95
  @app.get("/trajectory", response_model=TrajectoryResponse)
96
  async def get_trajectory():
97
- """
98
- Get the full trajectory of the current or most recent episode.
99
- Returns every action, observation snapshot, reward, timing, and security flags.
100
- """
101
  traj = env.get_trajectory()
102
  if not traj:
103
  return TrajectoryResponse()
@@ -106,11 +108,6 @@ async def get_trajectory():
106
 
107
  @app.get("/evaluate", response_model=EvaluationResponse)
108
  async def get_evaluation():
109
- """
110
- Get multi-dimensional evaluation of the current/latest episode.
111
- Scores across 6 dimensions: efficiency, navigation, correctness,
112
- reasoning, robustness, security.
113
- """
114
  evaluation = env.get_evaluation()
115
  if "error" in evaluation:
116
  return EvaluationResponse()
@@ -119,23 +116,224 @@ async def get_evaluation():
119
 
120
  @app.get("/metrics", response_model=MetricsResponse)
121
  async def get_metrics():
122
- """
123
- Get comprehensive metrics including memory usage, security stats,
124
- fault injection report, wasteful patterns, and action timeline.
125
- """
126
- metrics = env.get_metrics()
127
- return MetricsResponse(**metrics)
128
 
129
 
130
  @app.post("/fault-config")
131
  async def set_fault_config(config: FaultConfigRequest):
132
- """
133
- Configure fault injection for the NEXT episode (takes effect on next /reset).
134
- Levels: "none" (default), "light" (misleading comments), "heavy" (all faults)
135
- """
136
  env.set_fault_config(config.level)
137
  return {
138
  "status": "ok",
139
  "fault_level": config.level,
140
  "message": f"Fault injection set to '{config.level}'. Takes effect on next /reset.",
141
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # server/app.py
2
  """
3
+ FastAPI server v3.0
4
 
5
+ Core endpoints: POST /reset, POST /step, GET /state, GET /health
6
+ Evaluation endpoints: GET /trajectory, GET /evaluate, GET /metrics
7
+ Control endpoints: POST /fault-config
8
+ Intelligence endpoints: GET /classify, GET /strategy, GET /advanced-metrics,
9
+ POST /compare-agents, GET /improvement-plan, GET /viz-data
10
  """
11
  from fastapi import FastAPI, HTTPException
12
+ from fastapi.staticfiles import StaticFiles
13
  from contextlib import asynccontextmanager
14
+ import os
15
 
16
  from .environment import CodebaseNavEnvironment
17
  from .models import (
 
19
  TrajectoryResponse, EvaluationResponse, MetricsResponse,
20
  FaultConfigRequest,
21
  )
22
+ from .failure_classifier import FailureClassifier
23
+ from .strategy_detector import StrategyDetector
24
+ from .advanced_metrics import AdvancedMetricsEngine
25
+ from .self_improvement import SelfImprovementEngine
26
+ from .multi_agent import MultiAgentComparison
27
 
28
+ # Global instances
29
  env = CodebaseNavEnvironment()
30
+ failure_clf = FailureClassifier()
31
+ strategy_det = StrategyDetector()
32
+ adv_metrics = AdvancedMetricsEngine()
33
+ improvement = SelfImprovementEngine()
34
+ multi_agent = MultiAgentComparison()
35
 
36
 
37
  @asynccontextmanager
 
41
 
42
 
43
  app = FastAPI(
44
+ title="Codebase Navigation & Repair — OpenEnv v3",
45
  description=(
46
+ "RL environment for AI coding agents extended with process-based evaluation, "
47
+ "failure classification, strategy detection, self-improvement loops, "
48
+ "multi-agent comparison, 3D visualization, and advanced metrics."
49
  ),
50
+ version="3.0.0",
51
  lifespan=lifespan,
52
  )
53
 
54
+ # Serve static files (3D visualizer HTML)
55
+ _static_dir = os.path.join(os.path.dirname(__file__), "..", "static")
56
+ if os.path.exists(_static_dir):
57
+ app.mount("/static", StaticFiles(directory=_static_dir), name="static")
58
 
59
+
60
+ # ── Core OpenEnv Endpoints ────────────────────────────────────────────────────
61
 
62
  @app.post("/reset", response_model=ResetResult)
63
  async def reset(task: str = "task1"):
 
 
 
 
64
  valid_tasks = ["task1", "task2", "task3"]
65
  if task not in valid_tasks:
66
  raise HTTPException(status_code=400, detail=f"task must be one of {valid_tasks}")
67
  try:
68
+ return env.reset(task=task)
 
69
  except Exception as e:
70
  raise HTTPException(status_code=500, detail=str(e))
71
 
72
 
73
  @app.post("/step", response_model=StepResult)
74
  async def step(action: RepoAction):
 
 
 
75
  if env.done:
76
+ raise HTTPException(status_code=400, detail="Episode is done. POST /reset to start.")
77
  try:
78
+ return env.step(action)
 
79
  except RuntimeError as e:
80
  raise HTTPException(status_code=400, detail=str(e))
81
  except Exception as e:
 
84
 
85
  @app.get("/state", response_model=StateResult)
86
  async def state():
 
 
 
 
87
  return StateResult(
88
+ observation=env.get_state(),
89
  current_score=env.final_score,
90
  total_steps_taken=env.steps_taken,
91
  )
 
93
 
94
  @app.get("/health")
95
  async def health():
96
+ return {"status": "ok", "environment": "codebase-nav-env", "version": "3.0.0"}
97
 
98
 
99
+ # ── Evaluation Endpoints ──────────────────────────────────────────────────────
100
 
101
  @app.get("/trajectory", response_model=TrajectoryResponse)
102
  async def get_trajectory():
 
 
 
 
103
  traj = env.get_trajectory()
104
  if not traj:
105
  return TrajectoryResponse()
 
108
 
109
  @app.get("/evaluate", response_model=EvaluationResponse)
110
  async def get_evaluation():
 
 
 
 
 
111
  evaluation = env.get_evaluation()
112
  if "error" in evaluation:
113
  return EvaluationResponse()
 
116
 
117
  @app.get("/metrics", response_model=MetricsResponse)
118
  async def get_metrics():
119
+ return MetricsResponse(**env.get_metrics())
 
 
 
 
 
120
 
121
 
122
  @app.post("/fault-config")
123
  async def set_fault_config(config: FaultConfigRequest):
 
 
 
 
124
  env.set_fault_config(config.level)
125
  return {
126
  "status": "ok",
127
  "fault_level": config.level,
128
  "message": f"Fault injection set to '{config.level}'. Takes effect on next /reset.",
129
  }
130
+
131
+
132
+ # ── Intelligence Endpoints (NEW in v3) ────────────────────────────────────────
133
+
134
+ @app.get("/classify")
135
+ async def classify_failure():
136
+ """
137
+ Classify the failure type of the current/latest episode.
138
+ Returns typed failure taxonomy with root cause and remediation.
139
+ """
140
+ traj = env.get_trajectory()
141
+ if not traj:
142
+ return {"error": "No trajectory available. Run an episode first."}
143
+
144
+ steps = traj.get("steps", [])
145
+ meta = env.variant.meta if env.variant else {}
146
+
147
+ report = failure_clf.classify(
148
+ episode_id=traj.get("episode_id", ""),
149
+ task=env.current_task or "unknown",
150
+ trajectory_steps=steps,
151
+ variant_meta=meta,
152
+ files_read=list(env.files_read),
153
+ files_written=list(env.files_written),
154
+ final_score=env.final_score,
155
+ security_violations=env.security_violations,
156
+ )
157
+ return report.to_dict()
158
+
159
+
160
+ @app.get("/strategy")
161
+ async def detect_strategy():
162
+ """
163
+ Detect the behavioral strategy pattern used by the agent.
164
+ Returns: TARGETED_DEBUGGING | SYSTEMATIC_SEARCH | BRUTE_FORCE |
165
+ RANDOM_EXPLORATION | SPEC_DRIVEN | MINIMAL_EFFORT
166
+ """
167
+ traj = env.get_trajectory()
168
+ if not traj:
169
+ return {"error": "No trajectory available."}
170
+
171
+ steps = traj.get("steps", [])
172
+ meta = env.variant.meta if env.variant else {}
173
+
174
+ report = strategy_det.detect(
175
+ trajectory_steps=steps,
176
+ task=env.current_task or "unknown",
177
+ variant_meta=meta,
178
+ files_read=list(env.files_read),
179
+ final_score=env.final_score,
180
+ )
181
+ return report.to_dict()
182
+
183
+
184
+ @app.get("/advanced-metrics")
185
+ async def get_advanced_metrics():
186
+ """
187
+ Compute advanced metrics: reasoning efficiency, decision entropy,
188
+ exploration ratio, reliability index, consistency, pivot rate.
189
+ """
190
+ traj = env.get_trajectory()
191
+ if not traj:
192
+ return {"error": "No trajectory available."}
193
+
194
+ steps = traj.get("steps", [])
195
+ meta = env.variant.meta if env.variant else {}
196
+
197
+ report = adv_metrics.compute(
198
+ trajectory_steps=steps,
199
+ variant_meta=meta,
200
+ final_score=env.final_score,
201
+ files_read=list(env.files_read),
202
+ files_written=list(env.files_written),
203
+ )
204
+ return report.to_dict()
205
+
206
+
207
+ @app.get("/improvement-plan")
208
+ async def get_improvement_plan():
209
+ """
210
+ Generate a self-improvement plan based on failure classification.
211
+ Returns: what_went_wrong, improved_strategy, step-by-step plan,
212
+ system_prompt_addon (for injecting into next agent run).
213
+ """
214
+ traj = env.get_trajectory()
215
+ if not traj:
216
+ return {"error": "No trajectory available."}
217
+
218
+ steps = traj.get("steps", [])
219
+ meta = env.variant.meta if env.variant else {}
220
+
221
+ # Classify first
222
+ fail_report = failure_clf.classify(
223
+ episode_id=traj.get("episode_id", ""),
224
+ task=env.current_task or "unknown",
225
+ trajectory_steps=steps,
226
+ variant_meta=meta,
227
+ files_read=list(env.files_read),
228
+ files_written=list(env.files_written),
229
+ final_score=env.final_score,
230
+ security_violations=env.security_violations,
231
+ )
232
+
233
+ plan = improvement.generate_improvement_plan(
234
+ episode_id=traj.get("episode_id", ""),
235
+ task=env.current_task or "unknown",
236
+ failure_type=fail_report.primary_failure,
237
+ failure_evidence=[f.evidence for f in fail_report.failures],
238
+ original_score=env.final_score,
239
+ trajectory_steps=steps,
240
+ files_read=list(env.files_read),
241
+ files_written=list(env.files_written),
242
+ )
243
+ return plan.to_dict()
244
+
245
+
246
+ @app.post("/compare-agents")
247
+ async def compare_agents(task: str = "task1", agents: str = "all"):
248
+ """
249
+ Run multiple agent strategies on the same task and compare side-by-side.
250
+ agents: "all" | comma-separated list of: test-first,search-first,minimal,exhaustive
251
+ """
252
+ valid_tasks = ["task1", "task2", "task3"]
253
+ if task not in valid_tasks:
254
+ raise HTTPException(status_code=400, detail=f"task must be one of {valid_tasks}")
255
+
256
+ if agents == "all":
257
+ agent_list = None
258
+ else:
259
+ agent_list = [a.strip() for a in agents.split(",")]
260
+
261
+ try:
262
+ report = multi_agent.compare(env, task=task, agents=agent_list)
263
+ return report.to_dict()
264
+ except Exception as e:
265
+ raise HTTPException(status_code=500, detail=str(e))
266
+
267
+
268
+ @app.get("/viz-data")
269
+ async def get_viz_data():
270
+ """
271
+ Get structured 3D visualization data for the current/latest episode.
272
+ Returns nodes (files), edges (dependencies), and step trajectory
273
+ in the format expected by the Three.js visualizer.
274
+ """
275
+ traj = env.get_trajectory()
276
+ if not traj:
277
+ return {"error": "No trajectory available."}
278
+
279
+ # Build file nodes
280
+ files = []
281
+ visited = set(env.files_read)
282
+ modified = set(env.files_written)
283
+ meta = env.variant.meta if env.variant else {}
284
+ bug_files = set(meta.get("bug_files", []))
285
+
286
+ if env.variant:
287
+ tree = env.variant.get_tree()
288
+ for f in tree:
289
+ ftype = "test" if f.startswith("tests/") else \
290
+ "spec" if f.endswith(".md") else "src"
291
+ files.append({
292
+ "name": f,
293
+ "type": ftype,
294
+ "is_bug_file": f in bug_files,
295
+ "visited": f in visited,
296
+ "modified": f in modified,
297
+ })
298
+
299
+ # Build dependency edges from known patterns
300
+ deps = []
301
+ test_files = [f["name"] for f in files if f["type"] == "test"]
302
+ src_files = [f["name"] for f in files if f["type"] == "src"]
303
+
304
+ # Simple heuristic: connect tests to src files
305
+ for tf in test_files:
306
+ for sf in src_files:
307
+ deps.append({"from": tf, "to": sf})
308
+
309
+ # Build step data for trajectory
310
+ steps_data = []
311
+ for step in traj.get("steps", []):
312
+ steps_data.append({
313
+ "step": step.get("step_number", 0),
314
+ "action": step.get("action_type", ""),
315
+ "path": step.get("action_path"),
316
+ "reward": step.get("reward", 0.0),
317
+ "error": step.get("error"),
318
+ "pass_rate": step.get("test_pass_rate"),
319
+ })
320
+
321
+ # Get strategy
322
+ strategy_info = strategy_det.detect(
323
+ traj.get("steps", []),
324
+ env.current_task or "unknown",
325
+ meta,
326
+ list(env.files_read),
327
+ env.final_score,
328
+ ) if traj.get("steps") else None
329
+
330
+ return {
331
+ "task": env.current_task or "unknown",
332
+ "variant_id": traj.get("variant_id", "unknown"),
333
+ "final_score": env.final_score,
334
+ "strategy": strategy_info.strategy if strategy_info else "UNKNOWN",
335
+ "failure_type": "—",
336
+ "files": files,
337
+ "dependencies": deps,
338
+ "steps": steps_data,
339
+ }
server/failure_classifier.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # server/failure_classifier.py
2
+ """
3
+ Typed Failure Classification Engine.
4
+
5
+ Classifies agent failures into precise, actionable categories rather than
6
+ vague scores. Each failure type has a root cause, evidence, and remediation.
7
+
8
+ Failure taxonomy:
9
+ WRONG_FILE_NAVIGATION — agent read irrelevant files, missed key files
10
+ BLIND_WRITE — agent wrote code without reading first
11
+ HALLUCINATED_CODE — agent wrote syntactically/logically wrong code
12
+ NEVER_TESTED — agent submitted without running any tests
13
+ LOOPING_BEHAVIOR — agent repeated same action 3+ times
14
+ CONTEXT_OVERFLOW — agent read enormous amounts of irrelevant data
15
+ SECURITY_VIOLATION — agent wrote dangerous code
16
+ CORRECT — no failure detected
17
+ """
18
+ from typing import List, Dict, Any, Optional
19
+ from dataclasses import dataclass, field
20
+
21
+
22
+ @dataclass
23
+ class FailureInstance:
24
+ """One classified failure event."""
25
+ failure_type: str # e.g. "WRONG_FILE_NAVIGATION"
26
+ severity: str # "critical" | "major" | "minor"
27
+ step_number: int # Which step triggered it
28
+ evidence: str # Specific observation
29
+ root_cause: str # Why this happens
30
+ remediation: str # How to fix in next run
31
+
32
+
33
+ @dataclass
34
+ class FailureReport:
35
+ """Full failure analysis for one episode."""
36
+ episode_id: str
37
+ task: str
38
+ primary_failure: str # Most severe failure type
39
+ failures: List[FailureInstance] = field(default_factory=list)
40
+ success: bool = False
41
+ failure_summary: str = ""
42
+ retry_hint: str = "" # Actionable hint for the next attempt
43
+
44
+ def to_dict(self) -> dict:
45
+ return {
46
+ "episode_id": self.episode_id,
47
+ "task": self.task,
48
+ "success": self.success,
49
+ "primary_failure": self.primary_failure,
50
+ "failure_count": len(self.failures),
51
+ "failures": [
52
+ {
53
+ "type": f.failure_type,
54
+ "severity": f.severity,
55
+ "step": f.step_number,
56
+ "evidence": f.evidence,
57
+ "root_cause": f.root_cause,
58
+ "remediation": f.remediation,
59
+ }
60
+ for f in self.failures
61
+ ],
62
+ "failure_summary": self.failure_summary,
63
+ "retry_hint": self.retry_hint,
64
+ }
65
+
66
+
67
+ # ── Severity ordering for picking primary failure ─────────────────────────────
68
+ SEVERITY_RANK = {"critical": 3, "major": 2, "minor": 1}
69
+
70
+ FAILURE_REMEDIATION = {
71
+ "WRONG_FILE_NAVIGATION": (
72
+ "Read the failing test file first to understand the module under test, "
73
+ "then navigate directly to the imported source files."
74
+ ),
75
+ "BLIND_WRITE": (
76
+ "Always read the target file before writing. Use read_file → write_file → run_tests."
77
+ ),
78
+ "HALLUCINATED_CODE": (
79
+ "Re-read the source file, understand the function signature, "
80
+ "then write a minimal targeted fix. Run tests to verify."
81
+ ),
82
+ "NEVER_TESTED": (
83
+ "Always call run_tests after writing a fix. "
84
+ "Submit only when test pass rate has demonstrably improved."
85
+ ),
86
+ "LOOPING_BEHAVIOR": (
87
+ "Stop repeating the same action. Use search_code to find the bug location, "
88
+ "then navigate directly to it."
89
+ ),
90
+ "CONTEXT_OVERFLOW": (
91
+ "Focus on files explicitly referenced in the failing test's imports. "
92
+ "Avoid reading utility files unless the test error specifically mentions them."
93
+ ),
94
+ "SECURITY_VIOLATION": (
95
+ "Do not use os.system, eval, exec, or subprocess in fixes. "
96
+ "Write pure Python logic without shell calls."
97
+ ),
98
+ "CORRECT": "No remediation needed.",
99
+ }
100
+
101
+
102
+ class FailureClassifier:
103
+ """
104
+ Classifies agent failures from trajectory data.
105
+
106
+ Usage:
107
+ clf = FailureClassifier()
108
+ report = clf.classify(
109
+ episode_id="abc123",
110
+ task="task1",
111
+ trajectory_steps=[...],
112
+ variant_meta={...},
113
+ files_read=[...],
114
+ files_written=[...],
115
+ final_score=0.0,
116
+ )
117
+ """
118
+
119
+ def classify(
120
+ self,
121
+ episode_id: str,
122
+ task: str,
123
+ trajectory_steps: List[dict],
124
+ variant_meta: Dict[str, Any],
125
+ files_read: List[str],
126
+ files_written: List[str],
127
+ final_score: float,
128
+ security_violations: int = 0,
129
+ ) -> FailureReport:
130
+ """Run all classifiers and build a structured failure report."""
131
+ failures: List[FailureInstance] = []
132
+ success = final_score >= 0.5
133
+
134
+ if success and security_violations == 0:
135
+ return FailureReport(
136
+ episode_id=episode_id,
137
+ task=task,
138
+ primary_failure="CORRECT",
139
+ failures=[],
140
+ success=True,
141
+ failure_summary="Agent succeeded without errors.",
142
+ retry_hint="",
143
+ )
144
+
145
+ action_sequence = [s.get("action_type", "") for s in trajectory_steps]
146
+
147
+ # ── Classifier 1: Wrong File Navigation ───────────────────────────────
148
+ relevant = set(
149
+ variant_meta.get("bug_files", []) +
150
+ variant_meta.get("interface_files", []) +
151
+ variant_meta.get("read_first_files", []) +
152
+ variant_meta.get("files_to_implement", [])
153
+ )
154
+ if relevant and files_read:
155
+ irrelevant_reads = [f for f in files_read if f not in relevant
156
+ and not f.startswith("tests/")]
157
+ if len(irrelevant_reads) > 1 and not any(f in files_read for f in relevant):
158
+ failures.append(FailureInstance(
159
+ failure_type="WRONG_FILE_NAVIGATION",
160
+ severity="critical",
161
+ step_number=1,
162
+ evidence=f"Read {len(irrelevant_reads)} irrelevant files: {irrelevant_reads[:3]}. "
163
+ f"Never read key files: {list(relevant)[:3]}",
164
+ root_cause="Agent navigated to wrong part of the codebase entirely.",
165
+ remediation=FAILURE_REMEDIATION["WRONG_FILE_NAVIGATION"],
166
+ ))
167
+
168
+ # ── Classifier 2: Blind Write ─────────────────────────────────────────
169
+ write_indices = [i for i, a in enumerate(action_sequence) if a == "write_file"]
170
+ for wi in write_indices:
171
+ reads_before = [a for a in action_sequence[:wi] if a == "read_file"]
172
+ if not reads_before:
173
+ step = trajectory_steps[wi]
174
+ failures.append(FailureInstance(
175
+ failure_type="BLIND_WRITE",
176
+ severity="critical",
177
+ step_number=wi + 1,
178
+ evidence=f"write_file at step {wi+1} with zero prior read_file actions.",
179
+ root_cause="Agent attempted to fix code without reading it first — likely hallucinating.",
180
+ remediation=FAILURE_REMEDIATION["BLIND_WRITE"],
181
+ ))
182
+
183
+ # ── Classifier 3: Hallucinated Code ───────────────────────────────────
184
+ # Detect write followed by immediate test failure
185
+ for i, step in enumerate(trajectory_steps):
186
+ if step.get("action_type") == "run_tests":
187
+ prev_write = None
188
+ for j in range(i - 1, -1, -1):
189
+ if trajectory_steps[j].get("action_type") == "write_file":
190
+ prev_write = j
191
+ break
192
+ if prev_write is not None:
193
+ pass_rate = step.get("test_pass_rate", None)
194
+ if pass_rate is not None and pass_rate < 0.3:
195
+ failures.append(FailureInstance(
196
+ failure_type="HALLUCINATED_CODE",
197
+ severity="major",
198
+ step_number=i + 1,
199
+ evidence=f"Test pass rate {pass_rate:.2f} after write at step {prev_write+1}. "
200
+ f"Code change made things worse.",
201
+ root_cause="Agent wrote syntactically correct but semantically wrong code.",
202
+ remediation=FAILURE_REMEDIATION["HALLUCINATED_CODE"],
203
+ ))
204
+
205
+ # ── Classifier 4: Never Tested ────────────────────────────────────────
206
+ has_tests = "run_tests" in action_sequence
207
+ has_writes = "write_file" in action_sequence
208
+ has_submit = "submit" in action_sequence
209
+ if has_submit and has_writes and not has_tests:
210
+ failures.append(FailureInstance(
211
+ failure_type="NEVER_TESTED",
212
+ severity="major",
213
+ step_number=len(action_sequence),
214
+ evidence="Agent wrote code changes but submitted without running any tests.",
215
+ root_cause="No feedback loop — agent cannot know if its fix worked.",
216
+ remediation=FAILURE_REMEDIATION["NEVER_TESTED"],
217
+ ))
218
+
219
+ # ── Classifier 5: Looping Behavior ────────────────────────────────────
220
+ read_paths = [
221
+ (i, s.get("action_path"))
222
+ for i, s in enumerate(trajectory_steps)
223
+ if s.get("action_type") == "read_file" and s.get("action_path")
224
+ ]
225
+ path_counts: Dict[str, List[int]] = {}
226
+ for idx, path in read_paths:
227
+ path_counts.setdefault(path, []).append(idx)
228
+
229
+ for path, indices in path_counts.items():
230
+ if len(indices) >= 3:
231
+ failures.append(FailureInstance(
232
+ failure_type="LOOPING_BEHAVIOR",
233
+ severity="major",
234
+ step_number=indices[2] + 1,
235
+ evidence=f"Read '{path}' {len(indices)} times (steps {[i+1 for i in indices]}). "
236
+ f"Agent is stuck in a read loop.",
237
+ root_cause="Agent cannot extract the needed information and keeps retrying.",
238
+ remediation=FAILURE_REMEDIATION["LOOPING_BEHAVIOR"],
239
+ ))
240
+
241
+ # ── Classifier 6: Context Overflow ────────────────────────────────────
242
+ total_content = sum(
243
+ s.get("action_content_length") or 0
244
+ for s in trajectory_steps
245
+ if s.get("action_type") == "read_file"
246
+ )
247
+ if total_content > 50_000 and final_score < 0.5:
248
+ failures.append(FailureInstance(
249
+ failure_type="CONTEXT_OVERFLOW",
250
+ severity="minor",
251
+ step_number=len(trajectory_steps),
252
+ evidence=f"Agent read {total_content:,} chars total. "
253
+ f"Most of this was likely irrelevant context.",
254
+ root_cause="Agent wasted token budget reading unnecessary files.",
255
+ remediation=FAILURE_REMEDIATION["CONTEXT_OVERFLOW"],
256
+ ))
257
+
258
+ # ── Classifier 7: Security Violation ─────────────────────────────────
259
+ if security_violations > 0:
260
+ sec_steps = [
261
+ s for s in trajectory_steps if s.get("security_flags")
262
+ ]
263
+ for ss in sec_steps:
264
+ failures.append(FailureInstance(
265
+ failure_type="SECURITY_VIOLATION",
266
+ severity="critical",
267
+ step_number=ss.get("step_number", 0),
268
+ evidence=f"Flags: {ss.get('security_flags', [])}",
269
+ root_cause="Agent wrote unsafe code patterns that would be dangerous in production.",
270
+ remediation=FAILURE_REMEDIATION["SECURITY_VIOLATION"],
271
+ ))
272
+
273
+ # ── Build report ──────────────────────────────────────────────────────
274
+ if not failures:
275
+ # Failed but no specific classifier triggered — generic low score
276
+ primary = "HALLUCINATED_CODE"
277
+ summary = f"Score {final_score:.2f} — fix was written but insufficient. Re-read the source files more carefully."
278
+ hint = "Read test file → read all src files → write targeted fix → run tests → submit."
279
+ else:
280
+ # Pick most severe failure as primary
281
+ failures.sort(key=lambda f: SEVERITY_RANK.get(f.severity, 0), reverse=True)
282
+ primary = failures[0].failure_type
283
+ summary = "; ".join(f"{f.failure_type} (step {f.step_number})" for f in failures[:3])
284
+ hint = failures[0].remediation
285
+
286
+ return FailureReport(
287
+ episode_id=episode_id,
288
+ task=task,
289
+ primary_failure=primary,
290
+ failures=failures,
291
+ success=success,
292
+ failure_summary=summary,
293
+ retry_hint=hint,
294
+ )
server/multi_agent.py ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # server/multi_agent.py
2
+ """
3
+ Multi-Agent Comparison Engine.
4
+
5
+ Runs multiple agent configurations against the SAME task variant
6
+ and produces a side-by-side comparison report.
7
+
8
+ Agent configurations:
9
+ - Deterministic (rule-based, no LLM) — baseline
10
+ - Test-first (forces reading tests before anything)
11
+ - Search-first (forces search_code before reads)
12
+ - LLM-based (if HF_TOKEN provided)
13
+
14
+ This is the key feature that answers: "Which agent strategy wins?"
15
+ """
16
+ import time
17
+ import copy
18
+ from typing import List, Dict, Any, Optional, Callable
19
+ from dataclasses import dataclass, field
20
+
21
+
22
+ @dataclass
23
+ class AgentRunResult:
24
+ """Result of one agent configuration running one episode."""
25
+ agent_name: str
26
+ task: str
27
+ variant_id: str
28
+ final_score: float
29
+ total_steps: int
30
+ cumulative_reward: float
31
+ duration_seconds: float
32
+ action_sequence: List[str]
33
+ files_read: List[str]
34
+ files_written: List[str]
35
+ strategy: str # Detected strategy label
36
+ strategy_score: float
37
+ failure_type: str
38
+ reliability_index: float
39
+ step_timeline: List[dict]
40
+
41
+ def to_dict(self) -> dict:
42
+ return {
43
+ "agent_name": self.agent_name,
44
+ "task": self.task,
45
+ "variant_id": self.variant_id,
46
+ "final_score": round(self.final_score, 3),
47
+ "total_steps": self.total_steps,
48
+ "cumulative_reward": round(self.cumulative_reward, 3),
49
+ "duration_seconds": round(self.duration_seconds, 2),
50
+ "action_sequence": self.action_sequence,
51
+ "files_read": self.files_read,
52
+ "files_written": self.files_written,
53
+ "strategy": self.strategy,
54
+ "strategy_score": round(self.strategy_score, 3),
55
+ "failure_type": self.failure_type,
56
+ "reliability_index": round(self.reliability_index, 3),
57
+ "step_timeline": self.step_timeline,
58
+ }
59
+
60
+
61
+ @dataclass
62
+ class ComparisonReport:
63
+ """Side-by-side comparison of multiple agent configurations."""
64
+ task: str
65
+ variant_id: str
66
+ runs: List[AgentRunResult] = field(default_factory=list)
67
+
68
+ def to_dict(self) -> dict:
69
+ if not self.runs:
70
+ return {"error": "No runs to compare"}
71
+
72
+ # Rank by score then steps
73
+ ranked = sorted(self.runs, key=lambda r: (-r.final_score, r.total_steps))
74
+ winner = ranked[0]
75
+
76
+ return {
77
+ "task": self.task,
78
+ "variant_id": self.variant_id,
79
+ "winner": winner.agent_name,
80
+ "winner_score": winner.final_score,
81
+ "summary_table": [
82
+ {
83
+ "rank": i + 1,
84
+ "agent": r.agent_name,
85
+ "score": round(r.final_score, 3),
86
+ "steps": r.total_steps,
87
+ "reward": round(r.cumulative_reward, 3),
88
+ "strategy": r.strategy,
89
+ "failure": r.failure_type,
90
+ "reliability": round(r.reliability_index, 3),
91
+ }
92
+ for i, r in enumerate(ranked)
93
+ ],
94
+ "detailed_runs": [r.to_dict() for r in self.runs],
95
+ "insights": self._generate_insights(ranked),
96
+ }
97
+
98
+ def _generate_insights(self, ranked: List[AgentRunResult]) -> List[str]:
99
+ insights = []
100
+ if len(ranked) < 2:
101
+ return insights
102
+
103
+ best = ranked[0]
104
+ worst = ranked[-1]
105
+
106
+ if best.final_score > worst.final_score + 0.2:
107
+ insights.append(
108
+ f"'{best.agent_name}' significantly outperformed '{worst.agent_name}' "
109
+ f"({best.final_score:.2f} vs {worst.final_score:.2f})"
110
+ )
111
+
112
+ step_diffs = [(r.agent_name, r.total_steps) for r in ranked]
113
+ most_efficient = min(ranked, key=lambda r: r.total_steps if r.final_score >= 0.5 else float('inf'))
114
+ if most_efficient.final_score >= 0.5:
115
+ insights.append(
116
+ f"Most step-efficient successful agent: '{most_efficient.agent_name}' "
117
+ f"({most_efficient.total_steps} steps)"
118
+ )
119
+
120
+ strategies = [r.strategy for r in ranked]
121
+ if len(set(strategies)) > 1:
122
+ insights.append(
123
+ f"Strategy variance observed: {set(strategies)} — "
124
+ f"'{best.agent_name}' used {best.strategy} which proved most effective."
125
+ )
126
+
127
+ return insights
128
+
129
+
130
+ class MultiAgentComparison:
131
+ """
132
+ Runs multiple deterministic agent strategies against the same environment.
133
+
134
+ Usage (in-process, no LLM required):
135
+ from server.environment import CodebaseNavEnvironment
136
+ from server.models import RepoAction
137
+
138
+ env = CodebaseNavEnvironment()
139
+ engine = MultiAgentComparison()
140
+ report = engine.compare(env, task="task1")
141
+ """
142
+
143
+ # ── Built-in agent strategies ─────────────────────────────────────────────
144
+
145
+ @staticmethod
146
+ def _agent_test_first(obs: dict, step: int, context: dict) -> dict:
147
+ """Strategy: Read tests before any source file."""
148
+ tree = obs.get("repo_tree", [])
149
+ files_read = set(obs.get("files_read", []))
150
+
151
+ test_files = sorted([f for f in tree if f.startswith("tests/")])
152
+ src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")])
153
+ spec_files = sorted([f for f in tree if f.endswith(".md")])
154
+
155
+ # Phase 1: Tests first
156
+ for tf in test_files:
157
+ if tf not in files_read:
158
+ return {"action_type": "read_file", "path": tf}
159
+ # Phase 2: Source files
160
+ for sf in src_files:
161
+ if sf not in files_read:
162
+ return {"action_type": "read_file", "path": sf}
163
+ # Phase 3: Run tests
164
+ if test_files and context.get("tests_run", 0) == 0:
165
+ context["tests_run"] = 1
166
+ return {"action_type": "run_tests", "path": test_files[0]}
167
+ return {"action_type": "submit"}
168
+
169
+ @staticmethod
170
+ def _agent_search_first(obs: dict, step: int, context: dict) -> dict:
171
+ """Strategy: Use search_code to locate the bug before reading."""
172
+ tree = obs.get("repo_tree", [])
173
+ files_read = set(obs.get("files_read", []))
174
+ failing = obs.get("failing_tests", [])
175
+
176
+ # Step 1: search for the failing test function name
177
+ if step == 1 and failing:
178
+ fn_name = failing[0].split(".")[-1] if failing else "bug"
179
+ context["searched"] = True
180
+ return {"action_type": "search_code", "query": fn_name}
181
+
182
+ # Step 2: Read files based on search
183
+ test_files = sorted([f for f in tree if f.startswith("tests/")])
184
+ src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")])
185
+
186
+ for tf in test_files:
187
+ if tf not in files_read:
188
+ return {"action_type": "read_file", "path": tf}
189
+ for sf in src_files:
190
+ if sf not in files_read:
191
+ return {"action_type": "read_file", "path": sf}
192
+ if test_files and context.get("tests_run", 0) == 0:
193
+ context["tests_run"] = 1
194
+ return {"action_type": "run_tests", "path": test_files[0]}
195
+ return {"action_type": "submit"}
196
+
197
+ @staticmethod
198
+ def _agent_minimal(obs: dict, step: int, context: dict) -> dict:
199
+ """Strategy: Minimal effort — read one file, submit immediately."""
200
+ tree = obs.get("repo_tree", [])
201
+ files_read = set(obs.get("files_read", []))
202
+ src_files = [f for f in tree if f.startswith("src/") and f.endswith(".py")]
203
+ if src_files and not files_read:
204
+ return {"action_type": "read_file", "path": src_files[0]}
205
+ return {"action_type": "submit"}
206
+
207
+ @staticmethod
208
+ def _agent_exhaustive(obs: dict, step: int, context: dict) -> dict:
209
+ """Strategy: Read everything, run tests, then submit."""
210
+ tree = obs.get("repo_tree", [])
211
+ files_read = set(obs.get("files_read", []))
212
+
213
+ all_readable = [f for f in tree if f.endswith(".py") or f.endswith(".md")]
214
+ for f in all_readable:
215
+ if f not in files_read:
216
+ return {"action_type": "read_file", "path": f}
217
+
218
+ test_files = [f for f in tree if f.startswith("tests/")]
219
+ if test_files and context.get("tests_run", 0) == 0:
220
+ context["tests_run"] = 1
221
+ return {"action_type": "run_tests", "path": test_files[0]}
222
+ if test_files and context.get("tests_run2", 0) == 0:
223
+ context["tests_run2"] = 1
224
+ return {"action_type": "run_tests"}
225
+ return {"action_type": "submit"}
226
+
227
+ AGENT_CONFIGS = {
228
+ "test-first": _agent_test_first.__func__,
229
+ "search-first": _agent_search_first.__func__,
230
+ "minimal": _agent_minimal.__func__,
231
+ "exhaustive": _agent_exhaustive.__func__,
232
+ }
233
+
234
+ def compare(
235
+ self,
236
+ env, # CodebaseNavEnvironment instance
237
+ task: str = "task1",
238
+ agents: Optional[List[str]] = None,
239
+ shared_variant: Optional[str] = None,
240
+ ) -> ComparisonReport:
241
+ """
242
+ Run all (or selected) agents against the same task and compare.
243
+ The environment is reset to the same variant for each agent.
244
+ """
245
+ from server.models import RepoAction
246
+ from server.strategy_detector import StrategyDetector
247
+ from server.failure_classifier import FailureClassifier
248
+ from server.advanced_metrics import AdvancedMetricsEngine
249
+
250
+ agent_names = agents or list(self.AGENT_CONFIGS.keys())
251
+ strategy_detector = StrategyDetector()
252
+ failure_classifier = FailureClassifier()
253
+ metrics_engine = AdvancedMetricsEngine()
254
+
255
+ runs: List[AgentRunResult] = []
256
+ variant_id = None
257
+
258
+ for agent_name in agent_names:
259
+ agent_fn = self.AGENT_CONFIGS.get(agent_name)
260
+ if not agent_fn:
261
+ continue
262
+
263
+ # Reset environment
264
+ reset_result = env.reset(task=task)
265
+ obs = reset_result.observation
266
+ variant_id = reset_result.info.get("variant_id", "unknown")
267
+
268
+ context = {}
269
+ start = time.time()
270
+ max_steps = 15
271
+ files_read = []
272
+ files_written = []
273
+ cumulative_reward = 0.0
274
+ action_sequence = []
275
+ step_timeline = []
276
+
277
+ obs_dict = obs.model_dump()
278
+
279
+ for step_num in range(1, max_steps + 1):
280
+ if env.done:
281
+ break
282
+
283
+ action_dict = agent_fn(obs_dict, step_num, context)
284
+ action = RepoAction(
285
+ action_type=action_dict.get("action_type", "submit"),
286
+ path=action_dict.get("path"),
287
+ query=action_dict.get("query"),
288
+ content=action_dict.get("content"),
289
+ )
290
+
291
+ result = env.step(action)
292
+ obs = result.observation
293
+ obs_dict = obs.model_dump()
294
+ cumulative_reward += result.reward
295
+ action_sequence.append(action.action_type)
296
+
297
+ if action.path and action.action_type == "read_file":
298
+ files_read.append(action.path)
299
+ if action.path and action.action_type == "write_file":
300
+ files_written.append(action.path)
301
+
302
+ step_timeline.append({
303
+ "step": step_num,
304
+ "action": action.action_type,
305
+ "path": action.path,
306
+ "reward": round(result.reward, 3),
307
+ })
308
+
309
+ if result.done:
310
+ break
311
+
312
+ # Force submit if not done
313
+ if not env.done:
314
+ result = env.step(RepoAction(action_type="submit"))
315
+ cumulative_reward += result.reward
316
+ action_sequence.append("submit")
317
+
318
+ duration = time.time() - start
319
+ final_score = env.final_score
320
+
321
+ # Get trajectory for analysis
322
+ trajectory = env.get_trajectory()
323
+ traj_steps = trajectory.get("steps", []) if trajectory else []
324
+ variant_meta = {}
325
+ if env.variant:
326
+ variant_meta = env.variant.meta
327
+
328
+ # Detect strategy
329
+ strategy_report = strategy_detector.detect(
330
+ traj_steps, task, variant_meta, files_read, final_score
331
+ )
332
+
333
+ # Classify failure
334
+ failure_report = failure_classifier.classify(
335
+ episode_id=trajectory.get("episode_id", "") if trajectory else "",
336
+ task=task,
337
+ trajectory_steps=traj_steps,
338
+ variant_meta=variant_meta,
339
+ files_read=files_read,
340
+ files_written=files_written,
341
+ final_score=final_score,
342
+ )
343
+
344
+ # Advanced metrics
345
+ adv_metrics = metrics_engine.compute(
346
+ traj_steps, variant_meta, final_score, files_read, files_written
347
+ )
348
+
349
+ runs.append(AgentRunResult(
350
+ agent_name=agent_name,
351
+ task=task,
352
+ variant_id=variant_id or "unknown",
353
+ final_score=final_score,
354
+ total_steps=len(action_sequence),
355
+ cumulative_reward=cumulative_reward,
356
+ duration_seconds=duration,
357
+ action_sequence=action_sequence,
358
+ files_read=files_read,
359
+ files_written=files_written,
360
+ strategy=strategy_report.strategy,
361
+ strategy_score=strategy_report.score,
362
+ failure_type=failure_report.primary_failure,
363
+ reliability_index=adv_metrics.reliability_index,
364
+ step_timeline=step_timeline,
365
+ ))
366
+
367
+ return ComparisonReport(
368
+ task=task,
369
+ variant_id=variant_id or "unknown",
370
+ runs=runs,
371
+ )
server/self_improvement.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # server/self_improvement.py
2
+ """
3
+ Self-Improvement Loop.
4
+
5
+ After a failure, generates structured feedback and an improved strategy prompt
6
+ that can be injected into the agent's next attempt. This closes the loop
7
+ between evaluation and agent behavior.
8
+
9
+ The retry loop:
10
+ 1. Run episode → evaluate → classify failures
11
+ 2. Generate improvement prompt based on failure type
12
+ 3. Re-run episode with improvement prompt injected into agent context
13
+ 4. Compare before/after performance
14
+ """
15
+ from typing import List, Dict, Any, Optional
16
+ from dataclasses import dataclass, field
17
+
18
+
19
+ @dataclass
20
+ class ImprovementPlan:
21
+ """Structured feedback for the agent's next attempt."""
22
+ episode_id: str
23
+ task: str
24
+ failure_type: str
25
+ original_score: float
26
+
27
+ # Actionable feedback
28
+ what_went_wrong: str
29
+ specific_errors: List[str]
30
+ improved_strategy: str
31
+ step_by_step_plan: List[str]
32
+
33
+ # For injection into agent prompt
34
+ system_prompt_addon: str # Extra instructions for the system prompt
35
+ user_context_addon: str # Extra context for the user prompt
36
+
37
+ def to_dict(self) -> dict:
38
+ return {
39
+ "episode_id": self.episode_id,
40
+ "task": self.task,
41
+ "failure_type": self.failure_type,
42
+ "original_score": round(self.original_score, 3),
43
+ "what_went_wrong": self.what_went_wrong,
44
+ "specific_errors": self.specific_errors,
45
+ "improved_strategy": self.improved_strategy,
46
+ "step_by_step_plan": self.step_by_step_plan,
47
+ "system_prompt_addon": self.system_prompt_addon,
48
+ "user_context_addon": self.user_context_addon,
49
+ }
50
+
51
+
52
+ @dataclass
53
+ class RetryResult:
54
+ """Result of a retry attempt with improvement feedback."""
55
+ original_episode_id: str
56
+ retry_episode_id: str
57
+ original_score: float
58
+ retry_score: float
59
+ improvement: float # retry_score - original_score
60
+ failure_fixed: bool
61
+ steps_comparison: Dict[str, int] # {"original": N, "retry": M}
62
+
63
+ def to_dict(self) -> dict:
64
+ return {
65
+ "original_episode_id": self.original_episode_id,
66
+ "retry_episode_id": self.retry_episode_id,
67
+ "original_score": round(self.original_score, 3),
68
+ "retry_score": round(self.retry_score, 3),
69
+ "improvement": round(self.improvement, 3),
70
+ "failure_fixed": self.failure_fixed,
71
+ "steps_comparison": self.steps_comparison,
72
+ }
73
+
74
+
75
+ # ── Strategy templates per failure type ──────────────────────────────────────
76
+ STRATEGY_TEMPLATES = {
77
+ "WRONG_FILE_NAVIGATION": {
78
+ "what_went_wrong": "Agent navigated to the wrong files and missed the bug location entirely.",
79
+ "strategy": "START with the failing test file. Read its imports. Navigate exclusively to those imported modules.",
80
+ "plan": [
81
+ "1. Read the failing test file FIRST (in tests/ directory)",
82
+ "2. Find the import statements — these point to the buggy module",
83
+ "3. Read ONLY those imported source files",
84
+ "4. Look for the function/method the test is calling",
85
+ "5. Fix the specific function — do not touch other code",
86
+ "6. Run the failing test to verify",
87
+ "7. Submit",
88
+ ],
89
+ "system_addon": (
90
+ "CRITICAL: You previously failed by reading the wrong files. "
91
+ "This time: read the failing test first, identify its imports, "
92
+ "go directly to those source files. Do NOT read any file not referenced by the test."
93
+ ),
94
+ },
95
+ "BLIND_WRITE": {
96
+ "what_went_wrong": "Agent wrote code without reading the existing implementation first.",
97
+ "strategy": "NEVER write before reading. Read the target file. Understand the existing logic. Then write a minimal fix.",
98
+ "plan": [
99
+ "1. Read the failing test to understand expected behavior",
100
+ "2. Read the source file you plan to modify",
101
+ "3. Identify the exact line(s) causing failure",
102
+ "4. Write a FIX (not a rewrite) targeting only those lines",
103
+ "5. Run tests to verify improvement",
104
+ "6. Submit",
105
+ ],
106
+ "system_addon": (
107
+ "CRITICAL: You previously wrote code without reading the file first. "
108
+ "This time: ALWAYS call read_file on any file BEFORE using write_file. "
109
+ "No exceptions. Read → Understand → Write minimal fix."
110
+ ),
111
+ },
112
+ "HALLUCINATED_CODE": {
113
+ "what_went_wrong": "Agent wrote syntactically correct but logically wrong code that made tests worse.",
114
+ "strategy": "Write a targeted, minimal fix. Do not rewrite entire functions. Change only what the test requires.",
115
+ "plan": [
116
+ "1. Read the failing test and note EXACTLY what assertion fails",
117
+ "2. Read the source function — understand its current behavior",
118
+ "3. Identify the gap between current and expected behavior",
119
+ "4. Write the SMALLEST possible change that bridges that gap",
120
+ "5. Run tests BEFORE submitting to verify the fix works",
121
+ "6. If tests still fail, re-read and refine — don't guess",
122
+ ],
123
+ "system_addon": (
124
+ "CRITICAL: Your previous fix made things worse. This indicates hallucination. "
125
+ "This time: make the SMALLEST possible change. "
126
+ "Run run_tests after EVERY write to check if you're improving or degrading. "
127
+ "If tests get worse after a write, immediately read the file again and try a different approach."
128
+ ),
129
+ },
130
+ "NEVER_TESTED": {
131
+ "what_went_wrong": "Agent submitted code changes without running any tests to verify they work.",
132
+ "strategy": "ALWAYS run run_tests after every write_file. Never submit without test verification.",
133
+ "plan": [
134
+ "1. Read test → Read source → Write fix",
135
+ "2. IMMEDIATELY run run_tests pointing to the failing test file",
136
+ "3. If tests pass: submit",
137
+ "4. If tests still fail: re-read, refine, run tests again",
138
+ "5. ONLY submit when you have seen test improvement",
139
+ ],
140
+ "system_addon": (
141
+ "CRITICAL: You submitted without testing. This is invalid. "
142
+ "This time: after EVERY write_file action, you MUST call run_tests. "
143
+ "Only call submit when run_tests shows improvement. "
144
+ "The pattern is: read → write → run_tests → submit. Non-negotiable."
145
+ ),
146
+ },
147
+ "LOOPING_BEHAVIOR": {
148
+ "what_went_wrong": "Agent got stuck reading the same file repeatedly without making progress.",
149
+ "strategy": "Use search_code to find the exact bug location. Read each file at most once.",
150
+ "plan": [
151
+ "1. Use search_code with the function name from the failing test",
152
+ "2. Read the file that contains the matching code — ONCE",
153
+ "3. If you need more context, use search_code again with a different query",
154
+ "4. Once you have read a file, do NOT read it again",
155
+ "5. Write your fix, run tests, submit",
156
+ ],
157
+ "system_addon": (
158
+ "CRITICAL: You read the same files 3+ times without progress. "
159
+ "This time: you may read each file AT MOST ONCE. "
160
+ "Use search_code to pinpoint bug location. "
161
+ "If you are confused, use search_code — do not re-read files."
162
+ ),
163
+ },
164
+ "SECURITY_VIOLATION": {
165
+ "what_went_wrong": "Agent wrote dangerous code patterns that would be harmful in production.",
166
+ "strategy": "Write pure Python logic only. Never use os, subprocess, eval, or exec.",
167
+ "plan": [
168
+ "1. Read the test to understand what pure Python behavior is needed",
169
+ "2. Implement the fix using ONLY standard library functions",
170
+ "3. No os.system(), subprocess, eval(), exec(), or __import__()",
171
+ "4. Run tests and submit",
172
+ ],
173
+ "system_addon": (
174
+ "CRITICAL: Your previous code contained dangerous patterns (os.system, eval, exec, subprocess). "
175
+ "This is automatically penalized. "
176
+ "This time: write ONLY pure Python logic. No shell commands. No dynamic execution. "
177
+ "Use only stdlib modules that do not involve system access."
178
+ ),
179
+ },
180
+ "CORRECT": {
181
+ "what_went_wrong": "No failure — agent succeeded.",
182
+ "strategy": "Continue with same strategy.",
183
+ "plan": ["Maintain current approach."],
184
+ "system_addon": "",
185
+ },
186
+ }
187
+
188
+ # Default template for unknown failures
189
+ DEFAULT_TEMPLATE = {
190
+ "what_went_wrong": "Agent failed to fix the bug sufficiently — score too low.",
191
+ "strategy": "Read all relevant files carefully, make a targeted fix, run tests, submit.",
192
+ "plan": [
193
+ "1. Read failing test to understand expected behavior",
194
+ "2. Read each source file referenced by the test",
195
+ "3. Identify the bug: wrong return value, missing case, logic error",
196
+ "4. Write minimal fix",
197
+ "5. Run tests",
198
+ "6. Submit only when tests improve",
199
+ ],
200
+ "system_addon": (
201
+ "IMPORTANT: Your previous attempt scored below 0.5. "
202
+ "This time: focus on understanding what the failing test EXPECTS. "
203
+ "Make a targeted fix. Verify with run_tests before submitting."
204
+ ),
205
+ }
206
+
207
+
208
+ class SelfImprovementEngine:
209
+ """
210
+ Generates structured improvement plans from failure analysis.
211
+
212
+ Usage:
213
+ engine = SelfImprovementEngine()
214
+ plan = engine.generate_improvement_plan(
215
+ episode_id="abc123",
216
+ task="task1",
217
+ failure_report=failure_report,
218
+ trajectory_steps=[...],
219
+ )
220
+ # Then inject plan.system_prompt_addon into the agent's next run
221
+ """
222
+
223
+ def generate_improvement_plan(
224
+ self,
225
+ episode_id: str,
226
+ task: str,
227
+ failure_type: str,
228
+ failure_evidence: List[str],
229
+ original_score: float,
230
+ trajectory_steps: List[dict],
231
+ files_read: List[str],
232
+ files_written: List[str],
233
+ ) -> ImprovementPlan:
234
+ """Generate an actionable improvement plan from failure data."""
235
+ template = STRATEGY_TEMPLATES.get(failure_type, DEFAULT_TEMPLATE)
236
+
237
+ # Build specific error list from trajectory
238
+ specific_errors = []
239
+ for step in trajectory_steps:
240
+ if step.get("error"):
241
+ specific_errors.append(
242
+ f"Step {step.get('step_number', '?')}: {step['error'][:100]}"
243
+ )
244
+ specific_errors.extend(failure_evidence[:3])
245
+
246
+ # Build user context addon with trajectory summary
247
+ action_summary = []
248
+ for step in trajectory_steps[:8]: # First 8 steps for context
249
+ a = step.get("action_type", "?")
250
+ p = step.get("action_path") or step.get("action_query") or ""
251
+ r = step.get("reward", 0)
252
+ err = " ❌" if step.get("error") else ""
253
+ action_summary.append(f" Step {step.get('step_number', '?')}: {a} {p} → reward={r:+.2f}{err}")
254
+
255
+ user_context_addon = (
256
+ f"[PREVIOUS ATTEMPT REVIEW]\n"
257
+ f"Score: {original_score:.2f}/1.0\n"
258
+ f"Primary failure: {failure_type}\n"
259
+ f"What went wrong: {template['what_went_wrong']}\n"
260
+ f"\nYour previous actions:\n" + "\n".join(action_summary) +
261
+ f"\n\n[IMPROVED STRATEGY FOR THIS ATTEMPT]\n{template['strategy']}"
262
+ )
263
+
264
+ return ImprovementPlan(
265
+ episode_id=episode_id,
266
+ task=task,
267
+ failure_type=failure_type,
268
+ original_score=original_score,
269
+ what_went_wrong=template["what_went_wrong"],
270
+ specific_errors=specific_errors,
271
+ improved_strategy=template["strategy"],
272
+ step_by_step_plan=template["plan"],
273
+ system_prompt_addon=template["system_addon"],
274
+ user_context_addon=user_context_addon,
275
+ )
276
+
277
+ def build_retry_system_prompt(self, base_prompt: str, improvement_plan: ImprovementPlan) -> str:
278
+ """Inject improvement guidance into the agent system prompt."""
279
+ if not improvement_plan.system_prompt_addon:
280
+ return base_prompt
281
+ return (
282
+ f"{base_prompt}\n\n"
283
+ f"{'='*60}\n"
284
+ f"PREVIOUS ATTEMPT FEEDBACK (VERY IMPORTANT):\n"
285
+ f"{'='*60}\n"
286
+ f"{improvement_plan.system_prompt_addon}\n"
287
+ f"{'='*60}"
288
+ )
289
+
290
+ def build_retry_user_context(self, improvement_plan: ImprovementPlan) -> str:
291
+ """Build the user context string to prepend to the first prompt in a retry."""
292
+ return improvement_plan.user_context_addon
server/strategy_detector.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # server/strategy_detector.py
2
+ """
3
+ Strategy Pattern Detector.
4
+
5
+ Classifies what high-level search/navigation strategy the agent used.
6
+ This goes beyond step counting — it classifies the cognitive approach.
7
+
8
+ Strategies:
9
+ TARGETED_DEBUGGING — reads test → reads relevant src → fixes → tests
10
+ SYSTEMATIC_SEARCH — reads all files methodically before writing
11
+ BRUTE_FORCE — writes and runs tests repeatedly until something passes
12
+ RANDOM_EXPLORATION — no coherent pattern, reads random files
13
+ SPEC_DRIVEN — reads spec/docs first, then implements
14
+ MINIMAL_EFFORT — does the bare minimum (often fails)
15
+
16
+ Each strategy gets a score (1.0 = ideal for the task), a label, and evidence.
17
+ """
18
+ from typing import List, Dict, Any, Optional
19
+ from dataclasses import dataclass, field
20
+ from collections import Counter
21
+
22
+
23
+ @dataclass
24
+ class StrategyReport:
25
+ """Result of strategy pattern detection."""
26
+ strategy: str # Primary strategy label
27
+ score: float # 0.0–1.0 (task-appropriate quality)
28
+ confidence: float # How confident we are in the label (0–1)
29
+ sub_patterns: List[str] # Additional behavioral sub-patterns
30
+ evidence: List[str] # Supporting observations
31
+ strategy_description: str # Human-readable explanation
32
+ exploration_ratio: float # 0=pure exploit, 1=pure explore
33
+ pivot_count: int # How many times agent changed strategy mid-episode
34
+
35
+ def to_dict(self) -> dict:
36
+ return {
37
+ "strategy": self.strategy,
38
+ "score": round(self.score, 3),
39
+ "confidence": round(self.confidence, 3),
40
+ "sub_patterns": self.sub_patterns,
41
+ "evidence": self.evidence,
42
+ "strategy_description": self.strategy_description,
43
+ "exploration_ratio": round(self.exploration_ratio, 3),
44
+ "pivot_count": self.pivot_count,
45
+ }
46
+
47
+
48
+ STRATEGY_DESCRIPTIONS = {
49
+ "TARGETED_DEBUGGING": (
50
+ "Agent reads the failing test to understand expected behavior, "
51
+ "then navigates directly to the relevant source file and makes a targeted fix."
52
+ ),
53
+ "SYSTEMATIC_SEARCH": (
54
+ "Agent reads all available files before writing any code. "
55
+ "Methodical but can waste steps on irrelevant files."
56
+ ),
57
+ "BRUTE_FORCE": (
58
+ "Agent repeatedly writes and runs tests hoping something sticks. "
59
+ "No clear hypothesis about the bug — trial and error approach."
60
+ ),
61
+ "RANDOM_EXPLORATION": (
62
+ "Agent reads files in an incoherent order with no visible strategy. "
63
+ "High entropy — possibly confused by misleading information."
64
+ ),
65
+ "SPEC_DRIVEN": (
66
+ "Agent reads the specification/feature doc first, "
67
+ "then systematically implements what is described. Ideal for task3."
68
+ ),
69
+ "MINIMAL_EFFORT": (
70
+ "Agent took very few steps and submitted early. "
71
+ "May indicate overconfidence or giving up."
72
+ ),
73
+ }
74
+
75
+
76
+ class StrategyDetector:
77
+ """
78
+ Detects the behavioral strategy pattern used by an agent.
79
+
80
+ Usage:
81
+ detector = StrategyDetector()
82
+ report = detector.detect(
83
+ trajectory_steps=[...],
84
+ task="task1",
85
+ variant_meta={...},
86
+ files_read=[...],
87
+ final_score=0.7,
88
+ )
89
+ """
90
+
91
+ def detect(
92
+ self,
93
+ trajectory_steps: List[dict],
94
+ task: str,
95
+ variant_meta: Dict[str, Any],
96
+ files_read: List[str],
97
+ final_score: float,
98
+ ) -> StrategyReport:
99
+ """Detect strategy from trajectory data."""
100
+ if not trajectory_steps:
101
+ return StrategyReport(
102
+ strategy="MINIMAL_EFFORT",
103
+ score=0.0,
104
+ confidence=1.0,
105
+ sub_patterns=[],
106
+ evidence=["No steps taken"],
107
+ strategy_description=STRATEGY_DESCRIPTIONS["MINIMAL_EFFORT"],
108
+ exploration_ratio=0.0,
109
+ pivot_count=0,
110
+ )
111
+
112
+ action_seq = [s.get("action_type", "") for s in trajectory_steps]
113
+ read_paths = [
114
+ s.get("action_path", "")
115
+ for s in trajectory_steps
116
+ if s.get("action_type") == "read_file"
117
+ ]
118
+ write_count = action_seq.count("write_file")
119
+ test_count = action_seq.count("run_tests")
120
+ read_count = action_seq.count("read_file")
121
+ search_count = action_seq.count("search_code")
122
+ total = len(action_seq)
123
+
124
+ relevant = set(
125
+ variant_meta.get("bug_files", []) +
126
+ variant_meta.get("interface_files", []) +
127
+ variant_meta.get("read_first_files", [])
128
+ )
129
+ test_files = [f for f in read_paths if f and f.startswith("tests/")]
130
+ spec_files = [f for f in read_paths if f and f.endswith(".md")]
131
+
132
+ sub_patterns = []
133
+ evidence = []
134
+
135
+ # ── Exploration ratio: reads/searches vs writes/tests ─────────────────
136
+ explore_actions = read_count + search_count
137
+ exploit_actions = write_count + test_count
138
+ exploration_ratio = (
139
+ explore_actions / (explore_actions + exploit_actions)
140
+ if (explore_actions + exploit_actions) > 0
141
+ else 0.5
142
+ )
143
+
144
+ # ── Pivot detection: strategy changes mid-episode ─────────────────────
145
+ pivots = 0
146
+ blocks = []
147
+ current_block = action_seq[0] if action_seq else None
148
+ for a in action_seq[1:]:
149
+ read_like = a in ("read_file", "search_code")
150
+ write_like = a in ("write_file", "run_tests")
151
+ cur_read = current_block in ("read_file", "search_code")
152
+ cur_write = current_block in ("write_file", "run_tests")
153
+ if (read_like and cur_write) or (write_like and cur_read):
154
+ pivots += 1
155
+ current_block = a
156
+
157
+ # ── Strategy classification ────────────────────────────────────────────
158
+ strategy = "RANDOM_EXPLORATION"
159
+ score = 0.4
160
+ confidence = 0.5
161
+
162
+ # 1. SPEC_DRIVEN (reads spec/md first, task3)
163
+ if task == "task3" and spec_files and action_seq.index("read_file") == 0:
164
+ strategy = "SPEC_DRIVEN"
165
+ score = 0.85 if final_score > 0.5 else 0.55
166
+ confidence = 0.9
167
+ evidence.append(f"Read spec file(s) first: {spec_files[:2]}")
168
+ sub_patterns.append("SPEC_FIRST")
169
+
170
+ # 2. TARGETED_DEBUGGING (test first → relevant src → write)
171
+ elif (test_files and read_paths and read_paths[0].startswith("tests/")
172
+ and write_count >= 1 and test_count >= 1):
173
+ strategy = "TARGETED_DEBUGGING"
174
+ score = 0.85 + (0.15 * final_score)
175
+ confidence = 0.85
176
+ evidence.append(f"First read was test file: {read_paths[0]}")
177
+ evidence.append(f"Followed by write + test verification")
178
+ sub_patterns.append("TEST_FIRST")
179
+ if relevant and any(f in files_read for f in relevant):
180
+ sub_patterns.append("TARGETED_READ")
181
+ score = min(1.0, score + 0.05)
182
+
183
+ # 3. SYSTEMATIC_SEARCH (all files read before any write)
184
+ elif write_count > 0:
185
+ first_write_idx = next((i for i, a in enumerate(action_seq) if a == "write_file"), total)
186
+ reads_before_write = sum(1 for i, a in enumerate(action_seq) if a == "read_file" and i < first_write_idx)
187
+ if read_count > 0 and reads_before_write == read_count:
188
+ strategy = "SYSTEMATIC_SEARCH"
189
+ score = 0.65
190
+ confidence = 0.75
191
+ evidence.append(f"Read {reads_before_write} files before first write")
192
+ sub_patterns.append("READ_ALL_FIRST")
193
+
194
+ # 4. BRUTE_FORCE (multiple write-test cycles)
195
+ elif write_count >= 2 and test_count >= 2:
196
+ strategy = "BRUTE_FORCE"
197
+ score = 0.35
198
+ confidence = 0.8
199
+ evidence.append(f"{write_count} writes + {test_count} test runs = trial and error")
200
+ sub_patterns.append("TRIAL_AND_ERROR")
201
+
202
+ # 5. MINIMAL_EFFORT (tiny episode, or only submit)
203
+ elif total <= 3 or (write_count == 0 and test_count == 0):
204
+ strategy = "MINIMAL_EFFORT"
205
+ score = 0.1
206
+ confidence = 0.95
207
+ evidence.append(f"Only {total} total steps with no fix attempt")
208
+ sub_patterns.append("GAVE_UP")
209
+
210
+ # ── Additional sub-pattern detection ──────────────────────────────────
211
+ # Search-before-read
212
+ if search_count > 0:
213
+ first_search = next((i for i, a in enumerate(action_seq) if a == "search_code"), total)
214
+ first_read = next((i for i, a in enumerate(action_seq) if a == "read_file"), total)
215
+ if first_search < first_read:
216
+ sub_patterns.append("SEARCH_GUIDED")
217
+ evidence.append("Used search_code to locate bug before reading")
218
+
219
+ # Excessive looping
220
+ path_counts = Counter(p for p in read_paths if p)
221
+ max_rereads = max(path_counts.values()) if path_counts else 0
222
+ if max_rereads >= 3:
223
+ sub_patterns.append("READ_LOOP")
224
+ evidence.append(f"Re-read same file {max_rereads}x — likely confused")
225
+ score = max(0.0, score - 0.2)
226
+
227
+ # Verified fix (ran tests and found improvement)
228
+ test_rates = [s.get("test_pass_rate") for s in trajectory_steps if s.get("test_pass_rate") is not None]
229
+ if len(test_rates) >= 2 and test_rates[-1] > test_rates[0]:
230
+ sub_patterns.append("VERIFIED_FIX")
231
+ evidence.append(f"Test pass rate improved: {test_rates[0]:.2f} → {test_rates[-1]:.2f}")
232
+ score = min(1.0, score + 0.1)
233
+
234
+ return StrategyReport(
235
+ strategy=strategy,
236
+ score=max(0.0, min(1.0, score)),
237
+ confidence=confidence,
238
+ sub_patterns=sub_patterns,
239
+ evidence=evidence,
240
+ strategy_description=STRATEGY_DESCRIPTIONS.get(strategy, ""),
241
+ exploration_ratio=exploration_ratio,
242
+ pivot_count=pivots,
243
+ )
static/viz3d.html ADDED
@@ -0,0 +1,867 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Agent Trajectory 3D Visualizer</title>
7
+ <style>
8
+ * { margin: 0; padding: 0; box-sizing: border-box; }
9
+ body {
10
+ background: #0a0e1a;
11
+ color: #e0e6f0;
12
+ font-family: 'Segoe UI', system-ui, sans-serif;
13
+ overflow: hidden;
14
+ height: 100vh;
15
+ }
16
+ #canvas-container {
17
+ position: absolute;
18
+ top: 0; left: 0;
19
+ width: 100%; height: 100%;
20
+ }
21
+ #ui-overlay {
22
+ position: absolute;
23
+ top: 0; left: 0;
24
+ width: 100%; height: 100%;
25
+ pointer-events: none;
26
+ z-index: 10;
27
+ }
28
+ /* Header */
29
+ #header {
30
+ position: absolute;
31
+ top: 12px; left: 50%;
32
+ transform: translateX(-50%);
33
+ text-align: center;
34
+ pointer-events: none;
35
+ }
36
+ #header h1 {
37
+ font-size: 16px;
38
+ font-weight: 700;
39
+ color: #7dd3fc;
40
+ letter-spacing: 0.05em;
41
+ text-shadow: 0 0 20px rgba(125,211,252,0.5);
42
+ }
43
+ #header p {
44
+ font-size: 11px;
45
+ color: #64748b;
46
+ margin-top: 2px;
47
+ }
48
+ /* Legend */
49
+ #legend {
50
+ position: absolute;
51
+ top: 12px; right: 16px;
52
+ background: rgba(10,14,26,0.85);
53
+ border: 1px solid rgba(125,211,252,0.2);
54
+ border-radius: 8px;
55
+ padding: 10px 14px;
56
+ font-size: 11px;
57
+ pointer-events: none;
58
+ }
59
+ #legend h3 { color: #7dd3fc; margin-bottom: 8px; font-size: 11px; }
60
+ .legend-item {
61
+ display: flex; align-items: center; gap: 8px;
62
+ margin-bottom: 5px;
63
+ }
64
+ .legend-dot {
65
+ width: 10px; height: 10px;
66
+ border-radius: 50%;
67
+ flex-shrink: 0;
68
+ }
69
+ /* Info panel */
70
+ #info-panel {
71
+ position: absolute;
72
+ top: 12px; left: 16px;
73
+ background: rgba(10,14,26,0.85);
74
+ border: 1px solid rgba(125,211,252,0.2);
75
+ border-radius: 8px;
76
+ padding: 12px 16px;
77
+ min-width: 220px;
78
+ pointer-events: none;
79
+ }
80
+ #info-panel h3 { color: #7dd3fc; margin-bottom: 8px; font-size: 11px; letter-spacing: 0.1em; }
81
+ .info-row {
82
+ display: flex; justify-content: space-between; gap: 12px;
83
+ font-size: 11px;
84
+ margin-bottom: 4px;
85
+ color: #94a3b8;
86
+ }
87
+ .info-value { color: #e0e6f0; font-weight: 600; }
88
+ /* Timeline */
89
+ #timeline-panel {
90
+ position: absolute;
91
+ bottom: 20px; left: 50%;
92
+ transform: translateX(-50%);
93
+ background: rgba(10,14,26,0.9);
94
+ border: 1px solid rgba(125,211,252,0.2);
95
+ border-radius: 10px;
96
+ padding: 14px 20px;
97
+ width: min(700px, 90vw);
98
+ pointer-events: all;
99
+ }
100
+ #timeline-panel .tl-header {
101
+ display: flex;
102
+ justify-content: space-between;
103
+ align-items: center;
104
+ margin-bottom: 10px;
105
+ }
106
+ #timeline-panel h3 {
107
+ font-size: 11px;
108
+ color: #7dd3fc;
109
+ letter-spacing: 0.1em;
110
+ }
111
+ #step-label {
112
+ font-size: 12px;
113
+ color: #f0abfc;
114
+ font-weight: 700;
115
+ }
116
+ #timeline-slider {
117
+ width: 100%;
118
+ -webkit-appearance: none;
119
+ height: 4px;
120
+ background: linear-gradient(to right, #7dd3fc 0%, #7dd3fc var(--pct,0%), #1e293b var(--pct,0%));
121
+ border-radius: 4px;
122
+ outline: none;
123
+ cursor: pointer;
124
+ }
125
+ #timeline-slider::-webkit-slider-thumb {
126
+ -webkit-appearance: none;
127
+ width: 16px; height: 16px;
128
+ border-radius: 50%;
129
+ background: #7dd3fc;
130
+ cursor: pointer;
131
+ box-shadow: 0 0 10px rgba(125,211,252,0.7);
132
+ }
133
+ #step-actions {
134
+ display: flex;
135
+ gap: 8px;
136
+ margin-top: 10px;
137
+ justify-content: center;
138
+ }
139
+ .tl-btn {
140
+ background: rgba(125,211,252,0.1);
141
+ border: 1px solid rgba(125,211,252,0.3);
142
+ color: #7dd3fc;
143
+ padding: 5px 14px;
144
+ border-radius: 6px;
145
+ cursor: pointer;
146
+ font-size: 11px;
147
+ transition: all 0.2s;
148
+ }
149
+ .tl-btn:hover { background: rgba(125,211,252,0.25); }
150
+ .tl-btn.active { background: rgba(125,211,252,0.3); }
151
+ /* Step log */
152
+ #step-log {
153
+ position: absolute;
154
+ bottom: 130px; right: 16px;
155
+ background: rgba(10,14,26,0.85);
156
+ border: 1px solid rgba(125,211,252,0.2);
157
+ border-radius: 8px;
158
+ padding: 10px 14px;
159
+ width: 260px;
160
+ max-height: 240px;
161
+ overflow-y: auto;
162
+ pointer-events: none;
163
+ font-size: 10px;
164
+ }
165
+ #step-log h3 { color: #7dd3fc; margin-bottom: 8px; font-size: 11px; }
166
+ .log-entry {
167
+ display: flex;
168
+ align-items: flex-start;
169
+ gap: 6px;
170
+ margin-bottom: 6px;
171
+ padding-bottom: 6px;
172
+ border-bottom: 1px solid rgba(255,255,255,0.05);
173
+ }
174
+ .log-entry:last-child { border-bottom: none; }
175
+ .log-step { color: #475569; min-width: 28px; }
176
+ .log-action { font-weight: 600; }
177
+ .log-reward { margin-left: auto; font-weight: 700; }
178
+ .reward-pos { color: #4ade80; }
179
+ .reward-neg { color: #f87171; }
180
+ .reward-zero { color: #94a3b8; }
181
+ /* Tooltip */
182
+ #tooltip {
183
+ position: absolute;
184
+ background: rgba(10,14,26,0.95);
185
+ border: 1px solid rgba(125,211,252,0.4);
186
+ border-radius: 6px;
187
+ padding: 8px 12px;
188
+ font-size: 11px;
189
+ pointer-events: none;
190
+ opacity: 0;
191
+ transition: opacity 0.15s;
192
+ max-width: 200px;
193
+ z-index: 20;
194
+ }
195
+ #tooltip h4 { color: #7dd3fc; margin-bottom: 4px; }
196
+ /* Score ring */
197
+ #score-ring {
198
+ position: absolute;
199
+ bottom: 130px; left: 16px;
200
+ pointer-events: none;
201
+ }
202
+ #score-ring svg text { font-family: 'Segoe UI', sans-serif; }
203
+ /* Loader */
204
+ #loader {
205
+ position: absolute;
206
+ top: 50%; left: 50%;
207
+ transform: translate(-50%, -50%);
208
+ color: #7dd3fc;
209
+ font-size: 14px;
210
+ text-align: center;
211
+ }
212
+ .loader-spinner {
213
+ width: 40px; height: 40px;
214
+ border: 3px solid rgba(125,211,252,0.2);
215
+ border-top-color: #7dd3fc;
216
+ border-radius: 50%;
217
+ animation: spin 0.8s linear infinite;
218
+ margin: 0 auto 12px;
219
+ }
220
+ @keyframes spin { to { transform: rotate(360deg); } }
221
+ </style>
222
+ </head>
223
+ <body>
224
+
225
+ <!-- Hidden data injection point -->
226
+ <div id="viz-data" style="display:none"></div>
227
+
228
+ <div id="canvas-container">
229
+ <canvas id="three-canvas"></canvas>
230
+ </div>
231
+
232
+ <div id="loader">
233
+ <div class="loader-spinner"></div>
234
+ <p>Initializing 3D Visualizer...</p>
235
+ </div>
236
+
237
+ <div id="ui-overlay">
238
+ <!-- Header -->
239
+ <div id="header">
240
+ <h1>🔍 Agent Trajectory Visualizer — 3D</h1>
241
+ <p>Files = nodes · Dependencies = edges · Agent path = animated beam</p>
242
+ </div>
243
+
244
+ <!-- Info panel -->
245
+ <div id="info-panel">
246
+ <h3>EPISODE STATS</h3>
247
+ <div class="info-row"><span>Task</span><span class="info-value" id="stat-task">—</span></div>
248
+ <div class="info-row"><span>Variant</span><span class="info-value" id="stat-variant">—</span></div>
249
+ <div class="info-row"><span>Steps</span><span class="info-value" id="stat-steps">—</span></div>
250
+ <div class="info-row"><span>Score</span><span class="info-value" id="stat-score">—</span></div>
251
+ <div class="info-row"><span>Strategy</span><span class="info-value" id="stat-strategy">—</span></div>
252
+ <div class="info-row"><span>Failure</span><span class="info-value" id="stat-failure">—</span></div>
253
+ </div>
254
+
255
+ <!-- Legend -->
256
+ <div id="legend">
257
+ <h3>LEGEND</h3>
258
+ <div class="legend-item">
259
+ <div class="legend-dot" style="background:#f97316"></div><span>Source file</span>
260
+ </div>
261
+ <div class="legend-item">
262
+ <div class="legend-dot" style="background:#3b82f6"></div><span>Test file</span>
263
+ </div>
264
+ <div class="legend-item">
265
+ <div class="legend-dot" style="background:#a855f7"></div><span>Spec / Docs</span>
266
+ </div>
267
+ <div class="legend-item">
268
+ <div class="legend-dot" style="background:#22c55e"></div><span>Visited</span>
269
+ </div>
270
+ <div class="legend-item">
271
+ <div class="legend-dot" style="background:#ef4444"></div><span>Modified / Bug</span>
272
+ </div>
273
+ <div class="legend-item">
274
+ <div class="legend-dot" style="background:#facc15; width:20px; height:4px; border-radius:2px;"></div><span>Agent path</span>
275
+ </div>
276
+ </div>
277
+
278
+ <!-- Score ring -->
279
+ <div id="score-ring">
280
+ <svg width="80" height="80" viewBox="0 0 80 80">
281
+ <circle cx="40" cy="40" r="34" fill="none"
282
+ stroke="rgba(125,211,252,0.15)" stroke-width="6"/>
283
+ <circle id="score-arc" cx="40" cy="40" r="34" fill="none"
284
+ stroke="#7dd3fc" stroke-width="6"
285
+ stroke-dasharray="0 214"
286
+ stroke-linecap="round"
287
+ transform="rotate(-90 40 40)"
288
+ style="transition: stroke-dasharray 1s ease;"/>
289
+ <text id="score-text" x="40" y="45" text-anchor="middle"
290
+ fill="#e0e6f0" font-size="14" font-weight="700">0.0</text>
291
+ </svg>
292
+ </div>
293
+
294
+ <!-- Step log -->
295
+ <div id="step-log">
296
+ <h3>STEP LOG</h3>
297
+ <div id="log-entries"></div>
298
+ </div>
299
+
300
+ <!-- Tooltip -->
301
+ <div id="tooltip">
302
+ <h4 id="tooltip-title">File</h4>
303
+ <div id="tooltip-body"></div>
304
+ </div>
305
+
306
+ <!-- Timeline -->
307
+ <div id="timeline-panel">
308
+ <div class="tl-header">
309
+ <h3>TIMELINE REPLAY</h3>
310
+ <span id="step-label">Step 0 / 0</span>
311
+ </div>
312
+ <input type="range" id="timeline-slider" min="0" max="0" value="0"
313
+ oninput="onSliderChange(this.value)">
314
+ <div id="step-actions">
315
+ <button class="tl-btn" onclick="stepBack()">◀ Back</button>
316
+ <button class="tl-btn" id="play-btn" onclick="togglePlay()">▶ Play</button>
317
+ <button class="tl-btn" onclick="stepForward()">Forward ▶</button>
318
+ <button class="tl-btn" onclick="resetView()">↺ Reset</button>
319
+ <button class="tl-btn" id="orbit-btn" onclick="toggleOrbit()">🔄 Orbit</button>
320
+ </div>
321
+ </div>
322
+ </div>
323
+
324
+ <!-- Three.js from CDN -->
325
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/three.js/r128/three.min.js"></script>
326
+ <script>
327
+ // ── Sample data (replaced by real data from backend) ───────────────────────
328
+ const DEFAULT_DATA = {
329
+ task: "task1",
330
+ variant_id: "variant_1",
331
+ final_score: 0.714,
332
+ strategy: "TARGETED_DEBUGGING",
333
+ failure_type: "CORRECT",
334
+ files: [
335
+ { name: "tests/test_formatter.py", type: "test" },
336
+ { name: "src/formatter.py", type: "src", is_bug_file: true },
337
+ { name: "src/utils.py", type: "src" }
338
+ ],
339
+ dependencies: [
340
+ { from: "tests/test_formatter.py", to: "src/formatter.py" },
341
+ { from: "src/formatter.py", to: "src/utils.py" }
342
+ ],
343
+ steps: [
344
+ { step: 1, action: "read_file", path: "tests/test_formatter.py", reward: 0.0 },
345
+ { step: 2, action: "read_file", path: "src/formatter.py", reward: 0.05 },
346
+ { step: 3, action: "search_code", path: null, reward: 0.0 },
347
+ { step: 4, action: "run_tests", path: "tests/test_formatter.py", reward: 0.0 },
348
+ { step: 5, action: "submit", path: null, reward: 0.694 }
349
+ ]
350
+ };
351
+
352
+ // ── Load data from injection point or use default ────────────────────────────
353
+ function loadVizData() {
354
+ const el = document.getElementById('viz-data');
355
+ if (el && el.textContent.trim()) {
356
+ try { return JSON.parse(el.textContent); } catch(e) {}
357
+ }
358
+ return DEFAULT_DATA;
359
+ }
360
+
361
+ // ── Three.js setup ───────────────────────────────────────────────────────────
362
+ const canvas = document.getElementById('three-canvas');
363
+ const renderer = new THREE.WebGLRenderer({ canvas, antialias: true, alpha: true });
364
+ renderer.setSize(window.innerWidth, window.innerHeight);
365
+ renderer.setPixelRatio(Math.min(window.devicePixelRatio, 2));
366
+ renderer.setClearColor(0x0a0e1a, 1);
367
+
368
+ const scene = new THREE.Scene();
369
+ const fov = 60;
370
+ const camera = new THREE.PerspectiveCamera(fov, window.innerWidth / window.innerHeight, 0.1, 1000);
371
+ camera.position.set(0, 8, 22);
372
+ camera.lookAt(0, 0, 0);
373
+
374
+ // Ambient + directional light
375
+ scene.add(new THREE.AmbientLight(0x1a2040, 1));
376
+ const dirLight = new THREE.DirectionalLight(0x7dd3fc, 0.6);
377
+ dirLight.position.set(5, 10, 5);
378
+ scene.add(dirLight);
379
+
380
+ // Grid
381
+ const grid = new THREE.GridHelper(40, 20, 0x1e293b, 0x1e293b);
382
+ grid.position.y = -3;
383
+ scene.add(grid);
384
+
385
+ // Stars
386
+ const starGeo = new THREE.BufferGeometry();
387
+ const starCount = 800;
388
+ const starPositions = new Float32Array(starCount * 3);
389
+ for (let i = 0; i < starCount * 3; i++) starPositions[i] = (Math.random() - 0.5) * 200;
390
+ starGeo.setAttribute('position', new THREE.BufferAttribute(starPositions, 3));
391
+ const starMat = new THREE.PointsMaterial({ color: 0x334155, size: 0.3 });
392
+ scene.add(new THREE.Points(starGeo, starMat));
393
+
394
+ // ── Color palette ─────────────────────────────────────────────────────────────
395
+ const COLORS = {
396
+ src: 0xf97316,
397
+ test: 0x3b82f6,
398
+ spec: 0xa855f7,
399
+ visited: 0x22c55e,
400
+ modified: 0xef4444,
401
+ bug: 0xef4444,
402
+ edge: 0x334155,
403
+ path: 0xfacc15,
404
+ agent: 0xfbbf24,
405
+ };
406
+
407
+ // ── Orbit control (manual implementation) ────────────────────────────────────
408
+ let isOrbiting = false;
409
+ let orbitActive = false;
410
+ let mouse = { x: 0, y: 0, down: false, lastX: 0, lastY: 0 };
411
+ let spherical = { theta: 0, phi: Math.PI / 4, r: 22 };
412
+
413
+ canvas.addEventListener('mousedown', e => { mouse.down = true; mouse.lastX = e.clientX; mouse.lastY = e.clientY; });
414
+ canvas.addEventListener('mouseup', () => { mouse.down = false; });
415
+ canvas.addEventListener('mousemove', e => {
416
+ if (!mouse.down) {
417
+ // Hover for tooltip
418
+ checkHover(e.clientX, e.clientY);
419
+ return;
420
+ }
421
+ const dx = e.clientX - mouse.lastX;
422
+ const dy = e.clientY - mouse.lastY;
423
+ spherical.theta -= dx * 0.005;
424
+ spherical.phi = Math.max(0.1, Math.min(Math.PI / 2, spherical.phi - dy * 0.005));
425
+ mouse.lastX = e.clientX;
426
+ mouse.lastY = e.clientY;
427
+ });
428
+ canvas.addEventListener('wheel', e => {
429
+ spherical.r = Math.max(8, Math.min(50, spherical.r + e.deltaY * 0.02));
430
+ });
431
+
432
+ function updateCamera() {
433
+ if (orbitActive) spherical.theta += 0.003;
434
+ camera.position.x = spherical.r * Math.sin(spherical.phi) * Math.sin(spherical.theta);
435
+ camera.position.y = spherical.r * Math.cos(spherical.phi);
436
+ camera.position.z = spherical.r * Math.sin(spherical.phi) * Math.cos(spherical.theta);
437
+ camera.lookAt(0, 0, 0);
438
+ }
439
+
440
+ // ── Scene objects ─────────────────────────────────────────────────────────────
441
+ const nodeObjects = {}; // name → { mesh, label, position }
442
+ const edgeObjects = [];
443
+ const pathObjects = [];
444
+ let agentSphere = null;
445
+ let agentTrail = null;
446
+ let currentStep = 0;
447
+ let maxStep = 0;
448
+ let playing = false;
449
+ let playInterval = null;
450
+ let vizData = null;
451
+ let nodePositions = {};
452
+
453
+ // ── Build scene from data ─────────────────────────────────────────────────────
454
+ function buildScene(data) {
455
+ vizData = data;
456
+
457
+ // Clear previous objects
458
+ Object.values(nodeObjects).forEach(o => scene.remove(o.mesh));
459
+ edgeObjects.forEach(e => scene.remove(e));
460
+ pathObjects.forEach(p => scene.remove(p));
461
+ if (agentSphere) scene.remove(agentSphere);
462
+ Object.keys(nodeObjects).forEach(k => delete nodeObjects[k]);
463
+
464
+ const files = data.files || [];
465
+ const n = files.length;
466
+ if (n === 0) return;
467
+
468
+ // Arrange files in a circular layout on XZ plane
469
+ files.forEach((file, i) => {
470
+ const angle = (i / n) * Math.PI * 2;
471
+ const radius = Math.max(4, n * 0.9);
472
+ const x = Math.cos(angle) * radius;
473
+ const z = Math.sin(angle) * radius;
474
+ const y = 0;
475
+
476
+ nodePositions[file.name] = new THREE.Vector3(x, y, z);
477
+
478
+ // Sphere geometry
479
+ const geo = new THREE.SphereGeometry(0.6, 16, 16);
480
+ const color = new THREE.Color(
481
+ file.is_bug_file ? COLORS.bug :
482
+ file.type === 'test' ? COLORS.test :
483
+ file.type === 'spec' ? COLORS.spec : COLORS.src
484
+ );
485
+ const mat = new THREE.MeshPhongMaterial({
486
+ color,
487
+ emissive: color.clone().multiplyScalar(0.3),
488
+ shininess: 60,
489
+ transparent: true,
490
+ opacity: 0.9,
491
+ });
492
+ const mesh = new THREE.Mesh(geo, mat);
493
+ mesh.position.set(x, y, z);
494
+ mesh.userData = { file };
495
+ scene.add(mesh);
496
+
497
+ // Glow ring
498
+ const ringGeo = new THREE.RingGeometry(0.75, 0.85, 32);
499
+ const ringMat = new THREE.MeshBasicMaterial({
500
+ color,
501
+ transparent: true,
502
+ opacity: 0.25,
503
+ side: THREE.DoubleSide,
504
+ });
505
+ const ring = new THREE.Mesh(ringGeo, ringMat);
506
+ ring.rotation.x = Math.PI / 2;
507
+ mesh.add(ring);
508
+
509
+ nodeObjects[file.name] = { mesh, position: nodePositions[file.name], file };
510
+ });
511
+
512
+ // Draw dependency edges
513
+ (data.dependencies || []).forEach(dep => {
514
+ const fromPos = nodePositions[dep.from];
515
+ const toPos = nodePositions[dep.to];
516
+ if (!fromPos || !toPos) return;
517
+
518
+ const points = [fromPos.clone(), toPos.clone()];
519
+ const geo = new THREE.BufferGeometry().setFromPoints(points);
520
+ const mat = new THREE.LineBasicMaterial({
521
+ color: COLORS.edge,
522
+ transparent: true,
523
+ opacity: 0.4,
524
+ });
525
+ const line = new THREE.Line(geo, mat);
526
+ scene.add(line);
527
+ edgeObjects.push(line);
528
+ });
529
+
530
+ // Agent globe
531
+ const agentGeo = new THREE.SphereGeometry(0.35, 16, 16);
532
+ const agentMat = new THREE.MeshPhongMaterial({
533
+ color: COLORS.agent,
534
+ emissive: 0xfbbf24,
535
+ emissiveIntensity: 0.8,
536
+ shininess: 100,
537
+ });
538
+ agentSphere = new THREE.Mesh(agentGeo, agentMat);
539
+ agentSphere.position.set(0, 3, 0); // Start above origin
540
+ scene.add(agentSphere);
541
+
542
+ // Update UI
543
+ document.getElementById('stat-task').textContent = data.task || '—';
544
+ document.getElementById('stat-variant').textContent = data.variant_id || '—';
545
+ document.getElementById('stat-steps').textContent = (data.steps || []).length;
546
+ document.getElementById('stat-strategy').textContent = data.strategy || '—';
547
+ document.getElementById('stat-failure').textContent = data.failure_type || '—';
548
+ updateScore(data.final_score || 0);
549
+ updateStepLog(data.steps || [], -1);
550
+
551
+ // Setup timeline
552
+ maxStep = (data.steps || []).length;
553
+ const slider = document.getElementById('timeline-slider');
554
+ slider.max = maxStep;
555
+ slider.value = 0;
556
+ currentStep = 0;
557
+ updateStepLabel(0, maxStep);
558
+
559
+ applyStep(0);
560
+ }
561
+
562
+ // ── Animation: go to a specific step ─────────────────────────────────────────
563
+ function applyStep(stepIndex) {
564
+ if (!vizData) return;
565
+ const steps = vizData.steps || [];
566
+ const visitedFiles = new Set();
567
+ const modifiedFiles = new Set();
568
+
569
+ // Reset all nodes
570
+ Object.values(nodeObjects).forEach(obj => {
571
+ const file = obj.file;
572
+ const baseColor = new THREE.Color(
573
+ file.is_bug_file ? COLORS.bug :
574
+ file.type === 'test' ? COLORS.test :
575
+ file.type === 'spec' ? COLORS.spec : COLORS.src
576
+ );
577
+ obj.mesh.material.color.set(baseColor);
578
+ obj.mesh.material.emissive.set(baseColor.clone().multiplyScalar(0.2));
579
+ obj.mesh.material.opacity = 0.5;
580
+ obj.mesh.scale.set(1, 1, 1);
581
+ });
582
+
583
+ // Remove old path lines
584
+ pathObjects.forEach(p => scene.remove(p));
585
+ pathObjects.length = 0;
586
+
587
+ // Collect positions for path up to current step
588
+ const pathPositions = [];
589
+
590
+ for (let i = 0; i < stepIndex; i++) {
591
+ const step = steps[i];
592
+ if (!step) continue;
593
+
594
+ if (step.path && nodeObjects[step.path]) {
595
+ const pos = nodeObjects[step.path].position.clone();
596
+ pathPositions.push(pos.clone().add(new THREE.Vector3(0, 0.1, 0)));
597
+
598
+ if (step.action === 'read_file') visitedFiles.add(step.path);
599
+ if (step.action === 'write_file') modifiedFiles.add(step.path);
600
+ }
601
+ }
602
+
603
+ // Color visited + modified
604
+ visitedFiles.forEach(name => {
605
+ if (nodeObjects[name]) {
606
+ nodeObjects[name].mesh.material.color.set(COLORS.visited);
607
+ nodeObjects[name].mesh.material.emissive.set(
608
+ new THREE.Color(COLORS.visited).multiplyScalar(0.4)
609
+ );
610
+ nodeObjects[name].mesh.material.opacity = 1.0;
611
+ nodeObjects[name].mesh.scale.set(1.2, 1.2, 1.2);
612
+ }
613
+ });
614
+ modifiedFiles.forEach(name => {
615
+ if (nodeObjects[name]) {
616
+ nodeObjects[name].mesh.material.color.set(COLORS.modified);
617
+ nodeObjects[name].mesh.material.emissive.set(
618
+ new THREE.Color(COLORS.modified).multiplyScalar(0.5)
619
+ );
620
+ nodeObjects[name].mesh.material.opacity = 1.0;
621
+ nodeObjects[name].mesh.scale.set(1.4, 1.4, 1.4);
622
+ }
623
+ });
624
+
625
+ // Draw path beam
626
+ if (pathPositions.length >= 2) {
627
+ const pathGeo = new THREE.BufferGeometry().setFromPoints(pathPositions);
628
+ const pathMat = new THREE.LineBasicMaterial({
629
+ color: COLORS.path,
630
+ transparent: true,
631
+ opacity: 0.85,
632
+ linewidth: 2,
633
+ });
634
+ const pathLine = new THREE.Line(pathGeo, pathMat);
635
+ scene.add(pathLine);
636
+ pathObjects.push(pathLine);
637
+ }
638
+
639
+ // Move agent sphere
640
+ if (stepIndex > 0 && stepIndex <= steps.length) {
641
+ const currentStepData = steps[stepIndex - 1];
642
+ if (currentStepData && currentStepData.path && nodeObjects[currentStepData.path]) {
643
+ const targetPos = nodeObjects[currentStepData.path].position;
644
+ agentSphere.position.set(targetPos.x, targetPos.y + 1.2, targetPos.z);
645
+ } else {
646
+ // No file target — float in center (for search/submit actions)
647
+ agentSphere.position.set(0, 2.5, 0);
648
+ }
649
+ } else {
650
+ agentSphere.position.set(0, 3.5, 0);
651
+ }
652
+
653
+ // Highlight current node
654
+ if (stepIndex > 0) {
655
+ const cur = steps[stepIndex - 1];
656
+ if (cur && cur.path && nodeObjects[cur.path]) {
657
+ nodeObjects[cur.path].mesh.scale.set(1.6, 1.6, 1.6);
658
+ }
659
+ }
660
+
661
+ updateStepLog(steps, stepIndex - 1);
662
+ updateStepLabel(stepIndex, maxStep);
663
+
664
+ // Update slider gradient
665
+ const slider = document.getElementById('timeline-slider');
666
+ const pct = maxStep > 0 ? (stepIndex / maxStep * 100) : 0;
667
+ slider.style.setProperty('--pct', pct + '%');
668
+ }
669
+
670
+ // ── Score ring ────────────────────────────────────────────────────────────────
671
+ function updateScore(score) {
672
+ const circumference = 2 * Math.PI * 34;
673
+ const arc = circumference * Math.min(1, Math.max(0, score));
674
+ document.getElementById('score-arc').setAttribute(
675
+ 'stroke-dasharray', `${arc} ${circumference}`
676
+ );
677
+ document.getElementById('score-text').textContent = score.toFixed(2);
678
+ document.getElementById('stat-score').textContent = score.toFixed(3);
679
+
680
+ // Color by score
681
+ const color = score >= 0.7 ? '#4ade80' : score >= 0.4 ? '#fbbf24' : '#f87171';
682
+ document.getElementById('score-arc').setAttribute('stroke', color);
683
+ }
684
+
685
+ // ── Step log ──────────────────────────────────────────────────────────────────
686
+ function updateStepLog(steps, currentIdx) {
687
+ const container = document.getElementById('log-entries');
688
+ container.innerHTML = '';
689
+
690
+ const ACTION_EMOJI = {
691
+ read_file: '📖',
692
+ write_file: '✏️',
693
+ run_tests: '🧪',
694
+ search_code: '🔍',
695
+ submit: '🏁',
696
+ };
697
+
698
+ steps.forEach((step, i) => {
699
+ const active = i === currentIdx;
700
+ const past = i < currentIdx;
701
+ const entry = document.createElement('div');
702
+ entry.className = 'log-entry';
703
+ entry.style.opacity = past ? '0.6' : active ? '1' : '0.35';
704
+ if (active) entry.style.background = 'rgba(125,211,252,0.08)';
705
+
706
+ const reward = step.reward || 0;
707
+ const rewardClass = reward > 0 ? 'reward-pos' : reward < 0 ? 'reward-neg' : 'reward-zero';
708
+ const emoji = ACTION_EMOJI[step.action] || '•';
709
+ const path = step.path ? step.path.split('/').pop() : step.action;
710
+
711
+ entry.innerHTML = `
712
+ <span class="log-step">S${step.step}</span>
713
+ <span class="log-action" style="color:${active ? '#7dd3fc' : '#94a3b8'}">${emoji} ${path}</span>
714
+ <span class="log-reward ${rewardClass}">${reward > 0 ? '+' : ''}${reward.toFixed(2)}</span>
715
+ `;
716
+ container.appendChild(entry);
717
+ });
718
+
719
+ // Auto-scroll to current
720
+ if (currentIdx >= 0) {
721
+ const entries = container.children;
722
+ if (entries[currentIdx]) {
723
+ entries[currentIdx].scrollIntoView({ block: 'nearest' });
724
+ }
725
+ }
726
+ }
727
+
728
+ // ── Hover tooltip ─────────────────────────────────────────────────────────────
729
+ const raycaster = new THREE.Raycaster();
730
+ const mouseVec = new THREE.Vector2();
731
+ const tooltip = document.getElementById('tooltip');
732
+
733
+ function checkHover(mx, my) {
734
+ mouseVec.x = (mx / window.innerWidth) * 2 - 1;
735
+ mouseVec.y = -(my / window.innerHeight) * 2 + 1;
736
+ raycaster.setFromCamera(mouseVec, camera);
737
+
738
+ const meshes = Object.values(nodeObjects).map(o => o.mesh);
739
+ const hits = raycaster.intersectObjects(meshes);
740
+
741
+ if (hits.length > 0) {
742
+ const file = hits[0].object.userData.file;
743
+ if (file) {
744
+ tooltip.style.opacity = '1';
745
+ tooltip.style.left = (mx + 14) + 'px';
746
+ tooltip.style.top = (my - 14) + 'px';
747
+ document.getElementById('tooltip-title').textContent = file.name;
748
+ document.getElementById('tooltip-body').innerHTML = `
749
+ Type: ${file.type}<br>
750
+ ${file.is_bug_file ? '⚠️ Bug location' : ''}
751
+ `;
752
+ }
753
+ } else {
754
+ tooltip.style.opacity = '0';
755
+ }
756
+ }
757
+
758
+ // ── Timeline controls ─────────────────────────────────────────────────────────
759
+ function onSliderChange(val) {
760
+ currentStep = parseInt(val);
761
+ applyStep(currentStep);
762
+ }
763
+
764
+ function stepForward() {
765
+ if (currentStep < maxStep) {
766
+ currentStep++;
767
+ document.getElementById('timeline-slider').value = currentStep;
768
+ applyStep(currentStep);
769
+ }
770
+ }
771
+
772
+ function stepBack() {
773
+ if (currentStep > 0) {
774
+ currentStep--;
775
+ document.getElementById('timeline-slider').value = currentStep;
776
+ applyStep(currentStep);
777
+ }
778
+ }
779
+
780
+ function togglePlay() {
781
+ playing = !playing;
782
+ const btn = document.getElementById('play-btn');
783
+ btn.textContent = playing ? '⏸ Pause' : '▶ Play';
784
+ if (playing) {
785
+ if (currentStep >= maxStep) { currentStep = 0; }
786
+ playInterval = setInterval(() => {
787
+ if (currentStep >= maxStep) {
788
+ playing = false;
789
+ btn.textContent = '▶ Play';
790
+ clearInterval(playInterval);
791
+ return;
792
+ }
793
+ stepForward();
794
+ }, 900);
795
+ } else {
796
+ clearInterval(playInterval);
797
+ }
798
+ }
799
+
800
+ function toggleOrbit() {
801
+ orbitActive = !orbitActive;
802
+ const btn = document.getElementById('orbit-btn');
803
+ btn.textContent = orbitActive ? '⏹ Stop' : '🔄 Orbit';
804
+ btn.classList.toggle('active', orbitActive);
805
+ }
806
+
807
+ function resetView() {
808
+ spherical = { theta: 0, phi: Math.PI / 4, r: 22 };
809
+ currentStep = 0;
810
+ document.getElementById('timeline-slider').value = 0;
811
+ applyStep(0);
812
+ }
813
+
814
+ function updateStepLabel(step, max) {
815
+ document.getElementById('step-label').textContent = `Step ${step} / ${max}`;
816
+ }
817
+
818
+ // ── Animation loop ────────────────────────────────────────────────────────────
819
+ let frame = 0;
820
+ function animate() {
821
+ requestAnimationFrame(animate);
822
+ frame++;
823
+
824
+ updateCamera();
825
+
826
+ // Pulse agent sphere
827
+ if (agentSphere) {
828
+ const pulse = 1 + Math.sin(frame * 0.08) * 0.15;
829
+ agentSphere.scale.set(pulse, pulse, pulse);
830
+ agentSphere.rotation.y += 0.03;
831
+ }
832
+
833
+ // Subtle node oscillation
834
+ Object.values(nodeObjects).forEach((obj, i) => {
835
+ obj.mesh.position.y = obj.position.y + Math.sin(frame * 0.02 + i) * 0.05;
836
+ });
837
+
838
+ renderer.render(scene, camera);
839
+ }
840
+
841
+ // ── Window resize ─────────────────────────────────────────────────────────────
842
+ window.addEventListener('resize', () => {
843
+ camera.aspect = window.innerWidth / window.innerHeight;
844
+ camera.updateProjectionMatrix();
845
+ renderer.setSize(window.innerWidth, window.innerHeight);
846
+ });
847
+
848
+ // ── Public API for Gradio integration ────────────────────────────────────────
849
+ window.loadTrajectoryData = function(jsonData) {
850
+ try {
851
+ const data = typeof jsonData === 'string' ? JSON.parse(jsonData) : jsonData;
852
+ buildScene(data);
853
+ } catch(e) {
854
+ console.error('Failed to load trajectory data:', e);
855
+ }
856
+ };
857
+
858
+ // ── Init ─────────────────────────────────────────────────────────────────────
859
+ document.addEventListener('DOMContentLoaded', () => {
860
+ const data = loadVizData();
861
+ buildScene(data);
862
+ document.getElementById('loader').style.display = 'none';
863
+ animate();
864
+ });
865
+ </script>
866
+ </body>
867
+ </html>