Chirag0123 commited on
Commit
0b0338d
ยท
1 Parent(s): dfbd16e

v4 Research Modules & Pre-submission tweaks

Browse files
app.py CHANGED
@@ -1,16 +1,21 @@
1
  #!/usr/bin/env python3
2
  """
3
- app.py โ€” Gradio UI v3.0 โ€” Full Platform Entry Point
4
-
5
- Tabs:
6
- ๐ŸŽฎ Interactive โ€” manual step-by-step control
7
- ๐Ÿค– Run Agent โ€” built-in deterministic agent demo
8
- ๐Ÿ“Š Evaluation โ€” 6-dimension evaluation report
9
- ๐Ÿง  Intelligence โ€” failure classification, strategy, advanced metrics
10
- ๐Ÿ” Self-Improve โ€” improvement plan after failure
11
- โš–๏ธ Compare Agents โ€” side-by-side multi-agent comparison
12
- ๐ŸŒ 3D Visualizer โ€” Three.js trajectory visualization
13
- ๐Ÿ“– API โ€” REST API reference
 
 
 
 
 
14
  """
15
  import os
16
  import json
@@ -22,6 +27,12 @@ from server.strategy_detector import StrategyDetector
22
  from server.advanced_metrics import AdvancedMetricsEngine
23
  from server.self_improvement import SelfImprovementEngine
24
  from server.multi_agent import MultiAgentComparison
 
 
 
 
 
 
25
 
26
  # โ”€โ”€ Global instances โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
27
  env = CodebaseNavEnvironment()
@@ -30,36 +41,56 @@ strategy_det = StrategyDetector()
30
  adv_metrics_engine = AdvancedMetricsEngine()
31
  improvement_engine = SelfImprovementEngine()
32
  multi_agent_engine = MultiAgentComparison()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
 
35
  # โ”€โ”€ Tab 1: Interactive โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
36
 
37
- def reset_environment(task: str):
38
  try:
39
  result = env.reset(task=task)
40
  obs = result.observation
41
  tree = "\n".join(f" ๐Ÿ“„ {f}" for f in obs.repo_tree)
42
- failing = ", ".join(obs.failing_tests) if obs.failing_tests else "None listed"
43
  fi = result.info.get("fault_injection", {})
44
  faults = ""
45
  if fi.get("faults_injected"):
46
- faults = f"\n\nโš ๏ธ Fault Injection ({fi.get('difficulty_multiplier', 1.0):.1f}x):\n"
47
  faults += "\n".join(f" โ€ข {f}" for f in fi["faults_injected"][:5])
48
-
49
  status = (
50
- f"โœ… Episode Started โ€” {task} (variant: {result.info.get('variant_id', '?')})\n"
51
- f"โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\n"
52
- f"Steps: {obs.steps_remaining} remaining\n\n"
53
  f"๐Ÿ“ Files:\n{tree}\n\n"
54
  f"๐Ÿ”ด Failing Tests: {failing}\n\n"
55
- f"๐Ÿ“‹ Task: {obs.task_description}{faults}"
56
  )
57
  return status, "", "0", "0.000"
58
  except Exception as e:
59
  return f"โŒ Error: {e}", "", "0", "0.000"
60
 
61
 
62
- def take_step(action_type: str, path: str, query: str, content: str):
63
  if env.done:
64
  return "โŒ Episode done. Reset first.", "", "", ""
65
  try:
@@ -71,83 +102,88 @@ def take_step(action_type: str, path: str, query: str, content: str):
71
  )
72
  result = env.step(action)
73
  obs = result.observation
74
- result_text = obs.last_action_result or "No output"
75
- error = f"\nโš ๏ธ {obs.last_action_error}" if obs.last_action_error else ""
76
  flags = result.info.get("security_flags", [])
77
- sec = f"\n๐Ÿ”’ Security: {flags}" if flags else ""
78
-
79
  status = (
80
- f"Step {result.info['steps_taken']} | "
81
- f"Reward: {result.reward:+.3f} | "
82
- f"Steps left: {obs.steps_remaining}{error}{sec}"
83
  )
84
  if result.done:
85
  status += f"\n\n๐Ÿ DONE โ€” Score: {result.info['final_score']:.3f}"
86
-
87
- return (
88
- status,
89
- result_text[:3000],
90
- str(result.info["steps_taken"]),
91
- f"{result.info.get('cumulative_reward', 0):.3f}",
92
- )
93
  except Exception as e:
94
- return f"โŒ Error: {e}", "", "", ""
95
 
96
 
97
  # โ”€โ”€ Tab 2: Run Agent โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
98
 
99
- def run_builtin_agent(task: str):
100
  try:
101
  result = env.reset(task=task)
102
  obs = result.observation
103
- log = [
104
- f"๐Ÿš€ {task} (variant: {result.info.get('variant_id')})",
105
- f" Files: {obs.repo_tree}",
106
- f" Failing: {obs.failing_tests}",
107
- ]
108
  tree = obs.repo_tree
 
109
  test_files = sorted([f for f in tree if f.startswith("tests/")])
110
  src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")])
111
  spec_files = sorted([f for f in tree if f.endswith(".md")])
112
  steps = 0
113
 
114
  if task == "task3" and spec_files:
115
- for sf in spec_files:
116
  if env.done: break
117
  r = env.step(RepoAction(action_type="read_file", path=sf))
118
- steps += 1
119
- log.append(f" Step {steps}: read_file {sf} โ†’ {r.reward:+.3f}")
120
 
121
  for tf in test_files:
122
  if env.done: break
123
  r = env.step(RepoAction(action_type="read_file", path=tf))
124
- steps += 1
125
- log.append(f" Step {steps}: read_file {tf} โ†’ {r.reward:+.3f}")
 
 
 
126
 
127
  for sf in src_files:
128
- if env.done or steps >= 12: break
129
  r = env.step(RepoAction(action_type="read_file", path=sf))
130
- steps += 1
131
- log.append(f" Step {steps}: read_file {sf} โ†’ {r.reward:+.3f}")
132
 
133
  if not env.done and test_files:
134
  r = env.step(RepoAction(action_type="run_tests", path=test_files[0]))
135
- steps += 1
136
- log.append(f" Step {steps}: run_tests โ†’ {r.reward:+.3f}")
137
 
138
  if not env.done:
139
  r = env.step(RepoAction(action_type="submit"))
140
- steps += 1
141
- log.append(f" Step {steps}: submit โ†’ {r.reward:+.3f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
- log += [
144
- f"\n๐Ÿ Score: {env.final_score:.3f}",
145
- f" Steps: {steps}",
146
- f" Reward: {env.cumulative_reward:.3f}",
147
- ]
148
  return "\n".join(log)
149
  except Exception as e:
150
- return f"โŒ Error: {e}"
151
 
152
 
153
  # โ”€โ”€ Tab 3: Evaluation โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
@@ -156,55 +192,42 @@ def get_evaluation():
156
  try:
157
  ev = env.get_evaluation()
158
  if "error" in ev:
159
- return "No evaluation available. Run an episode first."
160
- lines = [
161
- f"๐ŸŽฏ Composite Score: {ev['composite_score']:.3f}",
162
- "โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”",
163
- ]
164
  for name, dim in ev.get("dimensions", {}).items():
165
- bar = "โ–ˆ" * int(dim["score"] * 20) + "โ–‘" * (20 - int(dim["score"] * 20))
166
  lines.append(f" {name:15s} [{bar}] {dim['score']:.3f}")
167
- for e in dim.get("evidence", [])[:2]:
168
  lines.append(f" โ†’ {e}")
169
  if ev.get("strengths"):
170
  lines += ["\n๐Ÿ’ช Strengths:"] + [f" โœ… {s}" for s in ev["strengths"]]
171
  if ev.get("failure_analysis"):
172
  lines += ["\nโš ๏ธ Failures:"] + [f" โŒ {f}" for f in ev["failure_analysis"]]
173
  if ev.get("recommendations"):
174
- lines += ["\n๐Ÿ’ก Recommendations:"] + [f" โ†’ {r}" for r in ev["recommendations"]]
175
  return "\n".join(lines)
176
  except Exception as e:
177
  return f"Error: {e}"
178
 
179
-
180
  def get_metrics():
181
  try:
182
  return json.dumps(env.get_metrics(), indent=2, default=str)
183
  except Exception as e:
184
  return f"Error: {e}"
185
 
186
-
187
  def get_trajectory():
188
  try:
189
  t = env.get_trajectory()
190
- if not t:
191
- return "No trajectory. Run an episode first."
192
  lines = [
193
- f"Episode: {t.get('episode_id')}",
194
- f"Task: {t.get('task')} | Variant: {t.get('variant_id')}",
195
- f"Score: {t.get('final_score', 0):.3f} | Duration: {t.get('duration_seconds', '?')}s",
196
- "โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”",
197
  ]
198
- emojis = {"read_file": "๐Ÿ“–", "write_file": "โœ๏ธ", "run_tests": "๐Ÿงช",
199
- "search_code": "๐Ÿ”", "submit": "๐Ÿ"}
200
- for step in t.get("steps", []):
201
- em = emojis.get(step["action_type"], "โ€ข")
202
  p = step.get("action_path") or step.get("action_query") or ""
203
  err = " โŒ" if step.get("error") else ""
204
- lines.append(
205
- f" {em} {step['step_number']:2d}: {step['action_type']:12s} {p:30s} "
206
- f"reward={step['reward']:+.3f} ({step['duration_ms']:.0f}ms){err}"
207
- )
208
  return "\n".join(lines)
209
  except Exception as e:
210
  return f"Error: {e}"
@@ -214,294 +237,310 @@ def get_trajectory():
214
 
215
  def get_failure_classification():
216
  try:
217
- traj = env.get_trajectory()
218
- if not traj:
219
- return "No trajectory. Run an episode first."
220
- meta = env.variant.meta if env.variant else {}
221
- report = failure_clf.classify(
222
- episode_id=traj.get("episode_id", ""),
223
- task=env.current_task or "unknown",
224
- trajectory_steps=traj.get("steps", []),
225
- variant_meta=meta,
226
- files_read=list(env.files_read),
227
- files_written=list(env.files_written),
228
- final_score=env.final_score,
229
- security_violations=env.security_violations,
230
- )
231
- d = report.to_dict()
232
  lines = [
233
  f"{'โœ… SUCCESS' if d['success'] else 'โŒ FAILURE'}",
234
- f"Primary Failure Type: {d['primary_failure']}",
235
- f"Failures Detected: {d['failure_count']}",
236
- "โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”",
237
  ]
238
- for f in d.get("failures", []):
239
- lines += [
240
- f"\n[{f['severity'].upper()}] {f['type']} @ Step {f['step']}",
241
- f" Evidence: {f['evidence']}",
242
- f" Root Cause: {f['root_cause']}",
243
- f" Fix: {f['remediation']}",
244
- ]
245
  if d.get("failure_summary"):
246
  lines += ["\n๐Ÿ“‹ Summary:", f" {d['failure_summary']}"]
247
  if d.get("retry_hint"):
248
- lines += ["\n๐Ÿ” Retry Hint:", f" {d['retry_hint']}"]
249
  return "\n".join(lines)
250
- except Exception as e:
251
- return f"Error: {e}"
252
 
253
 
254
  def get_strategy_detection():
255
  try:
256
- traj = env.get_trajectory()
257
- if not traj:
258
- return "No trajectory. Run an episode first."
259
- meta = env.variant.meta if env.variant else {}
260
- report = strategy_det.detect(
261
- trajectory_steps=traj.get("steps", []),
262
- task=env.current_task or "unknown",
263
- variant_meta=meta,
264
- files_read=list(env.files_read),
265
- final_score=env.final_score,
266
- )
267
- d = report.to_dict()
268
- score_bar = "โ–ˆ" * int(d["score"] * 20) + "โ–‘" * (20 - int(d["score"] * 20))
269
  lines = [
270
- f"๐Ÿงญ Strategy: {d['strategy']}",
271
- f" Score: [{score_bar}] {d['score']:.3f}",
272
- f" Confidence: {d['confidence']:.0%}",
273
- f"\n๐Ÿ“– {d['strategy_description']}",
274
- f"\n๐Ÿ“Š Exploration Ratio: {d['exploration_ratio']:.2f} "
275
- f"({'explore-heavy' if d['exploration_ratio'] > 0.6 else 'exploit-heavy' if d['exploration_ratio'] < 0.4 else 'balanced'})",
276
- f" Strategy Pivots: {d['pivot_count']}",
277
  ]
278
- if d.get("sub_patterns"):
279
- lines += ["\n๐Ÿ”– Sub-patterns:"] + [f" โ€ข {p}" for p in d["sub_patterns"]]
280
- if d.get("evidence"):
281
- lines += ["\n๐Ÿ” Evidence:"] + [f" โ†’ {e}" for e in d["evidence"]]
282
  return "\n".join(lines)
283
- except Exception as e:
284
- return f"Error: {e}"
285
 
286
 
287
  def get_advanced_metrics():
288
  try:
289
- traj = env.get_trajectory()
290
- if not traj:
291
- return "No trajectory. Run an episode first."
292
- meta = env.variant.meta if env.variant else {}
293
- report = adv_metrics_engine.compute(
294
- trajectory_steps=traj.get("steps", []),
295
- variant_meta=meta,
296
- final_score=env.final_score,
297
- files_read=list(env.files_read),
298
- files_written=list(env.files_written),
299
- )
300
- d = report.to_dict()
301
-
302
- def bar(v):
303
- return "โ–ˆ" * int(v * 20) + "โ–‘" * (20 - int(v * 20))
304
-
305
- lines = [
306
- "โšก ADVANCED METRICS",
307
- "โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”",
308
  f" Reasoning Efficiency [{bar(d['reasoning_efficiency'])}] {d['reasoning_efficiency']:.3f}",
309
  f" Reliability Index [{bar(d['reliability_index'])}] {d['reliability_index']:.3f}",
310
  f" Exploration Ratio [{bar(d['exploration_ratio'])}] {d['exploration_ratio']:.3f}",
311
  f" Decision Entropy [{bar(d['decision_entropy'])}] {d['decision_entropy']:.3f}",
312
  f" Wasteful Ratio [{bar(d['wasteful_ratio'])}] {d['wasteful_ratio']:.3f}",
313
- f" Pivot Rate {d['pivot_rate']:.2f} per 10 steps",
314
- f" Consistency [{bar(d['consistency_score'])}] {d['consistency_score']:.3f} ({d['runs_analyzed']} runs)",
315
- "\n๐Ÿ“Š Action Distribution:",
316
  ]
317
- for action, count in d.get("action_distribution", {}).items():
318
- lines.append(f" {action:15s}: {count}")
319
- if d.get("useful_actions"):
320
- lines += ["\nโœ… Useful Actions:"] + [f" โ€ข {a}" for a in d["useful_actions"]]
321
- if d.get("wasteful_actions"):
322
- lines += ["\nโš ๏ธ Wasteful Actions:"] + [f" โ€ข {a}" for a in d["wasteful_actions"]]
323
- lines += ["\n๐Ÿ”’ Reliability Breakdown:"]
324
- for k, v in d.get("reliability_breakdown", {}).items():
325
- lines.append(f" {k:15s}: {v:.3f}")
326
  return "\n".join(lines)
327
- except Exception as e:
328
- return f"Error: {e}"
329
 
330
 
331
  # โ”€โ”€ Tab 5: Self-Improve โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
332
 
333
  def get_improvement_plan():
334
  try:
335
- traj = env.get_trajectory()
336
- if not traj:
337
- return "No trajectory. Run an episode first."
338
- meta = env.variant.meta if env.variant else {}
339
- steps = traj.get("steps", [])
340
-
341
- fail_report = failure_clf.classify(
342
- episode_id=traj.get("episode_id", ""),
343
- task=env.current_task or "unknown",
344
- trajectory_steps=steps,
345
- variant_meta=meta,
346
- files_read=list(env.files_read),
347
- files_written=list(env.files_written),
348
- final_score=env.final_score,
349
- security_violations=env.security_violations,
350
- )
351
  plan = improvement_engine.generate_improvement_plan(
352
- episode_id=traj.get("episode_id", ""),
353
- task=env.current_task or "unknown",
354
- failure_type=fail_report.primary_failure,
355
- failure_evidence=[f.evidence for f in fail_report.failures],
356
- original_score=env.final_score,
357
- trajectory_steps=steps,
358
- files_read=list(env.files_read),
359
- files_written=list(env.files_written),
360
  )
361
  d = plan.to_dict()
362
  lines = [
363
- f"๐Ÿ” SELF-IMPROVEMENT PLAN",
364
- f"โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”",
365
- f"Original Score: {d['original_score']:.3f}",
366
- f"Failure Type: {d['failure_type']}",
367
- f"\nโŒ What Went Wrong:\n {d['what_went_wrong']}",
368
- f"\n๐ŸŽฏ Improved Strategy:\n {d['improved_strategy']}",
369
- f"\n๐Ÿ“‹ Step-by-Step Plan:",
370
- ]
371
- for step in d.get("step_by_step_plan", []):
372
- lines.append(f" {step}")
373
- if d.get("specific_errors"):
374
- lines += ["\n๐Ÿ”Ž Specific Errors:"] + [f" โ€ข {e}" for e in d["specific_errors"][:5]]
375
- lines += [
376
- "\n๐Ÿ’‰ System Prompt Injection (for next LLM run):",
377
- "โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€",
378
- d.get("system_prompt_addon", "No injection needed."),
379
- ]
380
  return "\n".join(lines)
381
- except Exception as e:
382
- return f"Error: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
 
384
 
385
  # โ”€โ”€ Tab 6: Compare Agents โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
386
 
387
- def run_comparison(task: str, selected_agents: list):
388
  try:
389
- agents = selected_agents if selected_agents else None
390
  report = multi_agent_engine.compare(env, task=task, agents=agents)
391
  d = report.to_dict()
392
-
393
  lines = [
394
  f"โš–๏ธ MULTI-AGENT COMPARISON โ€” {task} (variant: {d.get('variant_id')})",
395
- f"๐Ÿ† Winner: {d.get('winner')} (score: {d.get('winner_score', 0):.3f})",
396
- "โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”",
397
- f"{'Rank':<6} {'Agent':<16} {'Score':<8} {'Steps':<8} {'Strategy':<22} {'Failure':<22} {'Reliability':<12}",
398
- "โ”€" * 100,
399
  ]
400
- for row in d.get("summary_table", []):
401
- lines.append(
402
- f"#{row['rank']:<5} {row['agent']:<16} {row['score']:<8.3f} "
403
- f"{row['steps']:<8} {row['strategy']:<22} {row['failure']:<22} {row['reliability']:<12.3f}"
404
- )
405
- lines.append("โ”" * 100)
406
-
407
  if d.get("insights"):
408
  lines += ["\n๐Ÿ’ก Insights:"] + [f" โ†’ {i}" for i in d["insights"]]
409
-
410
- lines.append("\n๐Ÿ“Š Per-Agent Action Sequences:")
411
- for run in d.get("detailed_runs", []):
412
- seq = " โ†’ ".join(run.get("action_sequence", []))
413
  lines.append(f" {run['agent_name']:16s}: {seq}")
414
-
415
  return "\n".join(lines)
416
- except Exception as e:
417
- return f"โŒ Error: {e}"
418
 
419
 
420
  # โ”€โ”€ Tab 7: 3D Visualizer โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
421
 
422
- def get_viz_html():
423
- """Generate the 3D visualizer HTML with current trajectory data injected."""
424
- # Load the static HTML template
425
- static_path = os.path.join(os.path.dirname(__file__), "static", "viz3d.html")
426
- if not os.path.exists(static_path):
427
- return "<p style='color:red'>viz3d.html not found in static/</p>"
 
 
 
 
 
 
 
428
 
429
- with open(static_path, "r") as f:
430
- html = f.read()
431
 
432
- # Get viz data from current environment
433
- traj = env.get_trajectory()
434
- if traj:
435
- meta = env.variant.meta if env.variant else {}
436
- bug_files = set(meta.get("bug_files", []))
437
- files = []
438
- if env.variant:
439
- for fname in env.variant.get_tree():
440
- ftype = "test" if fname.startswith("tests/") else \
441
- "spec" if fname.endswith(".md") else "src"
442
- files.append({
443
- "name": fname,
444
- "type": ftype,
445
- "is_bug_file": fname in bug_files,
446
- "visited": fname in env.files_read,
447
- "modified": fname in env.files_written,
448
- })
449
-
450
- test_files = [f["name"] for f in files if f["type"] == "test"]
451
- src_files = [f["name"] for f in files if f["type"] == "src"]
452
- deps = []
453
- for tf in test_files:
454
- for sf in src_files:
455
- deps.append({"from": tf, "to": sf})
456
-
457
- steps_data = []
458
- for step in traj.get("steps", []):
459
- steps_data.append({
460
- "step": step.get("step_number", 0),
461
- "action": step.get("action_type", ""),
462
- "path": step.get("action_path"),
463
- "reward": step.get("reward", 0.0),
464
- "error": step.get("error"),
465
- "pass_rate": step.get("test_pass_rate"),
466
- })
467
-
468
- strategy_report = strategy_det.detect(
469
- traj.get("steps", []),
470
- env.current_task or "unknown",
471
- meta,
472
- list(env.files_read),
473
- env.final_score,
474
- ) if traj.get("steps") else None
475
-
476
- viz_data = {
477
- "task": env.current_task or "unknown",
478
- "variant_id": traj.get("variant_id", "unknown"),
479
- "final_score": env.final_score,
480
- "strategy": strategy_report.strategy if strategy_report else "UNKNOWN",
481
- "failure_type": "โ€”",
482
- "files": files,
483
- "dependencies": deps,
484
- "steps": steps_data,
485
- }
486
- data_json = json.dumps(viz_data)
487
- else:
488
- data_json = ""
489
-
490
- # Inject data into HTML
491
- html = html.replace(
492
- '<div id="viz-data" style="display:none"></div>',
493
- f'<div id="viz-data" style="display:none">{data_json}</div>'
494
- )
495
- return html
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
496
 
497
 
498
- # โ”€โ”€ Build Gradio UI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
 
 
499
 
500
- with gr.Blocks(title="Codebase Navigation & Repair โ€” OpenEnv v3") as demo:
501
  gr.Markdown(
502
- "# ๐Ÿ” Codebase Navigation & Repair โ€” OpenEnv v3\n"
503
- "**The most advanced debugging + evaluation platform for AI coding agents.** "
504
- "Navigate codebases ยท Fix bugs ยท Evaluate process ยท Visualize in 3D."
505
  )
506
 
507
  with gr.Tabs():
@@ -510,19 +549,12 @@ with gr.Blocks(title="Codebase Navigation & Repair โ€” OpenEnv v3") as demo:
510
  with gr.TabItem("๐ŸŽฎ Interactive"):
511
  with gr.Row():
512
  with gr.Column(scale=1):
513
- task_select = gr.Dropdown(
514
- ["task1", "task2", "task3"], value="task1",
515
- label="Task",
516
- info="task1=bugs, task2=cross-module, task3=feature impl"
517
- )
518
  reset_btn = gr.Button("๐Ÿ”„ Reset Environment", variant="primary")
519
  gr.Markdown("### Action")
520
- act_type = gr.Dropdown(
521
- ["read_file", "write_file", "run_tests", "search_code", "submit"],
522
- value="read_file", label="Action Type",
523
- )
524
  act_path = gr.Textbox(label="Path", placeholder="src/auth.py")
525
- act_query = gr.Textbox(label="Query (search_code)", placeholder="validate_token")
526
  act_content = gr.Textbox(label="Content (write_file)", lines=4)
527
  step_btn = gr.Button("โ–ถ๏ธ Execute Step", variant="secondary")
528
  with gr.Column(scale=2):
@@ -531,16 +563,16 @@ with gr.Blocks(title="Codebase Navigation & Repair โ€” OpenEnv v3") as demo:
531
  with gr.Row():
532
  steps_box = gr.Textbox(label="Steps", value="0", interactive=False)
533
  reward_box = gr.Textbox(label="Cumulative Reward", value="0.000", interactive=False)
534
- reset_btn.click(reset_environment, [task_select], [status_box, result_box, steps_box, reward_box])
535
  step_btn.click(take_step, [act_type, act_path, act_query, act_content], [status_box, result_box, steps_box, reward_box])
536
 
537
  # โ”€โ”€ Tab 2: Run Agent โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
538
  with gr.TabItem("๐Ÿค– Run Agent"):
539
- gr.Markdown("### Built-in Demonstration Agent\nRuns deterministic readโ†’submit strategy.")
540
- agent_task = gr.Dropdown(["task1", "task2", "task3"], value="task1", label="Task")
541
  run_btn = gr.Button("๐Ÿš€ Run Agent", variant="primary")
542
- agent_output = gr.Textbox(label="Agent Log", lines=20, interactive=False)
543
- run_btn.click(run_builtin_agent, [agent_task], [agent_output])
544
 
545
  # โ”€โ”€ Tab 3: Evaluation โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
546
  with gr.TabItem("๐Ÿ“Š Evaluation"):
@@ -553,107 +585,164 @@ with gr.Blocks(title="Codebase Navigation & Repair โ€” OpenEnv v3") as demo:
553
  metrics_btn.click(get_metrics, outputs=[eval_out])
554
  traj_btn.click(get_trajectory, outputs=[eval_out])
555
 
556
- # โ”€โ”€ Tab 4: ๐Ÿง  Intelligence โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
557
  with gr.TabItem("๐Ÿง  Intelligence"):
558
- gr.Markdown(
559
- "### Deep Agent Intelligence Analysis\n"
560
- "Failure classification, strategy detection, and advanced behavioral metrics."
561
- )
562
  with gr.Row():
563
- classify_btn = gr.Button("๐Ÿ”ฌ Classify Failure", variant="primary")
564
- strategy_btn = gr.Button("๐Ÿงญ Detect Strategy", variant="secondary")
565
  adv_btn = gr.Button("โšก Advanced Metrics", variant="secondary")
566
  intel_out = gr.Textbox(label="Analysis", lines=32, interactive=False)
567
- classify_btn.click(get_failure_classification, outputs=[intel_out])
568
- strategy_btn.click(get_strategy_detection, outputs=[intel_out])
569
  adv_btn.click(get_advanced_metrics, outputs=[intel_out])
570
 
571
- # โ”€โ”€ Tab 5: ๐Ÿ” Self-Improve โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
572
  with gr.TabItem("๐Ÿ” Self-Improve"):
573
- gr.Markdown(
574
- "### Self-Improvement Loop\n"
575
- "After a failure, this generates an actionable improvement plan and a "
576
- "system prompt injection for the agent's next attempt."
577
- )
578
- improve_btn = gr.Button("๐Ÿ” Generate Improvement Plan", variant="primary")
579
- improve_out = gr.Textbox(label="Improvement Plan", lines=32, interactive=False)
580
  improve_btn.click(get_improvement_plan, outputs=[improve_out])
 
581
 
582
- # โ”€โ”€ Tab 6: โš–๏ธ Compare โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
583
  with gr.TabItem("โš–๏ธ Compare Agents"):
584
- gr.Markdown(
585
- "### Multi-Agent Strategy Comparison\n"
586
- "Runs 4 built-in agent strategies on the same task to compare "
587
- "efficiency, strategy, and reliability side-by-side."
588
- )
589
  with gr.Row():
590
- comp_task = gr.Dropdown(["task1", "task2", "task3"], value="task1", label="Task")
591
  comp_agents = gr.CheckboxGroup(
592
- ["test-first", "search-first", "minimal", "exhaustive"],
593
- value=["test-first", "search-first", "minimal", "exhaustive"],
594
- label="Agents to Compare",
595
  )
596
  comp_btn = gr.Button("โš–๏ธ Run Comparison", variant="primary")
597
- comp_out = gr.Textbox(label="Comparison Report", lines=30, interactive=False)
598
  comp_btn.click(run_comparison, [comp_task, comp_agents], [comp_out])
599
 
600
- # โ”€โ”€ Tab 7: ๐ŸŒ 3D Visualizer โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€๏ฟฝ๏ฟฝ๏ฟฝโ”€โ”€โ”€
601
  with gr.TabItem("๐ŸŒ 3D Visualizer"):
602
  gr.Markdown(
603
  "### Agent Trajectory 3D Visualization\n"
604
- "Files = 3D nodes ยท Dependencies = edges ยท Agent path = animated beam ยท "
605
- "Timeline = scrubbable replay. **Run an episode first, then refresh.**"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
606
  )
607
- refresh_viz_btn = gr.Button("๐Ÿ”„ Load Trajectory into Visualizer", variant="primary")
608
- viz_html = gr.HTML(value="<p style='color:#64748b;text-align:center;padding:40px'>Click 'Load Trajectory' after running an episode.</p>")
609
- refresh_viz_btn.click(get_viz_html, outputs=[viz_html])
610
 
611
- # โ”€โ”€ Tab 8: API โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
612
  with gr.TabItem("๐Ÿ“– API"):
613
  gr.Markdown("""
614
- ### REST API โ€” v3.0 Endpoints
615
 
616
- #### Core (OpenEnv-compliant)
617
- | Endpoint | Method | Description |
618
- |----------|--------|-------------|
619
- | `/reset?task=task1` | POST | Start new episode |
620
- | `/step` | POST | Take action |
621
- | `/state` | GET | Current state |
622
- | `/health` | GET | Health check |
623
 
624
  #### Evaluation
625
- | Endpoint | Method | Description |
626
- |----------|--------|-------------|
627
- | `/trajectory` | GET | Full action log |
628
- | `/evaluate` | GET | 6-dimension scores |
629
- | `/metrics` | GET | Memory + security stats |
630
- | `/fault-config` | POST | Enable fault injection |
631
-
632
- #### Intelligence (NEW in v3)
633
- | Endpoint | Method | Description |
634
- |----------|--------|-------------|
635
- | `/classify` | GET | Typed failure classification |
636
- | `/strategy` | GET | Behavioral strategy detection |
637
- | `/advanced-metrics` | GET | Entropy, reliability, consistency |
638
- | `/improvement-plan` | GET | Self-improvement feedback |
639
- | `/compare-agents` | POST | Multi-agent comparison |
640
- | `/viz-data` | GET | 3D visualization data |
641
 
642
  ```bash
643
  BASE="http://localhost:7860"
 
644
  curl -X POST "$BASE/reset?task=task1"
645
- curl -X POST "$BASE/step" -H "Content-Type: application/json" -d '{"action_type":"read_file","path":"src/auth.py"}'
646
  curl -X POST "$BASE/step" -d '{"action_type":"submit"}'
 
 
647
  curl "$BASE/classify"
648
- curl "$BASE/strategy"
649
- curl "$BASE/advanced-metrics"
650
- curl "$BASE/improvement-plan"
651
- curl -X POST "$BASE/compare-agents?task=task1"
 
 
 
652
  ```
653
  """)
654
 
655
 
656
- # โ”€โ”€ Mount FastAPI under same process โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
657
  from server.app import app as fastapi_app
658
  gr_app = gr.mount_gradio_app(fastapi_app, demo, path="/")
659
 
 
1
  #!/usr/bin/env python3
2
  """
3
+ app.py โ€” Gradio UI v4.0 โ€” Full Research Platform
4
+
5
+ 13 tabs:
6
+ ๐ŸŽฎ Interactive โ€” manual control
7
+ ๐Ÿค– Run Agent โ€” deterministic demo agent
8
+ ๐Ÿ“Š Evaluation โ€” 6-dimension process evaluation
9
+ ๐Ÿง  Intelligence โ€” failure, strategy, advanced metrics
10
+ ๐Ÿ” Self-Improve โ€” improvement plan with prompt injection
11
+ โš–๏ธ Compare Agents โ€” multi-agent strategy comparison
12
+ ๐ŸŒ 3D Visualizer โ€” Three.js trajectory viz (FIXED: iframe)
13
+ ๐Ÿงช Causal Probe โ€” causal reasoning vs guessing
14
+ ๐ŸŽญ Counterfactual โ€” brittleness / robustness testing
15
+ ๐Ÿ“ Confidence โ€” calibration: overconfident vs underconfident
16
+ ๐Ÿ† Benchmark โ€” automated leaderboard
17
+ ๐Ÿ“ˆ Analytics โ€” unified research-grade report
18
+ ๐Ÿ“– API โ€” REST reference
19
  """
20
  import os
21
  import json
 
27
  from server.advanced_metrics import AdvancedMetricsEngine
28
  from server.self_improvement import SelfImprovementEngine
29
  from server.multi_agent import MultiAgentComparison
30
+ from server.causal_probe import CausalProbe
31
+ from server.counterfactual_engine import CounterfactualEngine
32
+ from server.confidence_calibrator import ConfidenceCalibrator
33
+ from server.benchmark_runner import BenchmarkRunner
34
+ from server.analytics_engine import AnalyticsEngine
35
+ from server.memory_bank import get_global_memory
36
 
37
  # โ”€โ”€ Global instances โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
38
  env = CodebaseNavEnvironment()
 
41
  adv_metrics_engine = AdvancedMetricsEngine()
42
  improvement_engine = SelfImprovementEngine()
43
  multi_agent_engine = MultiAgentComparison()
44
+ causal_probe = CausalProbe()
45
+ counterfactual_engine = CounterfactualEngine()
46
+ confidence_calibrator = ConfidenceCalibrator()
47
+ benchmark_runner = BenchmarkRunner()
48
+ analytics_engine = AnalyticsEngine()
49
+ memory_bank = get_global_memory()
50
+
51
+
52
+ # โ”€โ”€ Helpers โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
53
+
54
+ def _get_traj_and_meta():
55
+ traj = env.get_trajectory()
56
+ if not traj:
57
+ return None, None, None, None
58
+ meta = env.variant.meta if env.variant else {}
59
+ steps = traj.get("steps", [])
60
+ return traj, meta, steps, traj.get("episode_id", "")
61
+
62
+
63
+ def _no_traj():
64
+ return "โš ๏ธ No trajectory. Run an episode first (Interactive or Run Agent tab)."
65
 
66
 
67
  # โ”€โ”€ Tab 1: Interactive โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
68
 
69
+ def reset_environment(task):
70
  try:
71
  result = env.reset(task=task)
72
  obs = result.observation
73
  tree = "\n".join(f" ๐Ÿ“„ {f}" for f in obs.repo_tree)
74
+ failing = ", ".join(obs.failing_tests) if obs.failing_tests else "None"
75
  fi = result.info.get("fault_injection", {})
76
  faults = ""
77
  if fi.get("faults_injected"):
78
+ faults = f"\n\nโš ๏ธ Fault Injection ({fi.get('difficulty_multiplier',1):.1f}ร—):\n"
79
  faults += "\n".join(f" โ€ข {f}" for f in fi["faults_injected"][:5])
 
80
  status = (
81
+ f"โœ… Episode started โ€” {task} (variant: {result.info.get('variant_id','?')})\n"
82
+ f"โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\n"
83
+ f"Steps remaining: {obs.steps_remaining}\n\n"
84
  f"๐Ÿ“ Files:\n{tree}\n\n"
85
  f"๐Ÿ”ด Failing Tests: {failing}\n\n"
86
+ f"๐Ÿ“‹ {obs.task_description}{faults}"
87
  )
88
  return status, "", "0", "0.000"
89
  except Exception as e:
90
  return f"โŒ Error: {e}", "", "0", "0.000"
91
 
92
 
93
+ def take_step(action_type, path, query, content):
94
  if env.done:
95
  return "โŒ Episode done. Reset first.", "", "", ""
96
  try:
 
102
  )
103
  result = env.step(action)
104
  obs = result.observation
105
+ result_text = obs.last_action_result or ""
106
+ err = f"\nโš ๏ธ {obs.last_action_error}" if obs.last_action_error else ""
107
  flags = result.info.get("security_flags", [])
108
+ sec = f"\n๐Ÿ”’ {flags}" if flags else ""
 
109
  status = (
110
+ f"Step {result.info['steps_taken']} | Reward: {result.reward:+.3f} | "
111
+ f"Left: {obs.steps_remaining}{err}{sec}"
 
112
  )
113
  if result.done:
114
  status += f"\n\n๐Ÿ DONE โ€” Score: {result.info['final_score']:.3f}"
115
+ return status, result_text[:3000], str(result.info["steps_taken"]), f"{result.info.get('cumulative_reward',0):.3f}"
 
 
 
 
 
 
116
  except Exception as e:
117
+ return f"โŒ {e}", "", "", ""
118
 
119
 
120
  # โ”€โ”€ Tab 2: Run Agent โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
121
 
122
+ def run_builtin_agent(task):
123
  try:
124
  result = env.reset(task=task)
125
  obs = result.observation
 
 
 
 
 
126
  tree = obs.repo_tree
127
+ log = [f"๐Ÿš€ {task} (variant: {result.info.get('variant_id')})", f" Files: {tree}"]
128
  test_files = sorted([f for f in tree if f.startswith("tests/")])
129
  src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")])
130
  spec_files = sorted([f for f in tree if f.endswith(".md")])
131
  steps = 0
132
 
133
  if task == "task3" and spec_files:
134
+ for sf in spec_files[:2]:
135
  if env.done: break
136
  r = env.step(RepoAction(action_type="read_file", path=sf))
137
+ steps += 1; log.append(f" Step {steps}: read_file {sf} โ†’ {r.reward:+.3f}")
 
138
 
139
  for tf in test_files:
140
  if env.done: break
141
  r = env.step(RepoAction(action_type="read_file", path=tf))
142
+ steps += 1; log.append(f" Step {steps}: read_file {tf} โ†’ {r.reward:+.3f}")
143
+
144
+ if not env.done:
145
+ r = env.step(RepoAction(action_type="search_code", query="def "))
146
+ steps += 1; log.append(f" Step {steps}: search_code โ†’ {r.reward:+.3f}")
147
 
148
  for sf in src_files:
149
+ if env.done or steps >= 14: break
150
  r = env.step(RepoAction(action_type="read_file", path=sf))
151
+ steps += 1; log.append(f" Step {steps}: read_file {sf} โ†’ {r.reward:+.3f}")
 
152
 
153
  if not env.done and test_files:
154
  r = env.step(RepoAction(action_type="run_tests", path=test_files[0]))
155
+ steps += 1; log.append(f" Step {steps}: run_tests โ†’ {r.reward:+.3f}")
 
156
 
157
  if not env.done:
158
  r = env.step(RepoAction(action_type="submit"))
159
+ steps += 1; log.append(f" Step {steps}: submit โ†’ {r.reward:+.3f}")
160
+
161
+ log += ["", f"๐Ÿ Score: {env.final_score:.3f} | Steps: {steps} | Reward: {env.cumulative_reward:.3f}"]
162
+
163
+ # Store in memory
164
+ traj = env.get_trajectory()
165
+ if traj:
166
+ meta = env.variant.meta if env.variant else {}
167
+ fail_r = failure_clf.classify(
168
+ traj.get("episode_id",""), task, traj.get("steps",[]), meta,
169
+ list(env.files_read), list(env.files_written), env.final_score
170
+ )
171
+ strat_r = strategy_det.detect(traj.get("steps",[]), task, meta, list(env.files_read), env.final_score)
172
+ imp_plan = improvement_engine.generate_improvement_plan(
173
+ traj.get("episode_id",""), task, fail_r.primary_failure,
174
+ [], env.final_score, traj.get("steps",[]),
175
+ list(env.files_read), list(env.files_written)
176
+ )
177
+ memory_bank.store(
178
+ traj.get("episode_id",""), task, fail_r.primary_failure,
179
+ fail_r.failure_summary or "", env.final_score,
180
+ strat_r.strategy, traj.get("steps",[]), imp_plan.to_dict()
181
+ )
182
+ log.append(f"๐Ÿ’พ Stored lesson in memory bank ({memory_bank.get_stats()['total_entries']} total)")
183
 
 
 
 
 
 
184
  return "\n".join(log)
185
  except Exception as e:
186
+ return f"โŒ {e}"
187
 
188
 
189
  # โ”€โ”€ Tab 3: Evaluation โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
 
192
  try:
193
  ev = env.get_evaluation()
194
  if "error" in ev:
195
+ return _no_traj()
196
+ lines = [f"๐ŸŽฏ Composite Score: {ev['composite_score']:.3f}", "โ”"*50]
 
 
 
197
  for name, dim in ev.get("dimensions", {}).items():
198
+ bar = "โ–ˆ" * int(dim["score"]*20) + "โ–‘" * (20-int(dim["score"]*20))
199
  lines.append(f" {name:15s} [{bar}] {dim['score']:.3f}")
200
+ for e in dim.get("evidence",[])[:2]:
201
  lines.append(f" โ†’ {e}")
202
  if ev.get("strengths"):
203
  lines += ["\n๐Ÿ’ช Strengths:"] + [f" โœ… {s}" for s in ev["strengths"]]
204
  if ev.get("failure_analysis"):
205
  lines += ["\nโš ๏ธ Failures:"] + [f" โŒ {f}" for f in ev["failure_analysis"]]
206
  if ev.get("recommendations"):
207
+ lines += ["\n๐Ÿ’ก Recs:"] + [f" โ†’ {r}" for r in ev["recommendations"]]
208
  return "\n".join(lines)
209
  except Exception as e:
210
  return f"Error: {e}"
211
 
 
212
  def get_metrics():
213
  try:
214
  return json.dumps(env.get_metrics(), indent=2, default=str)
215
  except Exception as e:
216
  return f"Error: {e}"
217
 
 
218
  def get_trajectory():
219
  try:
220
  t = env.get_trajectory()
221
+ if not t: return _no_traj()
 
222
  lines = [
223
+ f"Episode: {t.get('episode_id')}", f"Task: {t.get('task')} | Variant: {t.get('variant_id')}",
224
+ f"Score: {t.get('final_score',0):.3f} | Duration: {t.get('duration_seconds','?')}s", "โ”"*60,
 
 
225
  ]
226
+ em = {"read_file":"๐Ÿ“–","write_file":"โœ๏ธ","run_tests":"๐Ÿงช","search_code":"๐Ÿ”","submit":"๐Ÿ"}
227
+ for step in t.get("steps",[]):
 
 
228
  p = step.get("action_path") or step.get("action_query") or ""
229
  err = " โŒ" if step.get("error") else ""
230
+ lines.append(f" {em.get(step['action_type'],'โ€ข')} {step['step_number']:2d}: {step['action_type']:12s} {p:25s} reward={step['reward']:+.3f}{err}")
 
 
 
231
  return "\n".join(lines)
232
  except Exception as e:
233
  return f"Error: {e}"
 
237
 
238
  def get_failure_classification():
239
  try:
240
+ traj, meta, steps, ep_id = _get_traj_and_meta()
241
+ if not traj: return _no_traj()
242
+ r = failure_clf.classify(ep_id, env.current_task or "?", steps, meta,
243
+ list(env.files_read), list(env.files_written), env.final_score)
244
+ d = r.to_dict()
 
 
 
 
 
 
 
 
 
 
245
  lines = [
246
  f"{'โœ… SUCCESS' if d['success'] else 'โŒ FAILURE'}",
247
+ f"Primary: {d['primary_failure']} | Count: {d['failure_count']}", "โ”"*50,
 
 
248
  ]
249
+ for f in d.get("failures",[]):
250
+ lines += [f"\n[{f['severity'].upper()}] {f['type']} @ step {f['step']}",
251
+ f" Evidence: {f['evidence']}", f" Fix: {f['remediation']}"]
 
 
 
 
252
  if d.get("failure_summary"):
253
  lines += ["\n๐Ÿ“‹ Summary:", f" {d['failure_summary']}"]
254
  if d.get("retry_hint"):
255
+ lines += [f"\n๐Ÿ” Retry hint: {d['retry_hint']}"]
256
  return "\n".join(lines)
257
+ except Exception as e: return f"Error: {e}"
 
258
 
259
 
260
  def get_strategy_detection():
261
  try:
262
+ traj, meta, steps, _ = _get_traj_and_meta()
263
+ if not traj: return _no_traj()
264
+ r = strategy_det.detect(steps, env.current_task or "?", meta, list(env.files_read), env.final_score)
265
+ d = r.to_dict()
266
+ bar = "โ–ˆ"*int(d["score"]*20)+"โ–‘"*(20-int(d["score"]*20))
 
 
 
 
 
 
 
 
267
  lines = [
268
+ f"๐Ÿงญ Strategy: {d['strategy']}", f" [{bar}] {d['score']:.3f} (confidence: {d['confidence']:.0%})",
269
+ f"\n{d['strategy_description']}",
270
+ f"\nExploration: {d['exploration_ratio']:.2f} | Pivots: {d['pivot_count']}",
 
 
 
 
271
  ]
272
+ if d.get("sub_patterns"): lines += ["\nSub-patterns:"] + [f" โ€ข {p}" for p in d["sub_patterns"]]
273
+ if d.get("evidence"): lines += ["\nEvidence:"] + [f" โ†’ {e}" for e in d["evidence"]]
 
 
274
  return "\n".join(lines)
275
+ except Exception as e: return f"Error: {e}"
 
276
 
277
 
278
  def get_advanced_metrics():
279
  try:
280
+ traj, meta, steps, _ = _get_traj_and_meta()
281
+ if not traj: return _no_traj()
282
+ r = adv_metrics_engine.compute(steps, meta, env.final_score, list(env.files_read), list(env.files_written))
283
+ d = r.to_dict()
284
+ def bar(v): return "โ–ˆ"*int(v*20)+"โ–‘"*(20-int(v*20))
285
+ lines = ["โšก ADVANCED METRICS", "โ”"*50,
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  f" Reasoning Efficiency [{bar(d['reasoning_efficiency'])}] {d['reasoning_efficiency']:.3f}",
287
  f" Reliability Index [{bar(d['reliability_index'])}] {d['reliability_index']:.3f}",
288
  f" Exploration Ratio [{bar(d['exploration_ratio'])}] {d['exploration_ratio']:.3f}",
289
  f" Decision Entropy [{bar(d['decision_entropy'])}] {d['decision_entropy']:.3f}",
290
  f" Wasteful Ratio [{bar(d['wasteful_ratio'])}] {d['wasteful_ratio']:.3f}",
291
+ f" Pivot Rate {d['pivot_rate']:.2f}/10 steps | Consistency {d['consistency_score']:.3f} ({d['runs_analyzed']} runs)",
 
 
292
  ]
293
+ if d.get("action_distribution"):
294
+ lines += ["\nAction Distribution:"] + [f" {a:14s}: {c}" for a,c in d["action_distribution"].items()]
 
 
 
 
 
 
 
295
  return "\n".join(lines)
296
+ except Exception as e: return f"Error: {e}"
 
297
 
298
 
299
  # โ”€โ”€ Tab 5: Self-Improve โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
300
 
301
  def get_improvement_plan():
302
  try:
303
+ traj, meta, steps, ep_id = _get_traj_and_meta()
304
+ if not traj: return _no_traj()
305
+ fail_r = failure_clf.classify(ep_id, env.current_task or "?", steps, meta,
306
+ list(env.files_read), list(env.files_written), env.final_score)
 
 
 
 
 
 
 
 
 
 
 
 
307
  plan = improvement_engine.generate_improvement_plan(
308
+ ep_id, env.current_task or "?", fail_r.primary_failure,
309
+ [f.evidence for f in fail_r.failures], env.final_score,
310
+ steps, list(env.files_read), list(env.files_written)
 
 
 
 
 
311
  )
312
  d = plan.to_dict()
313
  lines = [
314
+ "๐Ÿ” SELF-IMPROVEMENT PLAN", "โ”"*50,
315
+ f"Original Score: {d['original_score']:.3f} | Failure: {d['failure_type']}",
316
+ f"\nโŒ What went wrong:\n {d['what_went_wrong']}",
317
+ f"\n๐ŸŽฏ Improved strategy:\n {d['improved_strategy']}",
318
+ "\n๐Ÿ“‹ Step-by-step plan:",
319
+ ] + [f" {s}" for s in d.get("step_by_step_plan",[])]
320
+ lines += ["\n๐Ÿ’‰ System Prompt Injection:", "โ”€"*40, d.get("system_prompt_addon","None")]
 
 
 
 
 
 
 
 
 
 
321
  return "\n".join(lines)
322
+ except Exception as e: return f"Error: {e}"
323
+
324
+
325
+ def get_memory_context_for_task(task):
326
+ try:
327
+ ctx = memory_bank.retrieve(task=task, max_lessons=3)
328
+ stats = memory_bank.get_stats()
329
+ lines = [
330
+ f"๐Ÿง  MEMORY BANK โ€” {stats['total_entries']} total lessons",
331
+ f"Retrieving for: {task}", "โ”"*50,
332
+ ]
333
+ if not ctx.relevant_lessons:
334
+ lines.append("No lessons stored yet. Run episodes to build memory.")
335
+ else:
336
+ lines.append(f"\n๐Ÿ“š {ctx.lessons_count} relevant lesson(s):\n")
337
+ for i, e in enumerate(ctx.relevant_lessons, 1):
338
+ lines += [
339
+ f"[Lesson {i}] Task: {e.task} | Failure: {e.failure_type} | Score: {e.score:.2f}",
340
+ f" Title: {e.lesson_title}",
341
+ f" Lesson: {e.lesson_body[:120]}",
342
+ f" Hint: {e.lesson_hint[:120]}" if e.lesson_hint else "",
343
+ "",
344
+ ]
345
+ lines += ["\n๐Ÿ’‰ System Prompt Injection:", "โ”€"*40, ctx.system_prompt_injection]
346
+ return "\n".join(l for l in lines)
347
+ except Exception as e: return f"Error: {e}"
348
 
349
 
350
  # โ”€โ”€ Tab 6: Compare Agents โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
351
 
352
+ def run_comparison(task, selected_agents):
353
  try:
354
+ agents = selected_agents or None
355
  report = multi_agent_engine.compare(env, task=task, agents=agents)
356
  d = report.to_dict()
 
357
  lines = [
358
  f"โš–๏ธ MULTI-AGENT COMPARISON โ€” {task} (variant: {d.get('variant_id')})",
359
+ f"๐Ÿ† Winner: {d.get('winner')} (score: {d.get('winner_score',0):.3f})", "โ”"*80,
360
+ f"{'Rank':<5} {'Agent':<16} {'Score':<8} {'Steps':<7} {'Strategy':<22} {'Failure':<20} {'Reliability'}",
361
+ "โ”€"*80,
 
362
  ]
363
+ for row in d.get("summary_table",[]):
364
+ lines.append(f"#{row['rank']:<4} {row['agent']:<16} {row['score']:<8.3f} {row['steps']:<7} {row['strategy']:<22} {row['failure']:<20} {row['reliability']:.3f}")
365
+ lines.append("โ”"*80)
 
 
 
 
366
  if d.get("insights"):
367
  lines += ["\n๐Ÿ’ก Insights:"] + [f" โ†’ {i}" for i in d["insights"]]
368
+ lines.append("\n๐Ÿ“Š Action Sequences:")
369
+ for run in d.get("detailed_runs",[]):
370
+ seq = " โ†’ ".join(run.get("action_sequence",[]))
 
371
  lines.append(f" {run['agent_name']:16s}: {seq}")
 
372
  return "\n".join(lines)
373
+ except Exception as e: return f"โŒ {e}"
 
374
 
375
 
376
  # โ”€โ”€ Tab 7: 3D Visualizer โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
377
 
378
+ def get_viz_iframe():
379
+ """Return iframe pointing to /static/viz3d.html โ€” fixes Three.js canvas rendering."""
380
+ # Add a cache-busting timestamp so Gradio re-renders on refresh
381
+ import time
382
+ ts = int(time.time())
383
+ return (
384
+ f'<iframe src="/static/viz3d.html?t={ts}" '
385
+ f'width="100%" height="640" frameborder="0" '
386
+ f'style="border-radius:10px;border:1px solid rgba(125,211,252,0.2);'
387
+ f'background:#0a0e1a;" '
388
+ f'allow="accelerometer; autoplay" loading="lazy">'
389
+ f'</iframe>'
390
+ )
391
 
 
 
392
 
393
+ # โ”€โ”€ Tab 8: Causal Probe โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
394
+
395
+ def get_causal_probe():
396
+ try:
397
+ traj, meta, steps, ep_id = _get_traj_and_meta()
398
+ if not traj: return _no_traj()
399
+ r = causal_probe.probe(ep_id, env.current_task or "?", steps, meta,
400
+ list(env.files_read), list(env.files_written), env.final_score)
401
+ d = r.to_dict()
402
+ bar = lambda v: "โ–ˆ"*int(v*20)+"โ–‘"*(20-int(v*20))
403
+ lines = [
404
+ f"๐Ÿงช CAUSAL REASONING PROBE",
405
+ f"โ”"*55,
406
+ f"Understanding Level: {d['understanding_level']}",
407
+ f"Causal Score: [{bar(d['causal_score'])}] {d['causal_score']:.3f}",
408
+ f"Chain Coverage: [{bar(d['chain_coverage'])}] {d['chain_coverage']:.3f}",
409
+ f"Chain Order Score: [{bar(d['chain_order_score'])}] {d['chain_order_score']:.3f}",
410
+ f"\n๐Ÿ“ก Behavioral Signals:",
411
+ ]
412
+ sigs = d.get("behavioral_signals",{})
413
+ for k,v in sigs.items():
414
+ lines.append(f" {'โœ…' if v else 'โŒ'} {k.replace('_',' ').title()}")
415
+ if d.get("understanding_indicators"):
416
+ lines += ["\nโœ… Understanding Indicators:"] + [f" โ€ข {i}" for i in d["understanding_indicators"]]
417
+ if d.get("guessing_indicators"):
418
+ lines += ["\nโŒ Guessing Indicators:"] + [f" โ€ข {i}" for i in d["guessing_indicators"]]
419
+ diag = d.get("diagnostics",{})
420
+ if diag.get("false_confidence_detected"):
421
+ lines.append("\nโš ๏ธ FALSE CONFIDENCE DETECTED โ€” submitted without adequate exploration")
422
+ if diag.get("shortcut_learning_detected"):
423
+ lines.append("โš ๏ธ SHORTCUT LEARNING DETECTED โ€” wrote without reading source")
424
+ lines += [f"\n๐Ÿ“ {d['explanation']}"]
425
+ if d.get("recommendations"):
426
+ lines += ["\n๐Ÿ’ก Recommendations:"] + [f" โ†’ {r_}" for r_ in d["recommendations"]]
427
+ return "\n".join(lines)
428
+ except Exception as e: return f"Error: {e}"
429
+
430
+
431
+ # โ”€โ”€ Tab 9: Counterfactual โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
432
+
433
+ def get_counterfactual():
434
+ try:
435
+ traj, meta, steps, ep_id = _get_traj_and_meta()
436
+ if not traj: return _no_traj()
437
+ r = counterfactual_engine.analyze(ep_id, env.current_task or "?", steps, meta,
438
+ list(env.files_read), list(env.files_written), env.final_score)
439
+ d = r.to_dict()
440
+ bar = lambda v: "โ–ˆ"*int(v*20)+"โ–‘"*(20-int(v*20))
441
+ lines = [
442
+ f"๐ŸŽญ COUNTERFACTUAL ROBUSTNESS TEST",
443
+ f"โ”"*55,
444
+ f"Brittleness Level: {d['brittleness_level']}",
445
+ f"Robustness Score: [{bar(d['robustness_score'])}] {d['robustness_score']:.3f}",
446
+ f"Mutations Tested: {d['mutations_tested']}",
447
+ f"Mutations Survived: {d['mutations_survived']} โœ… | Failed: {d['mutations_failed']} โŒ",
448
+ f"\n๐Ÿงฌ Mutation Results:",
449
+ ]
450
+ for m in d.get("mutations",[]):
451
+ icon = "โœ…" if not m["would_break_agent"] else "โŒ"
452
+ lines.append(f" {icon} [{m['type']}] {m['description'][:55]}")
453
+ lines.append(f" {m['why'][:80]}")
454
+ if d.get("surface_dependencies"):
455
+ lines += ["\nโš ๏ธ Surface Dependencies:"] + [f" โ€ข {s}" for s in d["surface_dependencies"]]
456
+ if d.get("deep_dependencies"):
457
+ lines += ["\nโœ… Deep Dependencies:"] + [f" โ€ข {s}" for s in d["deep_dependencies"]]
458
+ lines += [f"\n๐Ÿ“ {d['explanation']}"]
459
+ if d.get("recommendations"):
460
+ lines += ["\n๐Ÿ’ก Recommendations:"] + [f" โ†’ {r_}" for r_ in d["recommendations"]]
461
+ return "\n".join(lines)
462
+ except Exception as e: return f"Error: {e}"
463
+
464
+
465
+ # โ”€โ”€ Tab 10: Confidence Calibration โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
466
+
467
+ def get_calibration():
468
+ try:
469
+ traj, meta, steps, ep_id = _get_traj_and_meta()
470
+ if not traj: return _no_traj()
471
+ r = confidence_calibrator.calibrate(ep_id, env.current_task or "?", steps, env.final_score)
472
+ d = r.to_dict()
473
+ bar = lambda v: "โ–ˆ"*int(v*20)+"โ–‘"*(20-int(v*20))
474
+ lines = [
475
+ f"๐Ÿ“ CONFIDENCE CALIBRATION REPORT",
476
+ f"โ”"*55,
477
+ f"Calibration Profile: {d['profile']}",
478
+ f"Calibration Score: [{bar(d['calibration_score'])}] {d['calibration_score']:.3f}",
479
+ f"Inferred Confidence: [{bar(d['inferred_confidence'])}] {d['inferred_confidence']:.3f}",
480
+ f"Actual Performance: [{bar(d['actual_performance'])}] {d['actual_performance']:.3f}",
481
+ f"Calibration Error: {d['expected_calibration_error']:.3f} (lower=better)",
482
+ f"Conf-Acc Correlation: {d['confidence_accuracy_correlation']:.3f}",
483
+ f"\n๐Ÿ“Š Behavioral Signals:",
484
+ ]
485
+ sigs = d.get("signals",{})
486
+ lines.append(f" Commitment Speed: {sigs.get('commitment_speed',0):.3f} (high=fast commit)")
487
+ lines.append(f" Re-Exploration Rate: {sigs.get('re_exploration_rate',0):.3f} (high=uncertain)")
488
+ lines.append(f" Verification Rate: {sigs.get('verification_rate',0):.3f} tests/write")
489
+ lines.append(f" Submit Speed: {sigs.get('submit_speed',0):.3f} (high=early submit)")
490
+ lines += [f"\n๐Ÿ“ {d['diagnosis']}"]
491
+ if d.get("recommendations"):
492
+ lines += ["\n๐Ÿ’ก Recommendations:"] + [f" โ†’ {r_}" for r_ in d["recommendations"]]
493
+ if d.get("confidence_trajectory"):
494
+ lines.append("\n๐Ÿ“ˆ Confidence Trajectory:")
495
+ for s in d["confidence_trajectory"][:8]:
496
+ acc_str = f" | acc={s['accuracy']:.2f}" if s['accuracy'] is not None else ""
497
+ lines.append(f" S{s['step']}: {s['action']:12s} conf={s['confidence']:.2f}{acc_str}")
498
+ return "\n".join(lines)
499
+ except Exception as e: return f"Error: {e}"
500
+
501
+
502
+ # โ”€โ”€ Tab 11: Benchmark โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
503
+
504
+ def run_benchmark(tasks_selected, agents_selected):
505
+ try:
506
+ tasks = tasks_selected if tasks_selected else ["task1", "task2", "task3"]
507
+ agents = agents_selected if agents_selected else None
508
+ report = benchmark_runner.run(env, tasks=tasks, agents=agents)
509
+ return report.render_table()
510
+ except Exception as e:
511
+ return f"โŒ Benchmark error: {e}"
512
+
513
+
514
+ # โ”€โ”€ Tab 12: Analytics โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
515
+
516
+ def get_analytics():
517
+ try:
518
+ if not env.get_trajectory():
519
+ return _no_traj()
520
+ report = analytics_engine.analyze(env)
521
+ return report.render_text()
522
+ except Exception as e:
523
+ return f"Error: {e}"
524
+
525
+ def get_analytics_json():
526
+ try:
527
+ if not env.get_trajectory():
528
+ return _no_traj()
529
+ report = analytics_engine.analyze(env)
530
+ return json.dumps(report.to_dict(), indent=2, default=str)
531
+ except Exception as e:
532
+ return f"Error: {e}"
533
 
534
 
535
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
536
+ # Gradio UI
537
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
538
 
539
+ with gr.Blocks(title="Codebase Navigation & Repair โ€” OpenEnv v4") as demo:
540
  gr.Markdown(
541
+ "# ๐Ÿ” Codebase Navigation & Repair โ€” OpenEnv v4\n"
542
+ "**The first platform that scientifically measures, explains, and improves AI agent reasoning.** "
543
+ "Navigate ยท Fix ยท Evaluate Process ยท Probe Causality ยท Test Counterfactuals ยท Calibrate Confidence ยท Benchmark."
544
  )
545
 
546
  with gr.Tabs():
 
549
  with gr.TabItem("๐ŸŽฎ Interactive"):
550
  with gr.Row():
551
  with gr.Column(scale=1):
552
+ task_sel = gr.Dropdown(["task1","task2","task3"], value="task1", label="Task")
 
 
 
 
553
  reset_btn = gr.Button("๐Ÿ”„ Reset Environment", variant="primary")
554
  gr.Markdown("### Action")
555
+ act_type = gr.Dropdown(["read_file","write_file","run_tests","search_code","submit"], value="read_file", label="Action Type")
 
 
 
556
  act_path = gr.Textbox(label="Path", placeholder="src/auth.py")
557
+ act_query = gr.Textbox(label="Query", placeholder="validate_token")
558
  act_content = gr.Textbox(label="Content (write_file)", lines=4)
559
  step_btn = gr.Button("โ–ถ๏ธ Execute Step", variant="secondary")
560
  with gr.Column(scale=2):
 
563
  with gr.Row():
564
  steps_box = gr.Textbox(label="Steps", value="0", interactive=False)
565
  reward_box = gr.Textbox(label="Cumulative Reward", value="0.000", interactive=False)
566
+ reset_btn.click(reset_environment, [task_sel], [status_box, result_box, steps_box, reward_box])
567
  step_btn.click(take_step, [act_type, act_path, act_query, act_content], [status_box, result_box, steps_box, reward_box])
568
 
569
  # โ”€โ”€ Tab 2: Run Agent โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
570
  with gr.TabItem("๐Ÿค– Run Agent"):
571
+ gr.Markdown("### Built-in Demonstration Agent\nRuns test-first deterministic strategy + stores lesson in memory bank.")
572
+ agent_task = gr.Dropdown(["task1","task2","task3"], value="task1", label="Task")
573
  run_btn = gr.Button("๐Ÿš€ Run Agent", variant="primary")
574
+ agent_out = gr.Textbox(label="Agent Log", lines=22, interactive=False)
575
+ run_btn.click(run_builtin_agent, [agent_task], [agent_out])
576
 
577
  # โ”€โ”€ Tab 3: Evaluation โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
578
  with gr.TabItem("๐Ÿ“Š Evaluation"):
 
585
  metrics_btn.click(get_metrics, outputs=[eval_out])
586
  traj_btn.click(get_trajectory, outputs=[eval_out])
587
 
588
+ # โ”€โ”€ Tab 4: Intelligence โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
589
  with gr.TabItem("๐Ÿง  Intelligence"):
590
+ gr.Markdown("### Deep Agent Intelligence Analysis")
 
 
 
591
  with gr.Row():
592
+ clf_btn = gr.Button("๐Ÿ”ฌ Classify Failure", variant="primary")
593
+ strat_btn = gr.Button("๐Ÿงญ Detect Strategy", variant="secondary")
594
  adv_btn = gr.Button("โšก Advanced Metrics", variant="secondary")
595
  intel_out = gr.Textbox(label="Analysis", lines=32, interactive=False)
596
+ clf_btn.click(get_failure_classification, outputs=[intel_out])
597
+ strat_btn.click(get_strategy_detection, outputs=[intel_out])
598
  adv_btn.click(get_advanced_metrics, outputs=[intel_out])
599
 
600
+ # โ”€โ”€ Tab 5: Self-Improve โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
601
  with gr.TabItem("๐Ÿ” Self-Improve"):
602
+ gr.Markdown("### Self-Improvement Loop + Episodic Memory")
603
+ with gr.Row():
604
+ improve_btn = gr.Button("๐Ÿ” Improvement Plan", variant="primary")
605
+ mem_task = gr.Dropdown(["task1","task2","task3"], value="task1", label="Task for Memory")
606
+ mem_btn = gr.Button("๐Ÿง  Retrieve Memory", variant="secondary")
607
+ improve_out = gr.Textbox(label="Output", lines=32, interactive=False)
 
608
  improve_btn.click(get_improvement_plan, outputs=[improve_out])
609
+ mem_btn.click(get_memory_context_for_task, [mem_task], [improve_out])
610
 
611
+ # โ”€โ”€ Tab 6: Compare Agents โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
612
  with gr.TabItem("โš–๏ธ Compare Agents"):
613
+ gr.Markdown("### Multi-Agent Strategy Comparison")
 
 
 
 
614
  with gr.Row():
615
+ comp_task = gr.Dropdown(["task1","task2","task3"], value="task1", label="Task")
616
  comp_agents = gr.CheckboxGroup(
617
+ ["test-first","search-first","minimal","exhaustive"],
618
+ value=["test-first","search-first","minimal","exhaustive"],
619
+ label="Agents",
620
  )
621
  comp_btn = gr.Button("โš–๏ธ Run Comparison", variant="primary")
622
+ comp_out = gr.Textbox(label="Report", lines=30, interactive=False)
623
  comp_btn.click(run_comparison, [comp_task, comp_agents], [comp_out])
624
 
625
+ # โ”€โ”€ Tab 7: 3D Visualizer โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
626
  with gr.TabItem("๐ŸŒ 3D Visualizer"):
627
  gr.Markdown(
628
  "### Agent Trajectory 3D Visualization\n"
629
+ "Files = glowing 3D spheres ยท Dependencies = edges ยท Agent = animated beam ยท **Run an episode first.**"
630
+ )
631
+ refresh_btn = gr.Button("๐Ÿ”„ Load / Refresh Visualizer", variant="primary")
632
+ viz_html = gr.HTML(
633
+ value='<div style="text-align:center;padding:60px;color:#475569;background:#0a0e1a;border-radius:10px">'
634
+ '<p style="font-size:24px">๐ŸŒ</p>'
635
+ '<p style="color:#7dd3fc;font-weight:700">Run an episode then click Load</p></div>'
636
+ )
637
+ refresh_btn.click(get_viz_iframe, outputs=[viz_html])
638
+
639
+ # โ”€โ”€ Tab 8: Causal Probe โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
640
+ with gr.TabItem("๐Ÿงช Causal Probe"):
641
+ gr.Markdown(
642
+ "### Causal Reasoning Evaluation\n"
643
+ "Did the agent truly understand WHY the bug exists, "
644
+ "or did it pattern-match and guess? "
645
+ "Measures chain coverage, order, and shortcut learning."
646
+ )
647
+ causal_btn = gr.Button("๐Ÿงช Run Causal Probe", variant="primary")
648
+ causal_out = gr.Textbox(label="Causal Reasoning Report", lines=32, interactive=False)
649
+ causal_btn.click(get_causal_probe, outputs=[causal_out])
650
+
651
+ # โ”€โ”€ Tab 9: Counterfactual โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
652
+ with gr.TabItem("๐ŸŽญ Counterfactual"):
653
+ gr.Markdown(
654
+ "### Counterfactual Robustness Testing\n"
655
+ "Applies 6 semantic-neutral mutations (filename rename, constant change, "
656
+ "dummy function, directory shift, docstring noise, import reorder) "
657
+ "and measures whether the agent's strategy survives."
658
  )
659
+ cf_btn = gr.Button("๐ŸŽญ Run Counterfactual Analysis", variant="primary")
660
+ cf_out = gr.Textbox(label="Robustness Report", lines=32, interactive=False)
661
+ cf_btn.click(get_counterfactual, outputs=[cf_out])
662
 
663
+ # โ”€โ”€ Tab 10: Confidence โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
664
+ with gr.TabItem("๐Ÿ“ Confidence"):
665
+ gr.Markdown(
666
+ "### Confidence Calibration Analysis\n"
667
+ "Infers agent confidence from behavioral proxies (commitment speed, "
668
+ "re-exploration rate, verification rate, submit timing) "
669
+ "and compares to actual performance. Detects overconfident and underconfident agents."
670
+ )
671
+ calib_btn = gr.Button("๐Ÿ“ Analyze Calibration", variant="primary")
672
+ calib_out = gr.Textbox(label="Calibration Report", lines=32, interactive=False)
673
+ calib_btn.click(get_calibration, outputs=[calib_out])
674
+
675
+ # โ”€โ”€ Tab 11: Benchmark โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
676
+ with gr.TabItem("๐Ÿ† Benchmark"):
677
+ gr.Markdown(
678
+ "### Automated Benchmark Leaderboard\n"
679
+ "Runs all selected agent strategies ร— all selected tasks automatically. "
680
+ "Ranks by composite score: correctness + causal reasoning + robustness + calibration + generalization."
681
+ )
682
+ with gr.Row():
683
+ bench_tasks = gr.CheckboxGroup(["task1","task2","task3"], value=["task1","task2"], label="Tasks to Benchmark")
684
+ bench_agents = gr.CheckboxGroup(
685
+ ["test-first","search-first","minimal","exhaustive"],
686
+ value=["test-first","minimal"],
687
+ label="Agent Strategies",
688
+ )
689
+ bench_btn = gr.Button("๐Ÿ† Run Benchmark (2โ€“4 min)", variant="primary")
690
+ bench_out = gr.Textbox(label="Leaderboard", lines=35, interactive=False)
691
+ bench_btn.click(run_benchmark, [bench_tasks, bench_agents], [bench_out])
692
+
693
+ # โ”€โ”€ Tab 12: Analytics โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
694
+ with gr.TabItem("๐Ÿ“ˆ Analytics"):
695
+ gr.Markdown(
696
+ "### Unified Research-Grade Analytics\n"
697
+ "Synthesizes ALL evaluation dimensions into one report: "
698
+ "reasoning graph, root cause tree, alternative paths, profile tags, "
699
+ "decision efficiency, composite score. Paper-ready JSON available."
700
+ )
701
+ with gr.Row():
702
+ analytics_btn = gr.Button("๐Ÿ“ˆ Full Analytics Report", variant="primary")
703
+ analytics_json_btn = gr.Button("๐Ÿ“‹ Export JSON", variant="secondary")
704
+ analytics_out = gr.Textbox(label="Analytics Report", lines=40, interactive=False)
705
+ analytics_btn.click(get_analytics, outputs=[analytics_out])
706
+ analytics_json_btn.click(get_analytics_json, outputs=[analytics_out])
707
+
708
+ # โ”€โ”€ Tab 13: API โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
709
  with gr.TabItem("๐Ÿ“– API"):
710
  gr.Markdown("""
711
+ ### REST API โ€” v4.0 Endpoints
712
 
713
+ #### Core
714
+ | `/reset` POST | `/step` POST | `/state` GET | `/health` GET |
 
 
 
 
 
715
 
716
  #### Evaluation
717
+ | `/trajectory` GET | `/evaluate` GET | `/metrics` GET | `/fault-config` POST |
718
+
719
+ #### Intelligence (v3)
720
+ | `/classify` GET | `/strategy` GET | `/advanced-metrics` GET | `/improvement-plan` GET | `/compare-agents` POST | `/viz-data` GET |
721
+
722
+ #### Research (v4 NEW)
723
+ | `/causal-probe` GET | `/counterfactual` GET | `/confidence` GET | `/benchmark` POST | `/analytics` GET |
 
 
 
 
 
 
 
 
 
724
 
725
  ```bash
726
  BASE="http://localhost:7860"
727
+ # Run a full episode
728
  curl -X POST "$BASE/reset?task=task1"
729
+ curl -X POST "$BASE/step" -H "Content-Type: application/json" -d '{"action_type":"read_file","path":"tests/test_formatter.py"}'
730
  curl -X POST "$BASE/step" -d '{"action_type":"submit"}'
731
+
732
+ # All intelligence endpoints
733
  curl "$BASE/classify"
734
+ curl "$BASE/causal-probe"
735
+ curl "$BASE/counterfactual"
736
+ curl "$BASE/confidence"
737
+ curl "$BASE/analytics"
738
+
739
+ # Benchmark
740
+ curl -X POST "$BASE/benchmark?tasks=task1,task2"
741
  ```
742
  """)
743
 
744
 
745
+ # โ”€โ”€ Mount FastAPI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
746
  from server.app import app as fastapi_app
747
  gr_app = gr.mount_gradio_app(fastapi_app, demo, path="/")
748
 
e2e_test_v3.py ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ e2e_test_v3.py โ€” Full End-to-End test suite for v3.0
4
+
5
+ Tests every endpoint, all 3 tasks, all new intelligence modules,
6
+ multi-agent comparison, and the 3D viz-data endpoint.
7
+ """
8
+ import sys
9
+ import json
10
+ import time
11
+ import requests
12
+
13
+ BASE = "http://localhost:7860"
14
+ PASS = 0
15
+ FAIL = 0
16
+ RESULTS = []
17
+
18
+
19
+ def check(name, condition, detail=""):
20
+ global PASS, FAIL
21
+ status = "โœ… PASS" if condition else "โŒ FAIL"
22
+ if condition:
23
+ PASS += 1
24
+ else:
25
+ FAIL += 1
26
+ msg = f" {status} {name}"
27
+ if detail:
28
+ msg += f" โ†’ {detail}"
29
+ print(msg)
30
+ RESULTS.append({"name": name, "passed": condition, "detail": detail})
31
+
32
+
33
+ def section(title):
34
+ print(f"\n{'โ”'*60}")
35
+ print(f" {title}")
36
+ print(f"{'โ”'*60}")
37
+
38
+
39
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
40
+ section("1. HEALTH & BASIC CONNECTIVITY")
41
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
42
+
43
+ r = requests.get(f"{BASE}/health")
44
+ check("GET /health returns 200", r.status_code == 200)
45
+ data = r.json()
46
+ check("Health version is 3.0.0", data.get("version") == "3.0.0", data.get("version"))
47
+ check("Health status is ok", data.get("status") == "ok")
48
+
49
+
50
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
51
+ section("2. CORE OPENENV โ€” ALL 3 TASKS")
52
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
53
+
54
+ for task in ["task1", "task2", "task3"]:
55
+ r = requests.post(f"{BASE}/reset?task={task}")
56
+ check(f"POST /reset?task={task} โ†’ 200", r.status_code == 200, f"status={r.status_code}")
57
+ if r.status_code == 200:
58
+ d = r.json()
59
+ obs = d.get("observation", {})
60
+ check(f" {task}: has repo_tree", bool(obs.get("repo_tree")), str(obs.get("repo_tree", [])[:2]))
61
+ check(f" {task}: has variant_id", bool(d.get("info", {}).get("variant_id")))
62
+ check(f" {task}: steps_remaining > 0", obs.get("steps_remaining", 0) > 0)
63
+
64
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
65
+ section("3. STEP ACTIONS โ€” FULL EPISODE (task1)")
66
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
67
+
68
+ r = requests.post(f"{BASE}/reset?task=task1")
69
+ obs = r.json()["observation"]
70
+ tree = obs["repo_tree"]
71
+ test_files = [f for f in tree if f.startswith("tests/")]
72
+ src_files = [f for f in tree if f.startswith("src/")]
73
+
74
+ # read_file
75
+ r = requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": test_files[0]})
76
+ check("POST /step read_file test file โ†’ 200", r.status_code == 200)
77
+ check("read_file reward >= 0", r.json().get("reward", -1) >= 0, str(r.json().get("reward")))
78
+
79
+ r = requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": src_files[0]})
80
+ check("POST /step read_file src file โ†’ 200", r.status_code == 200)
81
+
82
+ # search_code
83
+ r = requests.post(f"{BASE}/step", json={"action_type": "search_code", "query": "def "})
84
+ check("POST /step search_code โ†’ 200", r.status_code == 200)
85
+
86
+ # run_tests
87
+ r = requests.post(f"{BASE}/step", json={"action_type": "run_tests"})
88
+ check("POST /step run_tests โ†’ 200", r.status_code == 200, f"reward={r.json().get('reward')}")
89
+
90
+ # submit
91
+ r = requests.post(f"{BASE}/step", json={"action_type": "submit"})
92
+ check("POST /step submit โ†’ 200", r.status_code == 200)
93
+ final_score = r.json()["info"].get("final_score", 0)
94
+ check("Episode done after submit", r.json().get("done") == True)
95
+
96
+ # Try stepping after done โ†’ should get 400
97
+ r = requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": "x.py"})
98
+ check("POST /step after done โ†’ 400", r.status_code == 400)
99
+
100
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
101
+ section("4. STATE ENDPOINT")
102
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
103
+
104
+ requests.post(f"{BASE}/reset?task=task1")
105
+ requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": test_files[0]})
106
+ r = requests.get(f"{BASE}/state")
107
+ check("GET /state โ†’ 200", r.status_code == 200)
108
+ d = r.json()
109
+ check("State has observation", "observation" in d)
110
+ check("State total_steps_taken >= 1", d.get("total_steps_taken", 0) >= 1)
111
+
112
+
113
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
114
+ section("5. TRAJECTORY & EVALUATION")
115
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
116
+
117
+ requests.post(f"{BASE}/step", json={"action_type": "submit"})
118
+
119
+ r = requests.get(f"{BASE}/trajectory")
120
+ check("GET /trajectory โ†’ 200", r.status_code == 200)
121
+ traj = r.json()
122
+ check("Trajectory has episode_id", bool(traj.get("episode_id")))
123
+ check("Trajectory steps > 0", len(traj.get("steps", [])) > 0, f"steps={len(traj.get('steps',[]))}")
124
+
125
+ r = requests.get(f"{BASE}/evaluate")
126
+ check("GET /evaluate โ†’ 200", r.status_code == 200)
127
+ ev = r.json()
128
+ check("Evaluation has composite_score", "composite_score" in ev, str(ev.get("composite_score")))
129
+ check("Evaluation has 6 dimensions", len(ev.get("dimensions", {})) == 6, str(list(ev.get("dimensions", {}).keys())))
130
+
131
+ r = requests.get(f"{BASE}/metrics")
132
+ check("GET /metrics โ†’ 200", r.status_code == 200)
133
+ m = r.json()
134
+ check("Metrics has timeline", "timeline" in m, str(list(m.keys())[:5]))
135
+
136
+
137
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
138
+ section("6. FAULT INJECTION")
139
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
140
+
141
+ r = requests.post(f"{BASE}/fault-config", json={"level": "light"})
142
+ check("POST /fault-config light โ†’ 200", r.status_code == 200)
143
+ r = requests.post(f"{BASE}/reset?task=task1")
144
+ check("Reset with fault injection โ†’ 200", r.status_code == 200)
145
+ fi = r.json().get("info", {}).get("fault_injection", {})
146
+ check("Fault injection info present", "difficulty_multiplier" in fi or "faults_injected" in fi, str(fi))
147
+
148
+ # Reset back
149
+ requests.post(f"{BASE}/fault-config", json={"level": "none"})
150
+
151
+
152
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
153
+ section("7. INTELLIGENCE โ€” FAILURE CLASSIFIER")
154
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
155
+
156
+ # Run a fresh episode with minimal effort to get a known failure
157
+ requests.post(f"{BASE}/reset?task=task1")
158
+ requests.post(f"{BASE}/step", json={"action_type": "submit"}) # Submit without doing anything
159
+
160
+ r = requests.get(f"{BASE}/classify")
161
+ check("GET /classify โ†’ 200", r.status_code == 200)
162
+ d = r.json()
163
+ check("Classify has episode_id", "episode_id" in d, d.get("episode_id"))
164
+ check("Classify has primary_failure", "primary_failure" in d, d.get("primary_failure"))
165
+ check("Classify has success field", "success" in d)
166
+ check("Classify success=False for minimal effort", d.get("success") == False)
167
+ check("Classify has retry_hint", bool(d.get("retry_hint")), d.get("retry_hint", "")[:60])
168
+
169
+
170
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
171
+ section("8. INTELLIGENCE โ€” STRATEGY DETECTOR")
172
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
173
+
174
+ r = requests.get(f"{BASE}/strategy")
175
+ check("GET /strategy โ†’ 200", r.status_code == 200)
176
+ d = r.json()
177
+ check("Strategy has strategy field", "strategy" in d, d.get("strategy"))
178
+ VALID_STRATEGIES = ["TARGETED_DEBUGGING", "SYSTEMATIC_SEARCH", "BRUTE_FORCE",
179
+ "RANDOM_EXPLORATION", "SPEC_DRIVEN", "MINIMAL_EFFORT"]
180
+ check("Strategy is a known label", d.get("strategy") in VALID_STRATEGIES, d.get("strategy"))
181
+ check("Strategy has score 0-1", 0 <= d.get("score", -1) <= 1, str(d.get("score")))
182
+ check("Strategy has exploration_ratio", "exploration_ratio" in d)
183
+ check("Strategy has sub_patterns list", isinstance(d.get("sub_patterns"), list))
184
+
185
+
186
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
187
+ section("9. INTELLIGENCE โ€” ADVANCED METRICS")
188
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
189
+
190
+ r = requests.get(f"{BASE}/advanced-metrics")
191
+ check("GET /advanced-metrics โ†’ 200", r.status_code == 200)
192
+ d = r.json()
193
+ expected_keys = ["reasoning_efficiency", "exploration_ratio", "decision_entropy",
194
+ "reliability_index", "pivot_rate", "wasteful_ratio", "consistency_score"]
195
+ for key in expected_keys:
196
+ check(f" advanced-metrics has '{key}'", key in d, str(d.get(key, "MISSING")))
197
+ check("reliability_index in [0,1]", 0 <= d.get("reliability_index", -1) <= 1)
198
+ check("action_distribution is dict", isinstance(d.get("action_distribution"), dict))
199
+
200
+
201
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
202
+ section("10. INTELLIGENCE โ€” IMPROVEMENT PLAN")
203
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
204
+
205
+ r = requests.get(f"{BASE}/improvement-plan")
206
+ check("GET /improvement-plan โ†’ 200", r.status_code == 200)
207
+ d = r.json()
208
+ check("Plan has failure_type", "failure_type" in d, d.get("failure_type"))
209
+ check("Plan has what_went_wrong", bool(d.get("what_went_wrong")))
210
+ check("Plan has improved_strategy", bool(d.get("improved_strategy")))
211
+ check("Plan has step_by_step_plan list", isinstance(d.get("step_by_step_plan"), list))
212
+ check("Plan step_by_step_plan not empty", len(d.get("step_by_step_plan", [])) > 0)
213
+ check("Plan has system_prompt_addon", "system_prompt_addon" in d)
214
+
215
+
216
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
217
+ section("11. MULTI-AGENT COMPARISON")
218
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
219
+
220
+ r = requests.post(f"{BASE}/compare-agents?task=task1&agents=test-first,minimal")
221
+ check("POST /compare-agents (2 agents) โ†’ 200", r.status_code == 200, f"status={r.status_code}")
222
+ if r.status_code == 200:
223
+ d = r.json()
224
+ check("Comparison has winner", "winner" in d, d.get("winner"))
225
+ check("Comparison has summary_table", "summary_table" in d)
226
+ check("Summary table has 2 rows", len(d.get("summary_table", [])) == 2,
227
+ str(len(d.get("summary_table", []))))
228
+ check("Each row has score/steps/strategy", all(
229
+ "score" in row and "steps" in row and "strategy" in row
230
+ for row in d.get("summary_table", [])
231
+ ))
232
+ check("Comparison has insights", "insights" in d)
233
+ check("Comparison has detailed_runs", len(d.get("detailed_runs", [])) == 2)
234
+
235
+ # Test all 4 agents
236
+ r = requests.post(f"{BASE}/compare-agents?task=task1")
237
+ check("POST /compare-agents (all agents) โ†’ 200", r.status_code == 200)
238
+ if r.status_code == 200:
239
+ d = r.json()
240
+ check("All 4 agents ran", len(d.get("summary_table", [])) == 4,
241
+ f"rows={len(d.get('summary_table',[]))}")
242
+
243
+
244
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
245
+ section("12. 3D VISUALIZATION DATA")
246
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
247
+
248
+ # Run a full episode first for viz data
249
+ requests.post(f"{BASE}/reset?task=task1")
250
+ requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": test_files[0]})
251
+ requests.post(f"{BASE}/step", json={"action_type": "submit"})
252
+
253
+ r = requests.get(f"{BASE}/viz-data")
254
+ check("GET /viz-data โ†’ 200", r.status_code == 200)
255
+ d = r.json()
256
+ check("Viz-data has files array", isinstance(d.get("files"), list), f"len={len(d.get('files',[]))}")
257
+ check("Viz-data files > 0", len(d.get("files", [])) > 0)
258
+ check("Viz-data has dependencies", isinstance(d.get("dependencies"), list))
259
+ check("Viz-data has steps", isinstance(d.get("steps"), list))
260
+ check("Viz-data has strategy", "strategy" in d, d.get("strategy"))
261
+ check("Viz-data has final_score", "final_score" in d)
262
+ if d.get("files"):
263
+ f = d["files"][0]
264
+ check("File node has name/type/is_bug_file", all(k in f for k in ["name","type","is_bug_file"]))
265
+
266
+
267
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
268
+ section("13. INVALID ACTION HANDLING")
269
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
270
+
271
+ requests.post(f"{BASE}/reset?task=task1")
272
+
273
+ # Invalid task
274
+ r = requests.post(f"{BASE}/reset?task=task99")
275
+ check("Invalid task โ†’ 400", r.status_code == 400)
276
+
277
+ # Invalid action type
278
+ r = requests.post(f"{BASE}/step", json={"action_type": "hack_system"})
279
+ check("Invalid action_type โ†’ 400 or 422", r.status_code in (400, 422))
280
+
281
+ # Non-existent file
282
+ r = requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": "non_existent.py"})
283
+ check("Read non-existent file โ†’ 200 with error", r.status_code == 200)
284
+ obs = r.json().get("observation", {})
285
+ check("Non-existent file has error in obs", bool(obs.get("last_action_error")), obs.get("last_action_error","")[:60])
286
+
287
+
288
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
289
+ section("14. SECURITY SCANNING")
290
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
291
+
292
+ requests.post(f"{BASE}/reset?task=task1")
293
+ # Try to write a file with dangerous code
294
+ r = requests.post(f"{BASE}/step", json={
295
+ "action_type": "write_file",
296
+ "path": src_files[0] if src_files else "src/hack.py",
297
+ "content": "import os\nos.system('rm -rf /')\n"
298
+ })
299
+ check("Write dangerous code โ†’ 200", r.status_code == 200)
300
+ if r.status_code == 200:
301
+ info = r.json().get("info", {})
302
+ flags = info.get("security_flags", [])
303
+ check("Security flags populated for os.system", len(flags) > 0, str(flags[:2]))
304
+
305
+
306
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
307
+ section("15. GRADIO UI ENDPOINTS")
308
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
309
+
310
+ r = requests.get(f"{BASE}/")
311
+ check("GET / (Gradio UI) โ†’ 200", r.status_code == 200)
312
+ check("Response is HTML", "text/html" in r.headers.get("content-type", ""))
313
+
314
+ r = requests.get(f"{BASE}/static/viz3d.html")
315
+ check("GET /static/viz3d.html โ†’ 200", r.status_code == 200)
316
+ check("viz3d.html is HTML", "html" in r.text.lower()[:200])
317
+ check("viz3d.html has Three.js", "three" in r.text.lower())
318
+ check("viz3d.html has timeline-slider", "timeline-slider" in r.text)
319
+
320
+
321
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
322
+ section("16. TASK2 & TASK3 FULL EPISODE")
323
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
324
+
325
+ for task in ["task2", "task3"]:
326
+ r = requests.post(f"{BASE}/reset?task={task}")
327
+ check(f"{task} reset โ†’ 200", r.status_code == 200)
328
+ obs = r.json()["observation"]
329
+ tree = obs["repo_tree"]
330
+ tf = [f for f in tree if f.startswith("tests/")]
331
+ sf = [f for f in tree if f.startswith("src/")]
332
+ md = [f for f in tree if f.endswith(".md")]
333
+
334
+ if task == "task3" and md:
335
+ requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": md[0]})
336
+ if tf:
337
+ requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": tf[0]})
338
+ if sf:
339
+ requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": sf[0]})
340
+
341
+ r = requests.post(f"{BASE}/step", json={"action_type": "submit"})
342
+ check(f"{task} submit โ†’ done", r.json().get("done") == True)
343
+
344
+ # Verify all intelligence endpoints work post-episode
345
+ r = requests.get(f"{BASE}/classify")
346
+ check(f"{task} /classify works", r.status_code == 200 and "primary_failure" in r.json())
347
+ r = requests.get(f"{BASE}/strategy")
348
+ check(f"{task} /strategy works", r.status_code == 200 and "strategy" in r.json())
349
+
350
+
351
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€๏ฟฝ๏ฟฝโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
352
+ section("17. CONSISTENCY โ€” 3 RUNS SAME TASK")
353
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
354
+
355
+ scores = []
356
+ for i in range(3):
357
+ requests.post(f"{BASE}/reset?task=task1")
358
+ r = requests.get(f"{BASE}/state")
359
+ tree = r.json()["observation"]["repo_tree"]
360
+ tf = [f for f in tree if f.startswith("tests/")]
361
+ if tf:
362
+ requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": tf[0]})
363
+ requests.post(f"{BASE}/step", json={"action_type": "submit"})
364
+ metrics = requests.get(f"{BASE}/advanced-metrics").json()
365
+ scores.append(requests.get(f"{BASE}/evaluate").json().get("composite_score", 0))
366
+
367
+ check("3 runs completed", len(scores) == 3, str(scores))
368
+ check("All runs have valid scores", all(0 <= s <= 1 for s in scores), str(scores))
369
+
370
+ # Consistency metric
371
+ r = requests.get(f"{BASE}/advanced-metrics")
372
+ d = r.json()
373
+ check("Consistency score populated after multiple runs", d.get("runs_analyzed", 0) >= 1,
374
+ f"runs={d.get('runs_analyzed')}, consistency={d.get('consistency_score'):.3f}")
375
+
376
+
377
+ # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
378
+ print(f"\n{'โ•'*60}")
379
+ print(f" E2E RESULTS: {PASS} passed | {FAIL} failed | {PASS+FAIL} total")
380
+ print(f" Score: {PASS/(PASS+FAIL)*100:.1f}%")
381
+ print(f"{'โ•'*60}")
382
+
383
+ if FAIL > 0:
384
+ print("\nFailed tests:")
385
+ for r in RESULTS:
386
+ if not r["passed"]:
387
+ print(f" โŒ {r['name']}: {r['detail']}")
388
+
389
+ sys.exit(0 if FAIL == 0 else 1)
inference.py CHANGED
@@ -17,9 +17,13 @@ from openai import OpenAI
17
  import httpx
18
 
19
  # โ”€โ”€ Configuration โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
20
- API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
21
- API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
22
- MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
 
 
 
 
23
  ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:7860")
24
 
25
  MAX_STEPS_PER_TASK = {"task1": 12, "task2": 18, "task3": 22}
@@ -229,7 +233,7 @@ def run_task(env_client: EnvClient, llm_client: OpenAI, task: str) -> tuple:
229
 
230
  def main():
231
  env_client = EnvClient(ENV_BASE_URL)
232
- llm_client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
233
 
234
  all_scores = []
235
  for task in TASKS:
 
17
  import httpx
18
 
19
  # โ”€โ”€ Configuration โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
20
+ API_BASE_URL = os.getenv("API_BASE_URL", "<your-active-endpoint>")
21
+ MODEL_NAME = os.getenv("MODEL_NAME", "<your-active-model>")
22
+ HF_TOKEN = os.getenv("HF_TOKEN")
23
+
24
+ # Optional โ€” if you use from_docker_image():
25
+ LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
26
+
27
  ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:7860")
28
 
29
  MAX_STEPS_PER_TASK = {"task1": 12, "task2": 18, "task3": 22}
 
233
 
234
  def main():
235
  env_client = EnvClient(ENV_BASE_URL)
236
+ llm_client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
237
 
238
  all_scores = []
239
  for task in TASKS:
server/analytics_engine.py ADDED
@@ -0,0 +1,551 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # server/analytics_engine.py
2
+ """
3
+ Unified Analytics Engine โ€” v4.0
4
+
5
+ Aggregates ALL scoring dimensions into a single research-grade report.
6
+ Produces:
7
+ - Reasoning graph (structured DAG of the agent's decision process)
8
+ - Root cause analysis (why the agent failed at every level)
9
+ - Decision efficiency score
10
+ - Overall AI reliability profile (radar chart data)
11
+ - Paper-ready JSON suitable for arXiv submission
12
+
13
+ This module is the "top of the stack" โ€” it calls all other engines
14
+ and synthesizes their outputs into one authoritative report.
15
+ """
16
+ from __future__ import annotations
17
+ import time
18
+ import json
19
+ from typing import List, Dict, Any, Optional, Tuple
20
+ from dataclasses import dataclass, field
21
+
22
+
23
+ @dataclass
24
+ class ReasoningNode:
25
+ """One node in the agent's reconstructed reasoning graph."""
26
+ node_id: str
27
+ step_number: int
28
+ action_type: str
29
+ target: Optional[str] # file path or search query
30
+ reward: float
31
+ was_useful: bool
32
+ connected_to: List[str] # IDs of subsequent nodes that built on this
33
+ label: str # Human-readable description
34
+
35
+
36
+ @dataclass
37
+ class ReasoningGraph:
38
+ """
39
+ A directed graph reconstruction of the agent's thought process.
40
+
41
+ Nodes = actions taken.
42
+ Edges = "built on" relationships (e.g., write followed a read = used info from read).
43
+ Clusters = logical reasoning phases (Exploration, Hypothesis, Verification, Commit)
44
+ """
45
+ nodes: List[ReasoningNode]
46
+ phases: Dict[str, List[str]] # phase_name โ†’ [node_ids]
47
+ critical_path: List[str] # node_ids on the most impactful path
48
+ wasted_nodes: List[str] # node_ids that contributed nothing
49
+ optimal_path_comparison: Optional[str] # What should the agent have done
50
+
51
+ def to_dict(self) -> dict:
52
+ return {
53
+ "nodes": [
54
+ {
55
+ "id": n.node_id, "step": n.step_number,
56
+ "action": n.action_type, "target": n.target,
57
+ "reward": round(n.reward, 3), "useful": n.was_useful,
58
+ "connects_to": n.connected_to, "label": n.label,
59
+ }
60
+ for n in self.nodes
61
+ ],
62
+ "phases": self.phases,
63
+ "critical_path": self.critical_path,
64
+ "wasted_nodes": self.wasted_nodes,
65
+ "optimal_path": self.optimal_path_comparison,
66
+ }
67
+
68
+
69
+ @dataclass
70
+ class AnalyticsReport:
71
+ """
72
+ The master analytics report โ€” synthesizes all evaluation dimensions.
73
+ Paper-ready, structured for research publication or leaderboard submission.
74
+ """
75
+ report_id: str
76
+ episode_id: str
77
+ task: str
78
+ variant_id: str
79
+ generated_at: float
80
+
81
+ # Dimension scores (0.0โ€“1.0 each)
82
+ correctness_score: float # Did it fix the bug?
83
+ causal_score: float # Did it understand WHY?
84
+ robustness_score: float # Is the strategy resilient?
85
+ calibration_score: float # Was it appropriately confident?
86
+ reliability_index: float # Weighted multi-dim score
87
+ generalization_hint: float # Based on strategy (robust strategies generalize better)
88
+ decision_efficiency: float # Score / Steps ratio (normalized)
89
+ process_quality: float # How structured was the reasoning process?
90
+
91
+ # Composite
92
+ composite_score: float # Weighted aggregate of all dimensions
93
+
94
+ # Graph
95
+ reasoning_graph: ReasoningGraph
96
+
97
+ # Root cause trees
98
+ failure_root_causes: List[Dict] # Each: {cause, effect, evidence, depth}
99
+
100
+ # Alternative path analysis
101
+ what_agent_did: List[str]
102
+ what_agent_should_have_done: List[str]
103
+ steps_wasted: int
104
+ steps_optimal: int
105
+
106
+ # Profile tags
107
+ profile_tags: List[str] # e.g., ["OVERCONFIDENT", "SHORTCUT_LEARNER", "WELL_CALIBRATED"]
108
+
109
+ # Executive summary
110
+ executive_summary: str
111
+ researcher_notes: str # More technical deep dive
112
+
113
+ def to_dict(self) -> dict:
114
+ return {
115
+ "report_id": self.report_id,
116
+ "episode_id": self.episode_id,
117
+ "task": self.task,
118
+ "variant_id": self.variant_id,
119
+ "generated_at": self.generated_at,
120
+ "dimension_scores": {
121
+ "correctness": round(self.correctness_score, 3),
122
+ "causal_reasoning": round(self.causal_score, 3),
123
+ "robustness": round(self.robustness_score, 3),
124
+ "calibration": round(self.calibration_score, 3),
125
+ "reliability_index": round(self.reliability_index, 3),
126
+ "generalization": round(self.generalization_hint, 3),
127
+ "decision_efficiency": round(self.decision_efficiency, 3),
128
+ "process_quality": round(self.process_quality, 3),
129
+ "composite": round(self.composite_score, 3),
130
+ },
131
+ "reasoning_graph": self.reasoning_graph.to_dict(),
132
+ "failure_root_causes": self.failure_root_causes,
133
+ "alternative_paths": {
134
+ "what_agent_did": self.what_agent_did,
135
+ "optimal_path": self.what_agent_should_have_done,
136
+ "steps_wasted": self.steps_wasted,
137
+ "steps_optimal": self.steps_optimal,
138
+ },
139
+ "profile_tags": self.profile_tags,
140
+ "executive_summary": self.executive_summary,
141
+ "researcher_notes": self.researcher_notes,
142
+ }
143
+
144
+ def render_text(self) -> str:
145
+ """Render a human-readable analytics report."""
146
+ def bar(v: float, width: int = 20) -> str:
147
+ filled = int(v * width)
148
+ return "โ–ˆ" * filled + "โ–‘" * (width - filled)
149
+
150
+ lines = [
151
+ f"{'โ•'*70}",
152
+ f" ๐Ÿ“ˆ ANALYTICS ENGINE REPORT โ€” {self.task} | {self.variant_id}",
153
+ f" Episode: {self.episode_id}",
154
+ f"{'โ•'*70}",
155
+ "",
156
+ "โ”Œโ”€ DIMENSION SCORES โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€",
157
+ f"โ”‚ Correctness [{bar(self.correctness_score)}] {self.correctness_score:.3f}",
158
+ f"โ”‚ Causal Reasoning [{bar(self.causal_score)}] {self.causal_score:.3f}",
159
+ f"โ”‚ Robustness [{bar(self.robustness_score)}] {self.robustness_score:.3f}",
160
+ f"โ”‚ Calibration [{bar(self.calibration_score)}] {self.calibration_score:.3f}",
161
+ f"โ”‚ Reliability [{bar(self.reliability_index)}] {self.reliability_index:.3f}",
162
+ f"โ”‚ Decision Effic. [{bar(self.decision_efficiency)}] {self.decision_efficiency:.3f}",
163
+ f"โ”‚ Process Quality [{bar(self.process_quality)}] {self.process_quality:.3f}",
164
+ f"โ”‚ {'โ”€'*60}",
165
+ f"โ”‚ COMPOSITE [{bar(self.composite_score)}] {self.composite_score:.3f}",
166
+ "โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€",
167
+ "",
168
+ ]
169
+
170
+ if self.profile_tags:
171
+ lines.append(f"๐Ÿท๏ธ Profile: {' | '.join(self.profile_tags)}")
172
+ lines.append("")
173
+
174
+ lines += [
175
+ "๐Ÿ“ Executive Summary",
176
+ f" {self.executive_summary}",
177
+ "",
178
+ ]
179
+
180
+ if self.failure_root_causes:
181
+ lines.append("๐Ÿ”ฅ Failure Root Cause Analysis")
182
+ for rc in self.failure_root_causes[:3]:
183
+ lines.append(f" Cause: {rc.get('cause')}")
184
+ lines.append(f" Effect: {rc.get('effect')}")
185
+ lines.append(f" Fix: {rc.get('remediation')}")
186
+ lines.append("")
187
+
188
+ lines += [
189
+ "๐Ÿ—บ๏ธ What Agent Did vs Optimal",
190
+ f" Steps taken: {len(self.what_agent_did)} | Steps optimal: {self.steps_optimal} | Wasted: {self.steps_wasted}",
191
+ ]
192
+ for a, o in zip(
193
+ self.what_agent_did[:5],
194
+ self.what_agent_should_have_done[:5],
195
+ ):
196
+ prefix_a = " โœ“" if a == o else " โœ—"
197
+ lines.append(f" Agent: {a}")
198
+ lines.append(f" Optimal: {o}")
199
+ lines.append("")
200
+
201
+ if self.researcher_notes:
202
+ lines += ["๐Ÿ”ฌ Researcher Notes", f" {self.researcher_notes}", ""]
203
+
204
+ lines.append(f"{'โ•'*70}")
205
+ return "\n".join(lines)
206
+
207
+
208
+ class AnalyticsEngine:
209
+ """
210
+ Master analytics engine โ€” integrates all evaluation modules.
211
+
212
+ Call .analyze() after an episode to get the full AnalyticsReport.
213
+ """
214
+
215
+ def analyze(
216
+ self,
217
+ env,
218
+ causal_report=None,
219
+ counterfactual_report=None,
220
+ calibration_report=None,
221
+ advanced_metrics=None,
222
+ failure_report=None,
223
+ strategy_report=None,
224
+ ) -> AnalyticsReport:
225
+ """
226
+ Synthesize all evaluation outputs into one AnalyticsReport.
227
+ Each sub-report is optional โ€” we gracefully handle None.
228
+ """
229
+ import uuid
230
+
231
+ traj = env.get_trajectory()
232
+ steps = traj.get("steps", []) if traj else []
233
+ meta = env.variant.meta if env.variant else {}
234
+ episode_id = traj.get("episode_id", "unknown") if traj else "unknown"
235
+ variant_id = traj.get("variant_id", "unknown") if traj else "unknown"
236
+ task = env.current_task or "unknown"
237
+ final_score = env.final_score
238
+ files_read = list(env.files_read)
239
+ files_written = list(env.files_written)
240
+
241
+ # โ”€โ”€ Run sub-engines if reports not provided โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
242
+ if causal_report is None:
243
+ from server.causal_probe import CausalProbe
244
+ causal_report = CausalProbe().probe(
245
+ episode_id, task, steps, meta, files_read, files_written, final_score
246
+ )
247
+ if counterfactual_report is None:
248
+ from server.counterfactual_engine import CounterfactualEngine
249
+ counterfactual_report = CounterfactualEngine().analyze(
250
+ episode_id, task, steps, meta, files_read, files_written, final_score
251
+ )
252
+ if calibration_report is None:
253
+ from server.confidence_calibrator import ConfidenceCalibrator
254
+ calibration_report = ConfidenceCalibrator().calibrate(
255
+ episode_id, task, steps, final_score
256
+ )
257
+ if advanced_metrics is None:
258
+ from server.advanced_metrics import AdvancedMetricsEngine
259
+ advanced_metrics = AdvancedMetricsEngine().compute(
260
+ steps, meta, final_score, files_read, files_written
261
+ )
262
+ if failure_report is None:
263
+ from server.failure_classifier import FailureClassifier
264
+ failure_report = FailureClassifier().classify(
265
+ episode_id, task, steps, meta, files_read, files_written, final_score
266
+ )
267
+ if strategy_report is None:
268
+ from server.strategy_detector import StrategyDetector
269
+ strategy_report = StrategyDetector().detect(
270
+ steps, task, meta, files_read, final_score
271
+ )
272
+
273
+ # โ”€โ”€ Compute derived scores โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
274
+ causal_score = causal_report.causal_score
275
+ robustness_score = counterfactual_report.robustness_score
276
+ calibration_score = calibration_report.calibration_score
277
+ reliability_index = advanced_metrics.reliability_index
278
+ correctness_score = final_score
279
+
280
+ # Decision efficiency: correctness per step, normalized
281
+ total_steps = max(len(steps), 1)
282
+ max_steps_possible = meta.get("max_steps", 20)
283
+ decision_efficiency = (
284
+ final_score /
285
+ max(1.0, total_steps / max(1, max_steps_possible / 3))
286
+ )
287
+ decision_efficiency = min(1.0, decision_efficiency)
288
+
289
+ # Process quality: measures structural quality of reasoning process
290
+ read_before_write = causal_report.read_before_write
291
+ tested_before_submit = causal_report.submit_after_test
292
+ used_search = causal_report.search_before_navigate
293
+ full_chain = causal_report.actual_chain_coverage
294
+ process_quality = (
295
+ (0.25 if read_before_write else 0.0) +
296
+ (0.25 if tested_before_submit else 0.0) +
297
+ (0.20 if used_search else 0.0) +
298
+ full_chain * 0.30
299
+ )
300
+
301
+ # Generalization hint from strategy robustness
302
+ strategy_generalization_map = {
303
+ "TARGETED_DEBUGGING": 0.75,
304
+ "SYSTEMATIC_SEARCH": 0.70,
305
+ "SPEC_DRIVEN": 0.80,
306
+ "BRUTE_FORCE": 0.40,
307
+ "RANDOM_EXPLORATION": 0.30,
308
+ "MINIMAL_EFFORT": 0.20,
309
+ }
310
+ generalization_hint = strategy_generalization_map.get(strategy_report.strategy, 0.5)
311
+ generalization_hint = (generalization_hint + robustness_score) / 2
312
+
313
+ # Composite (research-grade weighted aggregate)
314
+ composite_score = (
315
+ correctness_score * 0.30 +
316
+ causal_score * 0.20 +
317
+ robustness_score * 0.15 +
318
+ calibration_score * 0.12 +
319
+ reliability_index * 0.10 +
320
+ process_quality * 0.08 +
321
+ decision_efficiency * 0.05
322
+ )
323
+
324
+ # โ”€โ”€ Build reasoning graph โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
325
+ reasoning_graph = self._build_reasoning_graph(steps, meta, files_read, files_written)
326
+
327
+ # โ”€โ”€ Root cause analysis โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
328
+ root_causes = self._build_root_cause_tree(
329
+ failure_report, causal_report, calibration_report, final_score
330
+ )
331
+
332
+ # โ”€โ”€ Alternative path analysis โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
333
+ what_did = [
334
+ f"{s.get('action_type')} {s.get('action_path') or s.get('action_query') or ''}".strip()
335
+ for s in steps
336
+ ]
337
+ optimal = self._compute_optimal_path(meta, files_read, files_written, final_score)
338
+ steps_wasted = max(0, total_steps - len(optimal))
339
+
340
+ # โ”€โ”€ Profile tags โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
341
+ tags = []
342
+ if calibration_report.profile.value != "WELL_CALIBRATED":
343
+ tags.append(calibration_report.profile.value)
344
+ if causal_report.shortcut_learning_detected:
345
+ tags.append("SHORTCUT_LEARNER")
346
+ if causal_report.false_confidence_detected:
347
+ tags.append("FALSE_CONFIDENCE")
348
+ if counterfactual_report.brittleness_level.value in ("BRITTLE", "FRAGILE"):
349
+ tags.append(f"BRITTLE_STRATEGY_{counterfactual_report.brittleness_level.value}")
350
+ if causal_report.understanding_level.value == "DEEP":
351
+ tags.append("DEEP_REASONER")
352
+ if strategy_report.strategy == "TARGETED_DEBUGGING":
353
+ tags.append("TARGETED_DEBUGGER")
354
+ if not tags:
355
+ tags.append("TYPICAL")
356
+
357
+ # โ”€โ”€ Executive summary โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
358
+ summary_parts = [
359
+ f"Agent scored {final_score:.2f} on {task}.",
360
+ f"Causal understanding: {causal_report.understanding_level.value} ({causal_score:.2f}).",
361
+ f"Strategy: {strategy_report.strategy} (robustness: {robustness_score:.2f}).",
362
+ f"Confidence calibration: {calibration_report.profile.value} (error: {calibration_report.expected_calibration_error:.2f}).",
363
+ f"Composite reliability: {composite_score:.2f}.",
364
+ ]
365
+ executive_summary = " ".join(summary_parts)
366
+
367
+ # โ”€โ”€ Researcher notes โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
368
+ researcher_notes = (
369
+ f"Observed {total_steps} steps ({steps_wasted} wasted vs estimated {len(optimal)} optimal). "
370
+ f"Chain coverage: {causal_report.actual_chain_coverage:.0%}. "
371
+ f"Chain order score: {causal_report.chain_order_score:.2f}. "
372
+ f"Counterfactual mutations survived: {counterfactual_report.mutations_survived}/{len(counterfactual_report.mutations_tested)}. "
373
+ f"Expected calibration error: {calibration_report.expected_calibration_error:.3f}. "
374
+ f"Decision efficiency: {decision_efficiency:.3f}. "
375
+ f"Process quality: {process_quality:.3f}."
376
+ )
377
+
378
+ return AnalyticsReport(
379
+ report_id=f"ar_{uuid.uuid4().hex[:10]}",
380
+ episode_id=episode_id,
381
+ task=task,
382
+ variant_id=variant_id,
383
+ generated_at=time.time(),
384
+ correctness_score=correctness_score,
385
+ causal_score=causal_score,
386
+ robustness_score=robustness_score,
387
+ calibration_score=calibration_score,
388
+ reliability_index=reliability_index,
389
+ generalization_hint=generalization_hint,
390
+ decision_efficiency=decision_efficiency,
391
+ process_quality=process_quality,
392
+ composite_score=composite_score,
393
+ reasoning_graph=reasoning_graph,
394
+ failure_root_causes=root_causes,
395
+ what_agent_did=what_did,
396
+ what_agent_should_have_done=optimal,
397
+ steps_wasted=steps_wasted,
398
+ steps_optimal=len(optimal),
399
+ profile_tags=tags,
400
+ executive_summary=executive_summary,
401
+ researcher_notes=researcher_notes,
402
+ )
403
+
404
+ def _build_reasoning_graph(
405
+ self,
406
+ steps: List[dict],
407
+ meta: dict,
408
+ files_read: List[str],
409
+ files_written: List[str],
410
+ ) -> ReasoningGraph:
411
+ """Build a DAG from the trajectory steps."""
412
+ bug_files = set(meta.get("bug_files", []) + meta.get("files_to_implement", []))
413
+
414
+ nodes: List[ReasoningNode] = []
415
+ phases: Dict[str, List[str]] = {
416
+ "Exploration": [], "Hypothesis": [], "Verification": [], "Commit": []
417
+ }
418
+ files_read_set = set()
419
+ last_useful_node_id: Optional[str] = None
420
+ all_node_ids: List[str] = []
421
+
422
+ for s in steps:
423
+ node_id = f"n{s.get('step_number', len(nodes)+1)}"
424
+ atype = s.get("action_type", "unknown")
425
+ target = s.get("action_path") or s.get("action_query")
426
+ reward = s.get("reward", 0.0)
427
+
428
+ # Determine usefulness
429
+ was_useful = (
430
+ reward > 0 or
431
+ (atype == "read_file" and target in bug_files) or
432
+ (atype == "search_code") or
433
+ (atype == "run_tests") or
434
+ (atype == "submit" and reward > 0)
435
+ )
436
+
437
+ # Determine phase
438
+ if atype in ("read_file", "search_code"):
439
+ phase = "Exploration"
440
+ elif atype == "write_file":
441
+ phase = "Hypothesis"
442
+ elif atype == "run_tests":
443
+ phase = "Verification"
444
+ else:
445
+ phase = "Commit"
446
+
447
+ # Build label
448
+ short_target = (target.split("/")[-1] if target else "")[:20] if target else ""
449
+ label = f"{atype}({short_target})" if short_target else atype
450
+
451
+ # Connections: link to previous useful node
452
+ connects_to = [last_useful_node_id] if last_useful_node_id and was_useful else []
453
+ connects_to = [c for c in connects_to if c]
454
+
455
+ node = ReasoningNode(
456
+ node_id=node_id,
457
+ step_number=s.get("step_number", len(nodes) + 1),
458
+ action_type=atype,
459
+ target=target,
460
+ reward=reward,
461
+ was_useful=was_useful,
462
+ connected_to=connects_to,
463
+ label=label,
464
+ )
465
+ nodes.append(node)
466
+ phases[phase].append(node_id)
467
+ all_node_ids.append(node_id)
468
+ if was_useful:
469
+ last_useful_node_id = node_id
470
+
471
+ # Critical path: nodes with positive reward or that led to the final submit
472
+ critical_path = [n.node_id for n in nodes if n.reward > 0 or n.action_type == "submit"]
473
+ wasted_nodes = [n.node_id for n in nodes if not n.was_useful and n.action_type != "submit"]
474
+
475
+ # Optimal path comparison
476
+ optimal_actions = []
477
+ test_files = [f for f in (list(files_read) + list(bug_files)) if "test" in f.lower()]
478
+ src_files = [f for f in (list(files_read) + list(bug_files)) if f not in test_files]
479
+ for tf in test_files[:1]:
480
+ optimal_actions.append(f"read_file({tf.split('/')[-1]})")
481
+ for sf in src_files[:2]:
482
+ optimal_actions.append(f"read_file({sf.split('/')[-1]})")
483
+ optimal_actions += ["write_file(src)", "run_tests", "submit"]
484
+ optimal_path = " โ†’ ".join(optimal_actions)
485
+
486
+ return ReasoningGraph(
487
+ nodes=nodes,
488
+ phases={k: v for k, v in phases.items() if v},
489
+ critical_path=critical_path,
490
+ wasted_nodes=wasted_nodes,
491
+ optimal_path_comparison=optimal_path,
492
+ )
493
+
494
+ def _build_root_cause_tree(
495
+ self, failure_report, causal_report, calibration_report, final_score: float
496
+ ) -> List[Dict]:
497
+ """Build a structured root cause tree."""
498
+ causes = []
499
+
500
+ if failure_report and failure_report.failures:
501
+ for f in failure_report.failures[:3]:
502
+ causes.append({
503
+ "depth": "primary",
504
+ "cause": f.failure_type if hasattr(f, "failure_type") else str(f),
505
+ "effect": f.evidence if hasattr(f, "evidence") else "unknown",
506
+ "remediation": f.remediation if hasattr(f, "remediation") else "See improvement plan",
507
+ })
508
+ elif final_score < 0.5:
509
+ causes.append({
510
+ "depth": "primary",
511
+ "cause": failure_report.primary_failure if failure_report else "LOW_SCORE",
512
+ "effect": f"Final score only {final_score:.2f} โ€” bug not adequately fixed",
513
+ "remediation": "Use test-first navigation and verify with run_tests",
514
+ })
515
+
516
+ if causal_report and causal_report.guessing_indicators:
517
+ for ind in causal_report.guessing_indicators[:2]:
518
+ causes.append({
519
+ "depth": "secondary",
520
+ "cause": "CAUSAL_GAP",
521
+ "effect": ind,
522
+ "remediation": causal_report.recommendations[0] if causal_report.recommendations else "",
523
+ })
524
+
525
+ if calibration_report and calibration_report.profile.value == "OVERCONFIDENT":
526
+ causes.append({
527
+ "depth": "secondary",
528
+ "cause": "OVERCONFIDENCE",
529
+ "effect": f"Inferred confidence {calibration_report.inferred_confidence:.2f} vs actual {calibration_report.actual_performance:.2f}",
530
+ "remediation": "Read more before committing. Verify with tests.",
531
+ })
532
+
533
+ return causes
534
+
535
+ def _compute_optimal_path(
536
+ self, meta: dict, files_read: List[str], files_written: List[str], score: float
537
+ ) -> List[str]:
538
+ """Suggest what the optimal action sequence would have been."""
539
+ test_files = [f for f in files_read if "test" in f.lower()]
540
+ bug_files = meta.get("bug_files", []) or meta.get("files_to_implement", [])
541
+
542
+ path = []
543
+ for tf in (test_files or ["tests/test_main.py"])[:1]:
544
+ path.append(f"read_file {tf}")
545
+ for bf in (bug_files or ["src/main.py"])[:2]:
546
+ path.append(f"read_file {bf}")
547
+ path.append("search_code <function_name>")
548
+ path.append("write_file <targeted_fix>")
549
+ path.append("run_tests")
550
+ path.append("submit")
551
+ return path
server/app.py CHANGED
@@ -1,12 +1,14 @@
1
  # server/app.py
2
  """
3
- FastAPI server โ€” v3.0
4
 
5
  Core endpoints: POST /reset, POST /step, GET /state, GET /health
6
  Evaluation endpoints: GET /trajectory, GET /evaluate, GET /metrics
7
  Control endpoints: POST /fault-config
8
- Intelligence endpoints: GET /classify, GET /strategy, GET /advanced-metrics,
9
- POST /compare-agents, GET /improvement-plan, GET /viz-data
 
 
10
  """
11
  from fastapi import FastAPI, HTTPException
12
  from fastapi.staticfiles import StaticFiles
@@ -337,3 +339,146 @@ async def get_viz_data():
337
  "dependencies": deps,
338
  "steps": steps_data,
339
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # server/app.py
2
  """
3
+ FastAPI server โ€” v4.0
4
 
5
  Core endpoints: POST /reset, POST /step, GET /state, GET /health
6
  Evaluation endpoints: GET /trajectory, GET /evaluate, GET /metrics
7
  Control endpoints: POST /fault-config
8
+ Intelligence (v3): GET /classify, GET /strategy, GET /advanced-metrics,
9
+ POST /compare-agents, GET /improvement-plan, GET /viz-data
10
+ Research (v4 NEW): GET /causal-probe, GET /counterfactual, GET /confidence,
11
+ POST /benchmark, GET /analytics
12
  """
13
  from fastapi import FastAPI, HTTPException
14
  from fastapi.staticfiles import StaticFiles
 
339
  "dependencies": deps,
340
  "steps": steps_data,
341
  }
342
+
343
+
344
+ # โ”€โ”€ Research Endpoints (NEW in v4) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
345
+
346
+ from .causal_probe import CausalProbe
347
+ from .counterfactual_engine import CounterfactualEngine
348
+ from .confidence_calibrator import ConfidenceCalibrator
349
+ from .benchmark_runner import BenchmarkRunner
350
+ from .analytics_engine import AnalyticsEngine
351
+
352
+ _causal = CausalProbe()
353
+ _counter = CounterfactualEngine()
354
+ _calibrator = ConfidenceCalibrator()
355
+ _benchmark = BenchmarkRunner()
356
+ _analytics = AnalyticsEngine()
357
+
358
+
359
+ @app.get("/causal-probe")
360
+ async def causal_probe():
361
+ """
362
+ Causal reasoning probe โ€” did the agent understand WHY the bug exists?
363
+ Returns: causal_score, understanding_level, chain_coverage, shortcut_detection.
364
+ """
365
+ traj = env.get_trajectory()
366
+ if not traj:
367
+ return {"error": "No trajectory available."}
368
+ steps = traj.get("steps", [])
369
+ meta = env.variant.meta if env.variant else {}
370
+ report = _causal.probe(
371
+ episode_id=traj.get("episode_id", ""),
372
+ task=env.current_task or "unknown",
373
+ trajectory_steps=steps,
374
+ variant_meta=meta,
375
+ files_read=list(env.files_read),
376
+ files_written=list(env.files_written),
377
+ final_score=env.final_score,
378
+ )
379
+ return report.to_dict()
380
+
381
+
382
+ @app.get("/counterfactual")
383
+ async def counterfactual():
384
+ """
385
+ Counterfactual robustness test โ€” is the agent's strategy brittle?
386
+ Simulates 6 mutations and measures how many the strategy survives.
387
+ Returns: robustness_score, brittleness_level, mutations analysis.
388
+ """
389
+ traj = env.get_trajectory()
390
+ if not traj:
391
+ return {"error": "No trajectory available."}
392
+ steps = traj.get("steps", [])
393
+ meta = env.variant.meta if env.variant else {}
394
+ report = _counter.analyze(
395
+ episode_id=traj.get("episode_id", ""),
396
+ task=env.current_task or "unknown",
397
+ trajectory_steps=steps,
398
+ variant_meta=meta,
399
+ files_read=list(env.files_read),
400
+ files_written=list(env.files_written),
401
+ final_score=env.final_score,
402
+ )
403
+ return report.to_dict()
404
+
405
+
406
+ @app.get("/confidence")
407
+ async def confidence_calibration():
408
+ """
409
+ Confidence calibration โ€” is the agent appropriately confident?
410
+ Infers confidence from behavioral proxies and compares to actual performance.
411
+ Returns: profile (WELL_CALIBRATED|OVERCONFIDENT|UNDERCONFIDENT), calibration_score.
412
+ """
413
+ traj = env.get_trajectory()
414
+ if not traj:
415
+ return {"error": "No trajectory available."}
416
+ steps = traj.get("steps", [])
417
+ report = _calibrator.calibrate(
418
+ episode_id=traj.get("episode_id", ""),
419
+ task=env.current_task or "unknown",
420
+ trajectory_steps=steps,
421
+ final_score=env.final_score,
422
+ )
423
+ return report.to_dict()
424
+
425
+
426
+ @app.post("/benchmark")
427
+ async def run_benchmark(
428
+ tasks: str = "task1,task2",
429
+ agents: str = "all",
430
+ benchmark_id: str = None,
431
+ ):
432
+ """
433
+ Automated benchmark leaderboard.
434
+ Runs all selected agents ร— tasks. Returns ranked leaderboard.
435
+ tasks: comma-separated task IDs. agents: "all" or comma-separated strategy names.
436
+ """
437
+ task_list = [t.strip() for t in tasks.split(",") if t.strip()]
438
+ valid_tasks = ["task1", "task2", "task3"]
439
+ task_list = [t for t in task_list if t in valid_tasks]
440
+ if not task_list:
441
+ raise HTTPException(status_code=400, detail=f"tasks must be one of {valid_tasks}")
442
+
443
+ agent_list = None if agents == "all" else [a.strip() for a in agents.split(",")]
444
+
445
+ try:
446
+ report = _benchmark.run(env, tasks=task_list, agents=agent_list, benchmark_id=benchmark_id)
447
+ return report.to_dict()
448
+ except Exception as e:
449
+ raise HTTPException(status_code=500, detail=str(e))
450
+
451
+
452
+ @app.get("/analytics")
453
+ async def get_analytics():
454
+ """
455
+ Unified research-grade analytics report.
456
+ Synthesizes all v3+v4 evaluation dimensions into one report with:
457
+ reasoning graph, root cause tree, alternative paths, profile tags,
458
+ composite score, executive summary, researcher notes.
459
+ """
460
+ traj = env.get_trajectory()
461
+ if not traj:
462
+ return {"error": "No trajectory available."}
463
+ try:
464
+ report = _analytics.analyze(env)
465
+ return report.to_dict()
466
+ except Exception as e:
467
+ raise HTTPException(status_code=500, detail=str(e))
468
+
469
+
470
+ @app.get("/health")
471
+ async def health_v4():
472
+ return {
473
+ "status": "ok",
474
+ "environment": "codebase-nav-env",
475
+ "version": "4.0.0",
476
+ "endpoints": [
477
+ "/reset", "/step", "/state", "/health",
478
+ "/trajectory", "/evaluate", "/metrics", "/fault-config",
479
+ "/classify", "/strategy", "/advanced-metrics",
480
+ "/improvement-plan", "/compare-agents", "/viz-data",
481
+ "/causal-probe", "/counterfactual", "/confidence",
482
+ "/benchmark", "/analytics",
483
+ ],
484
+ }
server/benchmark_runner.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # server/benchmark_runner.py
2
+ """
3
+ Benchmark Runner + Leaderboard โ€” v4.0
4
+
5
+ Automatically runs ALL tasks ร— selected agent configurations and generates
6
+ a research-grade leaderboard output with per-task, per-strategy breakdowns.
7
+
8
+ Unlike existing benchmarks (SWE-bench, HumanEval) which require manual setup,
9
+ this runs end-to-end in-process with deterministic strategies.
10
+
11
+ Output format:
12
+ - Leaderboard table (ranked by composite score)
13
+ - Per-task breakdown
14
+ - Per-failure-type breakdown
15
+ - Generalization score (variance across tasks)
16
+ - Robustness score (from counterfactual engine)
17
+ - A "benchmark JSON" suitable for publishing or comparing systems
18
+ """
19
+ from __future__ import annotations
20
+ import time
21
+ import json
22
+ from typing import List, Dict, Any, Optional
23
+ from dataclasses import dataclass, field
24
+
25
+
26
+ @dataclass
27
+ class BenchmarkResult:
28
+ """Result of running one agent on one task variant."""
29
+ agent_name: str
30
+ task: str
31
+ variant_id: str
32
+ final_score: float
33
+ total_steps: int
34
+ cumulative_reward: float
35
+ duration_seconds: float
36
+ strategy: str
37
+ failure_type: str
38
+ reliability_index: float
39
+ causal_score: float
40
+ robustness_score: float
41
+ calibration_score: float
42
+ action_sequence: List[str]
43
+
44
+
45
+ @dataclass
46
+ class AgentBenchmarkSummary:
47
+ """Aggregated results for one agent across all tasks."""
48
+ agent_name: str
49
+ tasks_run: int
50
+ mean_score: float
51
+ std_score: float
52
+ generalization_score: float # 1 - std (lower variance = more generalizable)
53
+ mean_steps: float
54
+ best_task: str
55
+ worst_task: str
56
+ mean_reliability: float
57
+ mean_causal_score: float
58
+ mean_robustness_score: float
59
+ mean_calibration_score: float
60
+ dominant_strategy: str
61
+ dominant_failure: str
62
+ composite_rank_score: float # Weighted final score for leaderboard
63
+ per_task_scores: Dict[str, float]
64
+
65
+ def to_dict(self) -> dict:
66
+ return {
67
+ "agent_name": self.agent_name,
68
+ "tasks_run": self.tasks_run,
69
+ "scores": {
70
+ "mean": round(self.mean_score, 3),
71
+ "std": round(self.std_score, 3),
72
+ "generalization": round(self.generalization_score, 3),
73
+ "reliability": round(self.mean_reliability, 3),
74
+ "causal_reasoning": round(self.mean_causal_score, 3),
75
+ "robustness": round(self.mean_robustness_score, 3),
76
+ "calibration": round(self.mean_calibration_score, 3),
77
+ "composite": round(self.composite_rank_score, 3),
78
+ },
79
+ "efficiency": {
80
+ "mean_steps": round(self.mean_steps, 1),
81
+ },
82
+ "behavior": {
83
+ "dominant_strategy": self.dominant_strategy,
84
+ "dominant_failure": self.dominant_failure,
85
+ },
86
+ "per_task_scores": {k: round(v, 3) for k, v in self.per_task_scores.items()},
87
+ "best_task": self.best_task,
88
+ "worst_task": self.worst_task,
89
+ }
90
+
91
+
92
+ @dataclass
93
+ class LeaderboardReport:
94
+ """Full benchmark leaderboard."""
95
+ benchmark_id: str
96
+ tasks_evaluated: List[str]
97
+ agents_evaluated: List[str]
98
+ total_episodes: int
99
+ run_duration_seconds: float
100
+ rankings: List[AgentBenchmarkSummary]
101
+ raw_results: List[BenchmarkResult]
102
+
103
+ def to_dict(self) -> dict:
104
+ return {
105
+ "benchmark_id": self.benchmark_id,
106
+ "tasks_evaluated": self.tasks_evaluated,
107
+ "agents_evaluated": self.agents_evaluated,
108
+ "total_episodes": self.total_episodes,
109
+ "run_duration_seconds": round(self.run_duration_seconds, 2),
110
+ "leaderboard": [r.to_dict() for r in self.rankings],
111
+ "winner": self.rankings[0].agent_name if self.rankings else "none",
112
+ "insights": self._generate_insights(),
113
+ }
114
+
115
+ def _generate_insights(self) -> List[str]:
116
+ if not self.rankings:
117
+ return []
118
+ insights = []
119
+ top = self.rankings[0]
120
+ bottom = self.rankings[-1]
121
+
122
+ if top.composite_rank_score - bottom.composite_rank_score > 0.2:
123
+ insights.append(
124
+ f"Large performance gap: '{top.agent_name}' ({top.composite_rank_score:.2f}) "
125
+ f"vs '{bottom.agent_name}' ({bottom.composite_rank_score:.2f})"
126
+ )
127
+ if top.generalization_score > 0.7:
128
+ insights.append(
129
+ f"'{top.agent_name}' shows strong generalization "
130
+ f"(std={top.std_score:.3f} across {top.tasks_run} tasks)"
131
+ )
132
+ for r in self.rankings:
133
+ if r.mean_causal_score > 0.6:
134
+ insights.append(
135
+ f"'{r.agent_name}' demonstrated genuine causal reasoning "
136
+ f"(causal_score={r.mean_causal_score:.2f})"
137
+ )
138
+ strategies = [r.dominant_strategy for r in self.rankings]
139
+ if len(set(strategies)) > 1:
140
+ best_strategy = self.rankings[0].dominant_strategy
141
+ insights.append(
142
+ f"Strategy '{best_strategy}' produced the highest composite score."
143
+ )
144
+ return insights
145
+
146
+ def render_table(self) -> str:
147
+ """Render ASCII leaderboard table."""
148
+ if not self.rankings:
149
+ return "No results."
150
+
151
+ lines = [
152
+ f"{'โ•'*90}",
153
+ f" ๐Ÿ† BENCHMARK LEADERBOARD โ€” {self.benchmark_id}",
154
+ f" Tasks: {', '.join(self.tasks_evaluated)} | Agents: {len(self.agents_evaluated)} | Episodes: {self.total_episodes}",
155
+ f"{'โ•'*90}",
156
+ f"{'Rank':<5} {'Agent':<16} {'Score':<8} {'Causal':<8} {'Robust':<8} {'Calibr':<8} {'Genrz':<8} {'Steps':<7} {'Strategy'}",
157
+ f"{'โ”€'*90}",
158
+ ]
159
+ for i, r in enumerate(self.rankings):
160
+ medal = "๐Ÿฅ‡" if i == 0 else "๐Ÿฅˆ" if i == 1 else "๐Ÿฅ‰" if i == 2 else f" #{i+1}"
161
+ lines.append(
162
+ f"{medal:<5} {r.agent_name:<16} {r.mean_score:<8.3f} "
163
+ f"{r.mean_causal_score:<8.3f} {r.mean_robustness_score:<8.3f} "
164
+ f"{r.mean_calibration_score:<8.3f} {r.generalization_score:<8.3f} "
165
+ f"{r.mean_steps:<7.1f} {r.dominant_strategy}"
166
+ )
167
+ lines.append(f"{'โ•'*90}")
168
+
169
+ lines.append("\n๐Ÿ“Š Per-Task Breakdown:")
170
+ for r in self.rankings:
171
+ task_str = " | ".join(f"{t}: {s:.2f}" for t, s in sorted(r.per_task_scores.items()))
172
+ lines.append(f" {r.agent_name:<16} {task_str}")
173
+
174
+ if self._generate_insights():
175
+ lines.append("\n๐Ÿ’ก Insights:")
176
+ lines.extend(f" โ†’ {i}" for i in self._generate_insights())
177
+
178
+ return "\n".join(lines)
179
+
180
+
181
+ class BenchmarkRunner:
182
+ """
183
+ Automated benchmark runner.
184
+
185
+ Runs each agent in AGENT_CONFIGS across each task, collecting:
186
+ - Final score
187
+ - All intelligence metrics (causal, counterfactual, confidence)
188
+ - Strategy and failure classification
189
+ - Reliability index
190
+
191
+ Then generates a ranked leaderboard.
192
+ """
193
+
194
+ def run(
195
+ self,
196
+ env,
197
+ tasks: Optional[List[str]] = None,
198
+ agents: Optional[List[str]] = None,
199
+ benchmark_id: Optional[str] = None,
200
+ ) -> LeaderboardReport:
201
+ """Run the full benchmark."""
202
+ import uuid
203
+ from server.models import RepoAction
204
+ from server.strategy_detector import StrategyDetector
205
+ from server.failure_classifier import FailureClassifier
206
+ from server.advanced_metrics import AdvancedMetricsEngine
207
+ from server.causal_probe import CausalProbe
208
+ from server.counterfactual_engine import CounterfactualEngine
209
+ from server.confidence_calibrator import ConfidenceCalibrator
210
+
211
+ benchmark_id = benchmark_id or f"bench_{uuid.uuid4().hex[:8]}"
212
+ tasks = tasks or ["task1", "task2", "task3"]
213
+ agent_configs = self._get_agent_configs()
214
+ if agents:
215
+ agent_configs = {k: v for k, v in agent_configs.items() if k in agents}
216
+
217
+ clf = FailureClassifier()
218
+ det = StrategyDetector()
219
+ adv = AdvancedMetricsEngine()
220
+ causal = CausalProbe()
221
+ counter = CounterfactualEngine()
222
+ calibrator = ConfidenceCalibrator()
223
+
224
+ start_time = time.time()
225
+ all_results: List[BenchmarkResult] = []
226
+
227
+ for task in tasks:
228
+ for agent_name, agent_fn in agent_configs.items():
229
+ try:
230
+ result = self._run_episode(
231
+ env, task, agent_name, agent_fn,
232
+ clf, det, adv, causal, counter, calibrator
233
+ )
234
+ all_results.append(result)
235
+ except Exception as e:
236
+ # Don't crash the whole benchmark on one failure
237
+ all_results.append(BenchmarkResult(
238
+ agent_name=agent_name, task=task, variant_id="error",
239
+ final_score=0.0, total_steps=0, cumulative_reward=0.0,
240
+ duration_seconds=0.0, strategy="ERROR", failure_type="BENCHMARK_ERROR",
241
+ reliability_index=0.0, causal_score=0.0, robustness_score=0.0,
242
+ calibration_score=0.0, action_sequence=[],
243
+ ))
244
+
245
+ total_duration = time.time() - start_time
246
+ rankings = self._compute_rankings(all_results, tasks)
247
+
248
+ return LeaderboardReport(
249
+ benchmark_id=benchmark_id,
250
+ tasks_evaluated=tasks,
251
+ agents_evaluated=list(agent_configs.keys()),
252
+ total_episodes=len(all_results),
253
+ run_duration_seconds=total_duration,
254
+ rankings=rankings,
255
+ raw_results=all_results,
256
+ )
257
+
258
+ def _run_episode(
259
+ self, env, task, agent_name, agent_fn,
260
+ clf, det, adv, causal, counter, calibrator
261
+ ) -> BenchmarkResult:
262
+ from server.models import RepoAction
263
+
264
+ reset_result = env.reset(task=task)
265
+ obs = reset_result.observation
266
+ variant_id = reset_result.info.get("variant_id", "unknown")
267
+ context = {}
268
+
269
+ obs_dict = obs.model_dump()
270
+ start = time.time()
271
+ cumulative_reward = 0.0
272
+ files_read, files_written, action_sequence = [], [], []
273
+ max_steps = 15
274
+
275
+ for step_num in range(1, max_steps + 1):
276
+ if env.done:
277
+ break
278
+ action_dict = agent_fn(obs_dict, step_num, context)
279
+ action = RepoAction(
280
+ action_type=action_dict.get("action_type", "submit"),
281
+ path=action_dict.get("path"),
282
+ query=action_dict.get("query"),
283
+ )
284
+ result = env.step(action)
285
+ obs = result.observation
286
+ obs_dict = obs.model_dump()
287
+ cumulative_reward += result.reward
288
+ action_sequence.append(action.action_type)
289
+ if action.path and action.action_type == "read_file":
290
+ files_read.append(action.path)
291
+ if action.path and action.action_type == "write_file":
292
+ files_written.append(action.path)
293
+ if result.done:
294
+ break
295
+
296
+ if not env.done:
297
+ r = env.step(RepoAction(action_type="submit"))
298
+ cumulative_reward += r.reward
299
+ action_sequence.append("submit")
300
+
301
+ duration = time.time() - start
302
+ final_score = env.final_score
303
+ traj = env.get_trajectory()
304
+ steps = traj.get("steps", []) if traj else []
305
+ meta = env.variant.meta if env.variant else {}
306
+
307
+ # Intelligence metrics
308
+ fail_r = clf.classify(
309
+ traj.get("episode_id", "") if traj else "", task,
310
+ steps, meta, files_read, files_written, final_score
311
+ )
312
+ strat_r = det.detect(steps, task, meta, files_read, final_score)
313
+ adv_r = adv.compute(steps, meta, final_score, files_read, files_written)
314
+ causal_r = causal.probe(
315
+ traj.get("episode_id", "") if traj else "", task,
316
+ steps, meta, files_read, files_written, final_score
317
+ )
318
+ counter_r = counter.analyze(
319
+ traj.get("episode_id", "") if traj else "", task,
320
+ steps, meta, files_read, files_written, final_score
321
+ )
322
+ calib_r = calibrator.calibrate(
323
+ traj.get("episode_id", "") if traj else "", task,
324
+ steps, final_score,
325
+ )
326
+
327
+ return BenchmarkResult(
328
+ agent_name=agent_name,
329
+ task=task,
330
+ variant_id=variant_id,
331
+ final_score=final_score,
332
+ total_steps=len(action_sequence),
333
+ cumulative_reward=cumulative_reward,
334
+ duration_seconds=duration,
335
+ strategy=strat_r.strategy,
336
+ failure_type=fail_r.primary_failure,
337
+ reliability_index=adv_r.reliability_index,
338
+ causal_score=causal_r.causal_score,
339
+ robustness_score=counter_r.robustness_score,
340
+ calibration_score=calib_r.calibration_score,
341
+ action_sequence=action_sequence,
342
+ )
343
+
344
+ def _compute_rankings(
345
+ self, results: List[BenchmarkResult], tasks: List[str]
346
+ ) -> List[AgentBenchmarkSummary]:
347
+ import math
348
+ from collections import Counter
349
+
350
+ # Group by agent
351
+ agent_results: Dict[str, List[BenchmarkResult]] = {}
352
+ for r in results:
353
+ agent_results.setdefault(r.agent_name, []).append(r)
354
+
355
+ summaries = []
356
+ for agent_name, agent_res in agent_results.items():
357
+ scores = [r.final_score for r in agent_res]
358
+ mean_score = sum(scores) / len(scores)
359
+ if len(scores) > 1:
360
+ variance = sum((s - mean_score) ** 2 for s in scores) / len(scores)
361
+ std_score = math.sqrt(variance)
362
+ else:
363
+ std_score = 0.0
364
+ generalization_score = max(0.0, 1.0 - std_score)
365
+
366
+ per_task = {r.task: r.final_score for r in agent_res}
367
+ strategies = Counter(r.strategy for r in agent_res)
368
+ failures = Counter(r.failure_type for r in agent_res)
369
+
370
+ mean_steps = sum(r.total_steps for r in agent_res) / len(agent_res)
371
+ mean_reliability = sum(r.reliability_index for r in agent_res) / len(agent_res)
372
+ mean_causal = sum(r.causal_score for r in agent_res) / len(agent_res)
373
+ mean_robustness = sum(r.robustness_score for r in agent_res) / len(agent_res)
374
+ mean_calibration = sum(r.calibration_score for r in agent_res) / len(agent_res)
375
+
376
+ # Composite leaderboard score โ€” weighted across all dimensions
377
+ composite = (
378
+ mean_score * 0.35 +
379
+ mean_causal * 0.20 +
380
+ mean_robustness * 0.15 +
381
+ mean_calibration * 0.15 +
382
+ generalization_score * 0.15
383
+ )
384
+
385
+ best_task = max(per_task, key=per_task.get)
386
+ worst_task = min(per_task, key=per_task.get)
387
+
388
+ summaries.append(AgentBenchmarkSummary(
389
+ agent_name=agent_name,
390
+ tasks_run=len(agent_res),
391
+ mean_score=mean_score,
392
+ std_score=std_score,
393
+ generalization_score=generalization_score,
394
+ mean_steps=mean_steps,
395
+ best_task=best_task,
396
+ worst_task=worst_task,
397
+ mean_reliability=mean_reliability,
398
+ mean_causal_score=mean_causal,
399
+ mean_robustness_score=mean_robustness,
400
+ mean_calibration_score=mean_calibration,
401
+ dominant_strategy=strategies.most_common(1)[0][0],
402
+ dominant_failure=failures.most_common(1)[0][0],
403
+ composite_rank_score=composite,
404
+ per_task_scores=per_task,
405
+ ))
406
+
407
+ summaries.sort(key=lambda s: -s.composite_rank_score)
408
+ return summaries
409
+
410
+ def _get_agent_configs(self) -> Dict:
411
+ """Reuse built-in strategies from multi_agent.py."""
412
+ from server.multi_agent import MultiAgentComparison
413
+ return MultiAgentComparison.AGENT_CONFIGS
server/causal_probe.py ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # server/causal_probe.py
2
+ """
3
+ Causal Reasoning Probe โ€” v4.0
4
+
5
+ The key scientific question: Did the agent understand WHY the bug exists,
6
+ or did it accidentally fix it by pattern matching?
7
+
8
+ We measure causal understanding by checking if the agent traversed the
9
+ COMPLETE causal chain: Failing test โ†’ tested function โ†’ return path โ†’ root cause.
10
+
11
+ An agent that reads only the test and immediately rewrites the function
12
+ is guessing. An agent that reads test โ†’ traces the call stack โ†’ finds the
13
+ actual cause first is reasoning causally.
14
+
15
+ This is NOT in any current benchmark. SWE-bench only checks if the test passes.
16
+ We check HOW the agent got there.
17
+ """
18
+ from __future__ import annotations
19
+ from typing import List, Dict, Any, Optional
20
+ from dataclasses import dataclass, field
21
+ from enum import Enum
22
+
23
+
24
+ class CausalUnderstandingLevel(str, Enum):
25
+ DEEP = "DEEP" # Full causal chain traversal
26
+ PARTIAL = "PARTIAL" # Partial chain (some steps missing)
27
+ SUPERFICIAL = "SUPERFICIAL" # Direct testโ†’rewrite with no chain
28
+ RANDOM = "RANDOM" # No discernible causal pattern
29
+
30
+
31
+ @dataclass
32
+ class CausalChainNode:
33
+ """One node in the reconstructed causal chain."""
34
+ file: str
35
+ role: str # "test", "caller", "called", "root_cause", "missed"
36
+ was_read: bool
37
+ read_order: Optional[int] # Which step did agent read this?
38
+
39
+
40
+ @dataclass
41
+ class CausalProbeReport:
42
+ """
43
+ Full causal reasoning analysis for one episode.
44
+ This is the primary output of the CausalProbe.
45
+ """
46
+ episode_id: str
47
+ task: str
48
+
49
+ # Core verdict
50
+ understanding_level: CausalUnderstandingLevel
51
+ causal_score: float # 0.0 โ€“ 1.0
52
+
53
+ # Chain analysis
54
+ expected_chain: List[CausalChainNode] # What SHOULD have been read
55
+ actual_chain_coverage: float # Fraction of chain actually traversed
56
+ chain_order_score: float # Was chain traversed in correct order?
57
+
58
+ # Behavioral signals
59
+ read_before_write: bool # Did agent read all relevant files before writing?
60
+ test_informed_navigation: bool # Did reading tests change which files agent read next?
61
+ search_before_navigate: bool # Did agent search for function names before reading?
62
+ submit_after_test: bool # Did agent verify fix before submitting?
63
+
64
+ # Signal: understanding vs guessing
65
+ guessing_indicators: List[str] # Signs agent was guessing
66
+ understanding_indicators: List[str] # Signs agent understood
67
+
68
+ # Calibration
69
+ false_confidence_detected: bool # Submitted without reading root cause file
70
+ shortcut_learning_detected: bool # Read test file โ†’ immediately wrote โ†’ submitted
71
+
72
+ explanation: str
73
+ recommendations: List[str]
74
+
75
+ def to_dict(self) -> dict:
76
+ return {
77
+ "episode_id": self.episode_id,
78
+ "task": self.task,
79
+ "understanding_level": self.understanding_level.value,
80
+ "causal_score": round(self.causal_score, 3),
81
+ "chain_coverage": round(self.actual_chain_coverage, 3),
82
+ "chain_order_score": round(self.chain_order_score, 3),
83
+ "behavioral_signals": {
84
+ "read_before_write": self.read_before_write,
85
+ "test_informed_navigation": self.test_informed_navigation,
86
+ "search_before_navigate": self.search_before_navigate,
87
+ "submit_after_test": self.submit_after_test,
88
+ },
89
+ "guessing_indicators": self.guessing_indicators,
90
+ "understanding_indicators": self.understanding_indicators,
91
+ "diagnostics": {
92
+ "false_confidence_detected": self.false_confidence_detected,
93
+ "shortcut_learning_detected": self.shortcut_learning_detected,
94
+ },
95
+ "expected_chain": [
96
+ {"file": n.file, "role": n.role, "read": n.was_read, "order": n.read_order}
97
+ for n in self.expected_chain
98
+ ],
99
+ "explanation": self.explanation,
100
+ "recommendations": self.recommendations,
101
+ }
102
+
103
+
104
+ class CausalProbe:
105
+ """
106
+ Analyzes whether an agent engaged in true causal reasoning.
107
+
108
+ The core insight: for a bug in src/X.py called from tests/test_X.py,
109
+ the causal chain is:
110
+ tests/test_X.py โ†’ (calls) โ†’ src/X.py โ†’ (calls) โ†’ src/utils.py (maybe)
111
+
112
+ A causally-aware agent reads in this order.
113
+ A shortcut agent reads the test, guesses the bug, rewrites without reading source.
114
+
115
+ We score order, coverage, and behavioral signals.
116
+ """
117
+
118
+ def probe(
119
+ self,
120
+ episode_id: str,
121
+ task: str,
122
+ trajectory_steps: List[dict],
123
+ variant_meta: dict,
124
+ files_read: List[str],
125
+ files_written: List[str],
126
+ final_score: float,
127
+ ) -> CausalProbeReport:
128
+ """Run the causal probe on an episode's trajectory."""
129
+
130
+ # โ”€โ”€ Build expected causal chain from variant metadata โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
131
+ test_files = variant_meta.get("test_files", []) or [
132
+ f for f in variant_meta.get("read_first_files", []) if "test" in f
133
+ ]
134
+ bug_files = variant_meta.get("bug_files", []) or variant_meta.get("files_to_implement", [])
135
+ dep_files = variant_meta.get("dependencies", []) or []
136
+
137
+ # If metadata sparse, infer from trajectory
138
+ all_files_in_traj = list({
139
+ s.get("action_path") for s in trajectory_steps
140
+ if s.get("action_path") and s.get("action_type") in ("read_file", "write_file")
141
+ })
142
+
143
+ if not test_files:
144
+ test_files = [f for f in all_files_in_traj if "test" in f.lower()]
145
+ if not bug_files:
146
+ bug_files = [f for f in all_files_in_traj
147
+ if "test" not in f.lower() and f.endswith(".py")]
148
+
149
+ # Build expected chain
150
+ expected_chain: List[CausalChainNode] = []
151
+ read_set = set(files_read)
152
+ read_order: Dict[str, int] = {}
153
+ for step in trajectory_steps:
154
+ if step.get("action_type") == "read_file" and step.get("action_path"):
155
+ path = step["action_path"]
156
+ if path not in read_order:
157
+ read_order[path] = step.get("step_number", len(read_order) + 1)
158
+
159
+ for tf in test_files:
160
+ expected_chain.append(CausalChainNode(
161
+ file=tf, role="test",
162
+ was_read=tf in read_set,
163
+ read_order=read_order.get(tf),
164
+ ))
165
+ for bf in bug_files:
166
+ expected_chain.append(CausalChainNode(
167
+ file=bf, role="root_cause",
168
+ was_read=bf in read_set,
169
+ read_order=read_order.get(bf),
170
+ ))
171
+ for df in dep_files:
172
+ expected_chain.append(CausalChainNode(
173
+ file=df, role="caller",
174
+ was_read=df in read_set,
175
+ read_order=read_order.get(df),
176
+ ))
177
+
178
+ if not expected_chain:
179
+ # Fallback: any file is better than none
180
+ for f in all_files_in_traj[:3]:
181
+ expected_chain.append(CausalChainNode(
182
+ file=f, role="unknown",
183
+ was_read=True,
184
+ read_order=read_order.get(f),
185
+ ))
186
+
187
+ # โ”€โ”€ Chain coverage โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
188
+ chain_files_read = [n for n in expected_chain if n.was_read and n.role != "missed"]
189
+ actual_chain_coverage = (
190
+ len(chain_files_read) / len(expected_chain) if expected_chain else 0.0
191
+ )
192
+
193
+ # โ”€โ”€ Chain order score (tests before src = good causal order) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
194
+ chain_order_score = 0.0
195
+ test_orders = [n.read_order for n in expected_chain if n.role == "test" and n.read_order]
196
+ src_orders = [n.read_order for n in expected_chain
197
+ if n.role in ("root_cause", "caller") and n.read_order]
198
+
199
+ if test_orders and src_orders:
200
+ # Good: all tests read before source files
201
+ correct_order_pairs = sum(
202
+ 1 for to in test_orders for so in src_orders if to < so
203
+ )
204
+ total_pairs = len(test_orders) * len(src_orders)
205
+ chain_order_score = correct_order_pairs / total_pairs if total_pairs > 0 else 0.0
206
+ elif test_orders and not src_orders:
207
+ chain_order_score = 0.3 # Partial โ€” read tests but not source
208
+ elif src_orders and not test_orders:
209
+ chain_order_score = 0.2 # Read source without reading tests = weaker
210
+
211
+ # โ”€โ”€ Behavioral signals โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
212
+ action_types = [s.get("action_type", "") for s in trajectory_steps]
213
+ action_paths = [s.get("action_path") for s in trajectory_steps]
214
+
215
+ # read_before_write: all written files were read at least once before write
216
+ read_before_write = True
217
+ for step in trajectory_steps:
218
+ if step.get("action_type") == "write_file" and step.get("action_path"):
219
+ p = step["action_path"]
220
+ step_n = step.get("step_number", 0)
221
+ was_read_before = any(
222
+ s2.get("action_type") == "read_file"
223
+ and s2.get("action_path") == p
224
+ and s2.get("step_number", 99) < step_n
225
+ for s2 in trajectory_steps
226
+ )
227
+ if not was_read_before:
228
+ read_before_write = False
229
+ break
230
+
231
+ # test_informed_navigation: did agent read source files AFTER reading tests?
232
+ test_read_step = min(
233
+ (s.get("step_number", 99) for s in trajectory_steps
234
+ if s.get("action_type") == "read_file"
235
+ and any(tf in (s.get("action_path") or "") for tf in test_files)),
236
+ default=None
237
+ )
238
+ src_read_after_test = test_read_step is not None and any(
239
+ s.get("action_type") == "read_file"
240
+ and s.get("step_number", 0) > test_read_step
241
+ and any(bf in (s.get("action_path") or "") for bf in bug_files)
242
+ for s in trajectory_steps
243
+ )
244
+ test_informed_navigation = src_read_after_test
245
+
246
+ # search_before_navigate: used search_code before reading source files
247
+ search_steps = [s for s in trajectory_steps if s.get("action_type") == "search_code"]
248
+ first_src_read = min(
249
+ (s.get("step_number", 99) for s in trajectory_steps
250
+ if s.get("action_type") == "read_file"
251
+ and any(bf in (s.get("action_path") or "") for bf in bug_files)),
252
+ default=None
253
+ )
254
+ search_before_navigate = bool(search_steps) and (
255
+ first_src_read is None or
256
+ any(s.get("step_number", 99) < first_src_read for s in search_steps)
257
+ )
258
+
259
+ # submit_after_test: ran tests before submitting
260
+ test_runs = [s for s in trajectory_steps if s.get("action_type") == "run_tests"]
261
+ submit_step = next(
262
+ (s.get("step_number", 99) for s in trajectory_steps
263
+ if s.get("action_type") == "submit"), None
264
+ )
265
+ submit_after_test = bool(test_runs) and submit_step is not None and any(
266
+ s.get("step_number", 0) < submit_step for s in test_runs
267
+ )
268
+
269
+ # โ”€โ”€ Guessing vs understanding indicators โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
270
+ guessing_indicators = []
271
+ understanding_indicators = []
272
+
273
+ total = len(trajectory_steps)
274
+
275
+ # Guessing: short episode with low score
276
+ if total <= 3 and final_score < 0.5:
277
+ guessing_indicators.append(f"Submitted in only {total} steps with score {final_score:.2f}")
278
+
279
+ # Guessing: wrote without reading
280
+ if not read_before_write:
281
+ guessing_indicators.append("Wrote to file(s) without first reading them")
282
+
283
+ # Guessing: skipped test files
284
+ if not any(n.was_read for n in expected_chain if n.role == "test"):
285
+ guessing_indicators.append("Never read any test files")
286
+
287
+ # Guessing: skipped source files
288
+ if not any(n.was_read for n in expected_chain if n.role == "root_cause"):
289
+ guessing_indicators.append("Never read the bug/source file")
290
+
291
+ # Understanding: search used
292
+ if search_steps:
293
+ understanding_indicators.append(
294
+ f"Used search_code {len(search_steps)}ร— to locate bug"
295
+ )
296
+
297
+ # Understanding: read tests first
298
+ if chain_order_score > 0.7:
299
+ understanding_indicators.append("Read tests before source files (correct causal order)")
300
+
301
+ # Understanding: tested before submitting
302
+ if submit_after_test:
303
+ understanding_indicators.append("Verified fix with run_tests before submitting")
304
+
305
+ # Understanding: explored full chain
306
+ if actual_chain_coverage > 0.7:
307
+ understanding_indicators.append(
308
+ f"Covered {actual_chain_coverage:.0%} of expected causal chain"
309
+ )
310
+
311
+ # โ”€โ”€ Diagnostics โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
312
+ # False confidence: submitted very early without testing
313
+ false_confidence_detected = (
314
+ submit_step is not None and submit_step <= 3 and not test_runs
315
+ )
316
+
317
+ # Shortcut learning: read test โ†’ immediate write โ†’ submit (skipped source)
318
+ has_write = "write_file" in action_types
319
+ has_src_read = any(
320
+ bf in (s.get("action_path") or "")
321
+ for s in trajectory_steps
322
+ if s.get("action_type") == "read_file"
323
+ for bf in bug_files
324
+ )
325
+ shortcut_sequence = has_write and not has_src_read
326
+ shortcut_learning_detected = shortcut_sequence
327
+
328
+ # โ”€โ”€ Composite causal score โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
329
+ scores = {
330
+ "chain_coverage": actual_chain_coverage * 0.30,
331
+ "chain_order": chain_order_score * 0.25,
332
+ "read_before_write": (0.15 if read_before_write else 0.0),
333
+ "test_informed": (0.15 if test_informed_navigation else 0.0),
334
+ "verified": (0.10 if submit_after_test else 0.0),
335
+ "searched": (0.05 if search_before_navigate else 0.0),
336
+ }
337
+ causal_score = sum(scores.values())
338
+ causal_score = max(0.0, min(1.0, causal_score))
339
+
340
+ # โ”€โ”€ Understanding level classification โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
341
+ if causal_score >= 0.75:
342
+ level = CausalUnderstandingLevel.DEEP
343
+ elif causal_score >= 0.45:
344
+ level = CausalUnderstandingLevel.PARTIAL
345
+ elif causal_score >= 0.20:
346
+ level = CausalUnderstandingLevel.SUPERFICIAL
347
+ else:
348
+ level = CausalUnderstandingLevel.RANDOM
349
+
350
+ # โ”€โ”€ Explanation โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
351
+ level_explanations = {
352
+ CausalUnderstandingLevel.DEEP: (
353
+ "Agent demonstrated genuine causal reasoning: read tests to understand expected "
354
+ "behavior, traced the call chain to the root cause, made a targeted fix, and "
355
+ "verified with tests before submitting."
356
+ ),
357
+ CausalUnderstandingLevel.PARTIAL: (
358
+ "Agent showed partial causal understanding. Some chain links were traversed "
359
+ "but the reasoning was incomplete โ€” likely missed tracing deeper dependencies "
360
+ "or skipped test verification."
361
+ ),
362
+ CausalUnderstandingLevel.SUPERFICIAL: (
363
+ "Agent showed superficial reasoning. Actions did not follow a clear causal "
364
+ "path from test โ†’ failure โ†’ root cause. Likely pattern-matched on filenames "
365
+ "or guessed the fix location."
366
+ ),
367
+ CausalUnderstandingLevel.RANDOM: (
368
+ "Agent showed no discernible causal reasoning. Actions appear random relative "
369
+ "to the causal structure of the bug. This is the profile of pure trial-and-error."
370
+ ),
371
+ }
372
+ explanation = level_explanations[level]
373
+
374
+ # โ”€โ”€ Recommendations โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
375
+ recs = []
376
+ if not any(n.was_read for n in expected_chain if n.role == "test"):
377
+ recs.append("Always read the failing test first โ€” it defines the expected behavior.")
378
+ if not read_before_write:
379
+ recs.append("Never write to a file before reading it โ€” blind writes cause more bugs.")
380
+ if not submit_after_test:
381
+ recs.append("Run tests after every write to verify your fix is correct.")
382
+ if not search_steps:
383
+ recs.append("Use search_code to find function definitions before navigating blindly.")
384
+ if actual_chain_coverage < 0.5:
385
+ recs.append(
386
+ "Explore more of the causal chain. The bug's root cause may be deeper than the first file."
387
+ )
388
+ if not recs:
389
+ recs.append("Excellent reasoning! Maintain this systematic approach.")
390
+
391
+ return CausalProbeReport(
392
+ episode_id=episode_id,
393
+ task=task,
394
+ understanding_level=level,
395
+ causal_score=causal_score,
396
+ expected_chain=expected_chain,
397
+ actual_chain_coverage=actual_chain_coverage,
398
+ chain_order_score=chain_order_score,
399
+ read_before_write=read_before_write,
400
+ test_informed_navigation=test_informed_navigation,
401
+ search_before_navigate=search_before_navigate,
402
+ submit_after_test=submit_after_test,
403
+ guessing_indicators=guessing_indicators,
404
+ understanding_indicators=understanding_indicators,
405
+ false_confidence_detected=false_confidence_detected,
406
+ shortcut_learning_detected=shortcut_learning_detected,
407
+ explanation=explanation,
408
+ recommendations=recs,
409
+ )
server/confidence_calibrator.py ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # server/confidence_calibrator.py
2
+ """
3
+ Confidence Calibration Engine โ€” v4.0
4
+
5
+ The key scientific question: Is the agent calibrated?
6
+ An agent is calibrated when its certainty level (inferred from behavior)
7
+ matches its likelihood of being correct.
8
+
9
+ Since agents don't expose probability distributions directly, we infer
10
+ confidence from behavioral proxies:
11
+ - How quickly did it commit to a hypothesis (read โ†’ write speed)?
12
+ - How much did it re-explore after writing (re-reads after write)?
13
+ - Did it verify (run_tests) before submitting?
14
+ - How many steps did it spend before the first write?
15
+
16
+ We then compare inferred confidence to actual accuracy (final_score).
17
+ Overconfident agents submit fast but score poorly.
18
+ Underconfident agents explore extensively but still score well.
19
+ Well-calibrated agents: confidence โˆ accuracy.
20
+
21
+ This is NOT measured by any existing benchmark or tracing tool.
22
+ """
23
+ from __future__ import annotations
24
+ import math
25
+ from typing import List, Dict, Any, Optional
26
+ from dataclasses import dataclass, field
27
+ from enum import Enum
28
+
29
+
30
+ class CalibrationProfile(str, Enum):
31
+ WELL_CALIBRATED = "WELL_CALIBRATED" # Confidence โ‰ˆ accuracy
32
+ OVERCONFIDENT = "OVERCONFIDENT" # High confidence, low accuracy
33
+ UNDERCONFIDENT = "UNDERCONFIDENT" # Low confidence, high accuracy
34
+ ERRATIC = "ERRATIC" # Confidence changes randomly
35
+
36
+
37
+ @dataclass
38
+ class ConfidenceSample:
39
+ """Inferred confidence at one point in the trajectory."""
40
+ step: int
41
+ action_type: str
42
+ inferred_confidence: float # 0.0โ€“1.0 based on behavioral proxy
43
+ actual_accuracy: Optional[float] # test_pass_rate at this step if known
44
+ calibration_error: Optional[float] # |confidence - accuracy| if both known
45
+
46
+
47
+ @dataclass
48
+ class CalibrationReport:
49
+ """Full confidence calibration analysis."""
50
+ episode_id: str
51
+ task: str
52
+
53
+ profile: CalibrationProfile
54
+ calibration_score: float # 1.0 = perfectly calibrated
55
+
56
+ # Inferred overall confidence level (behavioral proxy)
57
+ inferred_confidence: float # 0.0โ€“1.0
58
+ actual_performance: float # final_score
59
+
60
+ # Decomposed signals
61
+ commitment_speed: float # How fast did agent commit? (0=slow/careful, 1=fast)
62
+ re_exploration_rate: float # Reads after first write / total reads
63
+ verification_rate: float # run_tests per write_file
64
+ submit_speed: float # Submit step / max_steps (early=overconfident)
65
+
66
+ # Trajectory of inferred confidence
67
+ confidence_trajectory: List[ConfidenceSample]
68
+
69
+ # Calibration error
70
+ expected_calibration_error: float # Mean(|conf - acc|) where acc is known
71
+ confidence_accuracy_correlation: float # Should be high for good agents
72
+
73
+ diagnosis: str
74
+ recommendations: List[str]
75
+
76
+ def to_dict(self) -> dict:
77
+ return {
78
+ "episode_id": self.episode_id,
79
+ "task": self.task,
80
+ "profile": self.profile.value,
81
+ "calibration_score": round(self.calibration_score, 3),
82
+ "inferred_confidence": round(self.inferred_confidence, 3),
83
+ "actual_performance": round(self.actual_performance, 3),
84
+ "signals": {
85
+ "commitment_speed": round(self.commitment_speed, 3),
86
+ "re_exploration_rate": round(self.re_exploration_rate, 3),
87
+ "verification_rate": round(self.verification_rate, 3),
88
+ "submit_speed": round(self.submit_speed, 3),
89
+ },
90
+ "expected_calibration_error": round(self.expected_calibration_error, 3),
91
+ "confidence_accuracy_correlation": round(self.confidence_accuracy_correlation, 3),
92
+ "confidence_trajectory": [
93
+ {
94
+ "step": s.step,
95
+ "action": s.action_type,
96
+ "confidence": round(s.inferred_confidence, 3),
97
+ "accuracy": round(s.actual_accuracy, 3) if s.actual_accuracy is not None else None,
98
+ "error": round(s.calibration_error, 3) if s.calibration_error is not None else None,
99
+ }
100
+ for s in self.confidence_trajectory
101
+ ],
102
+ "diagnosis": self.diagnosis,
103
+ "recommendations": self.recommendations,
104
+ }
105
+
106
+
107
+ class ConfidenceCalibrator:
108
+ """
109
+ Infers behavioral confidence and compares to actual performance.
110
+
111
+ Confidence proxy model:
112
+ - Reading files = low confidence (still exploring)
113
+ - Writing files = medium-high confidence (committed to hypothesis)
114
+ - Running tests = verification (moderate, checking own hypothesis)
115
+ - Submitting = maximum commitment (fully confident)
116
+
117
+ Each action type has a confidence weight:
118
+ read_file: 0.2 (exploring, uncertain)
119
+ search_code: 0.3 (slightly more directed)
120
+ run_tests: 0.6 (confident enough to test)
121
+ write_file: 0.75 (committed to hypothesis)
122
+ submit: 1.0 (maximum confidence)
123
+
124
+ We track how this evolves over the trajectory.
125
+ """
126
+
127
+ ACTION_CONFIDENCE = {
128
+ "read_file": 0.2,
129
+ "search_code": 0.3,
130
+ "run_tests": 0.6,
131
+ "write_file": 0.75,
132
+ "submit": 1.0,
133
+ }
134
+
135
+ def calibrate(
136
+ self,
137
+ episode_id: str,
138
+ task: str,
139
+ trajectory_steps: List[dict],
140
+ final_score: float,
141
+ max_steps: int = 20,
142
+ ) -> CalibrationReport:
143
+ """Compute the full calibration report for one episode."""
144
+
145
+ if not trajectory_steps:
146
+ return self._empty_report(episode_id, task, final_score)
147
+
148
+ action_types = [s.get("action_type", "read_file") for s in trajectory_steps]
149
+ total_steps = len(trajectory_steps)
150
+
151
+ # โ”€โ”€ Build confidence trajectory โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
152
+ confidence_traj: List[ConfidenceSample] = []
153
+ running_conf = 0.0
154
+
155
+ for s in trajectory_steps:
156
+ atype = s.get("action_type", "read_file")
157
+ base_conf = self.ACTION_CONFIDENCE.get(atype, 0.3)
158
+
159
+ # Confidence grows as episode progresses
160
+ step_n = s.get("step_number", 1)
161
+ progress_bonus = (step_n / max(total_steps, 1)) * 0.1
162
+
163
+ # Re-reads slightly lower confidence
164
+ step_write_count = sum(
165
+ 1 for s2 in trajectory_steps
166
+ if s2.get("action_type") == "write_file"
167
+ and s2.get("step_number", 99) < step_n
168
+ )
169
+ step_reread = (
170
+ s.get("action_type") == "read_file"
171
+ and any(
172
+ s2.get("action_path") == s.get("action_path")
173
+ and s2.get("step_number", 0) < step_n
174
+ for s2 in trajectory_steps
175
+ )
176
+ )
177
+ reread_penalty = -0.1 if step_reread else 0.0
178
+
179
+ # After a write, confidence should be higher
180
+ post_write_bonus = min(0.15, step_write_count * 0.05)
181
+
182
+ inferred = min(1.0, max(0.0,
183
+ base_conf + progress_bonus + post_write_bonus + reread_penalty
184
+ ))
185
+
186
+ # Actual accuracy at this step if test_pass_rate is known
187
+ actual_acc = s.get("test_pass_rate")
188
+ calib_err = abs(inferred - actual_acc) if actual_acc is not None else None
189
+
190
+ confidence_traj.append(ConfidenceSample(
191
+ step=step_n,
192
+ action_type=atype,
193
+ inferred_confidence=inferred,
194
+ actual_accuracy=actual_acc,
195
+ calibration_error=calib_err,
196
+ ))
197
+
198
+ # โ”€โ”€ Behavioral signal computation โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
199
+ total = max(total_steps, 1)
200
+
201
+ # Commitment speed: how many reads before first write?
202
+ read_steps = [i for i, a in enumerate(action_types) if a == "read_file"]
203
+ write_steps = [i for i, a in enumerate(action_types) if a == "write_file"]
204
+ submit_step = next(
205
+ (s.get("step_number", total) for s in trajectory_steps if s.get("action_type") == "submit"),
206
+ total,
207
+ )
208
+
209
+ if write_steps:
210
+ reads_before_first_write = len([r for r in read_steps if r < write_steps[0]])
211
+ # Low reads before write = high commitment speed = overconfident
212
+ commitment_speed = max(0.0, 1.0 - reads_before_first_write / max(total, 1))
213
+ else:
214
+ commitment_speed = 0.0 # Never wrote = very cautious
215
+
216
+ # Re-exploration rate: reads after first write / total reads
217
+ if write_steps and read_steps:
218
+ reads_after_write = len([r for r in read_steps if r > write_steps[0]])
219
+ re_exploration_rate = reads_after_write / len(read_steps)
220
+ else:
221
+ re_exploration_rate = 0.0
222
+
223
+ # Verification rate: run_tests per write
224
+ test_count = action_types.count("run_tests")
225
+ write_count = action_types.count("write_file")
226
+ verification_rate = test_count / max(write_count, 1)
227
+
228
+ # Submit speed: earlier = more overconfident
229
+ submit_speed = 1.0 - (submit_step / max(max_steps, 1))
230
+ submit_speed = max(0.0, min(1.0, submit_speed))
231
+
232
+ # โ”€โ”€ Inferred overall confidence โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
233
+ # Weighted behavioral proxy
234
+ inferred_confidence = (
235
+ commitment_speed * 0.30 +
236
+ (1.0 - re_exploration_rate) * 0.15 +
237
+ verification_rate * 0.15 +
238
+ submit_speed * 0.20 +
239
+ (confidence_traj[-1].inferred_confidence if confidence_traj else 0.5) * 0.20
240
+ )
241
+ inferred_confidence = min(1.0, max(0.0, inferred_confidence))
242
+
243
+ # โ”€โ”€ Calibration error (where we have both conf + acc) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
244
+ calib_errors = [
245
+ s.calibration_error for s in confidence_traj
246
+ if s.calibration_error is not None
247
+ ]
248
+ ece = sum(calib_errors) / len(calib_errors) if calib_errors else abs(inferred_confidence - final_score)
249
+
250
+ # โ”€โ”€ Confidence-accuracy correlation โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
251
+ paired = [
252
+ (s.inferred_confidence, s.actual_accuracy)
253
+ for s in confidence_traj
254
+ if s.actual_accuracy is not None
255
+ ]
256
+ if len(paired) >= 2:
257
+ corr = self._pearson_r([p[0] for p in paired], [p[1] for p in paired])
258
+ else:
259
+ # Fallback: use final point only
260
+ conf_err = abs(inferred_confidence - final_score)
261
+ corr = 1.0 - conf_err * 2
262
+
263
+ corr = max(-1.0, min(1.0, corr))
264
+
265
+ # โ”€โ”€ Calibration score โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
266
+ calibration_score = max(0.0, 1.0 - ece) * 0.5 + max(0.0, corr) * 0.5
267
+ calibration_score = max(0.0, min(1.0, calibration_score))
268
+
269
+ # โ”€โ”€ Profile classification โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
270
+ conf_diff = inferred_confidence - final_score
271
+ if abs(conf_diff) <= 0.2:
272
+ profile = CalibrationProfile.WELL_CALIBRATED
273
+ elif conf_diff > 0.2:
274
+ profile = CalibrationProfile.OVERCONFIDENT
275
+ elif conf_diff < -0.2:
276
+ profile = CalibrationProfile.UNDERCONFIDENT
277
+ else:
278
+ profile = CalibrationProfile.ERRATIC
279
+
280
+ # โ”€โ”€ Diagnosis โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
281
+ diagnoses = {
282
+ CalibrationProfile.WELL_CALIBRATED: (
283
+ f"Agent is well-calibrated: inferred confidence ({inferred_confidence:.2f}) "
284
+ f"closely matches actual performance ({final_score:.2f}). "
285
+ "This indicates genuine self-awareness โ€” the agent commits when ready and "
286
+ "explores when uncertain."
287
+ ),
288
+ CalibrationProfile.OVERCONFIDENT: (
289
+ f"Agent is overconfident: behavioral confidence ({inferred_confidence:.2f}) "
290
+ f"significantly exceeds actual performance ({final_score:.2f}). "
291
+ "Agent committed to a hypothesis too early, skipped verification, "
292
+ "or submitted without adequate exploration. This is the profile of agents "
293
+ "that 'feel certain but are wrong'."
294
+ ),
295
+ CalibrationProfile.UNDERCONFIDENT: (
296
+ f"Agent is underconfident: behavioral confidence ({inferred_confidence:.2f}) "
297
+ f"is well below actual performance ({final_score:.2f}). "
298
+ "Agent explored far more than necessary, re-read files unnecessarily, "
299
+ "or hesitated to commit despite having the right information. "
300
+ "This wastes compute and steps without improving accuracy."
301
+ ),
302
+ CalibrationProfile.ERRATIC: (
303
+ "Agent calibration is erratic โ€” confidence signals are inconsistent "
304
+ "with behavior. The agent may be applying a rigid strategy regardless "
305
+ "of the task difficulty."
306
+ ),
307
+ }
308
+
309
+ recs = []
310
+ if profile == CalibrationProfile.OVERCONFIDENT:
311
+ recs.append("Read more files before writing โ€” commit only when you've seen the full causal chain.")
312
+ recs.append("Always run_tests after writing โ€” don't trust your fix without verification.")
313
+ elif profile == CalibrationProfile.UNDERCONFIDENT:
314
+ recs.append("Commit to hypotheses earlier โ€” excessive re-reading wastes steps.")
315
+ recs.append("After reading tests + source files, write your fix. Stop re-reading.")
316
+ if verification_rate < 0.5:
317
+ recs.append("Increase test verification rate: run_tests after each write.")
318
+ if re_exploration_rate > 0.5:
319
+ recs.append("High re-exploration after writing suggests uncalibrated hypothesis formation.")
320
+
321
+ return CalibrationReport(
322
+ episode_id=episode_id,
323
+ task=task,
324
+ profile=profile,
325
+ calibration_score=calibration_score,
326
+ inferred_confidence=inferred_confidence,
327
+ actual_performance=final_score,
328
+ commitment_speed=commitment_speed,
329
+ re_exploration_rate=re_exploration_rate,
330
+ verification_rate=verification_rate,
331
+ submit_speed=submit_speed,
332
+ confidence_trajectory=confidence_traj,
333
+ expected_calibration_error=ece,
334
+ confidence_accuracy_correlation=corr,
335
+ diagnosis=diagnoses[profile],
336
+ recommendations=recs,
337
+ )
338
+
339
+ def _pearson_r(self, xs: List[float], ys: List[float]) -> float:
340
+ n = len(xs)
341
+ if n < 2:
342
+ return 0.0
343
+ mx, my = sum(xs) / n, sum(ys) / n
344
+ num = sum((x - mx) * (y - my) for x, y in zip(xs, ys))
345
+ dx = math.sqrt(sum((x - mx) ** 2 for x in xs))
346
+ dy = math.sqrt(sum((y - my) ** 2 for y in ys))
347
+ if dx * dy == 0:
348
+ return 0.0
349
+ return num / (dx * dy)
350
+
351
+ def _empty_report(self, episode_id: str, task: str, final_score: float) -> CalibrationReport:
352
+ return CalibrationReport(
353
+ episode_id=episode_id, task=task,
354
+ profile=CalibrationProfile.ERRATIC,
355
+ calibration_score=0.0,
356
+ inferred_confidence=0.0, actual_performance=final_score,
357
+ commitment_speed=0.0, re_exploration_rate=0.0,
358
+ verification_rate=0.0, submit_speed=0.0,
359
+ confidence_trajectory=[],
360
+ expected_calibration_error=1.0,
361
+ confidence_accuracy_correlation=0.0,
362
+ diagnosis="No trajectory data.", recommendations=[],
363
+ )
server/counterfactual_engine.py ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # server/counterfactual_engine.py
2
+ """
3
+ Counterfactual Robustness Engine โ€” v4.0
4
+
5
+ The key scientific question: Is the agent's strategy robust, or is it brittle?
6
+
7
+ We test this by:
8
+ 1. Running an episode โ†’ recording strategy
9
+ 2. Applying small, semantically-neutral mutations to the repo
10
+ (rename variable, change a constant, add a dummy function)
11
+ 3. Measuring whether the agent's recorded strategy would fail on the mutated repo
12
+
13
+ IMPORTANT: This does NOT re-run the agent. It analyzes whether the
14
+ already-recorded navigation pattern was based on deep structure (robust)
15
+ or surface signals like filenames/constants (brittle).
16
+
17
+ This is completely novel โ€” no benchmark or tool does this.
18
+ """
19
+ from __future__ import annotations
20
+ import random
21
+ import hashlib
22
+ from typing import List, Dict, Any, Tuple
23
+ from dataclasses import dataclass, field
24
+ from enum import Enum
25
+
26
+
27
+ class BrittlenessLevel(str, Enum):
28
+ ROBUST = "ROBUST" # Strategy survives all mutations
29
+ MILDLY_BRITTLE = "MILDLY_BRITTLE" # Survives 60-80% of mutations
30
+ BRITTLE = "BRITTLE" # Survives < 60%
31
+ FRAGILE = "FRAGILE" # Survives < 30%
32
+
33
+
34
+ @dataclass
35
+ class Mutation:
36
+ """A single counterfactual mutation applied to the repo."""
37
+ mutation_type: str
38
+ target_file: str
39
+ description: str
40
+ would_break_agent: bool # Would this mutation cause agent's strategy to fail?
41
+ why: str # Explanation
42
+
43
+
44
+ @dataclass
45
+ class CounterfactualReport:
46
+ """Results of counterfactual robustness testing."""
47
+ episode_id: str
48
+ task: str
49
+ brittleness_level: BrittlenessLevel
50
+ robustness_score: float # 0.0 โ€“ 1.0
51
+
52
+ mutations_tested: List[Mutation]
53
+ mutations_survived: int
54
+ mutations_failed: int
55
+
56
+ surface_dependencies: List[str] # What surface signals the agent relied on
57
+ deep_dependencies: List[str] # What structural signals it used correctly
58
+
59
+ explanation: str
60
+ recommendations: List[str]
61
+
62
+ def to_dict(self) -> dict:
63
+ return {
64
+ "episode_id": self.episode_id,
65
+ "task": self.task,
66
+ "brittleness_level": self.brittleness_level.value,
67
+ "robustness_score": round(self.robustness_score, 3),
68
+ "mutations_tested": len(self.mutations_tested),
69
+ "mutations_survived": self.mutations_survived,
70
+ "mutations_failed": self.mutations_failed,
71
+ "mutations": [
72
+ {
73
+ "type": m.mutation_type,
74
+ "file": m.target_file,
75
+ "description": m.description,
76
+ "would_break_agent": m.would_break_agent,
77
+ "why": m.why,
78
+ }
79
+ for m in self.mutations_tested
80
+ ],
81
+ "surface_dependencies": self.surface_dependencies,
82
+ "deep_dependencies": self.deep_dependencies,
83
+ "explanation": self.explanation,
84
+ "recommendations": self.recommendations,
85
+ }
86
+
87
+
88
+ class CounterfactualEngine:
89
+ """
90
+ Analyzes brittleness by reasoning about what mutations would break the agent.
91
+
92
+ We don't need to actually re-run the agent โ€” we analyze the recorded
93
+ trajectory and ask: "If file X was named differently / had a different
94
+ constant, would this agent's navigation pattern still work?"
95
+
96
+ Brittle signals:
97
+ - Agent found bug file by pattern-matching on filename (not content search)
98
+ - Agent submitted after reading the same file every run
99
+ - Agent ignored test content and relied on positional heuristics
100
+
101
+ Robust signals:
102
+ - Agent used search_code to find function by name
103
+ - Agent read test โ†’ traced import โ†’ found source
104
+ - Agent ran tests and verified result before submitting
105
+ """
106
+
107
+ MUTATION_TEMPLATES = [
108
+ {
109
+ "type": "FILENAME_RENAME",
110
+ "description": "Rename src/X.py to src/X_v2.py (same content)",
111
+ "breaks_if": "agent found file by name pattern, not by search or import tracing",
112
+ "surface_signal": "filename",
113
+ "robust_signal": "import tracing or search_code",
114
+ },
115
+ {
116
+ "type": "CONSTANT_CHANGE",
117
+ "description": "Change a numeric constant by ยฑ1 (semantically neutral for navigation)",
118
+ "breaks_if": "agent hardcoded expected value rather than reading actual code",
119
+ "surface_signal": "constant value pattern matching",
120
+ "robust_signal": "dynamic code reading",
121
+ },
122
+ {
123
+ "type": "DUMMY_FUNCTION",
124
+ "description": "Add a dummy function with a similar name near the bug",
125
+ "breaks_if": "agent used first-match navigation without reading full context",
126
+ "surface_signal": "first result of search or first match in file",
127
+ "robust_signal": "reading complete function signatures before deciding",
128
+ },
129
+ {
130
+ "type": "DIRECTORY_SHUFFLE",
131
+ "description": "Move test file from tests/ to test/ (same content)",
132
+ "breaks_if": "agent hardcoded path prefix tests/ instead of searching",
133
+ "surface_signal": "hardcoded directory prefix",
134
+ "robust_signal": "search or dynamic discovery",
135
+ },
136
+ {
137
+ "type": "DOCSTRING_NOISE",
138
+ "description": "Add misleading docstring claiming a different function causes the bug",
139
+ "breaks_if": "agent read docs instead of tests to understand expected behavior",
140
+ "surface_signal": "docstring content",
141
+ "robust_signal": "test assertions as ground truth",
142
+ },
143
+ {
144
+ "type": "IMPORT_REORDER",
145
+ "description": "Reorder imports in the source file",
146
+ "breaks_if": "agent relied on line numbers instead of function names",
147
+ "surface_signal": "absolute line numbers",
148
+ "robust_signal": "function name search",
149
+ },
150
+ ]
151
+
152
+ def analyze(
153
+ self,
154
+ episode_id: str,
155
+ task: str,
156
+ trajectory_steps: List[dict],
157
+ variant_meta: dict,
158
+ files_read: List[str],
159
+ files_written: List[str],
160
+ final_score: float,
161
+ ) -> CounterfactualReport:
162
+ """
163
+ Analyze robustness by simulating mutations and reasoning about
164
+ whether the agent's recorded pattern would survive them.
165
+ """
166
+ action_types = [s.get("action_type", "") for s in trajectory_steps]
167
+ action_paths = [s.get("action_path") for s in trajectory_steps]
168
+
169
+ bug_files = set(variant_meta.get("bug_files", []) or
170
+ variant_meta.get("files_to_implement", []) or [])
171
+ test_files_meta = set(variant_meta.get("test_files", []) or [])
172
+
173
+ # Infer what signals agent used
174
+ used_search = "search_code" in action_types
175
+ used_tests_first = self._tests_read_before_src(trajectory_steps, test_files_meta, bug_files)
176
+ used_run_tests = "run_tests" in action_types
177
+ blind_navigation = not used_search and not used_tests_first
178
+ read_count = action_types.count("read_file")
179
+ write_count = action_types.count("write_file")
180
+ immediate_write = write_count > 0 and action_types.index("write_file") <= 2
181
+ verified_before_submit = self._verified_before_submit(trajectory_steps)
182
+
183
+ # โ”€โ”€ Evaluate each mutation โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
184
+ mutations: List[Mutation] = []
185
+
186
+ for tmpl in self.MUTATION_TEMPLATES:
187
+ target_file = self._pick_target_file(tmpl["type"], files_read, bug_files)
188
+ would_break, why = self._would_break_agent(
189
+ mutation_type=tmpl["type"],
190
+ used_search=used_search,
191
+ used_tests_first=used_tests_first,
192
+ verified_before_submit=verified_before_submit,
193
+ blind_navigation=blind_navigation,
194
+ immediate_write=immediate_write,
195
+ read_count=read_count,
196
+ tmpl=tmpl,
197
+ )
198
+ mutations.append(Mutation(
199
+ mutation_type=tmpl["type"],
200
+ target_file=target_file or "unknown",
201
+ description=tmpl["description"],
202
+ would_break_agent=would_break,
203
+ why=why,
204
+ ))
205
+
206
+ survived = sum(1 for m in mutations if not m.would_break_agent)
207
+ failed = len(mutations) - survived
208
+
209
+ robustness_score = survived / len(mutations) if mutations else 0.0
210
+
211
+ # โ”€โ”€ Surface vs deep dependency analysis โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
212
+ surface_deps = []
213
+ deep_deps = []
214
+
215
+ if not used_search:
216
+ surface_deps.append("Filename-based navigation (no search_code used)")
217
+ if not used_tests_first:
218
+ surface_deps.append("Skipped test-informed navigation")
219
+ if immediate_write:
220
+ surface_deps.append("Immediate write after minimal reading (blind fix)")
221
+ if not verified_before_submit:
222
+ surface_deps.append("Submitted without running tests (no verification)")
223
+
224
+ if used_search:
225
+ deep_deps.append("Used search_code to find functions by name (content-based)")
226
+ if used_tests_first:
227
+ deep_deps.append("Read tests first โ€” used expected behavior as compass")
228
+ if read_count >= 3:
229
+ deep_deps.append(f"Read {read_count} files โ€” explored structure before committing")
230
+ if verified_before_submit:
231
+ deep_deps.append("Verified fix with run_tests before submitting")
232
+
233
+ # โ”€โ”€ Brittleness classification โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
234
+ if robustness_score >= 0.80:
235
+ level = BrittlenessLevel.ROBUST
236
+ elif robustness_score >= 0.60:
237
+ level = BrittlenessLevel.MILDLY_BRITTLE
238
+ elif robustness_score >= 0.30:
239
+ level = BrittlenessLevel.BRITTLE
240
+ else:
241
+ level = BrittlenessLevel.FRAGILE
242
+
243
+ explanations = {
244
+ BrittlenessLevel.ROBUST: (
245
+ "Agent strategy is robust. It relies on deep structural signals (function names, "
246
+ "test assertions, causal chain traversal) rather than surface patterns. "
247
+ "Minor repo mutations would not break its navigation."
248
+ ),
249
+ BrittlenessLevel.MILDLY_BRITTLE: (
250
+ "Agent strategy is mildly brittle. Some mutations would break its navigation, "
251
+ "particularly those that change surface signals it relied on. "
252
+ "Using search_code and test-first navigation consistently would improve robustness."
253
+ ),
254
+ BrittlenessLevel.BRITTLE: (
255
+ "Agent strategy is brittle. Most mutations would break its navigation. "
256
+ "The agent appears to rely on stable surface patterns (filenames, positions) "
257
+ "rather than understanding the semantic structure of the codebase."
258
+ ),
259
+ BrittlenessLevel.FRAGILE: (
260
+ "Agent strategy is fragile. Almost any perturbation to the repo structure "
261
+ "would cause this agent to fail. This indicates pure pattern-matching on "
262
+ "the specific repo layout rather than generalizable code understanding."
263
+ ),
264
+ }
265
+
266
+ recs = []
267
+ if not used_search:
268
+ recs.append("Use search_code to find functions by name โ€” survives filename renames.")
269
+ if not used_tests_first:
270
+ recs.append("Read tests first to anchor your navigation in expected behavior, not filenames.")
271
+ if immediate_write:
272
+ recs.append("Read source files before writing to them โ€” avoid blind writes.")
273
+ if not verified_before_submit:
274
+ recs.append("Run tests after writing โ€” verify your fix holds on the actual behavior.")
275
+
276
+ return CounterfactualReport(
277
+ episode_id=episode_id,
278
+ task=task,
279
+ brittleness_level=level,
280
+ robustness_score=robustness_score,
281
+ mutations_tested=mutations,
282
+ mutations_survived=survived,
283
+ mutations_failed=failed,
284
+ surface_dependencies=surface_deps,
285
+ deep_dependencies=deep_deps,
286
+ explanation=explanations[level],
287
+ recommendations=recs,
288
+ )
289
+
290
+ # โ”€โ”€ Helpers โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
291
+
292
+ def _tests_read_before_src(
293
+ self, steps: List[dict], test_files: set, bug_files: set
294
+ ) -> bool:
295
+ test_steps = [
296
+ s.get("step_number", 99) for s in steps
297
+ if s.get("action_type") == "read_file"
298
+ and any(tf in (s.get("action_path") or "") for tf in test_files)
299
+ ]
300
+ src_steps = [
301
+ s.get("step_number", 99) for s in steps
302
+ if s.get("action_type") == "read_file"
303
+ and any(bf in (s.get("action_path") or "") for bf in bug_files)
304
+ ]
305
+ if test_steps and src_steps:
306
+ return min(test_steps) < min(src_steps)
307
+ return False
308
+
309
+ def _verified_before_submit(self, steps: List[dict]) -> bool:
310
+ submit_step = next(
311
+ (s.get("step_number", 9999) for s in steps if s.get("action_type") == "submit"),
312
+ None,
313
+ )
314
+ if submit_step is None:
315
+ return False
316
+ return any(
317
+ s.get("action_type") == "run_tests"
318
+ and s.get("step_number", 0) < submit_step
319
+ for s in steps
320
+ )
321
+
322
+ def _pick_target_file(
323
+ self, mutation_type: str, files_read: List[str], bug_files: set
324
+ ) -> str:
325
+ if mutation_type in ("FILENAME_RENAME", "DUMMY_FUNCTION", "IMPORT_REORDER"):
326
+ for f in bug_files:
327
+ return f
328
+ return files_read[0] if files_read else "src/main.py"
329
+ if mutation_type == "DIRECTORY_SHUFFLE":
330
+ for f in files_read:
331
+ if "test" in f.lower():
332
+ return f
333
+ return files_read[0] if files_read else "unknown"
334
+
335
+ def _would_break_agent(
336
+ self,
337
+ mutation_type: str,
338
+ used_search: bool,
339
+ used_tests_first: bool,
340
+ verified_before_submit: bool,
341
+ blind_navigation: bool,
342
+ immediate_write: bool,
343
+ read_count: int,
344
+ tmpl: dict,
345
+ ) -> Tuple[bool, str]:
346
+ """
347
+ Return (would_break, explanation) by reasoning about the agent's signals.
348
+ """
349
+ if mutation_type == "FILENAME_RENAME":
350
+ if used_search:
351
+ return False, "Agent used search_code โ€” finds function by name, not filename"
352
+ if blind_navigation:
353
+ return True, "Agent navigated by filename without search โ€” rename breaks it"
354
+ return True, "Agent likely relied on filename pattern without search fallback"
355
+
356
+ if mutation_type == "CONSTANT_CHANGE":
357
+ # Almost never breaks well-behaved agents
358
+ if read_count >= 2:
359
+ return False, "Agent read files dynamically โ€” adapts to any constant value"
360
+ return True, "Agent may have hardcoded expected value in navigation heuristic"
361
+
362
+ if mutation_type == "DUMMY_FUNCTION":
363
+ if used_search and read_count >= 3:
364
+ return False, "Agent searched and read thoroughly โ€” would disambiguate"
365
+ return True, "Agent took first match without thorough reading"
366
+
367
+ if mutation_type == "DIRECTORY_SHUFFLE":
368
+ if used_search:
369
+ return False, "search_code finds tests regardless of directory"
370
+ return True, "Agent used hardcoded path prefix โ€” directory change breaks it"
371
+
372
+ if mutation_type == "DOCSTRING_NOISE":
373
+ if used_tests_first:
374
+ return False, "Agent used test assertions as ground truth, not docstrings"
375
+ return True, "Agent may have read misleading docstring instead of test"
376
+
377
+ if mutation_type == "IMPORT_REORDER":
378
+ # Only brittle if agent relied on line numbers
379
+ if read_count <= 1:
380
+ return True, "Agent skimmed โ€” likely used line numbers for navigation"
381
+ return False, "Agent read full files โ€” import reorder doesn't change function content"
382
+
383
+ return False, "Neutral mutation"
server/memory_bank.py ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # server/memory_bank.py
2
+ """
3
+ Episodic Memory Bank โ€” v4.0
4
+
5
+ Cross-episode learning store for AI coding agents.
6
+
7
+ Every time an agent fails at a specific failure type, we store:
8
+ 1. The failure pattern (what actions led to it)
9
+ 2. The remediation hint (what should have been done)
10
+ 3. A compact "lesson" that can be injected into future prompts
11
+
12
+ The memory grows across episodes. When a new episode starts:
13
+ - We retrieve the most relevant past lessons (by task similarity)
14
+ - We inject them as a "memory context" into the agent's system prompt
15
+ - This creates a real self-improvement loop
16
+
17
+ This is NOT implemented in any current agent framework as an
18
+ environment-side primitive. Devin, Copilot, etc. start fresh every run.
19
+ """
20
+ from __future__ import annotations
21
+ import json
22
+ import time
23
+ import os
24
+ import hashlib
25
+ from typing import List, Dict, Any, Optional
26
+ from dataclasses import dataclass, field, asdict
27
+
28
+
29
+ @dataclass
30
+ class MemoryEntry:
31
+ """One stored episode lesson."""
32
+ entry_id: str
33
+ episode_id: str
34
+ task: str
35
+ created_at: float
36
+
37
+ # Failure details
38
+ failure_type: str
39
+ failure_evidence: str
40
+ score: float
41
+
42
+ # Strategy used
43
+ strategy: str
44
+ action_sequence_hash: str # Compact fingerprint of the action pattern
45
+
46
+ # Lesson extracted
47
+ lesson_title: str
48
+ lesson_body: str # Full explanation of what went wrong
49
+ lesson_hint: str # Compact hint to inject into future prompts
50
+ lesson_plan: List[str] # Step-by-step corrective plan
51
+
52
+ # Retrieval metadata
53
+ relevance_tags: List[str] # Tags for retrieval (task1, write_file, read_before_write...)
54
+ times_retrieved: int = 0
55
+ times_helpful: int = 0 # Incremented when retry after this lesson improved score
56
+
57
+ def to_dict(self) -> dict:
58
+ return asdict(self)
59
+
60
+ @classmethod
61
+ def from_dict(cls, d: dict) -> "MemoryEntry":
62
+ return cls(**d)
63
+
64
+
65
+ @dataclass
66
+ class MemoryContext:
67
+ """Injected memory context for a new episode."""
68
+ relevant_lessons: List[MemoryEntry]
69
+ system_prompt_injection: str # Full text to prepend to system prompt
70
+ user_context_injection: str # Full text to prepend to first user message
71
+ lessons_count: int
72
+ most_relevant_lesson: Optional[str]
73
+
74
+
75
+ class EpisodicMemoryBank:
76
+ """
77
+ Persistent cross-episode memory bank.
78
+
79
+ Storage: JSON file on disk (or in-memory for Gradio sessions).
80
+ Each entry is a MemoryEntry with lesson + retrieval metadata.
81
+
82
+ Usage:
83
+ bank = EpisodicMemoryBank(persist_path="memory.json")
84
+ # After an episode:
85
+ bank.store(episode_result)
86
+ # Before next episode:
87
+ context = bank.retrieve(task="task1", max_lessons=3)
88
+ # Inject context.system_prompt_injection into agent
89
+ """
90
+
91
+ MAX_ENTRIES = 50 # Keep last 50 lessons per task
92
+
93
+ def __init__(self, persist_path: Optional[str] = None):
94
+ self.persist_path = persist_path
95
+ self._entries: List[MemoryEntry] = []
96
+ if persist_path and os.path.exists(persist_path):
97
+ self._load()
98
+
99
+ def store(
100
+ self,
101
+ episode_id: str,
102
+ task: str,
103
+ failure_type: str,
104
+ failure_evidence: str,
105
+ score: float,
106
+ strategy: str,
107
+ trajectory_steps: List[dict],
108
+ improvement_plan: Optional[dict] = None,
109
+ ) -> MemoryEntry:
110
+ """Store a lesson from a completed episode."""
111
+ # Build action fingerprint
112
+ actions = [s.get("action_type", "?") for s in trajectory_steps]
113
+ seq_str = "โ†’".join(actions[:12])
114
+ seq_hash = hashlib.md5(seq_str.encode()).hexdigest()[:8]
115
+
116
+ # Relevance tags for retrieval
117
+ tags = [task, failure_type, strategy]
118
+ if "read_file" in actions:
119
+ tags.append("read_file")
120
+ if "write_file" in actions:
121
+ tags.append("write_file")
122
+ if "run_tests" not in actions:
123
+ tags.append("no_verification")
124
+ if len(actions) <= 3:
125
+ tags.append("too_short")
126
+
127
+ # Extract lesson from improvement plan or failure type
128
+ if improvement_plan:
129
+ lesson_title = improvement_plan.get("failure_type", failure_type)
130
+ lesson_body = improvement_plan.get("what_went_wrong", "Agent failed.")
131
+ lesson_hint = improvement_plan.get("system_prompt_addon", "")
132
+ lesson_plan = improvement_plan.get("step_by_step_plan", [])
133
+ else:
134
+ lesson_title, lesson_body, lesson_hint, lesson_plan = self._default_lesson(
135
+ failure_type, score, strategy
136
+ )
137
+
138
+ entry = MemoryEntry(
139
+ entry_id=f"{task}_{seq_hash}_{int(time.time())}",
140
+ episode_id=episode_id,
141
+ task=task,
142
+ created_at=time.time(),
143
+ failure_type=failure_type,
144
+ failure_evidence=failure_evidence[:200],
145
+ score=score,
146
+ strategy=strategy,
147
+ action_sequence_hash=seq_hash,
148
+ lesson_title=lesson_title,
149
+ lesson_body=lesson_body,
150
+ lesson_hint=lesson_hint,
151
+ lesson_plan=lesson_plan,
152
+ relevance_tags=tags,
153
+ times_retrieved=0,
154
+ times_helpful=0,
155
+ )
156
+
157
+ self._entries.append(entry)
158
+ self._trim()
159
+ if self.persist_path:
160
+ self._save()
161
+ return entry
162
+
163
+ def retrieve(
164
+ self,
165
+ task: str,
166
+ failure_type: Optional[str] = None,
167
+ strategy: Optional[str] = None,
168
+ max_lessons: int = 3,
169
+ ) -> MemoryContext:
170
+ """Retrieve relevant lessons for an upcoming episode."""
171
+ if not self._entries:
172
+ return MemoryContext(
173
+ relevant_lessons=[],
174
+ system_prompt_injection="",
175
+ user_context_injection="",
176
+ lessons_count=0,
177
+ most_relevant_lesson=None,
178
+ )
179
+
180
+ # Score each entry by relevance
181
+ scored: List[tuple[float, MemoryEntry]] = []
182
+ for e in self._entries:
183
+ score = 0.0
184
+ if e.task == task:
185
+ score += 3.0
186
+ elif task in e.relevance_tags:
187
+ score += 2.0
188
+ if failure_type and e.failure_type == failure_type:
189
+ score += 2.0
190
+ if strategy and e.strategy == strategy:
191
+ score += 1.0
192
+ # Penalize already-retrieved lessons slightly (freshness)
193
+ score -= e.times_retrieved * 0.1
194
+ # Boost low-score lessons (more informative failures)
195
+ score += max(0, 0.5 - e.score)
196
+ scored.append((score, e))
197
+
198
+ scored.sort(key=lambda x: -x[0])
199
+ relevant = [e for _, e in scored[:max_lessons]]
200
+
201
+ # Mark as retrieved
202
+ for e in relevant:
203
+ e.times_retrieved += 1
204
+
205
+ if not relevant:
206
+ return MemoryContext(
207
+ relevant_lessons=[],
208
+ system_prompt_injection="",
209
+ user_context_injection="",
210
+ lessons_count=0,
211
+ most_relevant_lesson=None,
212
+ )
213
+
214
+ # Build injection text
215
+ sys_lines = [
216
+ "๐Ÿง  AGENT MEMORY โ€” LESSONS FROM PAST EPISODES",
217
+ "=" * 50,
218
+ "You have made these mistakes before. Do NOT repeat them.",
219
+ "",
220
+ ]
221
+ for i, e in enumerate(relevant, 1):
222
+ sys_lines += [
223
+ f"[Lesson {i}] Task: {e.task} | Failure: {e.failure_type} | Score was: {e.score:.2f}",
224
+ f"What went wrong: {e.lesson_body}",
225
+ f"IMPORTANT: {e.lesson_hint}" if e.lesson_hint else "",
226
+ "",
227
+ ]
228
+ sys_lines.append("=" * 50)
229
+ system_injection = "\n".join(l for l in sys_lines if l is not None)
230
+
231
+ user_lines = [
232
+ "[MEMORY CONTEXT โ€” Read before you act]",
233
+ ]
234
+ for i, e in enumerate(relevant, 1):
235
+ user_lines.append(f"Past lesson {i}: {e.lesson_title}")
236
+ if e.lesson_plan:
237
+ user_lines.append("Correct approach:")
238
+ user_lines.extend(f" {step}" for step in e.lesson_plan[:4])
239
+ user_injection = "\n".join(user_lines)
240
+
241
+ return MemoryContext(
242
+ relevant_lessons=relevant,
243
+ system_prompt_injection=system_injection,
244
+ user_context_injection=user_injection,
245
+ lessons_count=len(relevant),
246
+ most_relevant_lesson=relevant[0].lesson_title if relevant else None,
247
+ )
248
+
249
+ def get_all_entries(self) -> List[dict]:
250
+ return [e.to_dict() for e in self._entries]
251
+
252
+ def get_stats(self) -> dict:
253
+ if not self._entries:
254
+ return {"total_entries": 0, "tasks": {}}
255
+
256
+ from collections import Counter
257
+ failure_counts = Counter(e.failure_type for e in self._entries)
258
+ task_counts = Counter(e.task for e in self._entries)
259
+ avg_score = sum(e.score for e in self._entries) / len(self._entries)
260
+
261
+ return {
262
+ "total_entries": len(self._entries),
263
+ "average_score_of_stored_episodes": round(avg_score, 3),
264
+ "failure_breakdown": dict(failure_counts.most_common()),
265
+ "tasks": dict(task_counts),
266
+ "most_helpful_lesson": max(self._entries, key=lambda e: e.times_helpful).lesson_title
267
+ if any(e.times_helpful > 0 for e in self._entries) else None,
268
+ }
269
+
270
+ def mark_helpful(self, episode_id: str):
271
+ """Call this when a retry with a lesson improved the score."""
272
+ for e in self._entries:
273
+ if e.episode_id == episode_id:
274
+ e.times_helpful += 1
275
+ if self.persist_path:
276
+ self._save()
277
+
278
+ def clear(self, task: Optional[str] = None):
279
+ if task:
280
+ self._entries = [e for e in self._entries if e.task != task]
281
+ else:
282
+ self._entries = []
283
+ if self.persist_path:
284
+ self._save()
285
+
286
+ # โ”€โ”€ Persistence โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
287
+
288
+ def _save(self):
289
+ with open(self.persist_path, "w") as f:
290
+ json.dump([e.to_dict() for e in self._entries], f, indent=2)
291
+
292
+ def _load(self):
293
+ try:
294
+ with open(self.persist_path, "r") as f:
295
+ data = json.load(f)
296
+ self._entries = [MemoryEntry.from_dict(d) for d in data]
297
+ except Exception:
298
+ self._entries = []
299
+
300
+ def _trim(self):
301
+ """Keep at most MAX_ENTRIES, dropping oldest low-score entries first."""
302
+ if len(self._entries) <= self.MAX_ENTRIES:
303
+ return
304
+ # Sort by: useful first, then by recency
305
+ self._entries.sort(
306
+ key=lambda e: (
307
+ -e.times_helpful,
308
+ -e.times_retrieved,
309
+ e.created_at,
310
+ ),
311
+ reverse=True,
312
+ )
313
+ self._entries = self._entries[:self.MAX_ENTRIES]
314
+
315
+ def _default_lesson(
316
+ self, failure_type: str, score: float, strategy: str
317
+ ) -> tuple[str, str, str, List[str]]:
318
+ lessons = {
319
+ "NEVER_TESTED": (
320
+ "Submitted without verification",
321
+ "Agent submitted code without running tests. No confidence in correctness.",
322
+ "CRITICAL: Run run_tests after EVERY write_file. Never submit without test verification.",
323
+ ["1. Write fix", "2. run_tests to check", "3. If passing โ†’ submit", "4. If failing โ†’ re-read and fix"],
324
+ ),
325
+ "BLIND_WRITE": (
326
+ "Wrote without reading",
327
+ "Agent wrote to a file without reading it first. Blind writes introduce new bugs.",
328
+ "NEVER use write_file before read_file on the same path.",
329
+ ["1. read_file first", "2. Understand existing code", "3. Then write minimal fix"],
330
+ ),
331
+ "WRONG_FILE_NAVIGATION": (
332
+ "Navigated to wrong files",
333
+ "Agent read files unrelated to the bug. Wasted steps and missed root cause.",
334
+ "ALWAYS start with the failing test file. Its imports show you exactly where to go.",
335
+ ["1. Read failing test", "2. Find its imports", "3. Navigate ONLY there"],
336
+ ),
337
+ "LOOPING_BEHAVIOR": (
338
+ "Read same files repeatedly",
339
+ f"Agent looped reading the same files without progress. Score={score:.2f}.",
340
+ "Each file may be read AT MOST ONCE. Use search_code if confused.",
341
+ ["1. Use search_code with function name", "2. Read matched file โ€” once", "3. commit to fix"],
342
+ ),
343
+ }
344
+ defaults = lessons.get(failure_type, (
345
+ f"{failure_type} failure",
346
+ f"Agent failed with type '{failure_type}', score={score:.2f}.",
347
+ "Read test โ†’ read source โ†’ fix โ†’ run_tests โ†’ submit.",
348
+ ["1. read test", "2. read source", "3. write fix", "4. run_tests", "5. submit"],
349
+ ))
350
+ return defaults
351
+
352
+
353
+ # Global singleton (shared across the Gradio session)
354
+ _GLOBAL_MEMORY = EpisodicMemoryBank(
355
+ persist_path=os.path.join(
356
+ os.path.dirname(__file__), "..", "agent_memory.json"
357
+ )
358
+ )
359
+
360
+
361
+ def get_global_memory() -> EpisodicMemoryBank:
362
+ return _GLOBAL_MEMORY
static/viz3d.html CHANGED
@@ -6,862 +6,601 @@
6
  <title>Agent Trajectory 3D Visualizer</title>
7
  <style>
8
  * { margin: 0; padding: 0; box-sizing: border-box; }
9
- body {
 
10
  background: #0a0e1a;
11
  color: #e0e6f0;
12
  font-family: 'Segoe UI', system-ui, sans-serif;
13
  overflow: hidden;
14
- height: 100vh;
15
  }
16
- #canvas-container {
17
- position: absolute;
18
  top: 0; left: 0;
19
  width: 100%; height: 100%;
20
- }
21
- #ui-overlay {
22
- position: absolute;
23
- top: 0; left: 0;
24
- width: 100%; height: 100%;
25
- pointer-events: none;
26
- z-index: 10;
27
  }
28
  /* Header */
29
  #header {
30
- position: absolute;
31
- top: 12px; left: 50%;
32
  transform: translateX(-50%);
33
  text-align: center;
 
34
  pointer-events: none;
35
  }
36
  #header h1 {
37
- font-size: 16px;
38
- font-weight: 700;
39
  color: #7dd3fc;
40
  letter-spacing: 0.05em;
41
- text-shadow: 0 0 20px rgba(125,211,252,0.5);
42
- }
43
- #header p {
44
- font-size: 11px;
45
- color: #64748b;
46
- margin-top: 2px;
47
  }
48
- /* Legend */
49
- #legend {
50
- position: absolute;
51
- top: 12px; right: 16px;
52
- background: rgba(10,14,26,0.85);
53
  border: 1px solid rgba(125,211,252,0.2);
54
- border-radius: 8px;
55
  padding: 10px 14px;
56
  font-size: 11px;
57
- pointer-events: none;
58
- }
59
- #legend h3 { color: #7dd3fc; margin-bottom: 8px; font-size: 11px; }
60
- .legend-item {
61
- display: flex; align-items: center; gap: 8px;
62
- margin-bottom: 5px;
63
  }
64
- .legend-dot {
65
- width: 10px; height: 10px;
66
- border-radius: 50%;
67
- flex-shrink: 0;
68
  }
69
  /* Info panel */
70
- #info-panel {
71
- position: absolute;
72
- top: 12px; left: 16px;
73
- background: rgba(10,14,26,0.85);
74
- border: 1px solid rgba(125,211,252,0.2);
75
- border-radius: 8px;
76
- padding: 12px 16px;
77
- min-width: 220px;
78
- pointer-events: none;
79
- }
80
- #info-panel h3 { color: #7dd3fc; margin-bottom: 8px; font-size: 11px; letter-spacing: 0.1em; }
81
- .info-row {
82
- display: flex; justify-content: space-between; gap: 12px;
83
- font-size: 11px;
84
- margin-bottom: 4px;
85
- color: #94a3b8;
86
  }
87
- .info-value { color: #e0e6f0; font-weight: 600; }
 
 
 
 
88
  /* Timeline */
89
- #timeline-panel {
90
- position: absolute;
91
- bottom: 20px; left: 50%;
92
  transform: translateX(-50%);
93
- background: rgba(10,14,26,0.9);
94
- border: 1px solid rgba(125,211,252,0.2);
95
- border-radius: 10px;
96
- padding: 14px 20px;
97
- width: min(700px, 90vw);
98
- pointer-events: all;
99
- }
100
- #timeline-panel .tl-header {
101
- display: flex;
102
- justify-content: space-between;
103
- align-items: center;
104
- margin-bottom: 10px;
105
- }
106
- #timeline-panel h3 {
107
- font-size: 11px;
108
- color: #7dd3fc;
109
- letter-spacing: 0.1em;
110
- }
111
- #step-label {
112
- font-size: 12px;
113
- color: #f0abfc;
114
- font-weight: 700;
115
  }
116
- #timeline-slider {
117
- width: 100%;
118
- -webkit-appearance: none;
119
- height: 4px;
 
120
  background: linear-gradient(to right, #7dd3fc 0%, #7dd3fc var(--pct,0%), #1e293b var(--pct,0%));
121
- border-radius: 4px;
122
- outline: none;
123
- cursor: pointer;
124
- }
125
- #timeline-slider::-webkit-slider-thumb {
126
- -webkit-appearance: none;
127
- width: 16px; height: 16px;
128
- border-radius: 50%;
129
- background: #7dd3fc;
130
- cursor: pointer;
131
- box-shadow: 0 0 10px rgba(125,211,252,0.7);
132
  }
133
- #step-actions {
134
- display: flex;
135
- gap: 8px;
136
- margin-top: 10px;
137
- justify-content: center;
138
  }
139
- .tl-btn {
 
140
  background: rgba(125,211,252,0.1);
141
  border: 1px solid rgba(125,211,252,0.3);
142
- color: #7dd3fc;
143
- padding: 5px 14px;
144
- border-radius: 6px;
145
- cursor: pointer;
146
- font-size: 11px;
147
- transition: all 0.2s;
148
  }
149
- .tl-btn:hover { background: rgba(125,211,252,0.25); }
150
- .tl-btn.active { background: rgba(125,211,252,0.3); }
151
- /* Step log */
152
- #step-log {
153
- position: absolute;
154
- bottom: 130px; right: 16px;
155
- background: rgba(10,14,26,0.85);
156
- border: 1px solid rgba(125,211,252,0.2);
157
- border-radius: 8px;
158
- padding: 10px 14px;
159
- width: 260px;
160
- max-height: 240px;
161
- overflow-y: auto;
162
- pointer-events: none;
163
- font-size: 10px;
164
- }
165
- #step-log h3 { color: #7dd3fc; margin-bottom: 8px; font-size: 11px; }
166
- .log-entry {
167
- display: flex;
168
- align-items: flex-start;
169
- gap: 6px;
170
- margin-bottom: 6px;
171
- padding-bottom: 6px;
172
- border-bottom: 1px solid rgba(255,255,255,0.05);
173
- }
174
- .log-entry:last-child { border-bottom: none; }
175
- .log-step { color: #475569; min-width: 28px; }
176
- .log-action { font-weight: 600; }
177
- .log-reward { margin-left: auto; font-weight: 700; }
178
- .reward-pos { color: #4ade80; }
179
- .reward-neg { color: #f87171; }
180
- .reward-zero { color: #94a3b8; }
181
  /* Tooltip */
182
  #tooltip {
183
- position: absolute;
184
  background: rgba(10,14,26,0.95);
185
  border: 1px solid rgba(125,211,252,0.4);
186
- border-radius: 6px;
187
- padding: 8px 12px;
188
- font-size: 11px;
189
- pointer-events: none;
190
- opacity: 0;
191
- transition: opacity 0.15s;
192
- max-width: 200px;
193
- z-index: 20;
194
- }
195
- #tooltip h4 { color: #7dd3fc; margin-bottom: 4px; }
196
- /* Score ring */
197
- #score-ring {
198
- position: absolute;
199
- bottom: 130px; left: 16px;
200
- pointer-events: none;
201
  }
202
- #score-ring svg text { font-family: 'Segoe UI', sans-serif; }
203
  /* Loader */
204
  #loader {
205
- position: absolute;
206
- top: 50%; left: 50%;
207
- transform: translate(-50%, -50%);
208
- color: #7dd3fc;
209
- font-size: 14px;
210
- text-align: center;
211
  }
212
- .loader-spinner {
213
- width: 40px; height: 40px;
214
- border: 3px solid rgba(125,211,252,0.2);
215
  border-top-color: #7dd3fc;
216
  border-radius: 50%;
217
- animation: spin 0.8s linear infinite;
218
- margin: 0 auto 12px;
 
 
 
 
 
 
219
  }
220
- @keyframes spin { to { transform: rotate(360deg); } }
221
  </style>
222
  </head>
223
  <body>
224
 
225
- <!-- Hidden data injection point -->
226
- <div id="viz-data" style="display:none"></div>
227
 
228
- <div id="canvas-container">
229
- <canvas id="three-canvas"></canvas>
 
 
 
230
  </div>
231
 
232
- <div id="loader">
233
- <div class="loader-spinner"></div>
234
- <p>Initializing 3D Visualizer...</p>
235
- </div>
236
 
237
- <div id="ui-overlay">
238
- <!-- Header -->
239
- <div id="header">
240
- <h1>๐Ÿ” Agent Trajectory Visualizer โ€” 3D</h1>
241
- <p>Files = nodes ยท Dependencies = edges ยท Agent path = animated beam</p>
242
- </div>
 
 
 
243
 
244
- <!-- Info panel -->
245
- <div id="info-panel">
246
- <h3>EPISODE STATS</h3>
247
- <div class="info-row"><span>Task</span><span class="info-value" id="stat-task">โ€”</span></div>
248
- <div class="info-row"><span>Variant</span><span class="info-value" id="stat-variant">โ€”</span></div>
249
- <div class="info-row"><span>Steps</span><span class="info-value" id="stat-steps">โ€”</span></div>
250
- <div class="info-row"><span>Score</span><span class="info-value" id="stat-score">โ€”</span></div>
251
- <div class="info-row"><span>Strategy</span><span class="info-value" id="stat-strategy">โ€”</span></div>
252
- <div class="info-row"><span>Failure</span><span class="info-value" id="stat-failure">โ€”</span></div>
253
- </div>
254
 
255
- <!-- Legend -->
256
- <div id="legend">
257
- <h3>LEGEND</h3>
258
- <div class="legend-item">
259
- <div class="legend-dot" style="background:#f97316"></div><span>Source file</span>
260
- </div>
261
- <div class="legend-item">
262
- <div class="legend-dot" style="background:#3b82f6"></div><span>Test file</span>
263
- </div>
264
- <div class="legend-item">
265
- <div class="legend-dot" style="background:#a855f7"></div><span>Spec / Docs</span>
266
- </div>
267
- <div class="legend-item">
268
- <div class="legend-dot" style="background:#22c55e"></div><span>Visited</span>
269
- </div>
270
- <div class="legend-item">
271
- <div class="legend-dot" style="background:#ef4444"></div><span>Modified / Bug</span>
272
- </div>
273
- <div class="legend-item">
274
- <div class="legend-dot" style="background:#facc15; width:20px; height:4px; border-radius:2px;"></div><span>Agent path</span>
275
- </div>
276
- </div>
277
 
278
- <!-- Score ring -->
279
- <div id="score-ring">
280
- <svg width="80" height="80" viewBox="0 0 80 80">
281
- <circle cx="40" cy="40" r="34" fill="none"
282
- stroke="rgba(125,211,252,0.15)" stroke-width="6"/>
283
- <circle id="score-arc" cx="40" cy="40" r="34" fill="none"
284
- stroke="#7dd3fc" stroke-width="6"
285
- stroke-dasharray="0 214"
286
- stroke-linecap="round"
287
- transform="rotate(-90 40 40)"
288
- style="transition: stroke-dasharray 1s ease;"/>
289
- <text id="score-text" x="40" y="45" text-anchor="middle"
290
- fill="#e0e6f0" font-size="14" font-weight="700">0.0</text>
291
- </svg>
292
- </div>
293
 
294
- <!-- Step log -->
295
- <div id="step-log">
296
- <h3>STEP LOG</h3>
297
- <div id="log-entries"></div>
298
- </div>
299
 
300
- <!-- Tooltip -->
301
- <div id="tooltip">
302
- <h4 id="tooltip-title">File</h4>
303
- <div id="tooltip-body"></div>
 
304
  </div>
305
-
306
- <!-- Timeline -->
307
- <div id="timeline-panel">
308
- <div class="tl-header">
309
- <h3>TIMELINE REPLAY</h3>
310
- <span id="step-label">Step 0 / 0</span>
311
- </div>
312
- <input type="range" id="timeline-slider" min="0" max="0" value="0"
313
- oninput="onSliderChange(this.value)">
314
- <div id="step-actions">
315
- <button class="tl-btn" onclick="stepBack()">โ—€ Back</button>
316
- <button class="tl-btn" id="play-btn" onclick="togglePlay()">โ–ถ Play</button>
317
- <button class="tl-btn" onclick="stepForward()">Forward โ–ถ</button>
318
- <button class="tl-btn" onclick="resetView()">โ†บ Reset</button>
319
- <button class="tl-btn" id="orbit-btn" onclick="toggleOrbit()">๐Ÿ”„ Orbit</button>
320
- </div>
321
  </div>
322
  </div>
323
 
324
- <!-- Three.js from CDN -->
325
  <script src="https://cdnjs.cloudflare.com/ajax/libs/three.js/r128/three.min.js"></script>
326
  <script>
327
- // โ”€โ”€ Sample data (replaced by real data from backend) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
328
- const DEFAULT_DATA = {
329
- task: "task1",
330
- variant_id: "variant_1",
331
- final_score: 0.714,
332
- strategy: "TARGETED_DEBUGGING",
333
- failure_type: "CORRECT",
334
- files: [
335
- { name: "tests/test_formatter.py", type: "test" },
336
- { name: "src/formatter.py", type: "src", is_bug_file: true },
337
- { name: "src/utils.py", type: "src" }
338
- ],
339
- dependencies: [
340
- { from: "tests/test_formatter.py", to: "src/formatter.py" },
341
- { from: "src/formatter.py", to: "src/utils.py" }
342
- ],
343
- steps: [
344
- { step: 1, action: "read_file", path: "tests/test_formatter.py", reward: 0.0 },
345
- { step: 2, action: "read_file", path: "src/formatter.py", reward: 0.05 },
346
- { step: 3, action: "search_code", path: null, reward: 0.0 },
347
- { step: 4, action: "run_tests", path: "tests/test_formatter.py", reward: 0.0 },
348
- { step: 5, action: "submit", path: null, reward: 0.694 }
349
- ]
350
- };
351
-
352
- // โ”€โ”€ Load data from injection point or use default โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
353
- function loadVizData() {
354
- const el = document.getElementById('viz-data');
355
- if (el && el.textContent.trim()) {
356
- try { return JSON.parse(el.textContent); } catch(e) {}
357
- }
358
- return DEFAULT_DATA;
359
- }
360
-
361
- // โ”€โ”€ Three.js setup โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
362
  const canvas = document.getElementById('three-canvas');
363
- const renderer = new THREE.WebGLRenderer({ canvas, antialias: true, alpha: true });
364
- renderer.setSize(window.innerWidth, window.innerHeight);
365
  renderer.setPixelRatio(Math.min(window.devicePixelRatio, 2));
366
  renderer.setClearColor(0x0a0e1a, 1);
367
 
 
 
 
 
 
 
 
 
368
  const scene = new THREE.Scene();
369
- const fov = 60;
370
- const camera = new THREE.PerspectiveCamera(fov, window.innerWidth / window.innerHeight, 0.1, 1000);
371
- camera.position.set(0, 8, 22);
372
  camera.lookAt(0, 0, 0);
 
373
 
374
- // Ambient + directional light
375
- scene.add(new THREE.AmbientLight(0x1a2040, 1));
376
- const dirLight = new THREE.DirectionalLight(0x7dd3fc, 0.6);
377
- dirLight.position.set(5, 10, 5);
378
- scene.add(dirLight);
379
 
380
  // Grid
381
- const grid = new THREE.GridHelper(40, 20, 0x1e293b, 0x1e293b);
382
- grid.position.y = -3;
383
  scene.add(grid);
384
 
385
  // Stars
386
- const starGeo = new THREE.BufferGeometry();
387
- const starCount = 800;
388
- const starPositions = new Float32Array(starCount * 3);
389
- for (let i = 0; i < starCount * 3; i++) starPositions[i] = (Math.random() - 0.5) * 200;
390
- starGeo.setAttribute('position', new THREE.BufferAttribute(starPositions, 3));
391
- const starMat = new THREE.PointsMaterial({ color: 0x334155, size: 0.3 });
392
- scene.add(new THREE.Points(starGeo, starMat));
393
-
394
- // โ”€โ”€ Color palette โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
395
- const COLORS = {
396
- src: 0xf97316,
397
- test: 0x3b82f6,
398
- spec: 0xa855f7,
399
- visited: 0x22c55e,
400
- modified: 0xef4444,
401
- bug: 0xef4444,
402
- edge: 0x334155,
403
- path: 0xfacc15,
404
- agent: 0xfbbf24,
405
- };
406
-
407
- // โ”€โ”€ Orbit control (manual implementation) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
408
- let isOrbiting = false;
409
- let orbitActive = false;
410
- let mouse = { x: 0, y: 0, down: false, lastX: 0, lastY: 0 };
411
- let spherical = { theta: 0, phi: Math.PI / 4, r: 22 };
412
-
413
- canvas.addEventListener('mousedown', e => { mouse.down = true; mouse.lastX = e.clientX; mouse.lastY = e.clientY; });
414
- canvas.addEventListener('mouseup', () => { mouse.down = false; });
415
- canvas.addEventListener('mousemove', e => {
416
- if (!mouse.down) {
417
- // Hover for tooltip
418
- checkHover(e.clientX, e.clientY);
419
- return;
420
- }
421
- const dx = e.clientX - mouse.lastX;
422
- const dy = e.clientY - mouse.lastY;
423
- spherical.theta -= dx * 0.005;
424
- spherical.phi = Math.max(0.1, Math.min(Math.PI / 2, spherical.phi - dy * 0.005));
425
- mouse.lastX = e.clientX;
426
- mouse.lastY = e.clientY;
427
  });
428
  canvas.addEventListener('wheel', e => {
429
- spherical.r = Math.max(8, Math.min(50, spherical.r + e.deltaY * 0.02));
430
  });
431
 
432
  function updateCamera() {
433
- if (orbitActive) spherical.theta += 0.003;
434
- camera.position.x = spherical.r * Math.sin(spherical.phi) * Math.sin(spherical.theta);
435
- camera.position.y = spherical.r * Math.cos(spherical.phi);
436
- camera.position.z = spherical.r * Math.sin(spherical.phi) * Math.cos(spherical.theta);
 
 
 
437
  camera.lookAt(0, 0, 0);
438
  }
439
 
440
- // โ”€โ”€ Scene objects โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
441
- const nodeObjects = {}; // name โ†’ { mesh, label, position }
442
- const edgeObjects = [];
443
- const pathObjects = [];
444
- let agentSphere = null;
445
- let agentTrail = null;
446
- let currentStep = 0;
447
- let maxStep = 0;
448
- let playing = false;
449
- let playInterval = null;
450
  let vizData = null;
451
- let nodePositions = {};
 
 
 
 
 
 
 
 
 
 
 
452
 
453
- // โ”€โ”€ Build scene from data โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
454
  function buildScene(data) {
 
455
  vizData = data;
456
-
457
- // Clear previous objects
458
- Object.values(nodeObjects).forEach(o => scene.remove(o.mesh));
459
- edgeObjects.forEach(e => scene.remove(e));
460
- pathObjects.forEach(p => scene.remove(p));
461
- if (agentSphere) scene.remove(agentSphere);
462
- Object.keys(nodeObjects).forEach(k => delete nodeObjects[k]);
463
-
464
  const files = data.files || [];
465
  const n = files.length;
466
- if (n === 0) return;
467
-
468
- // Arrange files in a circular layout on XZ plane
469
- files.forEach((file, i) => {
470
- const angle = (i / n) * Math.PI * 2;
471
- const radius = Math.max(4, n * 0.9);
472
- const x = Math.cos(angle) * radius;
473
- const z = Math.sin(angle) * radius;
474
- const y = 0;
475
-
476
- nodePositions[file.name] = new THREE.Vector3(x, y, z);
477
-
478
- // Sphere geometry
479
- const geo = new THREE.SphereGeometry(0.6, 16, 16);
480
- const color = new THREE.Color(
481
- file.is_bug_file ? COLORS.bug :
482
- file.type === 'test' ? COLORS.test :
483
- file.type === 'spec' ? COLORS.spec : COLORS.src
484
- );
485
  const mat = new THREE.MeshPhongMaterial({
486
- color,
487
- emissive: color.clone().multiplyScalar(0.3),
488
- shininess: 60,
489
- transparent: true,
490
- opacity: 0.9,
491
  });
492
  const mesh = new THREE.Mesh(geo, mat);
493
- mesh.position.set(x, y, z);
494
- mesh.userData = { file };
495
  scene.add(mesh);
496
 
497
- // Glow ring
498
- const ringGeo = new THREE.RingGeometry(0.75, 0.85, 32);
499
- const ringMat = new THREE.MeshBasicMaterial({
500
- color,
501
- transparent: true,
502
- opacity: 0.25,
503
- side: THREE.DoubleSide,
504
- });
505
- const ring = new THREE.Mesh(ringGeo, ringMat);
506
  ring.rotation.x = Math.PI / 2;
507
  mesh.add(ring);
508
 
509
- nodeObjects[file.name] = { mesh, position: nodePositions[file.name], file };
510
  });
511
 
512
- // Draw dependency edges
513
  (data.dependencies || []).forEach(dep => {
514
- const fromPos = nodePositions[dep.from];
515
- const toPos = nodePositions[dep.to];
516
- if (!fromPos || !toPos) return;
517
-
518
- const points = [fromPos.clone(), toPos.clone()];
519
- const geo = new THREE.BufferGeometry().setFromPoints(points);
520
- const mat = new THREE.LineBasicMaterial({
521
- color: COLORS.edge,
522
- transparent: true,
523
- opacity: 0.4,
524
- });
525
  const line = new THREE.Line(geo, mat);
526
  scene.add(line);
527
- edgeObjects.push(line);
528
  });
529
 
530
- // Agent globe
531
- const agentGeo = new THREE.SphereGeometry(0.35, 16, 16);
532
- const agentMat = new THREE.MeshPhongMaterial({
533
- color: COLORS.agent,
534
- emissive: 0xfbbf24,
535
- emissiveIntensity: 0.8,
536
- shininess: 100,
537
- });
538
- agentSphere = new THREE.Mesh(agentGeo, agentMat);
539
- agentSphere.position.set(0, 3, 0); // Start above origin
540
- scene.add(agentSphere);
541
-
542
- // Update UI
543
- document.getElementById('stat-task').textContent = data.task || 'โ€”';
544
- document.getElementById('stat-variant').textContent = data.variant_id || 'โ€”';
545
- document.getElementById('stat-steps').textContent = (data.steps || []).length;
546
- document.getElementById('stat-strategy').textContent = data.strategy || 'โ€”';
547
- document.getElementById('stat-failure').textContent = data.failure_type || 'โ€”';
548
  updateScore(data.final_score || 0);
549
- updateStepLog(data.steps || [], -1);
550
 
551
- // Setup timeline
552
  maxStep = (data.steps || []).length;
553
- const slider = document.getElementById('timeline-slider');
554
- slider.max = maxStep;
555
- slider.value = 0;
556
- currentStep = 0;
557
- updateStepLabel(0, maxStep);
558
-
559
  applyStep(0);
560
  }
561
 
562
- // โ”€โ”€ Animation: go to a specific step โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
563
- function applyStep(stepIndex) {
564
  if (!vizData) return;
565
  const steps = vizData.steps || [];
566
- const visitedFiles = new Set();
567
- const modifiedFiles = new Set();
568
 
569
  // Reset all nodes
570
- Object.values(nodeObjects).forEach(obj => {
571
- const file = obj.file;
572
- const baseColor = new THREE.Color(
573
- file.is_bug_file ? COLORS.bug :
574
- file.type === 'test' ? COLORS.test :
575
- file.type === 'spec' ? COLORS.spec : COLORS.src
576
- );
577
- obj.mesh.material.color.set(baseColor);
578
- obj.mesh.material.emissive.set(baseColor.clone().multiplyScalar(0.2));
579
- obj.mesh.material.opacity = 0.5;
580
- obj.mesh.scale.set(1, 1, 1);
581
  });
582
 
583
  // Remove old path lines
584
- pathObjects.forEach(p => scene.remove(p));
585
- pathObjects.length = 0;
586
-
587
- // Collect positions for path up to current step
588
- const pathPositions = [];
589
-
590
- for (let i = 0; i < stepIndex; i++) {
591
- const step = steps[i];
592
- if (!step) continue;
593
-
594
- if (step.path && nodeObjects[step.path]) {
595
- const pos = nodeObjects[step.path].position.clone();
596
- pathPositions.push(pos.clone().add(new THREE.Vector3(0, 0.1, 0)));
597
-
598
- if (step.action === 'read_file') visitedFiles.add(step.path);
599
- if (step.action === 'write_file') modifiedFiles.add(step.path);
600
  }
 
 
601
  }
602
 
603
- // Color visited + modified
604
- visitedFiles.forEach(name => {
605
- if (nodeObjects[name]) {
606
- nodeObjects[name].mesh.material.color.set(COLORS.visited);
607
- nodeObjects[name].mesh.material.emissive.set(
608
- new THREE.Color(COLORS.visited).multiplyScalar(0.4)
609
- );
610
- nodeObjects[name].mesh.material.opacity = 1.0;
611
- nodeObjects[name].mesh.scale.set(1.2, 1.2, 1.2);
612
  }
613
  });
614
- modifiedFiles.forEach(name => {
615
- if (nodeObjects[name]) {
616
- nodeObjects[name].mesh.material.color.set(COLORS.modified);
617
- nodeObjects[name].mesh.material.emissive.set(
618
- new THREE.Color(COLORS.modified).multiplyScalar(0.5)
619
- );
620
- nodeObjects[name].mesh.material.opacity = 1.0;
621
- nodeObjects[name].mesh.scale.set(1.4, 1.4, 1.4);
622
  }
623
  });
624
 
625
- // Draw path beam
626
- if (pathPositions.length >= 2) {
627
- const pathGeo = new THREE.BufferGeometry().setFromPoints(pathPositions);
628
- const pathMat = new THREE.LineBasicMaterial({
629
- color: COLORS.path,
630
- transparent: true,
631
- opacity: 0.85,
632
- linewidth: 2,
633
- });
634
- const pathLine = new THREE.Line(pathGeo, pathMat);
635
- scene.add(pathLine);
636
- pathObjects.push(pathLine);
637
  }
638
 
639
- // Move agent sphere
640
- if (stepIndex > 0 && stepIndex <= steps.length) {
641
- const currentStepData = steps[stepIndex - 1];
642
- if (currentStepData && currentStepData.path && nodeObjects[currentStepData.path]) {
643
- const targetPos = nodeObjects[currentStepData.path].position;
644
- agentSphere.position.set(targetPos.x, targetPos.y + 1.2, targetPos.z);
645
- } else {
646
- // No file target โ€” float in center (for search/submit actions)
647
- agentSphere.position.set(0, 2.5, 0);
648
- }
649
- } else {
650
- agentSphere.position.set(0, 3.5, 0);
651
  }
652
 
653
- // Highlight current node
654
- if (stepIndex > 0) {
655
- const cur = steps[stepIndex - 1];
656
- if (cur && cur.path && nodeObjects[cur.path]) {
657
- nodeObjects[cur.path].mesh.scale.set(1.6, 1.6, 1.6);
 
 
 
658
  }
 
 
659
  }
660
 
661
- updateStepLog(steps, stepIndex - 1);
662
- updateStepLabel(stepIndex, maxStep);
663
-
664
- // Update slider gradient
665
- const slider = document.getElementById('timeline-slider');
666
- const pct = maxStep > 0 ? (stepIndex / maxStep * 100) : 0;
667
- slider.style.setProperty('--pct', pct + '%');
668
  }
669
 
670
- // โ”€โ”€ Score ring โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
671
- function updateScore(score) {
672
- const circumference = 2 * Math.PI * 34;
673
- const arc = circumference * Math.min(1, Math.max(0, score));
674
- document.getElementById('score-arc').setAttribute(
675
- 'stroke-dasharray', `${arc} ${circumference}`
676
- );
677
- document.getElementById('score-text').textContent = score.toFixed(2);
678
- document.getElementById('stat-score').textContent = score.toFixed(3);
679
-
680
- // Color by score
681
- const color = score >= 0.7 ? '#4ade80' : score >= 0.4 ? '#fbbf24' : '#f87171';
682
- document.getElementById('score-arc').setAttribute('stroke', color);
683
  }
684
 
685
- // โ”€โ”€ Step log โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
686
- function updateStepLog(steps, currentIdx) {
687
- const container = document.getElementById('log-entries');
 
688
  container.innerHTML = '';
689
-
690
- const ACTION_EMOJI = {
691
- read_file: '๐Ÿ“–',
692
- write_file: 'โœ๏ธ',
693
- run_tests: '๐Ÿงช',
694
- search_code: '๐Ÿ”',
695
- submit: '๐Ÿ',
696
- };
697
-
698
- steps.forEach((step, i) => {
699
- const active = i === currentIdx;
700
- const past = i < currentIdx;
701
- const entry = document.createElement('div');
702
- entry.className = 'log-entry';
703
- entry.style.opacity = past ? '0.6' : active ? '1' : '0.35';
704
- if (active) entry.style.background = 'rgba(125,211,252,0.08)';
705
-
706
- const reward = step.reward || 0;
707
- const rewardClass = reward > 0 ? 'reward-pos' : reward < 0 ? 'reward-neg' : 'reward-zero';
708
- const emoji = ACTION_EMOJI[step.action] || 'โ€ข';
709
- const path = step.path ? step.path.split('/').pop() : step.action;
710
-
711
- entry.innerHTML = `
712
- <span class="log-step">S${step.step}</span>
713
- <span class="log-action" style="color:${active ? '#7dd3fc' : '#94a3b8'}">${emoji} ${path}</span>
714
- <span class="log-reward ${rewardClass}">${reward > 0 ? '+' : ''}${reward.toFixed(2)}</span>
715
- `;
716
- container.appendChild(entry);
717
  });
718
-
719
- // Auto-scroll to current
720
- if (currentIdx >= 0) {
721
- const entries = container.children;
722
- if (entries[currentIdx]) {
723
- entries[currentIdx].scrollIntoView({ block: 'nearest' });
724
- }
725
  }
726
  }
727
 
728
- // โ”€โ”€ Hover tooltip โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
729
- const raycaster = new THREE.Raycaster();
730
- const mouseVec = new THREE.Vector2();
731
- const tooltip = document.getElementById('tooltip');
732
 
733
  function checkHover(mx, my) {
734
- mouseVec.x = (mx / window.innerWidth) * 2 - 1;
735
- mouseVec.y = -(my / window.innerHeight) * 2 + 1;
736
- raycaster.setFromCamera(mouseVec, camera);
737
-
738
- const meshes = Object.values(nodeObjects).map(o => o.mesh);
739
- const hits = raycaster.intersectObjects(meshes);
740
-
741
- if (hits.length > 0) {
742
- const file = hits[0].object.userData.file;
743
- if (file) {
744
- tooltip.style.opacity = '1';
745
- tooltip.style.left = (mx + 14) + 'px';
746
- tooltip.style.top = (my - 14) + 'px';
747
- document.getElementById('tooltip-title').textContent = file.name;
748
- document.getElementById('tooltip-body').innerHTML = `
749
- Type: ${file.type}<br>
750
- ${file.is_bug_file ? 'โš ๏ธ Bug location' : ''}
751
- `;
752
- }
753
  } else {
754
- tooltip.style.opacity = '0';
755
- }
756
- }
757
-
758
- // โ”€โ”€ Timeline controls โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
759
- function onSliderChange(val) {
760
- currentStep = parseInt(val);
761
- applyStep(currentStep);
762
- }
763
-
764
- function stepForward() {
765
- if (currentStep < maxStep) {
766
- currentStep++;
767
- document.getElementById('timeline-slider').value = currentStep;
768
- applyStep(currentStep);
769
- }
770
- }
771
-
772
- function stepBack() {
773
- if (currentStep > 0) {
774
- currentStep--;
775
- document.getElementById('timeline-slider').value = currentStep;
776
- applyStep(currentStep);
777
  }
778
  }
779
 
 
 
 
 
780
  function togglePlay() {
781
  playing = !playing;
782
- const btn = document.getElementById('play-btn');
783
- btn.textContent = playing ? 'โธ Pause' : 'โ–ถ Play';
784
  if (playing) {
785
- if (currentStep >= maxStep) { currentStep = 0; }
786
- playInterval = setInterval(() => {
787
- if (currentStep >= maxStep) {
788
- playing = false;
789
- btn.textContent = 'โ–ถ Play';
790
- clearInterval(playInterval);
791
- return;
792
- }
793
- stepForward();
794
- }, 900);
795
  } else {
796
- clearInterval(playInterval);
797
  }
798
  }
799
-
800
  function toggleOrbit() {
801
- orbitActive = !orbitActive;
802
  const btn = document.getElementById('orbit-btn');
803
- btn.textContent = orbitActive ? 'โน Stop' : '๐Ÿ”„ Orbit';
804
- btn.classList.toggle('active', orbitActive);
805
  }
806
-
807
  function resetView() {
808
- spherical = { theta: 0, phi: Math.PI / 4, r: 22 };
809
- currentStep = 0;
810
- document.getElementById('timeline-slider').value = 0;
811
  applyStep(0);
812
  }
 
813
 
814
- function updateStepLabel(step, max) {
815
- document.getElementById('step-label').textContent = `Step ${step} / ${max}`;
816
- }
817
-
818
- // โ”€โ”€ Animation loop โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
819
- let frame = 0;
820
  function animate() {
821
  requestAnimationFrame(animate);
822
  frame++;
823
-
824
  updateCamera();
825
-
826
- // Pulse agent sphere
827
- if (agentSphere) {
828
- const pulse = 1 + Math.sin(frame * 0.08) * 0.15;
829
- agentSphere.scale.set(pulse, pulse, pulse);
830
- agentSphere.rotation.y += 0.03;
831
- }
832
-
833
- // Subtle node oscillation
834
- Object.values(nodeObjects).forEach((obj, i) => {
835
- obj.mesh.position.y = obj.position.y + Math.sin(frame * 0.02 + i) * 0.05;
836
  });
837
-
838
  renderer.render(scene, camera);
839
  }
 
840
 
841
- // โ”€โ”€ Window resize โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
842
- window.addEventListener('resize', () => {
843
- camera.aspect = window.innerWidth / window.innerHeight;
844
- camera.updateProjectionMatrix();
845
- renderer.setSize(window.innerWidth, window.innerHeight);
846
- });
847
-
848
- // โ”€โ”€ Public API for Gradio integration โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
849
- window.loadTrajectoryData = function(jsonData) {
850
  try {
851
- const data = typeof jsonData === 'string' ? JSON.parse(jsonData) : jsonData;
 
 
 
 
 
 
 
 
 
852
  buildScene(data);
 
853
  } catch(e) {
854
- console.error('Failed to load trajectory data:', e);
 
855
  }
856
- };
857
 
858
- // โ”€โ”€ Init โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
859
- document.addEventListener('DOMContentLoaded', () => {
860
- const data = loadVizData();
861
  buildScene(data);
862
  document.getElementById('loader').style.display = 'none';
863
- animate();
864
- });
 
 
 
865
  </script>
866
  </body>
867
  </html>
 
6
  <title>Agent Trajectory 3D Visualizer</title>
7
  <style>
8
  * { margin: 0; padding: 0; box-sizing: border-box; }
9
+ html, body {
10
+ width: 100%; height: 100%;
11
  background: #0a0e1a;
12
  color: #e0e6f0;
13
  font-family: 'Segoe UI', system-ui, sans-serif;
14
  overflow: hidden;
 
15
  }
16
+ #three-canvas {
17
+ position: fixed;
18
  top: 0; left: 0;
19
  width: 100%; height: 100%;
20
+ display: block;
 
 
 
 
 
 
21
  }
22
  /* Header */
23
  #header {
24
+ position: fixed;
25
+ top: 10px; left: 50%;
26
  transform: translateX(-50%);
27
  text-align: center;
28
+ z-index: 20;
29
  pointer-events: none;
30
  }
31
  #header h1 {
32
+ font-size: 14px; font-weight: 700;
 
33
  color: #7dd3fc;
34
  letter-spacing: 0.05em;
35
+ text-shadow: 0 0 16px rgba(125,211,252,0.6);
 
 
 
 
 
36
  }
37
+ /* Panel base */
38
+ .panel {
39
+ position: fixed;
40
+ background: rgba(10,14,26,0.88);
 
41
  border: 1px solid rgba(125,211,252,0.2);
42
+ border-radius: 10px;
43
  padding: 10px 14px;
44
  font-size: 11px;
45
+ z-index: 20;
46
+ backdrop-filter: blur(6px);
 
 
 
 
47
  }
48
+ .panel h3 {
49
+ font-size: 10px; letter-spacing: 0.1em;
50
+ color: #7dd3fc; margin-bottom: 8px;
51
+ text-transform: uppercase;
52
  }
53
  /* Info panel */
54
+ #info-panel { top: 10px; left: 14px; min-width: 190px; }
55
+ .info-row { display: flex; justify-content: space-between; gap: 10px; margin-bottom: 4px; color: #94a3b8; }
56
+ .info-val { color: #e0e6f0; font-weight: 600; max-width: 110px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
57
+ /* Legend */
58
+ #legend { top: 10px; right: 14px; }
59
+ .leg { display: flex; align-items: center; gap: 7px; margin-bottom: 5px; }
60
+ .leg-dot { width: 9px; height: 9px; border-radius: 50%; flex-shrink: 0; }
61
+ .leg-line { width: 18px; height: 3px; border-radius: 2px; flex-shrink: 0; }
62
+ /* Score ring */
63
+ #score-ring { position: fixed; bottom: 150px; left: 14px; z-index: 20; }
64
+ /* Step log */
65
+ #step-log {
66
+ position: fixed; bottom: 150px; right: 14px;
67
+ width: 230px; max-height: 200px; overflow-y: auto;
68
+ z-index: 20;
 
69
  }
70
+ .log-e { display: flex; gap: 5px; margin-bottom: 5px; padding-bottom: 5px; border-bottom: 1px solid rgba(255,255,255,0.05); font-size: 10px; }
71
+ .log-e:last-child { border-bottom: none; }
72
+ .log-s { color: #475569; min-width: 24px; }
73
+ .log-a { font-weight: 600; flex: 1; }
74
+ .rp { color: #4ade80; } .rn { color: #f87171; } .rz { color: #94a3b8; }
75
  /* Timeline */
76
+ #timeline {
77
+ position: fixed; bottom: 16px; left: 50%;
 
78
  transform: translateX(-50%);
79
+ width: min(680px, 92vw);
80
+ z-index: 20;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  }
82
+ #tl-header { display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px; }
83
+ #tl-header h3 { font-size: 10px; color: #7dd3fc; letter-spacing: 0.1em; }
84
+ #step-label { font-size: 11px; color: #f0abfc; font-weight: 700; }
85
+ #slider {
86
+ width: 100%; -webkit-appearance: none; height: 4px;
87
  background: linear-gradient(to right, #7dd3fc 0%, #7dd3fc var(--pct,0%), #1e293b var(--pct,0%));
88
+ border-radius: 4px; outline: none; cursor: pointer;
 
 
 
 
 
 
 
 
 
 
89
  }
90
+ #slider::-webkit-slider-thumb {
91
+ -webkit-appearance: none; width: 15px; height: 15px;
92
+ border-radius: 50%; background: #7dd3fc; cursor: pointer;
93
+ box-shadow: 0 0 8px rgba(125,211,252,0.8);
 
94
  }
95
+ #tl-btns { display: flex; gap: 7px; margin-top: 8px; justify-content: center; }
96
+ .tb {
97
  background: rgba(125,211,252,0.1);
98
  border: 1px solid rgba(125,211,252,0.3);
99
+ color: #7dd3fc; padding: 4px 12px;
100
+ border-radius: 6px; cursor: pointer; font-size: 10px;
101
+ transition: all 0.15s;
 
 
 
102
  }
103
+ .tb:hover { background: rgba(125,211,252,0.25); }
104
+ .tb.active { background: rgba(125,211,252,0.3); }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  /* Tooltip */
106
  #tooltip {
107
+ position: fixed; z-index: 30;
108
  background: rgba(10,14,26,0.95);
109
  border: 1px solid rgba(125,211,252,0.4);
110
+ border-radius: 6px; padding: 7px 11px;
111
+ font-size: 10px; pointer-events: none;
112
+ opacity: 0; transition: opacity 0.1s;
113
+ max-width: 180px;
 
 
 
 
 
 
 
 
 
 
 
114
  }
115
+ #tt-title { color: #7dd3fc; margin-bottom: 3px; font-weight: 700; }
116
  /* Loader */
117
  #loader {
118
+ position: fixed; top: 50%; left: 50%;
119
+ transform: translate(-50%,-50%);
120
+ text-align: center; z-index: 50; color: #7dd3fc; font-size: 13px;
 
 
 
121
  }
122
+ .spin {
123
+ width: 36px; height: 36px; margin: 0 auto 10px;
124
+ border: 3px solid rgba(125,211,252,0.15);
125
  border-top-color: #7dd3fc;
126
  border-radius: 50%;
127
+ animation: sp 0.7s linear infinite;
128
+ }
129
+ @keyframes sp { to { transform: rotate(360deg); } }
130
+ #no-data {
131
+ position: fixed; top: 50%; left: 50%;
132
+ transform: translate(-50%,-50%);
133
+ text-align: center; color: #475569; font-size: 13px;
134
+ display: none;
135
  }
 
136
  </style>
137
  </head>
138
  <body>
139
 
140
+ <canvas id="three-canvas"></canvas>
 
141
 
142
+ <div id="loader"><div class="spin"></div><p>Loading 3D...</p></div>
143
+ <div id="no-data">
144
+ <p style="font-size:28px;margin-bottom:12px">๐Ÿ”</p>
145
+ <p style="color:#7dd3fc;font-weight:700;margin-bottom:6px">No Episode Loaded</p>
146
+ <p>Run an episode first, then click<br><strong style="color:#7dd3fc">Load Trajectory</strong></p>
147
  </div>
148
 
149
+ <div id="header"><h1>๐Ÿ” Agent Trajectory Visualizer โ€” 3D</h1></div>
 
 
 
150
 
151
+ <!-- Info panel -->
152
+ <div class="panel" id="info-panel">
153
+ <h3>Episode Stats</h3>
154
+ <div class="info-row"><span>Task</span><span class="info-val" id="st-task">โ€”</span></div>
155
+ <div class="info-row"><span>Variant</span><span class="info-val" id="st-var">โ€”</span></div>
156
+ <div class="info-row"><span>Steps</span><span class="info-val" id="st-steps">โ€”</span></div>
157
+ <div class="info-row"><span>Score</span><span class="info-val" id="st-score">โ€”</span></div>
158
+ <div class="info-row"><span>Strategy</span><span class="info-val" id="st-strat">โ€”</span></div>
159
+ </div>
160
 
161
+ <!-- Legend -->
162
+ <div class="panel" id="legend">
163
+ <h3>Legend</h3>
164
+ <div class="leg"><div class="leg-dot" style="background:#f97316"></div><span>Source file</span></div>
165
+ <div class="leg"><div class="leg-dot" style="background:#3b82f6"></div><span>Test file</span></div>
166
+ <div class="leg"><div class="leg-dot" style="background:#a855f7"></div><span>Spec / Docs</span></div>
167
+ <div class="leg"><div class="leg-dot" style="background:#22c55e"></div><span>Visited</span></div>
168
+ <div class="leg"><div class="leg-dot" style="background:#ef4444"></div><span>Bug / Modified</span></div>
169
+ <div class="leg"><div class="leg-line" style="background:#facc15"></div><span>Agent path</span></div>
170
+ </div>
171
 
172
+ <!-- Score ring -->
173
+ <div id="score-ring">
174
+ <svg width="76" height="76" viewBox="0 0 76 76">
175
+ <circle cx="38" cy="38" r="30" fill="none" stroke="rgba(125,211,252,0.12)" stroke-width="6"/>
176
+ <circle id="score-arc" cx="38" cy="38" r="30" fill="none"
177
+ stroke="#7dd3fc" stroke-width="6"
178
+ stroke-dasharray="0 188"
179
+ stroke-linecap="round"
180
+ transform="rotate(-90 38 38)"
181
+ style="transition:stroke-dasharray 1.2s ease"/>
182
+ <text id="score-txt" x="38" y="43" text-anchor="middle"
183
+ fill="#e0e6f0" font-size="13" font-weight="700"
184
+ font-family="'Segoe UI',sans-serif">0.0</text>
185
+ </svg>
186
+ </div>
 
 
 
 
 
 
 
187
 
188
+ <!-- Step log -->
189
+ <div class="panel" id="step-log">
190
+ <h3>Step Log</h3>
191
+ <div id="log-list"></div>
192
+ </div>
 
 
 
 
 
 
 
 
 
 
193
 
194
+ <!-- Tooltip -->
195
+ <div id="tooltip"><div id="tt-title"></div><div id="tt-body"></div></div>
 
 
 
196
 
197
+ <!-- Timeline -->
198
+ <div class="panel" id="timeline">
199
+ <div id="tl-header">
200
+ <h3>Timeline Replay</h3>
201
+ <span id="step-label">Step 0 / 0</span>
202
  </div>
203
+ <input type="range" id="slider" min="0" max="0" value="0"
204
+ oninput="onSlider(this.value)">
205
+ <div id="tl-btns">
206
+ <button class="tb" onclick="stepBack()">โ—€ Back</button>
207
+ <button class="tb" id="play-btn" onclick="togglePlay()">โ–ถ Play</button>
208
+ <button class="tb" onclick="stepFwd()">Forward โ–ถ</button>
209
+ <button class="tb" onclick="resetView()">โ†บ Reset</button>
210
+ <button class="tb" id="orbit-btn" onclick="toggleOrbit()">๐Ÿ”„ Orbit</button>
 
 
 
 
 
 
 
 
211
  </div>
212
  </div>
213
 
 
214
  <script src="https://cdnjs.cloudflare.com/ajax/libs/three.js/r128/three.min.js"></script>
215
  <script>
216
+ // โ”€โ”€ Renderer โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  const canvas = document.getElementById('three-canvas');
218
+ const renderer = new THREE.WebGLRenderer({ canvas, antialias: true, alpha: false });
 
219
  renderer.setPixelRatio(Math.min(window.devicePixelRatio, 2));
220
  renderer.setClearColor(0x0a0e1a, 1);
221
 
222
+ function resize() {
223
+ renderer.setSize(window.innerWidth, window.innerHeight, false);
224
+ camera.aspect = window.innerWidth / window.innerHeight;
225
+ camera.updateProjectionMatrix();
226
+ }
227
+ window.addEventListener('resize', resize);
228
+
229
+ // โ”€โ”€ Scene + Camera โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
230
  const scene = new THREE.Scene();
231
+ const camera = new THREE.PerspectiveCamera(58, 1, 0.1, 1000);
232
+ camera.position.set(0, 8, 24);
 
233
  camera.lookAt(0, 0, 0);
234
+ resize();
235
 
236
+ // Lights
237
+ scene.add(new THREE.AmbientLight(0x1a2040, 1.2));
238
+ const dl = new THREE.DirectionalLight(0x7dd3fc, 0.5);
239
+ dl.position.set(5, 12, 5);
240
+ scene.add(dl);
241
 
242
  // Grid
243
+ const grid = new THREE.GridHelper(50, 25, 0x1e293b, 0x0f172a);
244
+ grid.position.y = -3.5;
245
  scene.add(grid);
246
 
247
  // Stars
248
+ (function() {
249
+ const geo = new THREE.BufferGeometry();
250
+ const pos = new Float32Array(900 * 3);
251
+ for (let i = 0; i < 900 * 3; i++) pos[i] = (Math.random() - 0.5) * 220;
252
+ geo.setAttribute('position', new THREE.BufferAttribute(pos, 3));
253
+ scene.add(new THREE.Points(geo, new THREE.PointsMaterial({ color: 0x1e3a5f, size: 0.25 })));
254
+ })();
255
+
256
+ // โ”€โ”€ Orbit controls (manual) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
257
+ let sph = { theta: 0, phi: 1.1, r: 24 };
258
+ let orbitAuto = false, dragging = false, lastX = 0, lastY = 0;
259
+
260
+ canvas.addEventListener('mousedown', e => { dragging = true; lastX = e.clientX; lastY = e.clientY; });
261
+ window.addEventListener('mouseup', () => { dragging = false; });
262
+ window.addEventListener('mousemove', e => {
263
+ if (dragging) {
264
+ sph.theta -= (e.clientX - lastX) * 0.006;
265
+ sph.phi = Math.max(0.15, Math.min(1.55, sph.phi - (e.clientY - lastY) * 0.006));
266
+ lastX = e.clientX; lastY = e.clientY;
267
+ } else { checkHover(e.clientX, e.clientY); }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  });
269
  canvas.addEventListener('wheel', e => {
270
+ sph.r = Math.max(8, Math.min(55, sph.r + e.deltaY * 0.025));
271
  });
272
 
273
  function updateCamera() {
274
+ if (orbitAuto) sph.theta += 0.004;
275
+ const sin_p = Math.sin(sph.phi);
276
+ camera.position.set(
277
+ sph.r * sin_p * Math.sin(sph.theta),
278
+ sph.r * Math.cos(sph.phi),
279
+ sph.r * sin_p * Math.cos(sph.theta)
280
+ );
281
  camera.lookAt(0, 0, 0);
282
  }
283
 
284
+ // โ”€โ”€ Scene state โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
285
+ const COLS = { src:0xf97316, test:0x3b82f6, spec:0xa855f7, visited:0x22c55e, bug:0xef4444, agent:0xfbbf24, path:0xfacc15, edge:0x334155 };
286
+
287
+ let nodeMap = {}; // filename โ†’ { mesh, basePos }
288
+ let pathLines = [], edgeLines = [];
289
+ let agentMesh = null;
 
 
 
 
290
  let vizData = null;
291
+ let curStep = 0, maxStep = 0;
292
+ let playing = false, playTimer = null;
293
+ let frame = 0;
294
+
295
+ // โ”€โ”€ Build scene โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
296
+ function clearScene() {
297
+ Object.values(nodeMap).forEach(o => scene.remove(o.mesh));
298
+ pathLines.forEach(l => scene.remove(l));
299
+ edgeLines.forEach(l => scene.remove(l));
300
+ if (agentMesh) scene.remove(agentMesh);
301
+ nodeMap = {}; pathLines = []; edgeLines = []; agentMesh = null;
302
+ }
303
 
 
304
  function buildScene(data) {
305
+ clearScene();
306
  vizData = data;
 
 
 
 
 
 
 
 
307
  const files = data.files || [];
308
  const n = files.length;
309
+ if (!n) return;
310
+
311
+ // Layout: circle
312
+ files.forEach((f, i) => {
313
+ const angle = (i / n) * Math.PI * 2 - Math.PI / 2;
314
+ const R = Math.max(5, n * 1.0);
315
+ const x = Math.cos(angle) * R;
316
+ const z = Math.sin(angle) * R;
317
+ const pos = new THREE.Vector3(x, 0, z);
318
+
319
+ const baseColor = f.is_bug_file ? COLS.bug :
320
+ f.type === 'test' ? COLS.test :
321
+ f.type === 'spec' ? COLS.spec : COLS.src;
322
+ const col = new THREE.Color(baseColor);
323
+
324
+ // Main sphere
325
+ const geo = new THREE.SphereGeometry(0.55, 20, 20);
 
 
326
  const mat = new THREE.MeshPhongMaterial({
327
+ color: col, emissive: col.clone().multiplyScalar(0.25),
328
+ shininess: 70, transparent: true, opacity: 0.85,
 
 
 
329
  });
330
  const mesh = new THREE.Mesh(geo, mat);
331
+ mesh.position.copy(pos);
332
+ mesh.userData = { file: f, basePos: pos.clone() };
333
  scene.add(mesh);
334
 
335
+ // Ring halo
336
+ const rg = new THREE.RingGeometry(0.7, 0.82, 32);
337
+ const rm = new THREE.MeshBasicMaterial({ color: col, transparent: true, opacity: 0.2, side: THREE.DoubleSide });
338
+ const ring = new THREE.Mesh(rg, rm);
 
 
 
 
 
339
  ring.rotation.x = Math.PI / 2;
340
  mesh.add(ring);
341
 
342
+ nodeMap[f.name] = { mesh, basePos: pos.clone() };
343
  });
344
 
345
+ // Dependency edges
346
  (data.dependencies || []).forEach(dep => {
347
+ const a = nodeMap[dep.from], b = nodeMap[dep.to];
348
+ if (!a || !b) return;
349
+ const geo = new THREE.BufferGeometry().setFromPoints([a.basePos.clone(), b.basePos.clone()]);
350
+ const mat = new THREE.LineBasicMaterial({ color: COLS.edge, transparent: true, opacity: 0.35 });
 
 
 
 
 
 
 
351
  const line = new THREE.Line(geo, mat);
352
  scene.add(line);
353
+ edgeLines.push(line);
354
  });
355
 
356
+ // Agent sphere
357
+ const ag = new THREE.SphereGeometry(0.32, 14, 14);
358
+ const am = new THREE.MeshPhongMaterial({ color: COLS.agent, emissive: 0xfbbf24, emissiveIntensity: 0.9, shininess: 120 });
359
+ agentMesh = new THREE.Mesh(ag, am);
360
+ agentMesh.position.set(0, 3, 0);
361
+ scene.add(agentMesh);
362
+
363
+ // Update UI stats
364
+ document.getElementById('st-task').textContent = data.task || 'โ€”';
365
+ document.getElementById('st-var').textContent = (data.variant_id || 'โ€”').slice(0, 12);
366
+ document.getElementById('st-steps').textContent = (data.steps || []).length;
367
+ document.getElementById('st-strat').textContent = data.strategy || 'โ€”';
 
 
 
 
 
 
368
  updateScore(data.final_score || 0);
 
369
 
 
370
  maxStep = (data.steps || []).length;
371
+ const sl = document.getElementById('slider');
372
+ sl.max = maxStep; sl.value = 0;
373
+ curStep = 0;
374
+ updateLabel(0, maxStep);
 
 
375
  applyStep(0);
376
  }
377
 
378
+ // โ”€โ”€ Apply step โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
379
+ function applyStep(idx) {
380
  if (!vizData) return;
381
  const steps = vizData.steps || [];
 
 
382
 
383
  // Reset all nodes
384
+ Object.values(nodeMap).forEach(({ mesh, basePos: _ }) => {
385
+ const f = mesh.userData.file;
386
+ const bc = f.is_bug_file ? COLS.bug : f.type === 'test' ? COLS.test : f.type === 'spec' ? COLS.spec : COLS.src;
387
+ mesh.material.color.set(bc);
388
+ mesh.material.emissive.set(new THREE.Color(bc).multiplyScalar(0.2));
389
+ mesh.material.opacity = 0.55;
390
+ mesh.scale.setScalar(1);
 
 
 
 
391
  });
392
 
393
  // Remove old path lines
394
+ pathLines.forEach(l => scene.remove(l));
395
+ pathLines = [];
396
+
397
+ // Collect positions for path
398
+ const pathPts = [];
399
+ const visited = new Set(), modified = new Set();
400
+
401
+ for (let i = 0; i < idx; i++) {
402
+ const s = steps[i];
403
+ if (!s) continue;
404
+ if (s.path && nodeMap[s.path]) {
405
+ const p = nodeMap[s.path].basePos.clone().add(new THREE.Vector3(0, 0.15, 0));
406
+ pathPts.push(p);
 
 
 
407
  }
408
+ if (s.action === 'read_file' && s.path) visited.add(s.path);
409
+ if (s.action === 'write_file' && s.path) modified.add(s.path);
410
  }
411
 
412
+ // Color visited/modified
413
+ visited.forEach(name => {
414
+ if (nodeMap[name]) {
415
+ nodeMap[name].mesh.material.color.set(COLS.visited);
416
+ nodeMap[name].mesh.material.emissive.set(new THREE.Color(COLS.visited).multiplyScalar(0.4));
417
+ nodeMap[name].mesh.material.opacity = 1;
418
+ nodeMap[name].mesh.scale.setScalar(1.25);
 
 
419
  }
420
  });
421
+ modified.forEach(name => {
422
+ if (nodeMap[name]) {
423
+ nodeMap[name].mesh.material.color.set(COLS.bug);
424
+ nodeMap[name].mesh.material.emissive.set(new THREE.Color(COLS.bug).multiplyScalar(0.5));
425
+ nodeMap[name].mesh.material.opacity = 1;
426
+ nodeMap[name].mesh.scale.setScalar(1.45);
 
 
427
  }
428
  });
429
 
430
+ // Highlight current node
431
+ if (idx > 0 && idx <= steps.length) {
432
+ const cur = steps[idx - 1];
433
+ if (cur && cur.path && nodeMap[cur.path]) {
434
+ nodeMap[cur.path].mesh.scale.setScalar(1.65);
435
+ }
 
 
 
 
 
 
436
  }
437
 
438
+ // Draw path
439
+ if (pathPts.length >= 2) {
440
+ const geo = new THREE.BufferGeometry().setFromPoints(pathPts);
441
+ const mat = new THREE.LineBasicMaterial({ color: COLS.path, transparent: true, opacity: 0.9 });
442
+ const line = new THREE.Line(geo, mat);
443
+ scene.add(line); pathLines.push(line);
 
 
 
 
 
 
444
  }
445
 
446
+ // Move agent
447
+ if (idx > 0 && idx <= steps.length) {
448
+ const cur = steps[idx - 1];
449
+ if (cur && cur.path && nodeMap[cur.path]) {
450
+ const tp = nodeMap[cur.path].basePos;
451
+ agentMesh.position.set(tp.x, tp.y + 1.3, tp.z);
452
+ } else {
453
+ agentMesh.position.set(0, 2.5, 0);
454
  }
455
+ } else {
456
+ agentMesh.position.set(0, 3.5, 0);
457
  }
458
 
459
+ updateLog(steps, idx - 1);
460
+ updateLabel(idx, maxStep);
461
+ const sl = document.getElementById('slider');
462
+ sl.style.setProperty('--pct', (maxStep > 0 ? idx / maxStep * 100 : 0) + '%');
 
 
 
463
  }
464
 
465
+ // โ”€โ”€ Score ring โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
466
+ function updateScore(s) {
467
+ const c = 2 * Math.PI * 30;
468
+ const arc = c * Math.min(1, Math.max(0, s));
469
+ document.getElementById('score-arc').setAttribute('stroke-dasharray', `${arc} ${c}`);
470
+ document.getElementById('score-txt').textContent = s.toFixed(2);
471
+ document.getElementById('st-score').textContent = s.toFixed(3);
472
+ const col = s >= 0.7 ? '#4ade80' : s >= 0.4 ? '#fbbf24' : '#f87171';
473
+ document.getElementById('score-arc').setAttribute('stroke', col);
 
 
 
 
474
  }
475
 
476
+ // โ”€โ”€ Step log โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
477
+ function updateLog(steps, curIdx) {
478
+ const em = { read_file:'๐Ÿ“–', write_file:'โœ๏ธ', run_tests:'๐Ÿงช', search_code:'๐Ÿ”', submit:'๐Ÿ' };
479
+ const container = document.getElementById('log-list');
480
  container.innerHTML = '';
481
+ steps.forEach((s, i) => {
482
+ const e = document.createElement('div');
483
+ e.className = 'log-e';
484
+ e.style.opacity = i < curIdx ? '0.55' : i === curIdx ? '1' : '0.3';
485
+ if (i === curIdx) e.style.background = 'rgba(125,211,252,0.07)';
486
+ const r = s.reward || 0;
487
+ const rc = r > 0 ? 'rp' : r < 0 ? 'rn' : 'rz';
488
+ const name = (s.path || s.action || '').split('/').pop() || s.action;
489
+ e.innerHTML = `<span class="log-s">S${s.step}</span><span class="log-a">${em[s.action]||'โ€ข'} ${name}</span><span class="${rc}">${r>0?'+':''}${r.toFixed(2)}</span>`;
490
+ container.appendChild(e);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
491
  });
492
+ if (curIdx >= 0 && container.children[curIdx]) {
493
+ container.children[curIdx].scrollIntoView({ block: 'nearest' });
 
 
 
 
 
494
  }
495
  }
496
 
497
+ // โ”€โ”€ Hover tooltip โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
498
+ const ray = new THREE.Raycaster();
499
+ const mv = new THREE.Vector2();
500
+ const tt = document.getElementById('tooltip');
501
 
502
  function checkHover(mx, my) {
503
+ mv.x = (mx / window.innerWidth) * 2 - 1;
504
+ mv.y = -(my / window.innerHeight) * 2 + 1;
505
+ ray.setFromCamera(mv, camera);
506
+ const meshes = Object.values(nodeMap).map(o => o.mesh);
507
+ const hits = ray.intersectObjects(meshes);
508
+ if (hits.length) {
509
+ const f = hits[0].object.userData.file;
510
+ tt.style.opacity = '1';
511
+ tt.style.left = (mx + 12) + 'px';
512
+ tt.style.top = (my - 8) + 'px';
513
+ document.getElementById('tt-title').textContent = f.name;
514
+ document.getElementById('tt-body').innerHTML =
515
+ `Type: ${f.type}${f.is_bug_file ? '<br>โš ๏ธ Bug location' : ''}${f.visited ? '<br>โœ… Visited' : ''}`;
 
 
 
 
 
 
516
  } else {
517
+ tt.style.opacity = '0';
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
518
  }
519
  }
520
 
521
+ // โ”€โ”€ Controls โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
522
+ function onSlider(v) { curStep = +v; applyStep(curStep); }
523
+ function stepFwd() { if (curStep < maxStep) { curStep++; document.getElementById('slider').value = curStep; applyStep(curStep); } }
524
+ function stepBack() { if (curStep > 0) { curStep--; document.getElementById('slider').value = curStep; applyStep(curStep); } }
525
  function togglePlay() {
526
  playing = !playing;
527
+ document.getElementById('play-btn').textContent = playing ? 'โธ Pause' : 'โ–ถ Play';
 
528
  if (playing) {
529
+ if (curStep >= maxStep) curStep = 0;
530
+ playTimer = setInterval(() => {
531
+ if (curStep >= maxStep) { playing = false; document.getElementById('play-btn').textContent = 'โ–ถ Play'; clearInterval(playTimer); return; }
532
+ stepFwd();
533
+ }, 850);
 
 
 
 
 
534
  } else {
535
+ clearInterval(playTimer);
536
  }
537
  }
 
538
  function toggleOrbit() {
539
+ orbitAuto = !orbitAuto;
540
  const btn = document.getElementById('orbit-btn');
541
+ btn.textContent = orbitAuto ? 'โน Stop' : '๐Ÿ”„ Orbit';
542
+ btn.classList.toggle('active', orbitAuto);
543
  }
 
544
  function resetView() {
545
+ sph = { theta: 0, phi: 1.1, r: 24 };
546
+ curStep = 0;
547
+ document.getElementById('slider').value = 0;
548
  applyStep(0);
549
  }
550
+ function updateLabel(s, m) { document.getElementById('step-label').textContent = `Step ${s} / ${m}`; }
551
 
552
+ // โ”€โ”€ Animation loop โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
 
 
 
 
 
553
  function animate() {
554
  requestAnimationFrame(animate);
555
  frame++;
 
556
  updateCamera();
557
+ // Pulsing agent
558
+ if (agentMesh) {
559
+ const p = 1 + Math.sin(frame * 0.09) * 0.18;
560
+ agentMesh.scale.setScalar(p);
561
+ agentMesh.rotation.y += 0.04;
562
+ }
563
+ // Subtle node float
564
+ Object.values(nodeMap).forEach(({ mesh, basePos }, i) => {
565
+ mesh.position.y = basePos.y + Math.sin(frame * 0.018 + i * 1.1) * 0.07;
 
 
566
  });
 
567
  renderer.render(scene, camera);
568
  }
569
+ animate();
570
 
571
+ // โ”€โ”€ Load data from API โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
572
+ async function fetchAndLoad() {
573
+ document.getElementById('loader').style.display = 'block';
574
+ document.getElementById('no-data').style.display = 'none';
 
 
 
 
 
575
  try {
576
+ // Try to determine base URL from window location
577
+ const base = window.location.origin;
578
+ const res = await fetch(`${base}/viz-data`, { cache: 'no-store' });
579
+ if (!res.ok) throw new Error('no data');
580
+ const data = await res.json();
581
+ if (data.error || !data.files || data.files.length === 0) {
582
+ document.getElementById('loader').style.display = 'none';
583
+ document.getElementById('no-data').style.display = 'block';
584
+ return;
585
+ }
586
  buildScene(data);
587
+ document.getElementById('loader').style.display = 'none';
588
  } catch(e) {
589
+ document.getElementById('loader').style.display = 'none';
590
+ document.getElementById('no-data').style.display = 'block';
591
  }
592
+ }
593
 
594
+ // โ”€โ”€ Public API (can be called from parent window) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
595
+ window.loadData = function(data) {
596
+ if (typeof data === 'string') { try { data = JSON.parse(data); } catch(e) { return; } }
597
  buildScene(data);
598
  document.getElementById('loader').style.display = 'none';
599
+ document.getElementById('no-data').style.display = 'none';
600
+ };
601
+
602
+ // Auto-load on init
603
+ window.addEventListener('load', fetchAndLoad);
604
  </script>
605
  </body>
606
  </html>