Spaces:
Sleeping
Sleeping
Commit ·
dfbd16e
1
Parent(s): f7185e1
v3.0 — Intelligence layer: failure classification, strategy detection, advanced metrics, self-improvement, multi-agent comparison, 3D visualizer
Browse files- app.py +518 -200
- server/advanced_metrics.py +245 -0
- server/app.py +247 -49
- server/failure_classifier.py +294 -0
- server/multi_agent.py +371 -0
- server/self_improvement.py +292 -0
- server/strategy_detector.py +243 -0
- static/viz3d.html +867 -0
app.py
CHANGED
|
@@ -1,80 +1,92 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
app.py — Gradio UI
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"""
|
| 6 |
import os
|
| 7 |
import json
|
| 8 |
import gradio as gr
|
| 9 |
from server.environment import CodebaseNavEnvironment
|
| 10 |
from server.models import RepoAction
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
# ── Global
|
| 13 |
env = CodebaseNavEnvironment()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
-
# ──
|
| 17 |
|
| 18 |
def reset_environment(task: str):
|
| 19 |
-
"""Reset environment and return initial state."""
|
| 20 |
try:
|
| 21 |
result = env.reset(task=task)
|
| 22 |
obs = result.observation
|
| 23 |
tree = "\n".join(f" 📄 {f}" for f in obs.repo_tree)
|
| 24 |
failing = ", ".join(obs.failing_tests) if obs.failing_tests else "None listed"
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
f"
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
f"
|
| 33 |
-
f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n
|
| 34 |
-
f"
|
|
|
|
| 35 |
f"🔴 Failing Tests: {failing}\n\n"
|
| 36 |
-
f"📋 Task: {obs.task_description}"
|
| 37 |
)
|
| 38 |
-
return
|
| 39 |
except Exception as e:
|
| 40 |
return f"❌ Error: {e}", "", "0", "0.000"
|
| 41 |
|
| 42 |
|
| 43 |
def take_step(action_type: str, path: str, query: str, content: str):
|
| 44 |
-
"""Execute one agent step."""
|
| 45 |
if env.done:
|
| 46 |
-
return "❌ Episode
|
| 47 |
-
|
| 48 |
try:
|
| 49 |
action = RepoAction(
|
| 50 |
action_type=action_type,
|
| 51 |
-
path=path
|
| 52 |
-
query=query
|
| 53 |
-
content=content
|
| 54 |
)
|
| 55 |
result = env.step(action)
|
| 56 |
obs = result.observation
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
error = f"⚠️ {error}"
|
| 62 |
|
| 63 |
status = (
|
| 64 |
f"Step {result.info['steps_taken']} | "
|
| 65 |
f"Reward: {result.reward:+.3f} | "
|
| 66 |
-
f"Steps left: {obs.steps_remaining}"
|
| 67 |
)
|
| 68 |
if result.done:
|
| 69 |
-
status += f"\n\n🏁
|
| 70 |
-
|
| 71 |
-
flags = result.info.get("security_flags", [])
|
| 72 |
-
if flags:
|
| 73 |
-
status += f"\n🔒 Security: {flags}"
|
| 74 |
|
| 75 |
return (
|
| 76 |
status,
|
| 77 |
-
|
| 78 |
str(result.info["steps_taken"]),
|
| 79 |
f"{result.info.get('cumulative_reward', 0):.3f}",
|
| 80 |
)
|
|
@@ -82,261 +94,567 @@ def take_step(action_type: str, path: str, query: str, content: str):
|
|
| 82 |
return f"❌ Error: {e}", "", "", ""
|
| 83 |
|
| 84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
def get_evaluation():
|
| 86 |
-
"""Get multi-dimensional evaluation report."""
|
| 87 |
try:
|
| 88 |
ev = env.get_evaluation()
|
| 89 |
if "error" in ev:
|
| 90 |
return "No evaluation available. Run an episode first."
|
| 91 |
-
|
| 92 |
lines = [
|
| 93 |
f"🎯 Composite Score: {ev['composite_score']:.3f}",
|
| 94 |
-
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
|
| 95 |
]
|
| 96 |
for name, dim in ev.get("dimensions", {}).items():
|
| 97 |
bar = "█" * int(dim["score"] * 20) + "░" * (20 - int(dim["score"] * 20))
|
| 98 |
lines.append(f" {name:15s} [{bar}] {dim['score']:.3f}")
|
| 99 |
-
for e in dim.get("evidence", []):
|
| 100 |
lines.append(f" → {e}")
|
| 101 |
-
|
| 102 |
if ev.get("strengths"):
|
| 103 |
-
lines
|
| 104 |
-
for s in ev["strengths"]:
|
| 105 |
-
lines.append(f" ✅ {s}")
|
| 106 |
-
|
| 107 |
if ev.get("failure_analysis"):
|
| 108 |
-
lines
|
| 109 |
-
for f in ev["failure_analysis"]:
|
| 110 |
-
lines.append(f" ❌ {f}")
|
| 111 |
-
|
| 112 |
if ev.get("recommendations"):
|
| 113 |
-
lines
|
| 114 |
-
for r in ev["recommendations"]:
|
| 115 |
-
lines.append(f" → {r}")
|
| 116 |
-
|
| 117 |
return "\n".join(lines)
|
| 118 |
except Exception as e:
|
| 119 |
return f"Error: {e}"
|
| 120 |
|
| 121 |
|
| 122 |
def get_metrics():
|
| 123 |
-
"""Get comprehensive metrics."""
|
| 124 |
try:
|
| 125 |
-
|
| 126 |
-
return json.dumps(m, indent=2, default=str)
|
| 127 |
except Exception as e:
|
| 128 |
return f"Error: {e}"
|
| 129 |
|
| 130 |
|
| 131 |
def get_trajectory():
|
| 132 |
-
"""Get full trajectory."""
|
| 133 |
try:
|
| 134 |
t = env.get_trajectory()
|
| 135 |
if not t:
|
| 136 |
-
return "No trajectory
|
| 137 |
-
|
| 138 |
lines = [
|
| 139 |
-
f"Episode: {t.get('episode_id'
|
| 140 |
-
f"Task: {t.get('task'
|
| 141 |
-
f"
|
| 142 |
-
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 143 |
]
|
|
|
|
|
|
|
| 144 |
for step in t.get("steps", []):
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
"🔍" if step["action_type"] == "search_code" else "🏁"
|
| 149 |
-
path = step.get("action_path") or step.get("action_query") or ""
|
| 150 |
-
err = f" ❌ {step['error']}" if step.get("error") else ""
|
| 151 |
lines.append(
|
| 152 |
-
f" {
|
| 153 |
-
f"{step['
|
| 154 |
-
f"reward={step['reward']:+.3f} "
|
| 155 |
-
f"({step['duration_ms']:.0f}ms){err}"
|
| 156 |
)
|
| 157 |
return "\n".join(lines)
|
| 158 |
except Exception as e:
|
| 159 |
return f"Error: {e}"
|
| 160 |
|
| 161 |
|
| 162 |
-
|
| 163 |
-
|
|
|
|
| 164 |
try:
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
-
# Strategy: read test file → read source → fix → run tests ��� submit
|
| 173 |
-
test_files = [f for f in obs.repo_tree if f.startswith("tests/")]
|
| 174 |
-
src_files = [f for f in obs.repo_tree if f.startswith("src/") and f.endswith(".py")]
|
| 175 |
-
spec_files = [f for f in obs.repo_tree if f.endswith(".md")]
|
| 176 |
|
| 177 |
-
|
| 178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
|
| 180 |
-
# Step 1: read spec or test
|
| 181 |
-
if task == "task3" and spec_files:
|
| 182 |
-
target = spec_files[0]
|
| 183 |
-
elif test_files:
|
| 184 |
-
target = test_files[0]
|
| 185 |
-
else:
|
| 186 |
-
target = obs.repo_tree[0]
|
| 187 |
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
|
| 212 |
-
log_lines.append(f"\n🏁 Final Score: {env.final_score:.3f}")
|
| 213 |
-
log_lines.append(f" Total Steps: {steps_done}")
|
| 214 |
-
log_lines.append(f" Cumulative Reward: {env.cumulative_reward:.3f}")
|
| 215 |
|
| 216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
except Exception as e:
|
| 218 |
return f"❌ Error: {e}"
|
| 219 |
|
| 220 |
|
| 221 |
-
# ──
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
|
|
|
| 226 |
gr.Markdown(
|
| 227 |
-
"# 🔍 Codebase Navigation & Repair — OpenEnv\n"
|
| 228 |
-
"**
|
| 229 |
-
"
|
| 230 |
)
|
| 231 |
|
| 232 |
with gr.Tabs():
|
| 233 |
-
|
|
|
|
| 234 |
with gr.TabItem("🎮 Interactive"):
|
| 235 |
with gr.Row():
|
| 236 |
with gr.Column(scale=1):
|
| 237 |
task_select = gr.Dropdown(
|
| 238 |
-
|
| 239 |
-
value="task1",
|
| 240 |
label="Task",
|
| 241 |
-
info="task1=
|
| 242 |
)
|
| 243 |
reset_btn = gr.Button("🔄 Reset Environment", variant="primary")
|
| 244 |
-
|
| 245 |
-
gr.
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
value="read_file",
|
| 249 |
-
label="Action Type",
|
| 250 |
)
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
step_btn = gr.Button("▶️ Execute Step", variant="secondary")
|
| 255 |
-
|
| 256 |
with gr.Column(scale=2):
|
| 257 |
-
status_box = gr.Textbox(label="Status", lines=
|
| 258 |
-
result_box = gr.Textbox(label="Last
|
| 259 |
with gr.Row():
|
| 260 |
-
steps_box = gr.Textbox(label="Steps
|
| 261 |
reward_box = gr.Textbox(label="Cumulative Reward", value="0.000", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
)
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
)
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
-
# ── Tab
|
| 274 |
-
with gr.TabItem("
|
| 275 |
gr.Markdown(
|
| 276 |
-
"###
|
| 277 |
-
"Runs
|
| 278 |
-
"
|
| 279 |
)
|
| 280 |
-
|
| 281 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
)
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
|
| 287 |
-
# ── Tab
|
| 288 |
-
with gr.TabItem("📊 Evaluation"):
|
| 289 |
-
with gr.Row():
|
| 290 |
-
eval_btn = gr.Button("🎯 Get Evaluation", variant="primary")
|
| 291 |
-
metrics_btn = gr.Button("📈 Get Metrics", variant="secondary")
|
| 292 |
-
traj_btn = gr.Button("🗺️ Get Trajectory", variant="secondary")
|
| 293 |
-
eval_output = gr.Textbox(label="Evaluation Report", lines=25, interactive=False)
|
| 294 |
-
eval_btn.click(get_evaluation, outputs=[eval_output])
|
| 295 |
-
metrics_btn.click(get_metrics, outputs=[eval_output])
|
| 296 |
-
traj_btn.click(get_trajectory, outputs=[eval_output])
|
| 297 |
-
|
| 298 |
-
# ── Tab 4: API Docs ──────────────────────────────────────────────
|
| 299 |
with gr.TabItem("📖 API"):
|
| 300 |
gr.Markdown("""
|
| 301 |
-
### REST API Endpoints
|
| 302 |
-
|
| 303 |
-
The FastAPI endpoints are mounted alongside this UI at `/api/`.
|
| 304 |
|
|
|
|
| 305 |
| Endpoint | Method | Description |
|
| 306 |
|----------|--------|-------------|
|
| 307 |
-
| `/
|
| 308 |
-
| `/
|
| 309 |
-
| `/
|
| 310 |
-
| `/
|
| 311 |
-
| `/api/trajectory` | GET | Full action log |
|
| 312 |
-
| `/api/evaluate` | GET | Multi-dimensional scores |
|
| 313 |
-
| `/api/metrics` | GET | Comprehensive stats |
|
| 314 |
-
| `/api/fault-config` | POST | Enable fault injection |
|
| 315 |
-
|
| 316 |
-
### Example: Reset + Read + Submit
|
| 317 |
-
```bash
|
| 318 |
-
BASE="https://YOUR-SPACE.hf.space/api"
|
| 319 |
-
|
| 320 |
-
# Reset
|
| 321 |
-
curl -X POST "$BASE/reset?task=task1"
|
| 322 |
|
| 323 |
-
#
|
| 324 |
-
|
| 325 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
|
| 327 |
-
#
|
| 328 |
-
|
| 329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
|
| 331 |
-
|
| 332 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
```
|
| 334 |
""")
|
| 335 |
|
| 336 |
|
| 337 |
-
# ── Mount FastAPI under
|
| 338 |
from server.app import app as fastapi_app
|
| 339 |
-
|
| 340 |
gr_app = gr.mount_gradio_app(fastapi_app, demo, path="/")
|
| 341 |
|
| 342 |
if __name__ == "__main__":
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
app.py — Gradio UI v3.0 — Full Platform Entry Point
|
| 4 |
+
|
| 5 |
+
Tabs:
|
| 6 |
+
🎮 Interactive — manual step-by-step control
|
| 7 |
+
🤖 Run Agent — built-in deterministic agent demo
|
| 8 |
+
📊 Evaluation — 6-dimension evaluation report
|
| 9 |
+
🧠 Intelligence — failure classification, strategy, advanced metrics
|
| 10 |
+
🔁 Self-Improve — improvement plan after failure
|
| 11 |
+
⚖️ Compare Agents — side-by-side multi-agent comparison
|
| 12 |
+
🌐 3D Visualizer — Three.js trajectory visualization
|
| 13 |
+
📖 API — REST API reference
|
| 14 |
"""
|
| 15 |
import os
|
| 16 |
import json
|
| 17 |
import gradio as gr
|
| 18 |
from server.environment import CodebaseNavEnvironment
|
| 19 |
from server.models import RepoAction
|
| 20 |
+
from server.failure_classifier import FailureClassifier
|
| 21 |
+
from server.strategy_detector import StrategyDetector
|
| 22 |
+
from server.advanced_metrics import AdvancedMetricsEngine
|
| 23 |
+
from server.self_improvement import SelfImprovementEngine
|
| 24 |
+
from server.multi_agent import MultiAgentComparison
|
| 25 |
|
| 26 |
+
# ── Global instances ──────────────────────────────────────────────────────────
|
| 27 |
env = CodebaseNavEnvironment()
|
| 28 |
+
failure_clf = FailureClassifier()
|
| 29 |
+
strategy_det = StrategyDetector()
|
| 30 |
+
adv_metrics_engine = AdvancedMetricsEngine()
|
| 31 |
+
improvement_engine = SelfImprovementEngine()
|
| 32 |
+
multi_agent_engine = MultiAgentComparison()
|
| 33 |
|
| 34 |
|
| 35 |
+
# ── Tab 1: Interactive ────────────────────────────────────────────────────────
|
| 36 |
|
| 37 |
def reset_environment(task: str):
|
|
|
|
| 38 |
try:
|
| 39 |
result = env.reset(task=task)
|
| 40 |
obs = result.observation
|
| 41 |
tree = "\n".join(f" 📄 {f}" for f in obs.repo_tree)
|
| 42 |
failing = ", ".join(obs.failing_tests) if obs.failing_tests else "None listed"
|
| 43 |
+
fi = result.info.get("fault_injection", {})
|
| 44 |
+
faults = ""
|
| 45 |
+
if fi.get("faults_injected"):
|
| 46 |
+
faults = f"\n\n⚠️ Fault Injection ({fi.get('difficulty_multiplier', 1.0):.1f}x):\n"
|
| 47 |
+
faults += "\n".join(f" • {f}" for f in fi["faults_injected"][:5])
|
| 48 |
+
|
| 49 |
+
status = (
|
| 50 |
+
f"✅ Episode Started — {task} (variant: {result.info.get('variant_id', '?')})\n"
|
| 51 |
+
f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
|
| 52 |
+
f"Steps: {obs.steps_remaining} remaining\n\n"
|
| 53 |
+
f"📁 Files:\n{tree}\n\n"
|
| 54 |
f"🔴 Failing Tests: {failing}\n\n"
|
| 55 |
+
f"📋 Task: {obs.task_description}{faults}"
|
| 56 |
)
|
| 57 |
+
return status, "", "0", "0.000"
|
| 58 |
except Exception as e:
|
| 59 |
return f"❌ Error: {e}", "", "0", "0.000"
|
| 60 |
|
| 61 |
|
| 62 |
def take_step(action_type: str, path: str, query: str, content: str):
|
|
|
|
| 63 |
if env.done:
|
| 64 |
+
return "❌ Episode done. Reset first.", "", "", ""
|
|
|
|
| 65 |
try:
|
| 66 |
action = RepoAction(
|
| 67 |
action_type=action_type,
|
| 68 |
+
path=path.strip() or None,
|
| 69 |
+
query=query.strip() or None,
|
| 70 |
+
content=content.strip() or None,
|
| 71 |
)
|
| 72 |
result = env.step(action)
|
| 73 |
obs = result.observation
|
| 74 |
+
result_text = obs.last_action_result or "No output"
|
| 75 |
+
error = f"\n⚠️ {obs.last_action_error}" if obs.last_action_error else ""
|
| 76 |
+
flags = result.info.get("security_flags", [])
|
| 77 |
+
sec = f"\n🔒 Security: {flags}" if flags else ""
|
|
|
|
| 78 |
|
| 79 |
status = (
|
| 80 |
f"Step {result.info['steps_taken']} | "
|
| 81 |
f"Reward: {result.reward:+.3f} | "
|
| 82 |
+
f"Steps left: {obs.steps_remaining}{error}{sec}"
|
| 83 |
)
|
| 84 |
if result.done:
|
| 85 |
+
status += f"\n\n🏁 DONE — Score: {result.info['final_score']:.3f}"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
return (
|
| 88 |
status,
|
| 89 |
+
result_text[:3000],
|
| 90 |
str(result.info["steps_taken"]),
|
| 91 |
f"{result.info.get('cumulative_reward', 0):.3f}",
|
| 92 |
)
|
|
|
|
| 94 |
return f"❌ Error: {e}", "", "", ""
|
| 95 |
|
| 96 |
|
| 97 |
+
# ── Tab 2: Run Agent ──────────────────────────────────────────────────────────
|
| 98 |
+
|
| 99 |
+
def run_builtin_agent(task: str):
|
| 100 |
+
try:
|
| 101 |
+
result = env.reset(task=task)
|
| 102 |
+
obs = result.observation
|
| 103 |
+
log = [
|
| 104 |
+
f"🚀 {task} (variant: {result.info.get('variant_id')})",
|
| 105 |
+
f" Files: {obs.repo_tree}",
|
| 106 |
+
f" Failing: {obs.failing_tests}",
|
| 107 |
+
]
|
| 108 |
+
tree = obs.repo_tree
|
| 109 |
+
test_files = sorted([f for f in tree if f.startswith("tests/")])
|
| 110 |
+
src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")])
|
| 111 |
+
spec_files = sorted([f for f in tree if f.endswith(".md")])
|
| 112 |
+
steps = 0
|
| 113 |
+
|
| 114 |
+
if task == "task3" and spec_files:
|
| 115 |
+
for sf in spec_files:
|
| 116 |
+
if env.done: break
|
| 117 |
+
r = env.step(RepoAction(action_type="read_file", path=sf))
|
| 118 |
+
steps += 1
|
| 119 |
+
log.append(f" Step {steps}: read_file {sf} → {r.reward:+.3f}")
|
| 120 |
+
|
| 121 |
+
for tf in test_files:
|
| 122 |
+
if env.done: break
|
| 123 |
+
r = env.step(RepoAction(action_type="read_file", path=tf))
|
| 124 |
+
steps += 1
|
| 125 |
+
log.append(f" Step {steps}: read_file {tf} → {r.reward:+.3f}")
|
| 126 |
+
|
| 127 |
+
for sf in src_files:
|
| 128 |
+
if env.done or steps >= 12: break
|
| 129 |
+
r = env.step(RepoAction(action_type="read_file", path=sf))
|
| 130 |
+
steps += 1
|
| 131 |
+
log.append(f" Step {steps}: read_file {sf} → {r.reward:+.3f}")
|
| 132 |
+
|
| 133 |
+
if not env.done and test_files:
|
| 134 |
+
r = env.step(RepoAction(action_type="run_tests", path=test_files[0]))
|
| 135 |
+
steps += 1
|
| 136 |
+
log.append(f" Step {steps}: run_tests → {r.reward:+.3f}")
|
| 137 |
+
|
| 138 |
+
if not env.done:
|
| 139 |
+
r = env.step(RepoAction(action_type="submit"))
|
| 140 |
+
steps += 1
|
| 141 |
+
log.append(f" Step {steps}: submit → {r.reward:+.3f}")
|
| 142 |
+
|
| 143 |
+
log += [
|
| 144 |
+
f"\n🏁 Score: {env.final_score:.3f}",
|
| 145 |
+
f" Steps: {steps}",
|
| 146 |
+
f" Reward: {env.cumulative_reward:.3f}",
|
| 147 |
+
]
|
| 148 |
+
return "\n".join(log)
|
| 149 |
+
except Exception as e:
|
| 150 |
+
return f"❌ Error: {e}"
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
# ── Tab 3: Evaluation ─────────────────────────────────────────────────────────
|
| 154 |
+
|
| 155 |
def get_evaluation():
|
|
|
|
| 156 |
try:
|
| 157 |
ev = env.get_evaluation()
|
| 158 |
if "error" in ev:
|
| 159 |
return "No evaluation available. Run an episode first."
|
|
|
|
| 160 |
lines = [
|
| 161 |
f"🎯 Composite Score: {ev['composite_score']:.3f}",
|
| 162 |
+
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
|
| 163 |
]
|
| 164 |
for name, dim in ev.get("dimensions", {}).items():
|
| 165 |
bar = "█" * int(dim["score"] * 20) + "░" * (20 - int(dim["score"] * 20))
|
| 166 |
lines.append(f" {name:15s} [{bar}] {dim['score']:.3f}")
|
| 167 |
+
for e in dim.get("evidence", [])[:2]:
|
| 168 |
lines.append(f" → {e}")
|
|
|
|
| 169 |
if ev.get("strengths"):
|
| 170 |
+
lines += ["\n💪 Strengths:"] + [f" ✅ {s}" for s in ev["strengths"]]
|
|
|
|
|
|
|
|
|
|
| 171 |
if ev.get("failure_analysis"):
|
| 172 |
+
lines += ["\n⚠️ Failures:"] + [f" ❌ {f}" for f in ev["failure_analysis"]]
|
|
|
|
|
|
|
|
|
|
| 173 |
if ev.get("recommendations"):
|
| 174 |
+
lines += ["\n💡 Recommendations:"] + [f" → {r}" for r in ev["recommendations"]]
|
|
|
|
|
|
|
|
|
|
| 175 |
return "\n".join(lines)
|
| 176 |
except Exception as e:
|
| 177 |
return f"Error: {e}"
|
| 178 |
|
| 179 |
|
| 180 |
def get_metrics():
|
|
|
|
| 181 |
try:
|
| 182 |
+
return json.dumps(env.get_metrics(), indent=2, default=str)
|
|
|
|
| 183 |
except Exception as e:
|
| 184 |
return f"Error: {e}"
|
| 185 |
|
| 186 |
|
| 187 |
def get_trajectory():
|
|
|
|
| 188 |
try:
|
| 189 |
t = env.get_trajectory()
|
| 190 |
if not t:
|
| 191 |
+
return "No trajectory. Run an episode first."
|
|
|
|
| 192 |
lines = [
|
| 193 |
+
f"Episode: {t.get('episode_id')}",
|
| 194 |
+
f"Task: {t.get('task')} | Variant: {t.get('variant_id')}",
|
| 195 |
+
f"Score: {t.get('final_score', 0):.3f} | Duration: {t.get('duration_seconds', '?')}s",
|
| 196 |
+
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
|
| 197 |
]
|
| 198 |
+
emojis = {"read_file": "📖", "write_file": "✏️", "run_tests": "🧪",
|
| 199 |
+
"search_code": "🔍", "submit": "🏁"}
|
| 200 |
for step in t.get("steps", []):
|
| 201 |
+
em = emojis.get(step["action_type"], "•")
|
| 202 |
+
p = step.get("action_path") or step.get("action_query") or ""
|
| 203 |
+
err = " ❌" if step.get("error") else ""
|
|
|
|
|
|
|
|
|
|
| 204 |
lines.append(
|
| 205 |
+
f" {em} {step['step_number']:2d}: {step['action_type']:12s} {p:30s} "
|
| 206 |
+
f"reward={step['reward']:+.3f} ({step['duration_ms']:.0f}ms){err}"
|
|
|
|
|
|
|
| 207 |
)
|
| 208 |
return "\n".join(lines)
|
| 209 |
except Exception as e:
|
| 210 |
return f"Error: {e}"
|
| 211 |
|
| 212 |
|
| 213 |
+
# ── Tab 4: Intelligence ───────────────────────────────────────────────────────
|
| 214 |
+
|
| 215 |
+
def get_failure_classification():
|
| 216 |
try:
|
| 217 |
+
traj = env.get_trajectory()
|
| 218 |
+
if not traj:
|
| 219 |
+
return "No trajectory. Run an episode first."
|
| 220 |
+
meta = env.variant.meta if env.variant else {}
|
| 221 |
+
report = failure_clf.classify(
|
| 222 |
+
episode_id=traj.get("episode_id", ""),
|
| 223 |
+
task=env.current_task or "unknown",
|
| 224 |
+
trajectory_steps=traj.get("steps", []),
|
| 225 |
+
variant_meta=meta,
|
| 226 |
+
files_read=list(env.files_read),
|
| 227 |
+
files_written=list(env.files_written),
|
| 228 |
+
final_score=env.final_score,
|
| 229 |
+
security_violations=env.security_violations,
|
| 230 |
+
)
|
| 231 |
+
d = report.to_dict()
|
| 232 |
+
lines = [
|
| 233 |
+
f"{'✅ SUCCESS' if d['success'] else '❌ FAILURE'}",
|
| 234 |
+
f"Primary Failure Type: {d['primary_failure']}",
|
| 235 |
+
f"Failures Detected: {d['failure_count']}",
|
| 236 |
+
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
|
| 237 |
+
]
|
| 238 |
+
for f in d.get("failures", []):
|
| 239 |
+
lines += [
|
| 240 |
+
f"\n[{f['severity'].upper()}] {f['type']} @ Step {f['step']}",
|
| 241 |
+
f" Evidence: {f['evidence']}",
|
| 242 |
+
f" Root Cause: {f['root_cause']}",
|
| 243 |
+
f" Fix: {f['remediation']}",
|
| 244 |
+
]
|
| 245 |
+
if d.get("failure_summary"):
|
| 246 |
+
lines += ["\n📋 Summary:", f" {d['failure_summary']}"]
|
| 247 |
+
if d.get("retry_hint"):
|
| 248 |
+
lines += ["\n🔁 Retry Hint:", f" {d['retry_hint']}"]
|
| 249 |
+
return "\n".join(lines)
|
| 250 |
+
except Exception as e:
|
| 251 |
+
return f"Error: {e}"
|
| 252 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
+
def get_strategy_detection():
|
| 255 |
+
try:
|
| 256 |
+
traj = env.get_trajectory()
|
| 257 |
+
if not traj:
|
| 258 |
+
return "No trajectory. Run an episode first."
|
| 259 |
+
meta = env.variant.meta if env.variant else {}
|
| 260 |
+
report = strategy_det.detect(
|
| 261 |
+
trajectory_steps=traj.get("steps", []),
|
| 262 |
+
task=env.current_task or "unknown",
|
| 263 |
+
variant_meta=meta,
|
| 264 |
+
files_read=list(env.files_read),
|
| 265 |
+
final_score=env.final_score,
|
| 266 |
+
)
|
| 267 |
+
d = report.to_dict()
|
| 268 |
+
score_bar = "█" * int(d["score"] * 20) + "░" * (20 - int(d["score"] * 20))
|
| 269 |
+
lines = [
|
| 270 |
+
f"🧭 Strategy: {d['strategy']}",
|
| 271 |
+
f" Score: [{score_bar}] {d['score']:.3f}",
|
| 272 |
+
f" Confidence: {d['confidence']:.0%}",
|
| 273 |
+
f"\n📖 {d['strategy_description']}",
|
| 274 |
+
f"\n📊 Exploration Ratio: {d['exploration_ratio']:.2f} "
|
| 275 |
+
f"({'explore-heavy' if d['exploration_ratio'] > 0.6 else 'exploit-heavy' if d['exploration_ratio'] < 0.4 else 'balanced'})",
|
| 276 |
+
f" Strategy Pivots: {d['pivot_count']}",
|
| 277 |
+
]
|
| 278 |
+
if d.get("sub_patterns"):
|
| 279 |
+
lines += ["\n🔖 Sub-patterns:"] + [f" • {p}" for p in d["sub_patterns"]]
|
| 280 |
+
if d.get("evidence"):
|
| 281 |
+
lines += ["\n🔍 Evidence:"] + [f" → {e}" for e in d["evidence"]]
|
| 282 |
+
return "\n".join(lines)
|
| 283 |
+
except Exception as e:
|
| 284 |
+
return f"Error: {e}"
|
| 285 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
|
| 287 |
+
def get_advanced_metrics():
|
| 288 |
+
try:
|
| 289 |
+
traj = env.get_trajectory()
|
| 290 |
+
if not traj:
|
| 291 |
+
return "No trajectory. Run an episode first."
|
| 292 |
+
meta = env.variant.meta if env.variant else {}
|
| 293 |
+
report = adv_metrics_engine.compute(
|
| 294 |
+
trajectory_steps=traj.get("steps", []),
|
| 295 |
+
variant_meta=meta,
|
| 296 |
+
final_score=env.final_score,
|
| 297 |
+
files_read=list(env.files_read),
|
| 298 |
+
files_written=list(env.files_written),
|
| 299 |
+
)
|
| 300 |
+
d = report.to_dict()
|
| 301 |
+
|
| 302 |
+
def bar(v):
|
| 303 |
+
return "█" * int(v * 20) + "░" * (20 - int(v * 20))
|
| 304 |
|
| 305 |
+
lines = [
|
| 306 |
+
"⚡ ADVANCED METRICS",
|
| 307 |
+
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
|
| 308 |
+
f" Reasoning Efficiency [{bar(d['reasoning_efficiency'])}] {d['reasoning_efficiency']:.3f}",
|
| 309 |
+
f" Reliability Index [{bar(d['reliability_index'])}] {d['reliability_index']:.3f}",
|
| 310 |
+
f" Exploration Ratio [{bar(d['exploration_ratio'])}] {d['exploration_ratio']:.3f}",
|
| 311 |
+
f" Decision Entropy [{bar(d['decision_entropy'])}] {d['decision_entropy']:.3f}",
|
| 312 |
+
f" Wasteful Ratio [{bar(d['wasteful_ratio'])}] {d['wasteful_ratio']:.3f}",
|
| 313 |
+
f" Pivot Rate {d['pivot_rate']:.2f} per 10 steps",
|
| 314 |
+
f" Consistency [{bar(d['consistency_score'])}] {d['consistency_score']:.3f} ({d['runs_analyzed']} runs)",
|
| 315 |
+
"\n📊 Action Distribution:",
|
| 316 |
+
]
|
| 317 |
+
for action, count in d.get("action_distribution", {}).items():
|
| 318 |
+
lines.append(f" {action:15s}: {count}")
|
| 319 |
+
if d.get("useful_actions"):
|
| 320 |
+
lines += ["\n✅ Useful Actions:"] + [f" • {a}" for a in d["useful_actions"]]
|
| 321 |
+
if d.get("wasteful_actions"):
|
| 322 |
+
lines += ["\n⚠️ Wasteful Actions:"] + [f" • {a}" for a in d["wasteful_actions"]]
|
| 323 |
+
lines += ["\n🔒 Reliability Breakdown:"]
|
| 324 |
+
for k, v in d.get("reliability_breakdown", {}).items():
|
| 325 |
+
lines.append(f" {k:15s}: {v:.3f}")
|
| 326 |
+
return "\n".join(lines)
|
| 327 |
+
except Exception as e:
|
| 328 |
+
return f"Error: {e}"
|
| 329 |
|
|
|
|
|
|
|
|
|
|
| 330 |
|
| 331 |
+
# ── Tab 5: Self-Improve ───────────────────────────────────────────────────────
|
| 332 |
+
|
| 333 |
+
def get_improvement_plan():
|
| 334 |
+
try:
|
| 335 |
+
traj = env.get_trajectory()
|
| 336 |
+
if not traj:
|
| 337 |
+
return "No trajectory. Run an episode first."
|
| 338 |
+
meta = env.variant.meta if env.variant else {}
|
| 339 |
+
steps = traj.get("steps", [])
|
| 340 |
+
|
| 341 |
+
fail_report = failure_clf.classify(
|
| 342 |
+
episode_id=traj.get("episode_id", ""),
|
| 343 |
+
task=env.current_task or "unknown",
|
| 344 |
+
trajectory_steps=steps,
|
| 345 |
+
variant_meta=meta,
|
| 346 |
+
files_read=list(env.files_read),
|
| 347 |
+
files_written=list(env.files_written),
|
| 348 |
+
final_score=env.final_score,
|
| 349 |
+
security_violations=env.security_violations,
|
| 350 |
+
)
|
| 351 |
+
plan = improvement_engine.generate_improvement_plan(
|
| 352 |
+
episode_id=traj.get("episode_id", ""),
|
| 353 |
+
task=env.current_task or "unknown",
|
| 354 |
+
failure_type=fail_report.primary_failure,
|
| 355 |
+
failure_evidence=[f.evidence for f in fail_report.failures],
|
| 356 |
+
original_score=env.final_score,
|
| 357 |
+
trajectory_steps=steps,
|
| 358 |
+
files_read=list(env.files_read),
|
| 359 |
+
files_written=list(env.files_written),
|
| 360 |
+
)
|
| 361 |
+
d = plan.to_dict()
|
| 362 |
+
lines = [
|
| 363 |
+
f"🔁 SELF-IMPROVEMENT PLAN",
|
| 364 |
+
f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
|
| 365 |
+
f"Original Score: {d['original_score']:.3f}",
|
| 366 |
+
f"Failure Type: {d['failure_type']}",
|
| 367 |
+
f"\n❌ What Went Wrong:\n {d['what_went_wrong']}",
|
| 368 |
+
f"\n🎯 Improved Strategy:\n {d['improved_strategy']}",
|
| 369 |
+
f"\n📋 Step-by-Step Plan:",
|
| 370 |
+
]
|
| 371 |
+
for step in d.get("step_by_step_plan", []):
|
| 372 |
+
lines.append(f" {step}")
|
| 373 |
+
if d.get("specific_errors"):
|
| 374 |
+
lines += ["\n🔎 Specific Errors:"] + [f" • {e}" for e in d["specific_errors"][:5]]
|
| 375 |
+
lines += [
|
| 376 |
+
"\n💉 System Prompt Injection (for next LLM run):",
|
| 377 |
+
"─────────────────────────────────────",
|
| 378 |
+
d.get("system_prompt_addon", "No injection needed."),
|
| 379 |
+
]
|
| 380 |
+
return "\n".join(lines)
|
| 381 |
+
except Exception as e:
|
| 382 |
+
return f"Error: {e}"
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
# ── Tab 6: Compare Agents ─────────────────────────────────────────────────────
|
| 386 |
+
|
| 387 |
+
def run_comparison(task: str, selected_agents: list):
|
| 388 |
+
try:
|
| 389 |
+
agents = selected_agents if selected_agents else None
|
| 390 |
+
report = multi_agent_engine.compare(env, task=task, agents=agents)
|
| 391 |
+
d = report.to_dict()
|
| 392 |
+
|
| 393 |
+
lines = [
|
| 394 |
+
f"⚖️ MULTI-AGENT COMPARISON — {task} (variant: {d.get('variant_id')})",
|
| 395 |
+
f"🏆 Winner: {d.get('winner')} (score: {d.get('winner_score', 0):.3f})",
|
| 396 |
+
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
|
| 397 |
+
f"{'Rank':<6} {'Agent':<16} {'Score':<8} {'Steps':<8} {'Strategy':<22} {'Failure':<22} {'Reliability':<12}",
|
| 398 |
+
"─" * 100,
|
| 399 |
+
]
|
| 400 |
+
for row in d.get("summary_table", []):
|
| 401 |
+
lines.append(
|
| 402 |
+
f"#{row['rank']:<5} {row['agent']:<16} {row['score']:<8.3f} "
|
| 403 |
+
f"{row['steps']:<8} {row['strategy']:<22} {row['failure']:<22} {row['reliability']:<12.3f}"
|
| 404 |
+
)
|
| 405 |
+
lines.append("━" * 100)
|
| 406 |
+
|
| 407 |
+
if d.get("insights"):
|
| 408 |
+
lines += ["\n💡 Insights:"] + [f" → {i}" for i in d["insights"]]
|
| 409 |
+
|
| 410 |
+
lines.append("\n📊 Per-Agent Action Sequences:")
|
| 411 |
+
for run in d.get("detailed_runs", []):
|
| 412 |
+
seq = " → ".join(run.get("action_sequence", []))
|
| 413 |
+
lines.append(f" {run['agent_name']:16s}: {seq}")
|
| 414 |
+
|
| 415 |
+
return "\n".join(lines)
|
| 416 |
except Exception as e:
|
| 417 |
return f"❌ Error: {e}"
|
| 418 |
|
| 419 |
|
| 420 |
+
# ── Tab 7: 3D Visualizer ──────────────────────────────────────────────────────
|
| 421 |
+
|
| 422 |
+
def get_viz_html():
|
| 423 |
+
"""Generate the 3D visualizer HTML with current trajectory data injected."""
|
| 424 |
+
# Load the static HTML template
|
| 425 |
+
static_path = os.path.join(os.path.dirname(__file__), "static", "viz3d.html")
|
| 426 |
+
if not os.path.exists(static_path):
|
| 427 |
+
return "<p style='color:red'>viz3d.html not found in static/</p>"
|
| 428 |
+
|
| 429 |
+
with open(static_path, "r") as f:
|
| 430 |
+
html = f.read()
|
| 431 |
+
|
| 432 |
+
# Get viz data from current environment
|
| 433 |
+
traj = env.get_trajectory()
|
| 434 |
+
if traj:
|
| 435 |
+
meta = env.variant.meta if env.variant else {}
|
| 436 |
+
bug_files = set(meta.get("bug_files", []))
|
| 437 |
+
files = []
|
| 438 |
+
if env.variant:
|
| 439 |
+
for fname in env.variant.get_tree():
|
| 440 |
+
ftype = "test" if fname.startswith("tests/") else \
|
| 441 |
+
"spec" if fname.endswith(".md") else "src"
|
| 442 |
+
files.append({
|
| 443 |
+
"name": fname,
|
| 444 |
+
"type": ftype,
|
| 445 |
+
"is_bug_file": fname in bug_files,
|
| 446 |
+
"visited": fname in env.files_read,
|
| 447 |
+
"modified": fname in env.files_written,
|
| 448 |
+
})
|
| 449 |
+
|
| 450 |
+
test_files = [f["name"] for f in files if f["type"] == "test"]
|
| 451 |
+
src_files = [f["name"] for f in files if f["type"] == "src"]
|
| 452 |
+
deps = []
|
| 453 |
+
for tf in test_files:
|
| 454 |
+
for sf in src_files:
|
| 455 |
+
deps.append({"from": tf, "to": sf})
|
| 456 |
+
|
| 457 |
+
steps_data = []
|
| 458 |
+
for step in traj.get("steps", []):
|
| 459 |
+
steps_data.append({
|
| 460 |
+
"step": step.get("step_number", 0),
|
| 461 |
+
"action": step.get("action_type", ""),
|
| 462 |
+
"path": step.get("action_path"),
|
| 463 |
+
"reward": step.get("reward", 0.0),
|
| 464 |
+
"error": step.get("error"),
|
| 465 |
+
"pass_rate": step.get("test_pass_rate"),
|
| 466 |
+
})
|
| 467 |
+
|
| 468 |
+
strategy_report = strategy_det.detect(
|
| 469 |
+
traj.get("steps", []),
|
| 470 |
+
env.current_task or "unknown",
|
| 471 |
+
meta,
|
| 472 |
+
list(env.files_read),
|
| 473 |
+
env.final_score,
|
| 474 |
+
) if traj.get("steps") else None
|
| 475 |
+
|
| 476 |
+
viz_data = {
|
| 477 |
+
"task": env.current_task or "unknown",
|
| 478 |
+
"variant_id": traj.get("variant_id", "unknown"),
|
| 479 |
+
"final_score": env.final_score,
|
| 480 |
+
"strategy": strategy_report.strategy if strategy_report else "UNKNOWN",
|
| 481 |
+
"failure_type": "—",
|
| 482 |
+
"files": files,
|
| 483 |
+
"dependencies": deps,
|
| 484 |
+
"steps": steps_data,
|
| 485 |
+
}
|
| 486 |
+
data_json = json.dumps(viz_data)
|
| 487 |
+
else:
|
| 488 |
+
data_json = ""
|
| 489 |
+
|
| 490 |
+
# Inject data into HTML
|
| 491 |
+
html = html.replace(
|
| 492 |
+
'<div id="viz-data" style="display:none"></div>',
|
| 493 |
+
f'<div id="viz-data" style="display:none">{data_json}</div>'
|
| 494 |
+
)
|
| 495 |
+
return html
|
| 496 |
|
| 497 |
+
|
| 498 |
+
# ── Build Gradio UI ───────────────────────────────────────────────────────────
|
| 499 |
+
|
| 500 |
+
with gr.Blocks(title="Codebase Navigation & Repair — OpenEnv v3") as demo:
|
| 501 |
gr.Markdown(
|
| 502 |
+
"# 🔍 Codebase Navigation & Repair — OpenEnv v3\n"
|
| 503 |
+
"**The most advanced debugging + evaluation platform for AI coding agents.** "
|
| 504 |
+
"Navigate codebases · Fix bugs · Evaluate process · Visualize in 3D."
|
| 505 |
)
|
| 506 |
|
| 507 |
with gr.Tabs():
|
| 508 |
+
|
| 509 |
+
# ── Tab 1: Interactive ────────────────────────────────────────────────
|
| 510 |
with gr.TabItem("🎮 Interactive"):
|
| 511 |
with gr.Row():
|
| 512 |
with gr.Column(scale=1):
|
| 513 |
task_select = gr.Dropdown(
|
| 514 |
+
["task1", "task2", "task3"], value="task1",
|
|
|
|
| 515 |
label="Task",
|
| 516 |
+
info="task1=bugs, task2=cross-module, task3=feature impl"
|
| 517 |
)
|
| 518 |
reset_btn = gr.Button("🔄 Reset Environment", variant="primary")
|
| 519 |
+
gr.Markdown("### Action")
|
| 520 |
+
act_type = gr.Dropdown(
|
| 521 |
+
["read_file", "write_file", "run_tests", "search_code", "submit"],
|
| 522 |
+
value="read_file", label="Action Type",
|
|
|
|
|
|
|
| 523 |
)
|
| 524 |
+
act_path = gr.Textbox(label="Path", placeholder="src/auth.py")
|
| 525 |
+
act_query = gr.Textbox(label="Query (search_code)", placeholder="validate_token")
|
| 526 |
+
act_content = gr.Textbox(label="Content (write_file)", lines=4)
|
| 527 |
step_btn = gr.Button("▶️ Execute Step", variant="secondary")
|
|
|
|
| 528 |
with gr.Column(scale=2):
|
| 529 |
+
status_box = gr.Textbox(label="Status", lines=14, interactive=False)
|
| 530 |
+
result_box = gr.Textbox(label="Last Result", lines=8, interactive=False)
|
| 531 |
with gr.Row():
|
| 532 |
+
steps_box = gr.Textbox(label="Steps", value="0", interactive=False)
|
| 533 |
reward_box = gr.Textbox(label="Cumulative Reward", value="0.000", interactive=False)
|
| 534 |
+
reset_btn.click(reset_environment, [task_select], [status_box, result_box, steps_box, reward_box])
|
| 535 |
+
step_btn.click(take_step, [act_type, act_path, act_query, act_content], [status_box, result_box, steps_box, reward_box])
|
| 536 |
+
|
| 537 |
+
# ── Tab 2: Run Agent ──────────────────────────────────────────────────
|
| 538 |
+
with gr.TabItem("🤖 Run Agent"):
|
| 539 |
+
gr.Markdown("### Built-in Demonstration Agent\nRuns deterministic read→submit strategy.")
|
| 540 |
+
agent_task = gr.Dropdown(["task1", "task2", "task3"], value="task1", label="Task")
|
| 541 |
+
run_btn = gr.Button("🚀 Run Agent", variant="primary")
|
| 542 |
+
agent_output = gr.Textbox(label="Agent Log", lines=20, interactive=False)
|
| 543 |
+
run_btn.click(run_builtin_agent, [agent_task], [agent_output])
|
| 544 |
|
| 545 |
+
# ── Tab 3: Evaluation ─────────────────────────────────────────────────
|
| 546 |
+
with gr.TabItem("📊 Evaluation"):
|
| 547 |
+
with gr.Row():
|
| 548 |
+
eval_btn = gr.Button("🎯 Evaluation Report", variant="primary")
|
| 549 |
+
metrics_btn = gr.Button("📈 Metrics JSON", variant="secondary")
|
| 550 |
+
traj_btn = gr.Button("🗺️ Trajectory", variant="secondary")
|
| 551 |
+
eval_out = gr.Textbox(label="Output", lines=28, interactive=False)
|
| 552 |
+
eval_btn.click(get_evaluation, outputs=[eval_out])
|
| 553 |
+
metrics_btn.click(get_metrics, outputs=[eval_out])
|
| 554 |
+
traj_btn.click(get_trajectory, outputs=[eval_out])
|
| 555 |
+
|
| 556 |
+
# ── Tab 4: 🧠 Intelligence ─────────────────────────────────────────────
|
| 557 |
+
with gr.TabItem("🧠 Intelligence"):
|
| 558 |
+
gr.Markdown(
|
| 559 |
+
"### Deep Agent Intelligence Analysis\n"
|
| 560 |
+
"Failure classification, strategy detection, and advanced behavioral metrics."
|
| 561 |
)
|
| 562 |
+
with gr.Row():
|
| 563 |
+
classify_btn = gr.Button("🔬 Classify Failure", variant="primary")
|
| 564 |
+
strategy_btn = gr.Button("🧭 Detect Strategy", variant="secondary")
|
| 565 |
+
adv_btn = gr.Button("⚡ Advanced Metrics", variant="secondary")
|
| 566 |
+
intel_out = gr.Textbox(label="Analysis", lines=32, interactive=False)
|
| 567 |
+
classify_btn.click(get_failure_classification, outputs=[intel_out])
|
| 568 |
+
strategy_btn.click(get_strategy_detection, outputs=[intel_out])
|
| 569 |
+
adv_btn.click(get_advanced_metrics, outputs=[intel_out])
|
| 570 |
+
|
| 571 |
+
# ── Tab 5: 🔁 Self-Improve ─────────────────────────────────────────────
|
| 572 |
+
with gr.TabItem("🔁 Self-Improve"):
|
| 573 |
+
gr.Markdown(
|
| 574 |
+
"### Self-Improvement Loop\n"
|
| 575 |
+
"After a failure, this generates an actionable improvement plan and a "
|
| 576 |
+
"system prompt injection for the agent's next attempt."
|
| 577 |
)
|
| 578 |
+
improve_btn = gr.Button("🔁 Generate Improvement Plan", variant="primary")
|
| 579 |
+
improve_out = gr.Textbox(label="Improvement Plan", lines=32, interactive=False)
|
| 580 |
+
improve_btn.click(get_improvement_plan, outputs=[improve_out])
|
| 581 |
|
| 582 |
+
# ── Tab 6: ⚖️ Compare ──────────────────────────────────────────────────
|
| 583 |
+
with gr.TabItem("⚖️ Compare Agents"):
|
| 584 |
gr.Markdown(
|
| 585 |
+
"### Multi-Agent Strategy Comparison\n"
|
| 586 |
+
"Runs 4 built-in agent strategies on the same task to compare "
|
| 587 |
+
"efficiency, strategy, and reliability side-by-side."
|
| 588 |
)
|
| 589 |
+
with gr.Row():
|
| 590 |
+
comp_task = gr.Dropdown(["task1", "task2", "task3"], value="task1", label="Task")
|
| 591 |
+
comp_agents = gr.CheckboxGroup(
|
| 592 |
+
["test-first", "search-first", "minimal", "exhaustive"],
|
| 593 |
+
value=["test-first", "search-first", "minimal", "exhaustive"],
|
| 594 |
+
label="Agents to Compare",
|
| 595 |
+
)
|
| 596 |
+
comp_btn = gr.Button("⚖️ Run Comparison", variant="primary")
|
| 597 |
+
comp_out = gr.Textbox(label="Comparison Report", lines=30, interactive=False)
|
| 598 |
+
comp_btn.click(run_comparison, [comp_task, comp_agents], [comp_out])
|
| 599 |
+
|
| 600 |
+
# ── Tab 7: 🌐 3D Visualizer ────────────────────────────────────────────
|
| 601 |
+
with gr.TabItem("🌐 3D Visualizer"):
|
| 602 |
+
gr.Markdown(
|
| 603 |
+
"### Agent Trajectory 3D Visualization\n"
|
| 604 |
+
"Files = 3D nodes · Dependencies = edges · Agent path = animated beam · "
|
| 605 |
+
"Timeline = scrubbable replay. **Run an episode first, then refresh.**"
|
| 606 |
)
|
| 607 |
+
refresh_viz_btn = gr.Button("🔄 Load Trajectory into Visualizer", variant="primary")
|
| 608 |
+
viz_html = gr.HTML(value="<p style='color:#64748b;text-align:center;padding:40px'>Click 'Load Trajectory' after running an episode.</p>")
|
| 609 |
+
refresh_viz_btn.click(get_viz_html, outputs=[viz_html])
|
| 610 |
|
| 611 |
+
# ── Tab 8: API ────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 612 |
with gr.TabItem("📖 API"):
|
| 613 |
gr.Markdown("""
|
| 614 |
+
### REST API — v3.0 Endpoints
|
|
|
|
|
|
|
| 615 |
|
| 616 |
+
#### Core (OpenEnv-compliant)
|
| 617 |
| Endpoint | Method | Description |
|
| 618 |
|----------|--------|-------------|
|
| 619 |
+
| `/reset?task=task1` | POST | Start new episode |
|
| 620 |
+
| `/step` | POST | Take action |
|
| 621 |
+
| `/state` | GET | Current state |
|
| 622 |
+
| `/health` | GET | Health check |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 623 |
|
| 624 |
+
#### Evaluation
|
| 625 |
+
| Endpoint | Method | Description |
|
| 626 |
+
|----------|--------|-------------|
|
| 627 |
+
| `/trajectory` | GET | Full action log |
|
| 628 |
+
| `/evaluate` | GET | 6-dimension scores |
|
| 629 |
+
| `/metrics` | GET | Memory + security stats |
|
| 630 |
+
| `/fault-config` | POST | Enable fault injection |
|
| 631 |
|
| 632 |
+
#### Intelligence (NEW in v3)
|
| 633 |
+
| Endpoint | Method | Description |
|
| 634 |
+
|----------|--------|-------------|
|
| 635 |
+
| `/classify` | GET | Typed failure classification |
|
| 636 |
+
| `/strategy` | GET | Behavioral strategy detection |
|
| 637 |
+
| `/advanced-metrics` | GET | Entropy, reliability, consistency |
|
| 638 |
+
| `/improvement-plan` | GET | Self-improvement feedback |
|
| 639 |
+
| `/compare-agents` | POST | Multi-agent comparison |
|
| 640 |
+
| `/viz-data` | GET | 3D visualization data |
|
| 641 |
|
| 642 |
+
```bash
|
| 643 |
+
BASE="http://localhost:7860"
|
| 644 |
+
curl -X POST "$BASE/reset?task=task1"
|
| 645 |
+
curl -X POST "$BASE/step" -H "Content-Type: application/json" -d '{"action_type":"read_file","path":"src/auth.py"}'
|
| 646 |
+
curl -X POST "$BASE/step" -d '{"action_type":"submit"}'
|
| 647 |
+
curl "$BASE/classify"
|
| 648 |
+
curl "$BASE/strategy"
|
| 649 |
+
curl "$BASE/advanced-metrics"
|
| 650 |
+
curl "$BASE/improvement-plan"
|
| 651 |
+
curl -X POST "$BASE/compare-agents?task=task1"
|
| 652 |
```
|
| 653 |
""")
|
| 654 |
|
| 655 |
|
| 656 |
+
# ── Mount FastAPI under same process ──────────────────────────────────────────
|
| 657 |
from server.app import app as fastapi_app
|
|
|
|
| 658 |
gr_app = gr.mount_gradio_app(fastapi_app, demo, path="/")
|
| 659 |
|
| 660 |
if __name__ == "__main__":
|
server/advanced_metrics.py
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/advanced_metrics.py
|
| 2 |
+
"""
|
| 3 |
+
Advanced Metrics Engine.
|
| 4 |
+
|
| 5 |
+
Computes metrics that existing benchmarks (SWE-bench, etc.) completely ignore:
|
| 6 |
+
- Exploration vs Exploitation ratio across episode
|
| 7 |
+
- Consistency score across multiple runs of same task
|
| 8 |
+
- Reliability index (weighted aggregate)
|
| 9 |
+
- Reasoning efficiency (useful actions / total actions)
|
| 10 |
+
- Decision entropy (how predictable/focused the agent is)
|
| 11 |
+
"""
|
| 12 |
+
import math
|
| 13 |
+
from typing import List, Dict, Any, Optional
|
| 14 |
+
from dataclasses import dataclass, field
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@dataclass
|
| 18 |
+
class AdvancedMetricsReport:
|
| 19 |
+
"""All advanced metrics for one episode or cross-episode comparison."""
|
| 20 |
+
|
| 21 |
+
# Per-episode
|
| 22 |
+
reasoning_efficiency: float # Useful steps / total steps
|
| 23 |
+
exploration_ratio: float # Read+search vs write+test ratio
|
| 24 |
+
decision_entropy: float # Shannon entropy of action distribution
|
| 25 |
+
reliability_index: float # Composite reliability score
|
| 26 |
+
pivot_rate: float # Strategy changes per 10 steps
|
| 27 |
+
wasteful_ratio: float # Redundant actions / total actions
|
| 28 |
+
|
| 29 |
+
# Cross-episode (populated when history provided)
|
| 30 |
+
consistency_score: float = 0.0 # Variance across runs (lower variance = higher consistency)
|
| 31 |
+
runs_analyzed: int = 0
|
| 32 |
+
|
| 33 |
+
# Breakdowns
|
| 34 |
+
action_distribution: Dict[str, int] = field(default_factory=dict)
|
| 35 |
+
useful_actions: List[str] = field(default_factory=list)
|
| 36 |
+
wasteful_actions: List[str] = field(default_factory=list)
|
| 37 |
+
reliability_breakdown: Dict[str, float] = field(default_factory=dict)
|
| 38 |
+
|
| 39 |
+
def to_dict(self) -> dict:
|
| 40 |
+
return {
|
| 41 |
+
"reasoning_efficiency": round(self.reasoning_efficiency, 3),
|
| 42 |
+
"exploration_ratio": round(self.exploration_ratio, 3),
|
| 43 |
+
"decision_entropy": round(self.decision_entropy, 3),
|
| 44 |
+
"reliability_index": round(self.reliability_index, 3),
|
| 45 |
+
"pivot_rate": round(self.pivot_rate, 3),
|
| 46 |
+
"wasteful_ratio": round(self.wasteful_ratio, 3),
|
| 47 |
+
"consistency_score": round(self.consistency_score, 3),
|
| 48 |
+
"runs_analyzed": self.runs_analyzed,
|
| 49 |
+
"action_distribution": self.action_distribution,
|
| 50 |
+
"useful_actions": self.useful_actions,
|
| 51 |
+
"wasteful_actions": self.wasteful_actions,
|
| 52 |
+
"reliability_breakdown": {
|
| 53 |
+
k: round(v, 3) for k, v in self.reliability_breakdown.items()
|
| 54 |
+
},
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class AdvancedMetricsEngine:
|
| 59 |
+
"""
|
| 60 |
+
Computes advanced behavioral and reliability metrics from trajectory data.
|
| 61 |
+
|
| 62 |
+
Usage:
|
| 63 |
+
engine = AdvancedMetricsEngine()
|
| 64 |
+
report = engine.compute(
|
| 65 |
+
trajectory_steps=[...],
|
| 66 |
+
variant_meta={...},
|
| 67 |
+
final_score=0.7,
|
| 68 |
+
files_read=[...],
|
| 69 |
+
files_written=[...],
|
| 70 |
+
history=[], # Pass previous episode scores for consistency
|
| 71 |
+
)
|
| 72 |
+
"""
|
| 73 |
+
|
| 74 |
+
def __init__(self):
|
| 75 |
+
self._score_history: List[float] = [] # Tracks scores across episodes
|
| 76 |
+
|
| 77 |
+
def compute(
|
| 78 |
+
self,
|
| 79 |
+
trajectory_steps: List[dict],
|
| 80 |
+
variant_meta: Dict[str, Any],
|
| 81 |
+
final_score: float,
|
| 82 |
+
files_read: List[str],
|
| 83 |
+
files_written: List[str],
|
| 84 |
+
history: Optional[List[float]] = None,
|
| 85 |
+
) -> AdvancedMetricsReport:
|
| 86 |
+
"""Compute all advanced metrics for one episode."""
|
| 87 |
+
# Record this score in history
|
| 88 |
+
self._score_history.append(final_score)
|
| 89 |
+
|
| 90 |
+
if not trajectory_steps:
|
| 91 |
+
return AdvancedMetricsReport(
|
| 92 |
+
reasoning_efficiency=0.0,
|
| 93 |
+
exploration_ratio=0.5,
|
| 94 |
+
decision_entropy=0.0,
|
| 95 |
+
reliability_index=0.0,
|
| 96 |
+
pivot_rate=0.0,
|
| 97 |
+
wasteful_ratio=1.0,
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
action_seq = [s.get("action_type", "unknown") for s in trajectory_steps]
|
| 101 |
+
total = len(action_seq)
|
| 102 |
+
|
| 103 |
+
# ── Action distribution ───────────────────────────────────────────────
|
| 104 |
+
from collections import Counter
|
| 105 |
+
dist = Counter(action_seq)
|
| 106 |
+
action_distribution = dict(dist)
|
| 107 |
+
|
| 108 |
+
# ── Decision entropy (Shannon entropy of action types) ────────────────
|
| 109 |
+
entropy = 0.0
|
| 110 |
+
for count in dist.values():
|
| 111 |
+
p = count / total
|
| 112 |
+
if p > 0:
|
| 113 |
+
entropy -= p * math.log2(p)
|
| 114 |
+
# Normalize by max possible entropy (log2 of unique action types)
|
| 115 |
+
max_entropy = math.log2(len(dist)) if len(dist) > 1 else 1.0
|
| 116 |
+
normalized_entropy = entropy / max_entropy if max_entropy > 0 else 0.0
|
| 117 |
+
|
| 118 |
+
# ── Exploration vs exploitation ratio ─────────────────────────────────
|
| 119 |
+
explore = dist.get("read_file", 0) + dist.get("search_code", 0)
|
| 120 |
+
exploit = dist.get("write_file", 0) + dist.get("run_tests", 0)
|
| 121 |
+
exploration_ratio = explore / (explore + exploit) if (explore + exploit) > 0 else 0.5
|
| 122 |
+
|
| 123 |
+
# ���─ Redundancy / wasteful actions ─────────────────────────────────────
|
| 124 |
+
read_paths = [
|
| 125 |
+
s.get("action_path")
|
| 126 |
+
for s in trajectory_steps
|
| 127 |
+
if s.get("action_type") == "read_file" and s.get("action_path")
|
| 128 |
+
]
|
| 129 |
+
seen = set()
|
| 130 |
+
redundant_reads = 0
|
| 131 |
+
for p in read_paths:
|
| 132 |
+
if p in seen:
|
| 133 |
+
redundant_reads += 1
|
| 134 |
+
seen.add(p)
|
| 135 |
+
|
| 136 |
+
error_actions = sum(1 for s in trajectory_steps if s.get("error"))
|
| 137 |
+
total_wasteful = redundant_reads + error_actions
|
| 138 |
+
wasteful_ratio = total_wasteful / total if total > 0 else 0.0
|
| 139 |
+
|
| 140 |
+
wasteful_actions = []
|
| 141 |
+
if redundant_reads > 0:
|
| 142 |
+
wasteful_actions.append(f"{redundant_reads}x redundant file reads")
|
| 143 |
+
if error_actions > 0:
|
| 144 |
+
wasteful_actions.append(f"{error_actions}x actions that produced errors")
|
| 145 |
+
|
| 146 |
+
# ── Useful action detection ───────────────────────────────────────────
|
| 147 |
+
useful_actions = []
|
| 148 |
+
relevant = set(
|
| 149 |
+
variant_meta.get("bug_files", []) +
|
| 150 |
+
variant_meta.get("interface_files", []) +
|
| 151 |
+
variant_meta.get("read_first_files", []) +
|
| 152 |
+
variant_meta.get("files_to_implement", [])
|
| 153 |
+
)
|
| 154 |
+
relevant_reads = [f for f in files_read if f in relevant]
|
| 155 |
+
if relevant_reads:
|
| 156 |
+
useful_actions.append(f"Read {len(relevant_reads)} key files: {relevant_reads[:3]}")
|
| 157 |
+
|
| 158 |
+
test_rates = [
|
| 159 |
+
s.get("test_pass_rate")
|
| 160 |
+
for s in trajectory_steps
|
| 161 |
+
if s.get("test_pass_rate") is not None
|
| 162 |
+
]
|
| 163 |
+
if len(test_rates) >= 2 and test_rates[-1] > test_rates[0]:
|
| 164 |
+
useful_actions.append(
|
| 165 |
+
f"Test pass rate improved from {test_rates[0]:.2f} to {test_rates[-1]:.2f}"
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
if files_written:
|
| 169 |
+
useful_actions.append(f"Wrote {len(files_written)} file(s): {files_written[:3]}")
|
| 170 |
+
|
| 171 |
+
# ── Reasoning efficiency ──────────────────────────────────────────────
|
| 172 |
+
useful_count = len(relevant_reads) + (1 if files_written else 0) + (1 if test_rates else 0)
|
| 173 |
+
reasoning_efficiency = min(1.0, useful_count / max(total, 1))
|
| 174 |
+
|
| 175 |
+
# ── Pivot rate (strategy switches per 10 steps) ───────────────────────
|
| 176 |
+
pivots = 0
|
| 177 |
+
for i in range(1, len(action_seq)):
|
| 178 |
+
prev_explore = action_seq[i-1] in ("read_file", "search_code")
|
| 179 |
+
curr_exploit = action_seq[i] in ("write_file", "run_tests")
|
| 180 |
+
prev_exploit = action_seq[i-1] in ("write_file", "run_tests")
|
| 181 |
+
curr_explore = action_seq[i] in ("read_file", "search_code")
|
| 182 |
+
if (prev_explore and curr_exploit) or (prev_exploit and curr_explore):
|
| 183 |
+
pivots += 1
|
| 184 |
+
pivot_rate = (pivots / total) * 10 if total > 0 else 0.0 # per 10 steps
|
| 185 |
+
|
| 186 |
+
# ── Reliability index ─────────────────────────────────────────────────
|
| 187 |
+
# Weighted aggregate: correctness matters most
|
| 188 |
+
reliability_breakdown = {
|
| 189 |
+
"correctness": final_score,
|
| 190 |
+
"efficiency": max(0.0, 1.0 - wasteful_ratio),
|
| 191 |
+
"focus": 1.0 - normalized_entropy, # Low entropy = focused behavior
|
| 192 |
+
"verification": 1.0 if test_rates else 0.0,
|
| 193 |
+
"safety": 1.0, # Will be reduced by security violations
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
# Check for security flags
|
| 197 |
+
sec_flags = sum(len(s.get("security_flags", [])) for s in trajectory_steps)
|
| 198 |
+
if sec_flags > 0:
|
| 199 |
+
reliability_breakdown["safety"] = max(0.0, 1.0 - sec_flags * 0.2)
|
| 200 |
+
|
| 201 |
+
# Weighted reliability index
|
| 202 |
+
weights = {
|
| 203 |
+
"correctness": 0.40,
|
| 204 |
+
"efficiency": 0.20,
|
| 205 |
+
"focus": 0.15,
|
| 206 |
+
"verification": 0.15,
|
| 207 |
+
"safety": 0.10,
|
| 208 |
+
}
|
| 209 |
+
reliability_index = sum(
|
| 210 |
+
reliability_breakdown[k] * weights[k]
|
| 211 |
+
for k in weights
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
# ── Consistency score (cross-episode) ────────────────────────────────
|
| 215 |
+
scores_to_use = list(history) if history else self._score_history
|
| 216 |
+
consistency_score = 0.0
|
| 217 |
+
runs_analyzed = len(scores_to_use)
|
| 218 |
+
|
| 219 |
+
if runs_analyzed >= 2:
|
| 220 |
+
mean = sum(scores_to_use) / runs_analyzed
|
| 221 |
+
variance = sum((s - mean) ** 2 for s in scores_to_use) / runs_analyzed
|
| 222 |
+
std_dev = math.sqrt(variance)
|
| 223 |
+
# Consistency = 1 - normalized_std_dev (higher = more consistent)
|
| 224 |
+
consistency_score = max(0.0, 1.0 - (std_dev / max(mean, 0.01)))
|
| 225 |
+
|
| 226 |
+
return AdvancedMetricsReport(
|
| 227 |
+
reasoning_efficiency=reasoning_efficiency,
|
| 228 |
+
exploration_ratio=exploration_ratio,
|
| 229 |
+
decision_entropy=normalized_entropy,
|
| 230 |
+
reliability_index=reliability_index,
|
| 231 |
+
pivot_rate=pivot_rate,
|
| 232 |
+
wasteful_ratio=wasteful_ratio,
|
| 233 |
+
consistency_score=consistency_score,
|
| 234 |
+
runs_analyzed=runs_analyzed,
|
| 235 |
+
action_distribution=action_distribution,
|
| 236 |
+
useful_actions=useful_actions,
|
| 237 |
+
wasteful_actions=wasteful_actions,
|
| 238 |
+
reliability_breakdown=reliability_breakdown,
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
def get_score_history(self) -> List[float]:
|
| 242 |
+
return list(self._score_history)
|
| 243 |
+
|
| 244 |
+
def reset_history(self):
|
| 245 |
+
self._score_history = []
|
server/app.py
CHANGED
|
@@ -1,13 +1,17 @@
|
|
| 1 |
# server/app.py
|
| 2 |
"""
|
| 3 |
-
FastAPI server
|
| 4 |
|
| 5 |
-
Core endpoints:
|
| 6 |
-
Evaluation endpoints:
|
| 7 |
-
Control endpoints:
|
|
|
|
|
|
|
| 8 |
"""
|
| 9 |
from fastapi import FastAPI, HTTPException
|
|
|
|
| 10 |
from contextlib import asynccontextmanager
|
|
|
|
| 11 |
|
| 12 |
from .environment import CodebaseNavEnvironment
|
| 13 |
from .models import (
|
|
@@ -15,9 +19,19 @@ from .models import (
|
|
| 15 |
TrajectoryResponse, EvaluationResponse, MetricsResponse,
|
| 16 |
FaultConfigRequest,
|
| 17 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
-
# Global
|
| 20 |
env = CodebaseNavEnvironment()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
@asynccontextmanager
|
|
@@ -27,45 +41,41 @@ async def lifespan(app: FastAPI):
|
|
| 27 |
|
| 28 |
|
| 29 |
app = FastAPI(
|
| 30 |
-
title="Codebase Navigation & Repair — OpenEnv",
|
| 31 |
description=(
|
| 32 |
-
"RL environment
|
| 33 |
-
"
|
| 34 |
-
"
|
| 35 |
),
|
| 36 |
-
version="
|
| 37 |
lifespan=lifespan,
|
| 38 |
)
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
|
|
|
|
| 42 |
|
| 43 |
@app.post("/reset", response_model=ResetResult)
|
| 44 |
async def reset(task: str = "task1"):
|
| 45 |
-
"""
|
| 46 |
-
Start a new episode.
|
| 47 |
-
task: "task1" | "task2" | "task3"
|
| 48 |
-
"""
|
| 49 |
valid_tasks = ["task1", "task2", "task3"]
|
| 50 |
if task not in valid_tasks:
|
| 51 |
raise HTTPException(status_code=400, detail=f"task must be one of {valid_tasks}")
|
| 52 |
try:
|
| 53 |
-
|
| 54 |
-
return result
|
| 55 |
except Exception as e:
|
| 56 |
raise HTTPException(status_code=500, detail=str(e))
|
| 57 |
|
| 58 |
|
| 59 |
@app.post("/step", response_model=StepResult)
|
| 60 |
async def step(action: RepoAction):
|
| 61 |
-
"""
|
| 62 |
-
Take one action in the current episode.
|
| 63 |
-
"""
|
| 64 |
if env.done:
|
| 65 |
-
raise HTTPException(status_code=400, detail="Episode is done. POST /reset to start
|
| 66 |
try:
|
| 67 |
-
|
| 68 |
-
return result
|
| 69 |
except RuntimeError as e:
|
| 70 |
raise HTTPException(status_code=400, detail=str(e))
|
| 71 |
except Exception as e:
|
|
@@ -74,12 +84,8 @@ async def step(action: RepoAction):
|
|
| 74 |
|
| 75 |
@app.get("/state", response_model=StateResult)
|
| 76 |
async def state():
|
| 77 |
-
"""
|
| 78 |
-
Get current state without advancing the episode.
|
| 79 |
-
"""
|
| 80 |
-
obs = env.get_state()
|
| 81 |
return StateResult(
|
| 82 |
-
observation=
|
| 83 |
current_score=env.final_score,
|
| 84 |
total_steps_taken=env.steps_taken,
|
| 85 |
)
|
|
@@ -87,17 +93,13 @@ async def state():
|
|
| 87 |
|
| 88 |
@app.get("/health")
|
| 89 |
async def health():
|
| 90 |
-
return {"status": "ok", "environment": "codebase-nav-env", "version": "
|
| 91 |
|
| 92 |
|
| 93 |
-
# ── Evaluation
|
| 94 |
|
| 95 |
@app.get("/trajectory", response_model=TrajectoryResponse)
|
| 96 |
async def get_trajectory():
|
| 97 |
-
"""
|
| 98 |
-
Get the full trajectory of the current or most recent episode.
|
| 99 |
-
Returns every action, observation snapshot, reward, timing, and security flags.
|
| 100 |
-
"""
|
| 101 |
traj = env.get_trajectory()
|
| 102 |
if not traj:
|
| 103 |
return TrajectoryResponse()
|
|
@@ -106,11 +108,6 @@ async def get_trajectory():
|
|
| 106 |
|
| 107 |
@app.get("/evaluate", response_model=EvaluationResponse)
|
| 108 |
async def get_evaluation():
|
| 109 |
-
"""
|
| 110 |
-
Get multi-dimensional evaluation of the current/latest episode.
|
| 111 |
-
Scores across 6 dimensions: efficiency, navigation, correctness,
|
| 112 |
-
reasoning, robustness, security.
|
| 113 |
-
"""
|
| 114 |
evaluation = env.get_evaluation()
|
| 115 |
if "error" in evaluation:
|
| 116 |
return EvaluationResponse()
|
|
@@ -119,23 +116,224 @@ async def get_evaluation():
|
|
| 119 |
|
| 120 |
@app.get("/metrics", response_model=MetricsResponse)
|
| 121 |
async def get_metrics():
|
| 122 |
-
|
| 123 |
-
Get comprehensive metrics including memory usage, security stats,
|
| 124 |
-
fault injection report, wasteful patterns, and action timeline.
|
| 125 |
-
"""
|
| 126 |
-
metrics = env.get_metrics()
|
| 127 |
-
return MetricsResponse(**metrics)
|
| 128 |
|
| 129 |
|
| 130 |
@app.post("/fault-config")
|
| 131 |
async def set_fault_config(config: FaultConfigRequest):
|
| 132 |
-
"""
|
| 133 |
-
Configure fault injection for the NEXT episode (takes effect on next /reset).
|
| 134 |
-
Levels: "none" (default), "light" (misleading comments), "heavy" (all faults)
|
| 135 |
-
"""
|
| 136 |
env.set_fault_config(config.level)
|
| 137 |
return {
|
| 138 |
"status": "ok",
|
| 139 |
"fault_level": config.level,
|
| 140 |
"message": f"Fault injection set to '{config.level}'. Takes effect on next /reset.",
|
| 141 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# server/app.py
|
| 2 |
"""
|
| 3 |
+
FastAPI server — v3.0
|
| 4 |
|
| 5 |
+
Core endpoints: POST /reset, POST /step, GET /state, GET /health
|
| 6 |
+
Evaluation endpoints: GET /trajectory, GET /evaluate, GET /metrics
|
| 7 |
+
Control endpoints: POST /fault-config
|
| 8 |
+
Intelligence endpoints: GET /classify, GET /strategy, GET /advanced-metrics,
|
| 9 |
+
POST /compare-agents, GET /improvement-plan, GET /viz-data
|
| 10 |
"""
|
| 11 |
from fastapi import FastAPI, HTTPException
|
| 12 |
+
from fastapi.staticfiles import StaticFiles
|
| 13 |
from contextlib import asynccontextmanager
|
| 14 |
+
import os
|
| 15 |
|
| 16 |
from .environment import CodebaseNavEnvironment
|
| 17 |
from .models import (
|
|
|
|
| 19 |
TrajectoryResponse, EvaluationResponse, MetricsResponse,
|
| 20 |
FaultConfigRequest,
|
| 21 |
)
|
| 22 |
+
from .failure_classifier import FailureClassifier
|
| 23 |
+
from .strategy_detector import StrategyDetector
|
| 24 |
+
from .advanced_metrics import AdvancedMetricsEngine
|
| 25 |
+
from .self_improvement import SelfImprovementEngine
|
| 26 |
+
from .multi_agent import MultiAgentComparison
|
| 27 |
|
| 28 |
+
# Global instances
|
| 29 |
env = CodebaseNavEnvironment()
|
| 30 |
+
failure_clf = FailureClassifier()
|
| 31 |
+
strategy_det = StrategyDetector()
|
| 32 |
+
adv_metrics = AdvancedMetricsEngine()
|
| 33 |
+
improvement = SelfImprovementEngine()
|
| 34 |
+
multi_agent = MultiAgentComparison()
|
| 35 |
|
| 36 |
|
| 37 |
@asynccontextmanager
|
|
|
|
| 41 |
|
| 42 |
|
| 43 |
app = FastAPI(
|
| 44 |
+
title="Codebase Navigation & Repair — OpenEnv v3",
|
| 45 |
description=(
|
| 46 |
+
"RL environment for AI coding agents — extended with process-based evaluation, "
|
| 47 |
+
"failure classification, strategy detection, self-improvement loops, "
|
| 48 |
+
"multi-agent comparison, 3D visualization, and advanced metrics."
|
| 49 |
),
|
| 50 |
+
version="3.0.0",
|
| 51 |
lifespan=lifespan,
|
| 52 |
)
|
| 53 |
|
| 54 |
+
# Serve static files (3D visualizer HTML)
|
| 55 |
+
_static_dir = os.path.join(os.path.dirname(__file__), "..", "static")
|
| 56 |
+
if os.path.exists(_static_dir):
|
| 57 |
+
app.mount("/static", StaticFiles(directory=_static_dir), name="static")
|
| 58 |
|
| 59 |
+
|
| 60 |
+
# ── Core OpenEnv Endpoints ────────────────────────────────────────────────────
|
| 61 |
|
| 62 |
@app.post("/reset", response_model=ResetResult)
|
| 63 |
async def reset(task: str = "task1"):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
valid_tasks = ["task1", "task2", "task3"]
|
| 65 |
if task not in valid_tasks:
|
| 66 |
raise HTTPException(status_code=400, detail=f"task must be one of {valid_tasks}")
|
| 67 |
try:
|
| 68 |
+
return env.reset(task=task)
|
|
|
|
| 69 |
except Exception as e:
|
| 70 |
raise HTTPException(status_code=500, detail=str(e))
|
| 71 |
|
| 72 |
|
| 73 |
@app.post("/step", response_model=StepResult)
|
| 74 |
async def step(action: RepoAction):
|
|
|
|
|
|
|
|
|
|
| 75 |
if env.done:
|
| 76 |
+
raise HTTPException(status_code=400, detail="Episode is done. POST /reset to start.")
|
| 77 |
try:
|
| 78 |
+
return env.step(action)
|
|
|
|
| 79 |
except RuntimeError as e:
|
| 80 |
raise HTTPException(status_code=400, detail=str(e))
|
| 81 |
except Exception as e:
|
|
|
|
| 84 |
|
| 85 |
@app.get("/state", response_model=StateResult)
|
| 86 |
async def state():
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
return StateResult(
|
| 88 |
+
observation=env.get_state(),
|
| 89 |
current_score=env.final_score,
|
| 90 |
total_steps_taken=env.steps_taken,
|
| 91 |
)
|
|
|
|
| 93 |
|
| 94 |
@app.get("/health")
|
| 95 |
async def health():
|
| 96 |
+
return {"status": "ok", "environment": "codebase-nav-env", "version": "3.0.0"}
|
| 97 |
|
| 98 |
|
| 99 |
+
# ── Evaluation Endpoints ──────────────────────────────────────────────────────
|
| 100 |
|
| 101 |
@app.get("/trajectory", response_model=TrajectoryResponse)
|
| 102 |
async def get_trajectory():
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
traj = env.get_trajectory()
|
| 104 |
if not traj:
|
| 105 |
return TrajectoryResponse()
|
|
|
|
| 108 |
|
| 109 |
@app.get("/evaluate", response_model=EvaluationResponse)
|
| 110 |
async def get_evaluation():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
evaluation = env.get_evaluation()
|
| 112 |
if "error" in evaluation:
|
| 113 |
return EvaluationResponse()
|
|
|
|
| 116 |
|
| 117 |
@app.get("/metrics", response_model=MetricsResponse)
|
| 118 |
async def get_metrics():
|
| 119 |
+
return MetricsResponse(**env.get_metrics())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
|
| 122 |
@app.post("/fault-config")
|
| 123 |
async def set_fault_config(config: FaultConfigRequest):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
env.set_fault_config(config.level)
|
| 125 |
return {
|
| 126 |
"status": "ok",
|
| 127 |
"fault_level": config.level,
|
| 128 |
"message": f"Fault injection set to '{config.level}'. Takes effect on next /reset.",
|
| 129 |
}
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
# ── Intelligence Endpoints (NEW in v3) ────────────────────────────────────────
|
| 133 |
+
|
| 134 |
+
@app.get("/classify")
|
| 135 |
+
async def classify_failure():
|
| 136 |
+
"""
|
| 137 |
+
Classify the failure type of the current/latest episode.
|
| 138 |
+
Returns typed failure taxonomy with root cause and remediation.
|
| 139 |
+
"""
|
| 140 |
+
traj = env.get_trajectory()
|
| 141 |
+
if not traj:
|
| 142 |
+
return {"error": "No trajectory available. Run an episode first."}
|
| 143 |
+
|
| 144 |
+
steps = traj.get("steps", [])
|
| 145 |
+
meta = env.variant.meta if env.variant else {}
|
| 146 |
+
|
| 147 |
+
report = failure_clf.classify(
|
| 148 |
+
episode_id=traj.get("episode_id", ""),
|
| 149 |
+
task=env.current_task or "unknown",
|
| 150 |
+
trajectory_steps=steps,
|
| 151 |
+
variant_meta=meta,
|
| 152 |
+
files_read=list(env.files_read),
|
| 153 |
+
files_written=list(env.files_written),
|
| 154 |
+
final_score=env.final_score,
|
| 155 |
+
security_violations=env.security_violations,
|
| 156 |
+
)
|
| 157 |
+
return report.to_dict()
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
@app.get("/strategy")
|
| 161 |
+
async def detect_strategy():
|
| 162 |
+
"""
|
| 163 |
+
Detect the behavioral strategy pattern used by the agent.
|
| 164 |
+
Returns: TARGETED_DEBUGGING | SYSTEMATIC_SEARCH | BRUTE_FORCE |
|
| 165 |
+
RANDOM_EXPLORATION | SPEC_DRIVEN | MINIMAL_EFFORT
|
| 166 |
+
"""
|
| 167 |
+
traj = env.get_trajectory()
|
| 168 |
+
if not traj:
|
| 169 |
+
return {"error": "No trajectory available."}
|
| 170 |
+
|
| 171 |
+
steps = traj.get("steps", [])
|
| 172 |
+
meta = env.variant.meta if env.variant else {}
|
| 173 |
+
|
| 174 |
+
report = strategy_det.detect(
|
| 175 |
+
trajectory_steps=steps,
|
| 176 |
+
task=env.current_task or "unknown",
|
| 177 |
+
variant_meta=meta,
|
| 178 |
+
files_read=list(env.files_read),
|
| 179 |
+
final_score=env.final_score,
|
| 180 |
+
)
|
| 181 |
+
return report.to_dict()
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
@app.get("/advanced-metrics")
|
| 185 |
+
async def get_advanced_metrics():
|
| 186 |
+
"""
|
| 187 |
+
Compute advanced metrics: reasoning efficiency, decision entropy,
|
| 188 |
+
exploration ratio, reliability index, consistency, pivot rate.
|
| 189 |
+
"""
|
| 190 |
+
traj = env.get_trajectory()
|
| 191 |
+
if not traj:
|
| 192 |
+
return {"error": "No trajectory available."}
|
| 193 |
+
|
| 194 |
+
steps = traj.get("steps", [])
|
| 195 |
+
meta = env.variant.meta if env.variant else {}
|
| 196 |
+
|
| 197 |
+
report = adv_metrics.compute(
|
| 198 |
+
trajectory_steps=steps,
|
| 199 |
+
variant_meta=meta,
|
| 200 |
+
final_score=env.final_score,
|
| 201 |
+
files_read=list(env.files_read),
|
| 202 |
+
files_written=list(env.files_written),
|
| 203 |
+
)
|
| 204 |
+
return report.to_dict()
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
@app.get("/improvement-plan")
|
| 208 |
+
async def get_improvement_plan():
|
| 209 |
+
"""
|
| 210 |
+
Generate a self-improvement plan based on failure classification.
|
| 211 |
+
Returns: what_went_wrong, improved_strategy, step-by-step plan,
|
| 212 |
+
system_prompt_addon (for injecting into next agent run).
|
| 213 |
+
"""
|
| 214 |
+
traj = env.get_trajectory()
|
| 215 |
+
if not traj:
|
| 216 |
+
return {"error": "No trajectory available."}
|
| 217 |
+
|
| 218 |
+
steps = traj.get("steps", [])
|
| 219 |
+
meta = env.variant.meta if env.variant else {}
|
| 220 |
+
|
| 221 |
+
# Classify first
|
| 222 |
+
fail_report = failure_clf.classify(
|
| 223 |
+
episode_id=traj.get("episode_id", ""),
|
| 224 |
+
task=env.current_task or "unknown",
|
| 225 |
+
trajectory_steps=steps,
|
| 226 |
+
variant_meta=meta,
|
| 227 |
+
files_read=list(env.files_read),
|
| 228 |
+
files_written=list(env.files_written),
|
| 229 |
+
final_score=env.final_score,
|
| 230 |
+
security_violations=env.security_violations,
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
plan = improvement.generate_improvement_plan(
|
| 234 |
+
episode_id=traj.get("episode_id", ""),
|
| 235 |
+
task=env.current_task or "unknown",
|
| 236 |
+
failure_type=fail_report.primary_failure,
|
| 237 |
+
failure_evidence=[f.evidence for f in fail_report.failures],
|
| 238 |
+
original_score=env.final_score,
|
| 239 |
+
trajectory_steps=steps,
|
| 240 |
+
files_read=list(env.files_read),
|
| 241 |
+
files_written=list(env.files_written),
|
| 242 |
+
)
|
| 243 |
+
return plan.to_dict()
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
@app.post("/compare-agents")
|
| 247 |
+
async def compare_agents(task: str = "task1", agents: str = "all"):
|
| 248 |
+
"""
|
| 249 |
+
Run multiple agent strategies on the same task and compare side-by-side.
|
| 250 |
+
agents: "all" | comma-separated list of: test-first,search-first,minimal,exhaustive
|
| 251 |
+
"""
|
| 252 |
+
valid_tasks = ["task1", "task2", "task3"]
|
| 253 |
+
if task not in valid_tasks:
|
| 254 |
+
raise HTTPException(status_code=400, detail=f"task must be one of {valid_tasks}")
|
| 255 |
+
|
| 256 |
+
if agents == "all":
|
| 257 |
+
agent_list = None
|
| 258 |
+
else:
|
| 259 |
+
agent_list = [a.strip() for a in agents.split(",")]
|
| 260 |
+
|
| 261 |
+
try:
|
| 262 |
+
report = multi_agent.compare(env, task=task, agents=agent_list)
|
| 263 |
+
return report.to_dict()
|
| 264 |
+
except Exception as e:
|
| 265 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
@app.get("/viz-data")
|
| 269 |
+
async def get_viz_data():
|
| 270 |
+
"""
|
| 271 |
+
Get structured 3D visualization data for the current/latest episode.
|
| 272 |
+
Returns nodes (files), edges (dependencies), and step trajectory
|
| 273 |
+
in the format expected by the Three.js visualizer.
|
| 274 |
+
"""
|
| 275 |
+
traj = env.get_trajectory()
|
| 276 |
+
if not traj:
|
| 277 |
+
return {"error": "No trajectory available."}
|
| 278 |
+
|
| 279 |
+
# Build file nodes
|
| 280 |
+
files = []
|
| 281 |
+
visited = set(env.files_read)
|
| 282 |
+
modified = set(env.files_written)
|
| 283 |
+
meta = env.variant.meta if env.variant else {}
|
| 284 |
+
bug_files = set(meta.get("bug_files", []))
|
| 285 |
+
|
| 286 |
+
if env.variant:
|
| 287 |
+
tree = env.variant.get_tree()
|
| 288 |
+
for f in tree:
|
| 289 |
+
ftype = "test" if f.startswith("tests/") else \
|
| 290 |
+
"spec" if f.endswith(".md") else "src"
|
| 291 |
+
files.append({
|
| 292 |
+
"name": f,
|
| 293 |
+
"type": ftype,
|
| 294 |
+
"is_bug_file": f in bug_files,
|
| 295 |
+
"visited": f in visited,
|
| 296 |
+
"modified": f in modified,
|
| 297 |
+
})
|
| 298 |
+
|
| 299 |
+
# Build dependency edges from known patterns
|
| 300 |
+
deps = []
|
| 301 |
+
test_files = [f["name"] for f in files if f["type"] == "test"]
|
| 302 |
+
src_files = [f["name"] for f in files if f["type"] == "src"]
|
| 303 |
+
|
| 304 |
+
# Simple heuristic: connect tests to src files
|
| 305 |
+
for tf in test_files:
|
| 306 |
+
for sf in src_files:
|
| 307 |
+
deps.append({"from": tf, "to": sf})
|
| 308 |
+
|
| 309 |
+
# Build step data for trajectory
|
| 310 |
+
steps_data = []
|
| 311 |
+
for step in traj.get("steps", []):
|
| 312 |
+
steps_data.append({
|
| 313 |
+
"step": step.get("step_number", 0),
|
| 314 |
+
"action": step.get("action_type", ""),
|
| 315 |
+
"path": step.get("action_path"),
|
| 316 |
+
"reward": step.get("reward", 0.0),
|
| 317 |
+
"error": step.get("error"),
|
| 318 |
+
"pass_rate": step.get("test_pass_rate"),
|
| 319 |
+
})
|
| 320 |
+
|
| 321 |
+
# Get strategy
|
| 322 |
+
strategy_info = strategy_det.detect(
|
| 323 |
+
traj.get("steps", []),
|
| 324 |
+
env.current_task or "unknown",
|
| 325 |
+
meta,
|
| 326 |
+
list(env.files_read),
|
| 327 |
+
env.final_score,
|
| 328 |
+
) if traj.get("steps") else None
|
| 329 |
+
|
| 330 |
+
return {
|
| 331 |
+
"task": env.current_task or "unknown",
|
| 332 |
+
"variant_id": traj.get("variant_id", "unknown"),
|
| 333 |
+
"final_score": env.final_score,
|
| 334 |
+
"strategy": strategy_info.strategy if strategy_info else "UNKNOWN",
|
| 335 |
+
"failure_type": "—",
|
| 336 |
+
"files": files,
|
| 337 |
+
"dependencies": deps,
|
| 338 |
+
"steps": steps_data,
|
| 339 |
+
}
|
server/failure_classifier.py
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/failure_classifier.py
|
| 2 |
+
"""
|
| 3 |
+
Typed Failure Classification Engine.
|
| 4 |
+
|
| 5 |
+
Classifies agent failures into precise, actionable categories rather than
|
| 6 |
+
vague scores. Each failure type has a root cause, evidence, and remediation.
|
| 7 |
+
|
| 8 |
+
Failure taxonomy:
|
| 9 |
+
WRONG_FILE_NAVIGATION — agent read irrelevant files, missed key files
|
| 10 |
+
BLIND_WRITE — agent wrote code without reading first
|
| 11 |
+
HALLUCINATED_CODE — agent wrote syntactically/logically wrong code
|
| 12 |
+
NEVER_TESTED — agent submitted without running any tests
|
| 13 |
+
LOOPING_BEHAVIOR — agent repeated same action 3+ times
|
| 14 |
+
CONTEXT_OVERFLOW — agent read enormous amounts of irrelevant data
|
| 15 |
+
SECURITY_VIOLATION — agent wrote dangerous code
|
| 16 |
+
CORRECT — no failure detected
|
| 17 |
+
"""
|
| 18 |
+
from typing import List, Dict, Any, Optional
|
| 19 |
+
from dataclasses import dataclass, field
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@dataclass
|
| 23 |
+
class FailureInstance:
|
| 24 |
+
"""One classified failure event."""
|
| 25 |
+
failure_type: str # e.g. "WRONG_FILE_NAVIGATION"
|
| 26 |
+
severity: str # "critical" | "major" | "minor"
|
| 27 |
+
step_number: int # Which step triggered it
|
| 28 |
+
evidence: str # Specific observation
|
| 29 |
+
root_cause: str # Why this happens
|
| 30 |
+
remediation: str # How to fix in next run
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@dataclass
|
| 34 |
+
class FailureReport:
|
| 35 |
+
"""Full failure analysis for one episode."""
|
| 36 |
+
episode_id: str
|
| 37 |
+
task: str
|
| 38 |
+
primary_failure: str # Most severe failure type
|
| 39 |
+
failures: List[FailureInstance] = field(default_factory=list)
|
| 40 |
+
success: bool = False
|
| 41 |
+
failure_summary: str = ""
|
| 42 |
+
retry_hint: str = "" # Actionable hint for the next attempt
|
| 43 |
+
|
| 44 |
+
def to_dict(self) -> dict:
|
| 45 |
+
return {
|
| 46 |
+
"episode_id": self.episode_id,
|
| 47 |
+
"task": self.task,
|
| 48 |
+
"success": self.success,
|
| 49 |
+
"primary_failure": self.primary_failure,
|
| 50 |
+
"failure_count": len(self.failures),
|
| 51 |
+
"failures": [
|
| 52 |
+
{
|
| 53 |
+
"type": f.failure_type,
|
| 54 |
+
"severity": f.severity,
|
| 55 |
+
"step": f.step_number,
|
| 56 |
+
"evidence": f.evidence,
|
| 57 |
+
"root_cause": f.root_cause,
|
| 58 |
+
"remediation": f.remediation,
|
| 59 |
+
}
|
| 60 |
+
for f in self.failures
|
| 61 |
+
],
|
| 62 |
+
"failure_summary": self.failure_summary,
|
| 63 |
+
"retry_hint": self.retry_hint,
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# ── Severity ordering for picking primary failure ─────────────────────────────
|
| 68 |
+
SEVERITY_RANK = {"critical": 3, "major": 2, "minor": 1}
|
| 69 |
+
|
| 70 |
+
FAILURE_REMEDIATION = {
|
| 71 |
+
"WRONG_FILE_NAVIGATION": (
|
| 72 |
+
"Read the failing test file first to understand the module under test, "
|
| 73 |
+
"then navigate directly to the imported source files."
|
| 74 |
+
),
|
| 75 |
+
"BLIND_WRITE": (
|
| 76 |
+
"Always read the target file before writing. Use read_file → write_file → run_tests."
|
| 77 |
+
),
|
| 78 |
+
"HALLUCINATED_CODE": (
|
| 79 |
+
"Re-read the source file, understand the function signature, "
|
| 80 |
+
"then write a minimal targeted fix. Run tests to verify."
|
| 81 |
+
),
|
| 82 |
+
"NEVER_TESTED": (
|
| 83 |
+
"Always call run_tests after writing a fix. "
|
| 84 |
+
"Submit only when test pass rate has demonstrably improved."
|
| 85 |
+
),
|
| 86 |
+
"LOOPING_BEHAVIOR": (
|
| 87 |
+
"Stop repeating the same action. Use search_code to find the bug location, "
|
| 88 |
+
"then navigate directly to it."
|
| 89 |
+
),
|
| 90 |
+
"CONTEXT_OVERFLOW": (
|
| 91 |
+
"Focus on files explicitly referenced in the failing test's imports. "
|
| 92 |
+
"Avoid reading utility files unless the test error specifically mentions them."
|
| 93 |
+
),
|
| 94 |
+
"SECURITY_VIOLATION": (
|
| 95 |
+
"Do not use os.system, eval, exec, or subprocess in fixes. "
|
| 96 |
+
"Write pure Python logic without shell calls."
|
| 97 |
+
),
|
| 98 |
+
"CORRECT": "No remediation needed.",
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
class FailureClassifier:
|
| 103 |
+
"""
|
| 104 |
+
Classifies agent failures from trajectory data.
|
| 105 |
+
|
| 106 |
+
Usage:
|
| 107 |
+
clf = FailureClassifier()
|
| 108 |
+
report = clf.classify(
|
| 109 |
+
episode_id="abc123",
|
| 110 |
+
task="task1",
|
| 111 |
+
trajectory_steps=[...],
|
| 112 |
+
variant_meta={...},
|
| 113 |
+
files_read=[...],
|
| 114 |
+
files_written=[...],
|
| 115 |
+
final_score=0.0,
|
| 116 |
+
)
|
| 117 |
+
"""
|
| 118 |
+
|
| 119 |
+
def classify(
|
| 120 |
+
self,
|
| 121 |
+
episode_id: str,
|
| 122 |
+
task: str,
|
| 123 |
+
trajectory_steps: List[dict],
|
| 124 |
+
variant_meta: Dict[str, Any],
|
| 125 |
+
files_read: List[str],
|
| 126 |
+
files_written: List[str],
|
| 127 |
+
final_score: float,
|
| 128 |
+
security_violations: int = 0,
|
| 129 |
+
) -> FailureReport:
|
| 130 |
+
"""Run all classifiers and build a structured failure report."""
|
| 131 |
+
failures: List[FailureInstance] = []
|
| 132 |
+
success = final_score >= 0.5
|
| 133 |
+
|
| 134 |
+
if success and security_violations == 0:
|
| 135 |
+
return FailureReport(
|
| 136 |
+
episode_id=episode_id,
|
| 137 |
+
task=task,
|
| 138 |
+
primary_failure="CORRECT",
|
| 139 |
+
failures=[],
|
| 140 |
+
success=True,
|
| 141 |
+
failure_summary="Agent succeeded without errors.",
|
| 142 |
+
retry_hint="",
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
action_sequence = [s.get("action_type", "") for s in trajectory_steps]
|
| 146 |
+
|
| 147 |
+
# ── Classifier 1: Wrong File Navigation ───────────────────────────────
|
| 148 |
+
relevant = set(
|
| 149 |
+
variant_meta.get("bug_files", []) +
|
| 150 |
+
variant_meta.get("interface_files", []) +
|
| 151 |
+
variant_meta.get("read_first_files", []) +
|
| 152 |
+
variant_meta.get("files_to_implement", [])
|
| 153 |
+
)
|
| 154 |
+
if relevant and files_read:
|
| 155 |
+
irrelevant_reads = [f for f in files_read if f not in relevant
|
| 156 |
+
and not f.startswith("tests/")]
|
| 157 |
+
if len(irrelevant_reads) > 1 and not any(f in files_read for f in relevant):
|
| 158 |
+
failures.append(FailureInstance(
|
| 159 |
+
failure_type="WRONG_FILE_NAVIGATION",
|
| 160 |
+
severity="critical",
|
| 161 |
+
step_number=1,
|
| 162 |
+
evidence=f"Read {len(irrelevant_reads)} irrelevant files: {irrelevant_reads[:3]}. "
|
| 163 |
+
f"Never read key files: {list(relevant)[:3]}",
|
| 164 |
+
root_cause="Agent navigated to wrong part of the codebase entirely.",
|
| 165 |
+
remediation=FAILURE_REMEDIATION["WRONG_FILE_NAVIGATION"],
|
| 166 |
+
))
|
| 167 |
+
|
| 168 |
+
# ── Classifier 2: Blind Write ─────────────────────────────────────────
|
| 169 |
+
write_indices = [i for i, a in enumerate(action_sequence) if a == "write_file"]
|
| 170 |
+
for wi in write_indices:
|
| 171 |
+
reads_before = [a for a in action_sequence[:wi] if a == "read_file"]
|
| 172 |
+
if not reads_before:
|
| 173 |
+
step = trajectory_steps[wi]
|
| 174 |
+
failures.append(FailureInstance(
|
| 175 |
+
failure_type="BLIND_WRITE",
|
| 176 |
+
severity="critical",
|
| 177 |
+
step_number=wi + 1,
|
| 178 |
+
evidence=f"write_file at step {wi+1} with zero prior read_file actions.",
|
| 179 |
+
root_cause="Agent attempted to fix code without reading it first — likely hallucinating.",
|
| 180 |
+
remediation=FAILURE_REMEDIATION["BLIND_WRITE"],
|
| 181 |
+
))
|
| 182 |
+
|
| 183 |
+
# ── Classifier 3: Hallucinated Code ───────────────────────────────────
|
| 184 |
+
# Detect write followed by immediate test failure
|
| 185 |
+
for i, step in enumerate(trajectory_steps):
|
| 186 |
+
if step.get("action_type") == "run_tests":
|
| 187 |
+
prev_write = None
|
| 188 |
+
for j in range(i - 1, -1, -1):
|
| 189 |
+
if trajectory_steps[j].get("action_type") == "write_file":
|
| 190 |
+
prev_write = j
|
| 191 |
+
break
|
| 192 |
+
if prev_write is not None:
|
| 193 |
+
pass_rate = step.get("test_pass_rate", None)
|
| 194 |
+
if pass_rate is not None and pass_rate < 0.3:
|
| 195 |
+
failures.append(FailureInstance(
|
| 196 |
+
failure_type="HALLUCINATED_CODE",
|
| 197 |
+
severity="major",
|
| 198 |
+
step_number=i + 1,
|
| 199 |
+
evidence=f"Test pass rate {pass_rate:.2f} after write at step {prev_write+1}. "
|
| 200 |
+
f"Code change made things worse.",
|
| 201 |
+
root_cause="Agent wrote syntactically correct but semantically wrong code.",
|
| 202 |
+
remediation=FAILURE_REMEDIATION["HALLUCINATED_CODE"],
|
| 203 |
+
))
|
| 204 |
+
|
| 205 |
+
# ── Classifier 4: Never Tested ────────────────────────────────────────
|
| 206 |
+
has_tests = "run_tests" in action_sequence
|
| 207 |
+
has_writes = "write_file" in action_sequence
|
| 208 |
+
has_submit = "submit" in action_sequence
|
| 209 |
+
if has_submit and has_writes and not has_tests:
|
| 210 |
+
failures.append(FailureInstance(
|
| 211 |
+
failure_type="NEVER_TESTED",
|
| 212 |
+
severity="major",
|
| 213 |
+
step_number=len(action_sequence),
|
| 214 |
+
evidence="Agent wrote code changes but submitted without running any tests.",
|
| 215 |
+
root_cause="No feedback loop — agent cannot know if its fix worked.",
|
| 216 |
+
remediation=FAILURE_REMEDIATION["NEVER_TESTED"],
|
| 217 |
+
))
|
| 218 |
+
|
| 219 |
+
# ── Classifier 5: Looping Behavior ────────────────────────────────────
|
| 220 |
+
read_paths = [
|
| 221 |
+
(i, s.get("action_path"))
|
| 222 |
+
for i, s in enumerate(trajectory_steps)
|
| 223 |
+
if s.get("action_type") == "read_file" and s.get("action_path")
|
| 224 |
+
]
|
| 225 |
+
path_counts: Dict[str, List[int]] = {}
|
| 226 |
+
for idx, path in read_paths:
|
| 227 |
+
path_counts.setdefault(path, []).append(idx)
|
| 228 |
+
|
| 229 |
+
for path, indices in path_counts.items():
|
| 230 |
+
if len(indices) >= 3:
|
| 231 |
+
failures.append(FailureInstance(
|
| 232 |
+
failure_type="LOOPING_BEHAVIOR",
|
| 233 |
+
severity="major",
|
| 234 |
+
step_number=indices[2] + 1,
|
| 235 |
+
evidence=f"Read '{path}' {len(indices)} times (steps {[i+1 for i in indices]}). "
|
| 236 |
+
f"Agent is stuck in a read loop.",
|
| 237 |
+
root_cause="Agent cannot extract the needed information and keeps retrying.",
|
| 238 |
+
remediation=FAILURE_REMEDIATION["LOOPING_BEHAVIOR"],
|
| 239 |
+
))
|
| 240 |
+
|
| 241 |
+
# ── Classifier 6: Context Overflow ────────────────────────────────────
|
| 242 |
+
total_content = sum(
|
| 243 |
+
s.get("action_content_length") or 0
|
| 244 |
+
for s in trajectory_steps
|
| 245 |
+
if s.get("action_type") == "read_file"
|
| 246 |
+
)
|
| 247 |
+
if total_content > 50_000 and final_score < 0.5:
|
| 248 |
+
failures.append(FailureInstance(
|
| 249 |
+
failure_type="CONTEXT_OVERFLOW",
|
| 250 |
+
severity="minor",
|
| 251 |
+
step_number=len(trajectory_steps),
|
| 252 |
+
evidence=f"Agent read {total_content:,} chars total. "
|
| 253 |
+
f"Most of this was likely irrelevant context.",
|
| 254 |
+
root_cause="Agent wasted token budget reading unnecessary files.",
|
| 255 |
+
remediation=FAILURE_REMEDIATION["CONTEXT_OVERFLOW"],
|
| 256 |
+
))
|
| 257 |
+
|
| 258 |
+
# ── Classifier 7: Security Violation ─────────────────────────────────
|
| 259 |
+
if security_violations > 0:
|
| 260 |
+
sec_steps = [
|
| 261 |
+
s for s in trajectory_steps if s.get("security_flags")
|
| 262 |
+
]
|
| 263 |
+
for ss in sec_steps:
|
| 264 |
+
failures.append(FailureInstance(
|
| 265 |
+
failure_type="SECURITY_VIOLATION",
|
| 266 |
+
severity="critical",
|
| 267 |
+
step_number=ss.get("step_number", 0),
|
| 268 |
+
evidence=f"Flags: {ss.get('security_flags', [])}",
|
| 269 |
+
root_cause="Agent wrote unsafe code patterns that would be dangerous in production.",
|
| 270 |
+
remediation=FAILURE_REMEDIATION["SECURITY_VIOLATION"],
|
| 271 |
+
))
|
| 272 |
+
|
| 273 |
+
# ── Build report ──────────────────────────────────────────────────────
|
| 274 |
+
if not failures:
|
| 275 |
+
# Failed but no specific classifier triggered — generic low score
|
| 276 |
+
primary = "HALLUCINATED_CODE"
|
| 277 |
+
summary = f"Score {final_score:.2f} — fix was written but insufficient. Re-read the source files more carefully."
|
| 278 |
+
hint = "Read test file → read all src files → write targeted fix → run tests → submit."
|
| 279 |
+
else:
|
| 280 |
+
# Pick most severe failure as primary
|
| 281 |
+
failures.sort(key=lambda f: SEVERITY_RANK.get(f.severity, 0), reverse=True)
|
| 282 |
+
primary = failures[0].failure_type
|
| 283 |
+
summary = "; ".join(f"{f.failure_type} (step {f.step_number})" for f in failures[:3])
|
| 284 |
+
hint = failures[0].remediation
|
| 285 |
+
|
| 286 |
+
return FailureReport(
|
| 287 |
+
episode_id=episode_id,
|
| 288 |
+
task=task,
|
| 289 |
+
primary_failure=primary,
|
| 290 |
+
failures=failures,
|
| 291 |
+
success=success,
|
| 292 |
+
failure_summary=summary,
|
| 293 |
+
retry_hint=hint,
|
| 294 |
+
)
|
server/multi_agent.py
ADDED
|
@@ -0,0 +1,371 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/multi_agent.py
|
| 2 |
+
"""
|
| 3 |
+
Multi-Agent Comparison Engine.
|
| 4 |
+
|
| 5 |
+
Runs multiple agent configurations against the SAME task variant
|
| 6 |
+
and produces a side-by-side comparison report.
|
| 7 |
+
|
| 8 |
+
Agent configurations:
|
| 9 |
+
- Deterministic (rule-based, no LLM) — baseline
|
| 10 |
+
- Test-first (forces reading tests before anything)
|
| 11 |
+
- Search-first (forces search_code before reads)
|
| 12 |
+
- LLM-based (if HF_TOKEN provided)
|
| 13 |
+
|
| 14 |
+
This is the key feature that answers: "Which agent strategy wins?"
|
| 15 |
+
"""
|
| 16 |
+
import time
|
| 17 |
+
import copy
|
| 18 |
+
from typing import List, Dict, Any, Optional, Callable
|
| 19 |
+
from dataclasses import dataclass, field
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@dataclass
|
| 23 |
+
class AgentRunResult:
|
| 24 |
+
"""Result of one agent configuration running one episode."""
|
| 25 |
+
agent_name: str
|
| 26 |
+
task: str
|
| 27 |
+
variant_id: str
|
| 28 |
+
final_score: float
|
| 29 |
+
total_steps: int
|
| 30 |
+
cumulative_reward: float
|
| 31 |
+
duration_seconds: float
|
| 32 |
+
action_sequence: List[str]
|
| 33 |
+
files_read: List[str]
|
| 34 |
+
files_written: List[str]
|
| 35 |
+
strategy: str # Detected strategy label
|
| 36 |
+
strategy_score: float
|
| 37 |
+
failure_type: str
|
| 38 |
+
reliability_index: float
|
| 39 |
+
step_timeline: List[dict]
|
| 40 |
+
|
| 41 |
+
def to_dict(self) -> dict:
|
| 42 |
+
return {
|
| 43 |
+
"agent_name": self.agent_name,
|
| 44 |
+
"task": self.task,
|
| 45 |
+
"variant_id": self.variant_id,
|
| 46 |
+
"final_score": round(self.final_score, 3),
|
| 47 |
+
"total_steps": self.total_steps,
|
| 48 |
+
"cumulative_reward": round(self.cumulative_reward, 3),
|
| 49 |
+
"duration_seconds": round(self.duration_seconds, 2),
|
| 50 |
+
"action_sequence": self.action_sequence,
|
| 51 |
+
"files_read": self.files_read,
|
| 52 |
+
"files_written": self.files_written,
|
| 53 |
+
"strategy": self.strategy,
|
| 54 |
+
"strategy_score": round(self.strategy_score, 3),
|
| 55 |
+
"failure_type": self.failure_type,
|
| 56 |
+
"reliability_index": round(self.reliability_index, 3),
|
| 57 |
+
"step_timeline": self.step_timeline,
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@dataclass
|
| 62 |
+
class ComparisonReport:
|
| 63 |
+
"""Side-by-side comparison of multiple agent configurations."""
|
| 64 |
+
task: str
|
| 65 |
+
variant_id: str
|
| 66 |
+
runs: List[AgentRunResult] = field(default_factory=list)
|
| 67 |
+
|
| 68 |
+
def to_dict(self) -> dict:
|
| 69 |
+
if not self.runs:
|
| 70 |
+
return {"error": "No runs to compare"}
|
| 71 |
+
|
| 72 |
+
# Rank by score then steps
|
| 73 |
+
ranked = sorted(self.runs, key=lambda r: (-r.final_score, r.total_steps))
|
| 74 |
+
winner = ranked[0]
|
| 75 |
+
|
| 76 |
+
return {
|
| 77 |
+
"task": self.task,
|
| 78 |
+
"variant_id": self.variant_id,
|
| 79 |
+
"winner": winner.agent_name,
|
| 80 |
+
"winner_score": winner.final_score,
|
| 81 |
+
"summary_table": [
|
| 82 |
+
{
|
| 83 |
+
"rank": i + 1,
|
| 84 |
+
"agent": r.agent_name,
|
| 85 |
+
"score": round(r.final_score, 3),
|
| 86 |
+
"steps": r.total_steps,
|
| 87 |
+
"reward": round(r.cumulative_reward, 3),
|
| 88 |
+
"strategy": r.strategy,
|
| 89 |
+
"failure": r.failure_type,
|
| 90 |
+
"reliability": round(r.reliability_index, 3),
|
| 91 |
+
}
|
| 92 |
+
for i, r in enumerate(ranked)
|
| 93 |
+
],
|
| 94 |
+
"detailed_runs": [r.to_dict() for r in self.runs],
|
| 95 |
+
"insights": self._generate_insights(ranked),
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
def _generate_insights(self, ranked: List[AgentRunResult]) -> List[str]:
|
| 99 |
+
insights = []
|
| 100 |
+
if len(ranked) < 2:
|
| 101 |
+
return insights
|
| 102 |
+
|
| 103 |
+
best = ranked[0]
|
| 104 |
+
worst = ranked[-1]
|
| 105 |
+
|
| 106 |
+
if best.final_score > worst.final_score + 0.2:
|
| 107 |
+
insights.append(
|
| 108 |
+
f"'{best.agent_name}' significantly outperformed '{worst.agent_name}' "
|
| 109 |
+
f"({best.final_score:.2f} vs {worst.final_score:.2f})"
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
step_diffs = [(r.agent_name, r.total_steps) for r in ranked]
|
| 113 |
+
most_efficient = min(ranked, key=lambda r: r.total_steps if r.final_score >= 0.5 else float('inf'))
|
| 114 |
+
if most_efficient.final_score >= 0.5:
|
| 115 |
+
insights.append(
|
| 116 |
+
f"Most step-efficient successful agent: '{most_efficient.agent_name}' "
|
| 117 |
+
f"({most_efficient.total_steps} steps)"
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
strategies = [r.strategy for r in ranked]
|
| 121 |
+
if len(set(strategies)) > 1:
|
| 122 |
+
insights.append(
|
| 123 |
+
f"Strategy variance observed: {set(strategies)} — "
|
| 124 |
+
f"'{best.agent_name}' used {best.strategy} which proved most effective."
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
return insights
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
class MultiAgentComparison:
|
| 131 |
+
"""
|
| 132 |
+
Runs multiple deterministic agent strategies against the same environment.
|
| 133 |
+
|
| 134 |
+
Usage (in-process, no LLM required):
|
| 135 |
+
from server.environment import CodebaseNavEnvironment
|
| 136 |
+
from server.models import RepoAction
|
| 137 |
+
|
| 138 |
+
env = CodebaseNavEnvironment()
|
| 139 |
+
engine = MultiAgentComparison()
|
| 140 |
+
report = engine.compare(env, task="task1")
|
| 141 |
+
"""
|
| 142 |
+
|
| 143 |
+
# ── Built-in agent strategies ─────────────────────────────────────────────
|
| 144 |
+
|
| 145 |
+
@staticmethod
|
| 146 |
+
def _agent_test_first(obs: dict, step: int, context: dict) -> dict:
|
| 147 |
+
"""Strategy: Read tests before any source file."""
|
| 148 |
+
tree = obs.get("repo_tree", [])
|
| 149 |
+
files_read = set(obs.get("files_read", []))
|
| 150 |
+
|
| 151 |
+
test_files = sorted([f for f in tree if f.startswith("tests/")])
|
| 152 |
+
src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")])
|
| 153 |
+
spec_files = sorted([f for f in tree if f.endswith(".md")])
|
| 154 |
+
|
| 155 |
+
# Phase 1: Tests first
|
| 156 |
+
for tf in test_files:
|
| 157 |
+
if tf not in files_read:
|
| 158 |
+
return {"action_type": "read_file", "path": tf}
|
| 159 |
+
# Phase 2: Source files
|
| 160 |
+
for sf in src_files:
|
| 161 |
+
if sf not in files_read:
|
| 162 |
+
return {"action_type": "read_file", "path": sf}
|
| 163 |
+
# Phase 3: Run tests
|
| 164 |
+
if test_files and context.get("tests_run", 0) == 0:
|
| 165 |
+
context["tests_run"] = 1
|
| 166 |
+
return {"action_type": "run_tests", "path": test_files[0]}
|
| 167 |
+
return {"action_type": "submit"}
|
| 168 |
+
|
| 169 |
+
@staticmethod
|
| 170 |
+
def _agent_search_first(obs: dict, step: int, context: dict) -> dict:
|
| 171 |
+
"""Strategy: Use search_code to locate the bug before reading."""
|
| 172 |
+
tree = obs.get("repo_tree", [])
|
| 173 |
+
files_read = set(obs.get("files_read", []))
|
| 174 |
+
failing = obs.get("failing_tests", [])
|
| 175 |
+
|
| 176 |
+
# Step 1: search for the failing test function name
|
| 177 |
+
if step == 1 and failing:
|
| 178 |
+
fn_name = failing[0].split(".")[-1] if failing else "bug"
|
| 179 |
+
context["searched"] = True
|
| 180 |
+
return {"action_type": "search_code", "query": fn_name}
|
| 181 |
+
|
| 182 |
+
# Step 2: Read files based on search
|
| 183 |
+
test_files = sorted([f for f in tree if f.startswith("tests/")])
|
| 184 |
+
src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")])
|
| 185 |
+
|
| 186 |
+
for tf in test_files:
|
| 187 |
+
if tf not in files_read:
|
| 188 |
+
return {"action_type": "read_file", "path": tf}
|
| 189 |
+
for sf in src_files:
|
| 190 |
+
if sf not in files_read:
|
| 191 |
+
return {"action_type": "read_file", "path": sf}
|
| 192 |
+
if test_files and context.get("tests_run", 0) == 0:
|
| 193 |
+
context["tests_run"] = 1
|
| 194 |
+
return {"action_type": "run_tests", "path": test_files[0]}
|
| 195 |
+
return {"action_type": "submit"}
|
| 196 |
+
|
| 197 |
+
@staticmethod
|
| 198 |
+
def _agent_minimal(obs: dict, step: int, context: dict) -> dict:
|
| 199 |
+
"""Strategy: Minimal effort — read one file, submit immediately."""
|
| 200 |
+
tree = obs.get("repo_tree", [])
|
| 201 |
+
files_read = set(obs.get("files_read", []))
|
| 202 |
+
src_files = [f for f in tree if f.startswith("src/") and f.endswith(".py")]
|
| 203 |
+
if src_files and not files_read:
|
| 204 |
+
return {"action_type": "read_file", "path": src_files[0]}
|
| 205 |
+
return {"action_type": "submit"}
|
| 206 |
+
|
| 207 |
+
@staticmethod
|
| 208 |
+
def _agent_exhaustive(obs: dict, step: int, context: dict) -> dict:
|
| 209 |
+
"""Strategy: Read everything, run tests, then submit."""
|
| 210 |
+
tree = obs.get("repo_tree", [])
|
| 211 |
+
files_read = set(obs.get("files_read", []))
|
| 212 |
+
|
| 213 |
+
all_readable = [f for f in tree if f.endswith(".py") or f.endswith(".md")]
|
| 214 |
+
for f in all_readable:
|
| 215 |
+
if f not in files_read:
|
| 216 |
+
return {"action_type": "read_file", "path": f}
|
| 217 |
+
|
| 218 |
+
test_files = [f for f in tree if f.startswith("tests/")]
|
| 219 |
+
if test_files and context.get("tests_run", 0) == 0:
|
| 220 |
+
context["tests_run"] = 1
|
| 221 |
+
return {"action_type": "run_tests", "path": test_files[0]}
|
| 222 |
+
if test_files and context.get("tests_run2", 0) == 0:
|
| 223 |
+
context["tests_run2"] = 1
|
| 224 |
+
return {"action_type": "run_tests"}
|
| 225 |
+
return {"action_type": "submit"}
|
| 226 |
+
|
| 227 |
+
AGENT_CONFIGS = {
|
| 228 |
+
"test-first": _agent_test_first.__func__,
|
| 229 |
+
"search-first": _agent_search_first.__func__,
|
| 230 |
+
"minimal": _agent_minimal.__func__,
|
| 231 |
+
"exhaustive": _agent_exhaustive.__func__,
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
def compare(
|
| 235 |
+
self,
|
| 236 |
+
env, # CodebaseNavEnvironment instance
|
| 237 |
+
task: str = "task1",
|
| 238 |
+
agents: Optional[List[str]] = None,
|
| 239 |
+
shared_variant: Optional[str] = None,
|
| 240 |
+
) -> ComparisonReport:
|
| 241 |
+
"""
|
| 242 |
+
Run all (or selected) agents against the same task and compare.
|
| 243 |
+
The environment is reset to the same variant for each agent.
|
| 244 |
+
"""
|
| 245 |
+
from server.models import RepoAction
|
| 246 |
+
from server.strategy_detector import StrategyDetector
|
| 247 |
+
from server.failure_classifier import FailureClassifier
|
| 248 |
+
from server.advanced_metrics import AdvancedMetricsEngine
|
| 249 |
+
|
| 250 |
+
agent_names = agents or list(self.AGENT_CONFIGS.keys())
|
| 251 |
+
strategy_detector = StrategyDetector()
|
| 252 |
+
failure_classifier = FailureClassifier()
|
| 253 |
+
metrics_engine = AdvancedMetricsEngine()
|
| 254 |
+
|
| 255 |
+
runs: List[AgentRunResult] = []
|
| 256 |
+
variant_id = None
|
| 257 |
+
|
| 258 |
+
for agent_name in agent_names:
|
| 259 |
+
agent_fn = self.AGENT_CONFIGS.get(agent_name)
|
| 260 |
+
if not agent_fn:
|
| 261 |
+
continue
|
| 262 |
+
|
| 263 |
+
# Reset environment
|
| 264 |
+
reset_result = env.reset(task=task)
|
| 265 |
+
obs = reset_result.observation
|
| 266 |
+
variant_id = reset_result.info.get("variant_id", "unknown")
|
| 267 |
+
|
| 268 |
+
context = {}
|
| 269 |
+
start = time.time()
|
| 270 |
+
max_steps = 15
|
| 271 |
+
files_read = []
|
| 272 |
+
files_written = []
|
| 273 |
+
cumulative_reward = 0.0
|
| 274 |
+
action_sequence = []
|
| 275 |
+
step_timeline = []
|
| 276 |
+
|
| 277 |
+
obs_dict = obs.model_dump()
|
| 278 |
+
|
| 279 |
+
for step_num in range(1, max_steps + 1):
|
| 280 |
+
if env.done:
|
| 281 |
+
break
|
| 282 |
+
|
| 283 |
+
action_dict = agent_fn(obs_dict, step_num, context)
|
| 284 |
+
action = RepoAction(
|
| 285 |
+
action_type=action_dict.get("action_type", "submit"),
|
| 286 |
+
path=action_dict.get("path"),
|
| 287 |
+
query=action_dict.get("query"),
|
| 288 |
+
content=action_dict.get("content"),
|
| 289 |
+
)
|
| 290 |
+
|
| 291 |
+
result = env.step(action)
|
| 292 |
+
obs = result.observation
|
| 293 |
+
obs_dict = obs.model_dump()
|
| 294 |
+
cumulative_reward += result.reward
|
| 295 |
+
action_sequence.append(action.action_type)
|
| 296 |
+
|
| 297 |
+
if action.path and action.action_type == "read_file":
|
| 298 |
+
files_read.append(action.path)
|
| 299 |
+
if action.path and action.action_type == "write_file":
|
| 300 |
+
files_written.append(action.path)
|
| 301 |
+
|
| 302 |
+
step_timeline.append({
|
| 303 |
+
"step": step_num,
|
| 304 |
+
"action": action.action_type,
|
| 305 |
+
"path": action.path,
|
| 306 |
+
"reward": round(result.reward, 3),
|
| 307 |
+
})
|
| 308 |
+
|
| 309 |
+
if result.done:
|
| 310 |
+
break
|
| 311 |
+
|
| 312 |
+
# Force submit if not done
|
| 313 |
+
if not env.done:
|
| 314 |
+
result = env.step(RepoAction(action_type="submit"))
|
| 315 |
+
cumulative_reward += result.reward
|
| 316 |
+
action_sequence.append("submit")
|
| 317 |
+
|
| 318 |
+
duration = time.time() - start
|
| 319 |
+
final_score = env.final_score
|
| 320 |
+
|
| 321 |
+
# Get trajectory for analysis
|
| 322 |
+
trajectory = env.get_trajectory()
|
| 323 |
+
traj_steps = trajectory.get("steps", []) if trajectory else []
|
| 324 |
+
variant_meta = {}
|
| 325 |
+
if env.variant:
|
| 326 |
+
variant_meta = env.variant.meta
|
| 327 |
+
|
| 328 |
+
# Detect strategy
|
| 329 |
+
strategy_report = strategy_detector.detect(
|
| 330 |
+
traj_steps, task, variant_meta, files_read, final_score
|
| 331 |
+
)
|
| 332 |
+
|
| 333 |
+
# Classify failure
|
| 334 |
+
failure_report = failure_classifier.classify(
|
| 335 |
+
episode_id=trajectory.get("episode_id", "") if trajectory else "",
|
| 336 |
+
task=task,
|
| 337 |
+
trajectory_steps=traj_steps,
|
| 338 |
+
variant_meta=variant_meta,
|
| 339 |
+
files_read=files_read,
|
| 340 |
+
files_written=files_written,
|
| 341 |
+
final_score=final_score,
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
# Advanced metrics
|
| 345 |
+
adv_metrics = metrics_engine.compute(
|
| 346 |
+
traj_steps, variant_meta, final_score, files_read, files_written
|
| 347 |
+
)
|
| 348 |
+
|
| 349 |
+
runs.append(AgentRunResult(
|
| 350 |
+
agent_name=agent_name,
|
| 351 |
+
task=task,
|
| 352 |
+
variant_id=variant_id or "unknown",
|
| 353 |
+
final_score=final_score,
|
| 354 |
+
total_steps=len(action_sequence),
|
| 355 |
+
cumulative_reward=cumulative_reward,
|
| 356 |
+
duration_seconds=duration,
|
| 357 |
+
action_sequence=action_sequence,
|
| 358 |
+
files_read=files_read,
|
| 359 |
+
files_written=files_written,
|
| 360 |
+
strategy=strategy_report.strategy,
|
| 361 |
+
strategy_score=strategy_report.score,
|
| 362 |
+
failure_type=failure_report.primary_failure,
|
| 363 |
+
reliability_index=adv_metrics.reliability_index,
|
| 364 |
+
step_timeline=step_timeline,
|
| 365 |
+
))
|
| 366 |
+
|
| 367 |
+
return ComparisonReport(
|
| 368 |
+
task=task,
|
| 369 |
+
variant_id=variant_id or "unknown",
|
| 370 |
+
runs=runs,
|
| 371 |
+
)
|
server/self_improvement.py
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/self_improvement.py
|
| 2 |
+
"""
|
| 3 |
+
Self-Improvement Loop.
|
| 4 |
+
|
| 5 |
+
After a failure, generates structured feedback and an improved strategy prompt
|
| 6 |
+
that can be injected into the agent's next attempt. This closes the loop
|
| 7 |
+
between evaluation and agent behavior.
|
| 8 |
+
|
| 9 |
+
The retry loop:
|
| 10 |
+
1. Run episode → evaluate → classify failures
|
| 11 |
+
2. Generate improvement prompt based on failure type
|
| 12 |
+
3. Re-run episode with improvement prompt injected into agent context
|
| 13 |
+
4. Compare before/after performance
|
| 14 |
+
"""
|
| 15 |
+
from typing import List, Dict, Any, Optional
|
| 16 |
+
from dataclasses import dataclass, field
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@dataclass
|
| 20 |
+
class ImprovementPlan:
|
| 21 |
+
"""Structured feedback for the agent's next attempt."""
|
| 22 |
+
episode_id: str
|
| 23 |
+
task: str
|
| 24 |
+
failure_type: str
|
| 25 |
+
original_score: float
|
| 26 |
+
|
| 27 |
+
# Actionable feedback
|
| 28 |
+
what_went_wrong: str
|
| 29 |
+
specific_errors: List[str]
|
| 30 |
+
improved_strategy: str
|
| 31 |
+
step_by_step_plan: List[str]
|
| 32 |
+
|
| 33 |
+
# For injection into agent prompt
|
| 34 |
+
system_prompt_addon: str # Extra instructions for the system prompt
|
| 35 |
+
user_context_addon: str # Extra context for the user prompt
|
| 36 |
+
|
| 37 |
+
def to_dict(self) -> dict:
|
| 38 |
+
return {
|
| 39 |
+
"episode_id": self.episode_id,
|
| 40 |
+
"task": self.task,
|
| 41 |
+
"failure_type": self.failure_type,
|
| 42 |
+
"original_score": round(self.original_score, 3),
|
| 43 |
+
"what_went_wrong": self.what_went_wrong,
|
| 44 |
+
"specific_errors": self.specific_errors,
|
| 45 |
+
"improved_strategy": self.improved_strategy,
|
| 46 |
+
"step_by_step_plan": self.step_by_step_plan,
|
| 47 |
+
"system_prompt_addon": self.system_prompt_addon,
|
| 48 |
+
"user_context_addon": self.user_context_addon,
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@dataclass
|
| 53 |
+
class RetryResult:
|
| 54 |
+
"""Result of a retry attempt with improvement feedback."""
|
| 55 |
+
original_episode_id: str
|
| 56 |
+
retry_episode_id: str
|
| 57 |
+
original_score: float
|
| 58 |
+
retry_score: float
|
| 59 |
+
improvement: float # retry_score - original_score
|
| 60 |
+
failure_fixed: bool
|
| 61 |
+
steps_comparison: Dict[str, int] # {"original": N, "retry": M}
|
| 62 |
+
|
| 63 |
+
def to_dict(self) -> dict:
|
| 64 |
+
return {
|
| 65 |
+
"original_episode_id": self.original_episode_id,
|
| 66 |
+
"retry_episode_id": self.retry_episode_id,
|
| 67 |
+
"original_score": round(self.original_score, 3),
|
| 68 |
+
"retry_score": round(self.retry_score, 3),
|
| 69 |
+
"improvement": round(self.improvement, 3),
|
| 70 |
+
"failure_fixed": self.failure_fixed,
|
| 71 |
+
"steps_comparison": self.steps_comparison,
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# ── Strategy templates per failure type ──────────────────────────────────────
|
| 76 |
+
STRATEGY_TEMPLATES = {
|
| 77 |
+
"WRONG_FILE_NAVIGATION": {
|
| 78 |
+
"what_went_wrong": "Agent navigated to the wrong files and missed the bug location entirely.",
|
| 79 |
+
"strategy": "START with the failing test file. Read its imports. Navigate exclusively to those imported modules.",
|
| 80 |
+
"plan": [
|
| 81 |
+
"1. Read the failing test file FIRST (in tests/ directory)",
|
| 82 |
+
"2. Find the import statements — these point to the buggy module",
|
| 83 |
+
"3. Read ONLY those imported source files",
|
| 84 |
+
"4. Look for the function/method the test is calling",
|
| 85 |
+
"5. Fix the specific function — do not touch other code",
|
| 86 |
+
"6. Run the failing test to verify",
|
| 87 |
+
"7. Submit",
|
| 88 |
+
],
|
| 89 |
+
"system_addon": (
|
| 90 |
+
"CRITICAL: You previously failed by reading the wrong files. "
|
| 91 |
+
"This time: read the failing test first, identify its imports, "
|
| 92 |
+
"go directly to those source files. Do NOT read any file not referenced by the test."
|
| 93 |
+
),
|
| 94 |
+
},
|
| 95 |
+
"BLIND_WRITE": {
|
| 96 |
+
"what_went_wrong": "Agent wrote code without reading the existing implementation first.",
|
| 97 |
+
"strategy": "NEVER write before reading. Read the target file. Understand the existing logic. Then write a minimal fix.",
|
| 98 |
+
"plan": [
|
| 99 |
+
"1. Read the failing test to understand expected behavior",
|
| 100 |
+
"2. Read the source file you plan to modify",
|
| 101 |
+
"3. Identify the exact line(s) causing failure",
|
| 102 |
+
"4. Write a FIX (not a rewrite) targeting only those lines",
|
| 103 |
+
"5. Run tests to verify improvement",
|
| 104 |
+
"6. Submit",
|
| 105 |
+
],
|
| 106 |
+
"system_addon": (
|
| 107 |
+
"CRITICAL: You previously wrote code without reading the file first. "
|
| 108 |
+
"This time: ALWAYS call read_file on any file BEFORE using write_file. "
|
| 109 |
+
"No exceptions. Read → Understand → Write minimal fix."
|
| 110 |
+
),
|
| 111 |
+
},
|
| 112 |
+
"HALLUCINATED_CODE": {
|
| 113 |
+
"what_went_wrong": "Agent wrote syntactically correct but logically wrong code that made tests worse.",
|
| 114 |
+
"strategy": "Write a targeted, minimal fix. Do not rewrite entire functions. Change only what the test requires.",
|
| 115 |
+
"plan": [
|
| 116 |
+
"1. Read the failing test and note EXACTLY what assertion fails",
|
| 117 |
+
"2. Read the source function — understand its current behavior",
|
| 118 |
+
"3. Identify the gap between current and expected behavior",
|
| 119 |
+
"4. Write the SMALLEST possible change that bridges that gap",
|
| 120 |
+
"5. Run tests BEFORE submitting to verify the fix works",
|
| 121 |
+
"6. If tests still fail, re-read and refine — don't guess",
|
| 122 |
+
],
|
| 123 |
+
"system_addon": (
|
| 124 |
+
"CRITICAL: Your previous fix made things worse. This indicates hallucination. "
|
| 125 |
+
"This time: make the SMALLEST possible change. "
|
| 126 |
+
"Run run_tests after EVERY write to check if you're improving or degrading. "
|
| 127 |
+
"If tests get worse after a write, immediately read the file again and try a different approach."
|
| 128 |
+
),
|
| 129 |
+
},
|
| 130 |
+
"NEVER_TESTED": {
|
| 131 |
+
"what_went_wrong": "Agent submitted code changes without running any tests to verify they work.",
|
| 132 |
+
"strategy": "ALWAYS run run_tests after every write_file. Never submit without test verification.",
|
| 133 |
+
"plan": [
|
| 134 |
+
"1. Read test → Read source → Write fix",
|
| 135 |
+
"2. IMMEDIATELY run run_tests pointing to the failing test file",
|
| 136 |
+
"3. If tests pass: submit",
|
| 137 |
+
"4. If tests still fail: re-read, refine, run tests again",
|
| 138 |
+
"5. ONLY submit when you have seen test improvement",
|
| 139 |
+
],
|
| 140 |
+
"system_addon": (
|
| 141 |
+
"CRITICAL: You submitted without testing. This is invalid. "
|
| 142 |
+
"This time: after EVERY write_file action, you MUST call run_tests. "
|
| 143 |
+
"Only call submit when run_tests shows improvement. "
|
| 144 |
+
"The pattern is: read → write → run_tests → submit. Non-negotiable."
|
| 145 |
+
),
|
| 146 |
+
},
|
| 147 |
+
"LOOPING_BEHAVIOR": {
|
| 148 |
+
"what_went_wrong": "Agent got stuck reading the same file repeatedly without making progress.",
|
| 149 |
+
"strategy": "Use search_code to find the exact bug location. Read each file at most once.",
|
| 150 |
+
"plan": [
|
| 151 |
+
"1. Use search_code with the function name from the failing test",
|
| 152 |
+
"2. Read the file that contains the matching code — ONCE",
|
| 153 |
+
"3. If you need more context, use search_code again with a different query",
|
| 154 |
+
"4. Once you have read a file, do NOT read it again",
|
| 155 |
+
"5. Write your fix, run tests, submit",
|
| 156 |
+
],
|
| 157 |
+
"system_addon": (
|
| 158 |
+
"CRITICAL: You read the same files 3+ times without progress. "
|
| 159 |
+
"This time: you may read each file AT MOST ONCE. "
|
| 160 |
+
"Use search_code to pinpoint bug location. "
|
| 161 |
+
"If you are confused, use search_code — do not re-read files."
|
| 162 |
+
),
|
| 163 |
+
},
|
| 164 |
+
"SECURITY_VIOLATION": {
|
| 165 |
+
"what_went_wrong": "Agent wrote dangerous code patterns that would be harmful in production.",
|
| 166 |
+
"strategy": "Write pure Python logic only. Never use os, subprocess, eval, or exec.",
|
| 167 |
+
"plan": [
|
| 168 |
+
"1. Read the test to understand what pure Python behavior is needed",
|
| 169 |
+
"2. Implement the fix using ONLY standard library functions",
|
| 170 |
+
"3. No os.system(), subprocess, eval(), exec(), or __import__()",
|
| 171 |
+
"4. Run tests and submit",
|
| 172 |
+
],
|
| 173 |
+
"system_addon": (
|
| 174 |
+
"CRITICAL: Your previous code contained dangerous patterns (os.system, eval, exec, subprocess). "
|
| 175 |
+
"This is automatically penalized. "
|
| 176 |
+
"This time: write ONLY pure Python logic. No shell commands. No dynamic execution. "
|
| 177 |
+
"Use only stdlib modules that do not involve system access."
|
| 178 |
+
),
|
| 179 |
+
},
|
| 180 |
+
"CORRECT": {
|
| 181 |
+
"what_went_wrong": "No failure — agent succeeded.",
|
| 182 |
+
"strategy": "Continue with same strategy.",
|
| 183 |
+
"plan": ["Maintain current approach."],
|
| 184 |
+
"system_addon": "",
|
| 185 |
+
},
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
# Default template for unknown failures
|
| 189 |
+
DEFAULT_TEMPLATE = {
|
| 190 |
+
"what_went_wrong": "Agent failed to fix the bug sufficiently — score too low.",
|
| 191 |
+
"strategy": "Read all relevant files carefully, make a targeted fix, run tests, submit.",
|
| 192 |
+
"plan": [
|
| 193 |
+
"1. Read failing test to understand expected behavior",
|
| 194 |
+
"2. Read each source file referenced by the test",
|
| 195 |
+
"3. Identify the bug: wrong return value, missing case, logic error",
|
| 196 |
+
"4. Write minimal fix",
|
| 197 |
+
"5. Run tests",
|
| 198 |
+
"6. Submit only when tests improve",
|
| 199 |
+
],
|
| 200 |
+
"system_addon": (
|
| 201 |
+
"IMPORTANT: Your previous attempt scored below 0.5. "
|
| 202 |
+
"This time: focus on understanding what the failing test EXPECTS. "
|
| 203 |
+
"Make a targeted fix. Verify with run_tests before submitting."
|
| 204 |
+
),
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
class SelfImprovementEngine:
|
| 209 |
+
"""
|
| 210 |
+
Generates structured improvement plans from failure analysis.
|
| 211 |
+
|
| 212 |
+
Usage:
|
| 213 |
+
engine = SelfImprovementEngine()
|
| 214 |
+
plan = engine.generate_improvement_plan(
|
| 215 |
+
episode_id="abc123",
|
| 216 |
+
task="task1",
|
| 217 |
+
failure_report=failure_report,
|
| 218 |
+
trajectory_steps=[...],
|
| 219 |
+
)
|
| 220 |
+
# Then inject plan.system_prompt_addon into the agent's next run
|
| 221 |
+
"""
|
| 222 |
+
|
| 223 |
+
def generate_improvement_plan(
|
| 224 |
+
self,
|
| 225 |
+
episode_id: str,
|
| 226 |
+
task: str,
|
| 227 |
+
failure_type: str,
|
| 228 |
+
failure_evidence: List[str],
|
| 229 |
+
original_score: float,
|
| 230 |
+
trajectory_steps: List[dict],
|
| 231 |
+
files_read: List[str],
|
| 232 |
+
files_written: List[str],
|
| 233 |
+
) -> ImprovementPlan:
|
| 234 |
+
"""Generate an actionable improvement plan from failure data."""
|
| 235 |
+
template = STRATEGY_TEMPLATES.get(failure_type, DEFAULT_TEMPLATE)
|
| 236 |
+
|
| 237 |
+
# Build specific error list from trajectory
|
| 238 |
+
specific_errors = []
|
| 239 |
+
for step in trajectory_steps:
|
| 240 |
+
if step.get("error"):
|
| 241 |
+
specific_errors.append(
|
| 242 |
+
f"Step {step.get('step_number', '?')}: {step['error'][:100]}"
|
| 243 |
+
)
|
| 244 |
+
specific_errors.extend(failure_evidence[:3])
|
| 245 |
+
|
| 246 |
+
# Build user context addon with trajectory summary
|
| 247 |
+
action_summary = []
|
| 248 |
+
for step in trajectory_steps[:8]: # First 8 steps for context
|
| 249 |
+
a = step.get("action_type", "?")
|
| 250 |
+
p = step.get("action_path") or step.get("action_query") or ""
|
| 251 |
+
r = step.get("reward", 0)
|
| 252 |
+
err = " ❌" if step.get("error") else ""
|
| 253 |
+
action_summary.append(f" Step {step.get('step_number', '?')}: {a} {p} → reward={r:+.2f}{err}")
|
| 254 |
+
|
| 255 |
+
user_context_addon = (
|
| 256 |
+
f"[PREVIOUS ATTEMPT REVIEW]\n"
|
| 257 |
+
f"Score: {original_score:.2f}/1.0\n"
|
| 258 |
+
f"Primary failure: {failure_type}\n"
|
| 259 |
+
f"What went wrong: {template['what_went_wrong']}\n"
|
| 260 |
+
f"\nYour previous actions:\n" + "\n".join(action_summary) +
|
| 261 |
+
f"\n\n[IMPROVED STRATEGY FOR THIS ATTEMPT]\n{template['strategy']}"
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
return ImprovementPlan(
|
| 265 |
+
episode_id=episode_id,
|
| 266 |
+
task=task,
|
| 267 |
+
failure_type=failure_type,
|
| 268 |
+
original_score=original_score,
|
| 269 |
+
what_went_wrong=template["what_went_wrong"],
|
| 270 |
+
specific_errors=specific_errors,
|
| 271 |
+
improved_strategy=template["strategy"],
|
| 272 |
+
step_by_step_plan=template["plan"],
|
| 273 |
+
system_prompt_addon=template["system_addon"],
|
| 274 |
+
user_context_addon=user_context_addon,
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
def build_retry_system_prompt(self, base_prompt: str, improvement_plan: ImprovementPlan) -> str:
|
| 278 |
+
"""Inject improvement guidance into the agent system prompt."""
|
| 279 |
+
if not improvement_plan.system_prompt_addon:
|
| 280 |
+
return base_prompt
|
| 281 |
+
return (
|
| 282 |
+
f"{base_prompt}\n\n"
|
| 283 |
+
f"{'='*60}\n"
|
| 284 |
+
f"PREVIOUS ATTEMPT FEEDBACK (VERY IMPORTANT):\n"
|
| 285 |
+
f"{'='*60}\n"
|
| 286 |
+
f"{improvement_plan.system_prompt_addon}\n"
|
| 287 |
+
f"{'='*60}"
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
def build_retry_user_context(self, improvement_plan: ImprovementPlan) -> str:
|
| 291 |
+
"""Build the user context string to prepend to the first prompt in a retry."""
|
| 292 |
+
return improvement_plan.user_context_addon
|
server/strategy_detector.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/strategy_detector.py
|
| 2 |
+
"""
|
| 3 |
+
Strategy Pattern Detector.
|
| 4 |
+
|
| 5 |
+
Classifies what high-level search/navigation strategy the agent used.
|
| 6 |
+
This goes beyond step counting — it classifies the cognitive approach.
|
| 7 |
+
|
| 8 |
+
Strategies:
|
| 9 |
+
TARGETED_DEBUGGING — reads test → reads relevant src → fixes → tests
|
| 10 |
+
SYSTEMATIC_SEARCH — reads all files methodically before writing
|
| 11 |
+
BRUTE_FORCE — writes and runs tests repeatedly until something passes
|
| 12 |
+
RANDOM_EXPLORATION — no coherent pattern, reads random files
|
| 13 |
+
SPEC_DRIVEN — reads spec/docs first, then implements
|
| 14 |
+
MINIMAL_EFFORT — does the bare minimum (often fails)
|
| 15 |
+
|
| 16 |
+
Each strategy gets a score (1.0 = ideal for the task), a label, and evidence.
|
| 17 |
+
"""
|
| 18 |
+
from typing import List, Dict, Any, Optional
|
| 19 |
+
from dataclasses import dataclass, field
|
| 20 |
+
from collections import Counter
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@dataclass
|
| 24 |
+
class StrategyReport:
|
| 25 |
+
"""Result of strategy pattern detection."""
|
| 26 |
+
strategy: str # Primary strategy label
|
| 27 |
+
score: float # 0.0–1.0 (task-appropriate quality)
|
| 28 |
+
confidence: float # How confident we are in the label (0–1)
|
| 29 |
+
sub_patterns: List[str] # Additional behavioral sub-patterns
|
| 30 |
+
evidence: List[str] # Supporting observations
|
| 31 |
+
strategy_description: str # Human-readable explanation
|
| 32 |
+
exploration_ratio: float # 0=pure exploit, 1=pure explore
|
| 33 |
+
pivot_count: int # How many times agent changed strategy mid-episode
|
| 34 |
+
|
| 35 |
+
def to_dict(self) -> dict:
|
| 36 |
+
return {
|
| 37 |
+
"strategy": self.strategy,
|
| 38 |
+
"score": round(self.score, 3),
|
| 39 |
+
"confidence": round(self.confidence, 3),
|
| 40 |
+
"sub_patterns": self.sub_patterns,
|
| 41 |
+
"evidence": self.evidence,
|
| 42 |
+
"strategy_description": self.strategy_description,
|
| 43 |
+
"exploration_ratio": round(self.exploration_ratio, 3),
|
| 44 |
+
"pivot_count": self.pivot_count,
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
STRATEGY_DESCRIPTIONS = {
|
| 49 |
+
"TARGETED_DEBUGGING": (
|
| 50 |
+
"Agent reads the failing test to understand expected behavior, "
|
| 51 |
+
"then navigates directly to the relevant source file and makes a targeted fix."
|
| 52 |
+
),
|
| 53 |
+
"SYSTEMATIC_SEARCH": (
|
| 54 |
+
"Agent reads all available files before writing any code. "
|
| 55 |
+
"Methodical but can waste steps on irrelevant files."
|
| 56 |
+
),
|
| 57 |
+
"BRUTE_FORCE": (
|
| 58 |
+
"Agent repeatedly writes and runs tests hoping something sticks. "
|
| 59 |
+
"No clear hypothesis about the bug — trial and error approach."
|
| 60 |
+
),
|
| 61 |
+
"RANDOM_EXPLORATION": (
|
| 62 |
+
"Agent reads files in an incoherent order with no visible strategy. "
|
| 63 |
+
"High entropy — possibly confused by misleading information."
|
| 64 |
+
),
|
| 65 |
+
"SPEC_DRIVEN": (
|
| 66 |
+
"Agent reads the specification/feature doc first, "
|
| 67 |
+
"then systematically implements what is described. Ideal for task3."
|
| 68 |
+
),
|
| 69 |
+
"MINIMAL_EFFORT": (
|
| 70 |
+
"Agent took very few steps and submitted early. "
|
| 71 |
+
"May indicate overconfidence or giving up."
|
| 72 |
+
),
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class StrategyDetector:
|
| 77 |
+
"""
|
| 78 |
+
Detects the behavioral strategy pattern used by an agent.
|
| 79 |
+
|
| 80 |
+
Usage:
|
| 81 |
+
detector = StrategyDetector()
|
| 82 |
+
report = detector.detect(
|
| 83 |
+
trajectory_steps=[...],
|
| 84 |
+
task="task1",
|
| 85 |
+
variant_meta={...},
|
| 86 |
+
files_read=[...],
|
| 87 |
+
final_score=0.7,
|
| 88 |
+
)
|
| 89 |
+
"""
|
| 90 |
+
|
| 91 |
+
def detect(
|
| 92 |
+
self,
|
| 93 |
+
trajectory_steps: List[dict],
|
| 94 |
+
task: str,
|
| 95 |
+
variant_meta: Dict[str, Any],
|
| 96 |
+
files_read: List[str],
|
| 97 |
+
final_score: float,
|
| 98 |
+
) -> StrategyReport:
|
| 99 |
+
"""Detect strategy from trajectory data."""
|
| 100 |
+
if not trajectory_steps:
|
| 101 |
+
return StrategyReport(
|
| 102 |
+
strategy="MINIMAL_EFFORT",
|
| 103 |
+
score=0.0,
|
| 104 |
+
confidence=1.0,
|
| 105 |
+
sub_patterns=[],
|
| 106 |
+
evidence=["No steps taken"],
|
| 107 |
+
strategy_description=STRATEGY_DESCRIPTIONS["MINIMAL_EFFORT"],
|
| 108 |
+
exploration_ratio=0.0,
|
| 109 |
+
pivot_count=0,
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
action_seq = [s.get("action_type", "") for s in trajectory_steps]
|
| 113 |
+
read_paths = [
|
| 114 |
+
s.get("action_path", "")
|
| 115 |
+
for s in trajectory_steps
|
| 116 |
+
if s.get("action_type") == "read_file"
|
| 117 |
+
]
|
| 118 |
+
write_count = action_seq.count("write_file")
|
| 119 |
+
test_count = action_seq.count("run_tests")
|
| 120 |
+
read_count = action_seq.count("read_file")
|
| 121 |
+
search_count = action_seq.count("search_code")
|
| 122 |
+
total = len(action_seq)
|
| 123 |
+
|
| 124 |
+
relevant = set(
|
| 125 |
+
variant_meta.get("bug_files", []) +
|
| 126 |
+
variant_meta.get("interface_files", []) +
|
| 127 |
+
variant_meta.get("read_first_files", [])
|
| 128 |
+
)
|
| 129 |
+
test_files = [f for f in read_paths if f and f.startswith("tests/")]
|
| 130 |
+
spec_files = [f for f in read_paths if f and f.endswith(".md")]
|
| 131 |
+
|
| 132 |
+
sub_patterns = []
|
| 133 |
+
evidence = []
|
| 134 |
+
|
| 135 |
+
# ── Exploration ratio: reads/searches vs writes/tests ─────────────────
|
| 136 |
+
explore_actions = read_count + search_count
|
| 137 |
+
exploit_actions = write_count + test_count
|
| 138 |
+
exploration_ratio = (
|
| 139 |
+
explore_actions / (explore_actions + exploit_actions)
|
| 140 |
+
if (explore_actions + exploit_actions) > 0
|
| 141 |
+
else 0.5
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
# ── Pivot detection: strategy changes mid-episode ─────────────────────
|
| 145 |
+
pivots = 0
|
| 146 |
+
blocks = []
|
| 147 |
+
current_block = action_seq[0] if action_seq else None
|
| 148 |
+
for a in action_seq[1:]:
|
| 149 |
+
read_like = a in ("read_file", "search_code")
|
| 150 |
+
write_like = a in ("write_file", "run_tests")
|
| 151 |
+
cur_read = current_block in ("read_file", "search_code")
|
| 152 |
+
cur_write = current_block in ("write_file", "run_tests")
|
| 153 |
+
if (read_like and cur_write) or (write_like and cur_read):
|
| 154 |
+
pivots += 1
|
| 155 |
+
current_block = a
|
| 156 |
+
|
| 157 |
+
# ── Strategy classification ────────────────────────────────────────────
|
| 158 |
+
strategy = "RANDOM_EXPLORATION"
|
| 159 |
+
score = 0.4
|
| 160 |
+
confidence = 0.5
|
| 161 |
+
|
| 162 |
+
# 1. SPEC_DRIVEN (reads spec/md first, task3)
|
| 163 |
+
if task == "task3" and spec_files and action_seq.index("read_file") == 0:
|
| 164 |
+
strategy = "SPEC_DRIVEN"
|
| 165 |
+
score = 0.85 if final_score > 0.5 else 0.55
|
| 166 |
+
confidence = 0.9
|
| 167 |
+
evidence.append(f"Read spec file(s) first: {spec_files[:2]}")
|
| 168 |
+
sub_patterns.append("SPEC_FIRST")
|
| 169 |
+
|
| 170 |
+
# 2. TARGETED_DEBUGGING (test first → relevant src → write)
|
| 171 |
+
elif (test_files and read_paths and read_paths[0].startswith("tests/")
|
| 172 |
+
and write_count >= 1 and test_count >= 1):
|
| 173 |
+
strategy = "TARGETED_DEBUGGING"
|
| 174 |
+
score = 0.85 + (0.15 * final_score)
|
| 175 |
+
confidence = 0.85
|
| 176 |
+
evidence.append(f"First read was test file: {read_paths[0]}")
|
| 177 |
+
evidence.append(f"Followed by write + test verification")
|
| 178 |
+
sub_patterns.append("TEST_FIRST")
|
| 179 |
+
if relevant and any(f in files_read for f in relevant):
|
| 180 |
+
sub_patterns.append("TARGETED_READ")
|
| 181 |
+
score = min(1.0, score + 0.05)
|
| 182 |
+
|
| 183 |
+
# 3. SYSTEMATIC_SEARCH (all files read before any write)
|
| 184 |
+
elif write_count > 0:
|
| 185 |
+
first_write_idx = next((i for i, a in enumerate(action_seq) if a == "write_file"), total)
|
| 186 |
+
reads_before_write = sum(1 for i, a in enumerate(action_seq) if a == "read_file" and i < first_write_idx)
|
| 187 |
+
if read_count > 0 and reads_before_write == read_count:
|
| 188 |
+
strategy = "SYSTEMATIC_SEARCH"
|
| 189 |
+
score = 0.65
|
| 190 |
+
confidence = 0.75
|
| 191 |
+
evidence.append(f"Read {reads_before_write} files before first write")
|
| 192 |
+
sub_patterns.append("READ_ALL_FIRST")
|
| 193 |
+
|
| 194 |
+
# 4. BRUTE_FORCE (multiple write-test cycles)
|
| 195 |
+
elif write_count >= 2 and test_count >= 2:
|
| 196 |
+
strategy = "BRUTE_FORCE"
|
| 197 |
+
score = 0.35
|
| 198 |
+
confidence = 0.8
|
| 199 |
+
evidence.append(f"{write_count} writes + {test_count} test runs = trial and error")
|
| 200 |
+
sub_patterns.append("TRIAL_AND_ERROR")
|
| 201 |
+
|
| 202 |
+
# 5. MINIMAL_EFFORT (tiny episode, or only submit)
|
| 203 |
+
elif total <= 3 or (write_count == 0 and test_count == 0):
|
| 204 |
+
strategy = "MINIMAL_EFFORT"
|
| 205 |
+
score = 0.1
|
| 206 |
+
confidence = 0.95
|
| 207 |
+
evidence.append(f"Only {total} total steps with no fix attempt")
|
| 208 |
+
sub_patterns.append("GAVE_UP")
|
| 209 |
+
|
| 210 |
+
# ── Additional sub-pattern detection ──────────────────────────────────
|
| 211 |
+
# Search-before-read
|
| 212 |
+
if search_count > 0:
|
| 213 |
+
first_search = next((i for i, a in enumerate(action_seq) if a == "search_code"), total)
|
| 214 |
+
first_read = next((i for i, a in enumerate(action_seq) if a == "read_file"), total)
|
| 215 |
+
if first_search < first_read:
|
| 216 |
+
sub_patterns.append("SEARCH_GUIDED")
|
| 217 |
+
evidence.append("Used search_code to locate bug before reading")
|
| 218 |
+
|
| 219 |
+
# Excessive looping
|
| 220 |
+
path_counts = Counter(p for p in read_paths if p)
|
| 221 |
+
max_rereads = max(path_counts.values()) if path_counts else 0
|
| 222 |
+
if max_rereads >= 3:
|
| 223 |
+
sub_patterns.append("READ_LOOP")
|
| 224 |
+
evidence.append(f"Re-read same file {max_rereads}x — likely confused")
|
| 225 |
+
score = max(0.0, score - 0.2)
|
| 226 |
+
|
| 227 |
+
# Verified fix (ran tests and found improvement)
|
| 228 |
+
test_rates = [s.get("test_pass_rate") for s in trajectory_steps if s.get("test_pass_rate") is not None]
|
| 229 |
+
if len(test_rates) >= 2 and test_rates[-1] > test_rates[0]:
|
| 230 |
+
sub_patterns.append("VERIFIED_FIX")
|
| 231 |
+
evidence.append(f"Test pass rate improved: {test_rates[0]:.2f} → {test_rates[-1]:.2f}")
|
| 232 |
+
score = min(1.0, score + 0.1)
|
| 233 |
+
|
| 234 |
+
return StrategyReport(
|
| 235 |
+
strategy=strategy,
|
| 236 |
+
score=max(0.0, min(1.0, score)),
|
| 237 |
+
confidence=confidence,
|
| 238 |
+
sub_patterns=sub_patterns,
|
| 239 |
+
evidence=evidence,
|
| 240 |
+
strategy_description=STRATEGY_DESCRIPTIONS.get(strategy, ""),
|
| 241 |
+
exploration_ratio=exploration_ratio,
|
| 242 |
+
pivot_count=pivots,
|
| 243 |
+
)
|
static/viz3d.html
ADDED
|
@@ -0,0 +1,867 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>Agent Trajectory 3D Visualizer</title>
|
| 7 |
+
<style>
|
| 8 |
+
* { margin: 0; padding: 0; box-sizing: border-box; }
|
| 9 |
+
body {
|
| 10 |
+
background: #0a0e1a;
|
| 11 |
+
color: #e0e6f0;
|
| 12 |
+
font-family: 'Segoe UI', system-ui, sans-serif;
|
| 13 |
+
overflow: hidden;
|
| 14 |
+
height: 100vh;
|
| 15 |
+
}
|
| 16 |
+
#canvas-container {
|
| 17 |
+
position: absolute;
|
| 18 |
+
top: 0; left: 0;
|
| 19 |
+
width: 100%; height: 100%;
|
| 20 |
+
}
|
| 21 |
+
#ui-overlay {
|
| 22 |
+
position: absolute;
|
| 23 |
+
top: 0; left: 0;
|
| 24 |
+
width: 100%; height: 100%;
|
| 25 |
+
pointer-events: none;
|
| 26 |
+
z-index: 10;
|
| 27 |
+
}
|
| 28 |
+
/* Header */
|
| 29 |
+
#header {
|
| 30 |
+
position: absolute;
|
| 31 |
+
top: 12px; left: 50%;
|
| 32 |
+
transform: translateX(-50%);
|
| 33 |
+
text-align: center;
|
| 34 |
+
pointer-events: none;
|
| 35 |
+
}
|
| 36 |
+
#header h1 {
|
| 37 |
+
font-size: 16px;
|
| 38 |
+
font-weight: 700;
|
| 39 |
+
color: #7dd3fc;
|
| 40 |
+
letter-spacing: 0.05em;
|
| 41 |
+
text-shadow: 0 0 20px rgba(125,211,252,0.5);
|
| 42 |
+
}
|
| 43 |
+
#header p {
|
| 44 |
+
font-size: 11px;
|
| 45 |
+
color: #64748b;
|
| 46 |
+
margin-top: 2px;
|
| 47 |
+
}
|
| 48 |
+
/* Legend */
|
| 49 |
+
#legend {
|
| 50 |
+
position: absolute;
|
| 51 |
+
top: 12px; right: 16px;
|
| 52 |
+
background: rgba(10,14,26,0.85);
|
| 53 |
+
border: 1px solid rgba(125,211,252,0.2);
|
| 54 |
+
border-radius: 8px;
|
| 55 |
+
padding: 10px 14px;
|
| 56 |
+
font-size: 11px;
|
| 57 |
+
pointer-events: none;
|
| 58 |
+
}
|
| 59 |
+
#legend h3 { color: #7dd3fc; margin-bottom: 8px; font-size: 11px; }
|
| 60 |
+
.legend-item {
|
| 61 |
+
display: flex; align-items: center; gap: 8px;
|
| 62 |
+
margin-bottom: 5px;
|
| 63 |
+
}
|
| 64 |
+
.legend-dot {
|
| 65 |
+
width: 10px; height: 10px;
|
| 66 |
+
border-radius: 50%;
|
| 67 |
+
flex-shrink: 0;
|
| 68 |
+
}
|
| 69 |
+
/* Info panel */
|
| 70 |
+
#info-panel {
|
| 71 |
+
position: absolute;
|
| 72 |
+
top: 12px; left: 16px;
|
| 73 |
+
background: rgba(10,14,26,0.85);
|
| 74 |
+
border: 1px solid rgba(125,211,252,0.2);
|
| 75 |
+
border-radius: 8px;
|
| 76 |
+
padding: 12px 16px;
|
| 77 |
+
min-width: 220px;
|
| 78 |
+
pointer-events: none;
|
| 79 |
+
}
|
| 80 |
+
#info-panel h3 { color: #7dd3fc; margin-bottom: 8px; font-size: 11px; letter-spacing: 0.1em; }
|
| 81 |
+
.info-row {
|
| 82 |
+
display: flex; justify-content: space-between; gap: 12px;
|
| 83 |
+
font-size: 11px;
|
| 84 |
+
margin-bottom: 4px;
|
| 85 |
+
color: #94a3b8;
|
| 86 |
+
}
|
| 87 |
+
.info-value { color: #e0e6f0; font-weight: 600; }
|
| 88 |
+
/* Timeline */
|
| 89 |
+
#timeline-panel {
|
| 90 |
+
position: absolute;
|
| 91 |
+
bottom: 20px; left: 50%;
|
| 92 |
+
transform: translateX(-50%);
|
| 93 |
+
background: rgba(10,14,26,0.9);
|
| 94 |
+
border: 1px solid rgba(125,211,252,0.2);
|
| 95 |
+
border-radius: 10px;
|
| 96 |
+
padding: 14px 20px;
|
| 97 |
+
width: min(700px, 90vw);
|
| 98 |
+
pointer-events: all;
|
| 99 |
+
}
|
| 100 |
+
#timeline-panel .tl-header {
|
| 101 |
+
display: flex;
|
| 102 |
+
justify-content: space-between;
|
| 103 |
+
align-items: center;
|
| 104 |
+
margin-bottom: 10px;
|
| 105 |
+
}
|
| 106 |
+
#timeline-panel h3 {
|
| 107 |
+
font-size: 11px;
|
| 108 |
+
color: #7dd3fc;
|
| 109 |
+
letter-spacing: 0.1em;
|
| 110 |
+
}
|
| 111 |
+
#step-label {
|
| 112 |
+
font-size: 12px;
|
| 113 |
+
color: #f0abfc;
|
| 114 |
+
font-weight: 700;
|
| 115 |
+
}
|
| 116 |
+
#timeline-slider {
|
| 117 |
+
width: 100%;
|
| 118 |
+
-webkit-appearance: none;
|
| 119 |
+
height: 4px;
|
| 120 |
+
background: linear-gradient(to right, #7dd3fc 0%, #7dd3fc var(--pct,0%), #1e293b var(--pct,0%));
|
| 121 |
+
border-radius: 4px;
|
| 122 |
+
outline: none;
|
| 123 |
+
cursor: pointer;
|
| 124 |
+
}
|
| 125 |
+
#timeline-slider::-webkit-slider-thumb {
|
| 126 |
+
-webkit-appearance: none;
|
| 127 |
+
width: 16px; height: 16px;
|
| 128 |
+
border-radius: 50%;
|
| 129 |
+
background: #7dd3fc;
|
| 130 |
+
cursor: pointer;
|
| 131 |
+
box-shadow: 0 0 10px rgba(125,211,252,0.7);
|
| 132 |
+
}
|
| 133 |
+
#step-actions {
|
| 134 |
+
display: flex;
|
| 135 |
+
gap: 8px;
|
| 136 |
+
margin-top: 10px;
|
| 137 |
+
justify-content: center;
|
| 138 |
+
}
|
| 139 |
+
.tl-btn {
|
| 140 |
+
background: rgba(125,211,252,0.1);
|
| 141 |
+
border: 1px solid rgba(125,211,252,0.3);
|
| 142 |
+
color: #7dd3fc;
|
| 143 |
+
padding: 5px 14px;
|
| 144 |
+
border-radius: 6px;
|
| 145 |
+
cursor: pointer;
|
| 146 |
+
font-size: 11px;
|
| 147 |
+
transition: all 0.2s;
|
| 148 |
+
}
|
| 149 |
+
.tl-btn:hover { background: rgba(125,211,252,0.25); }
|
| 150 |
+
.tl-btn.active { background: rgba(125,211,252,0.3); }
|
| 151 |
+
/* Step log */
|
| 152 |
+
#step-log {
|
| 153 |
+
position: absolute;
|
| 154 |
+
bottom: 130px; right: 16px;
|
| 155 |
+
background: rgba(10,14,26,0.85);
|
| 156 |
+
border: 1px solid rgba(125,211,252,0.2);
|
| 157 |
+
border-radius: 8px;
|
| 158 |
+
padding: 10px 14px;
|
| 159 |
+
width: 260px;
|
| 160 |
+
max-height: 240px;
|
| 161 |
+
overflow-y: auto;
|
| 162 |
+
pointer-events: none;
|
| 163 |
+
font-size: 10px;
|
| 164 |
+
}
|
| 165 |
+
#step-log h3 { color: #7dd3fc; margin-bottom: 8px; font-size: 11px; }
|
| 166 |
+
.log-entry {
|
| 167 |
+
display: flex;
|
| 168 |
+
align-items: flex-start;
|
| 169 |
+
gap: 6px;
|
| 170 |
+
margin-bottom: 6px;
|
| 171 |
+
padding-bottom: 6px;
|
| 172 |
+
border-bottom: 1px solid rgba(255,255,255,0.05);
|
| 173 |
+
}
|
| 174 |
+
.log-entry:last-child { border-bottom: none; }
|
| 175 |
+
.log-step { color: #475569; min-width: 28px; }
|
| 176 |
+
.log-action { font-weight: 600; }
|
| 177 |
+
.log-reward { margin-left: auto; font-weight: 700; }
|
| 178 |
+
.reward-pos { color: #4ade80; }
|
| 179 |
+
.reward-neg { color: #f87171; }
|
| 180 |
+
.reward-zero { color: #94a3b8; }
|
| 181 |
+
/* Tooltip */
|
| 182 |
+
#tooltip {
|
| 183 |
+
position: absolute;
|
| 184 |
+
background: rgba(10,14,26,0.95);
|
| 185 |
+
border: 1px solid rgba(125,211,252,0.4);
|
| 186 |
+
border-radius: 6px;
|
| 187 |
+
padding: 8px 12px;
|
| 188 |
+
font-size: 11px;
|
| 189 |
+
pointer-events: none;
|
| 190 |
+
opacity: 0;
|
| 191 |
+
transition: opacity 0.15s;
|
| 192 |
+
max-width: 200px;
|
| 193 |
+
z-index: 20;
|
| 194 |
+
}
|
| 195 |
+
#tooltip h4 { color: #7dd3fc; margin-bottom: 4px; }
|
| 196 |
+
/* Score ring */
|
| 197 |
+
#score-ring {
|
| 198 |
+
position: absolute;
|
| 199 |
+
bottom: 130px; left: 16px;
|
| 200 |
+
pointer-events: none;
|
| 201 |
+
}
|
| 202 |
+
#score-ring svg text { font-family: 'Segoe UI', sans-serif; }
|
| 203 |
+
/* Loader */
|
| 204 |
+
#loader {
|
| 205 |
+
position: absolute;
|
| 206 |
+
top: 50%; left: 50%;
|
| 207 |
+
transform: translate(-50%, -50%);
|
| 208 |
+
color: #7dd3fc;
|
| 209 |
+
font-size: 14px;
|
| 210 |
+
text-align: center;
|
| 211 |
+
}
|
| 212 |
+
.loader-spinner {
|
| 213 |
+
width: 40px; height: 40px;
|
| 214 |
+
border: 3px solid rgba(125,211,252,0.2);
|
| 215 |
+
border-top-color: #7dd3fc;
|
| 216 |
+
border-radius: 50%;
|
| 217 |
+
animation: spin 0.8s linear infinite;
|
| 218 |
+
margin: 0 auto 12px;
|
| 219 |
+
}
|
| 220 |
+
@keyframes spin { to { transform: rotate(360deg); } }
|
| 221 |
+
</style>
|
| 222 |
+
</head>
|
| 223 |
+
<body>
|
| 224 |
+
|
| 225 |
+
<!-- Hidden data injection point -->
|
| 226 |
+
<div id="viz-data" style="display:none"></div>
|
| 227 |
+
|
| 228 |
+
<div id="canvas-container">
|
| 229 |
+
<canvas id="three-canvas"></canvas>
|
| 230 |
+
</div>
|
| 231 |
+
|
| 232 |
+
<div id="loader">
|
| 233 |
+
<div class="loader-spinner"></div>
|
| 234 |
+
<p>Initializing 3D Visualizer...</p>
|
| 235 |
+
</div>
|
| 236 |
+
|
| 237 |
+
<div id="ui-overlay">
|
| 238 |
+
<!-- Header -->
|
| 239 |
+
<div id="header">
|
| 240 |
+
<h1>🔍 Agent Trajectory Visualizer — 3D</h1>
|
| 241 |
+
<p>Files = nodes · Dependencies = edges · Agent path = animated beam</p>
|
| 242 |
+
</div>
|
| 243 |
+
|
| 244 |
+
<!-- Info panel -->
|
| 245 |
+
<div id="info-panel">
|
| 246 |
+
<h3>EPISODE STATS</h3>
|
| 247 |
+
<div class="info-row"><span>Task</span><span class="info-value" id="stat-task">—</span></div>
|
| 248 |
+
<div class="info-row"><span>Variant</span><span class="info-value" id="stat-variant">—</span></div>
|
| 249 |
+
<div class="info-row"><span>Steps</span><span class="info-value" id="stat-steps">—</span></div>
|
| 250 |
+
<div class="info-row"><span>Score</span><span class="info-value" id="stat-score">—</span></div>
|
| 251 |
+
<div class="info-row"><span>Strategy</span><span class="info-value" id="stat-strategy">—</span></div>
|
| 252 |
+
<div class="info-row"><span>Failure</span><span class="info-value" id="stat-failure">—</span></div>
|
| 253 |
+
</div>
|
| 254 |
+
|
| 255 |
+
<!-- Legend -->
|
| 256 |
+
<div id="legend">
|
| 257 |
+
<h3>LEGEND</h3>
|
| 258 |
+
<div class="legend-item">
|
| 259 |
+
<div class="legend-dot" style="background:#f97316"></div><span>Source file</span>
|
| 260 |
+
</div>
|
| 261 |
+
<div class="legend-item">
|
| 262 |
+
<div class="legend-dot" style="background:#3b82f6"></div><span>Test file</span>
|
| 263 |
+
</div>
|
| 264 |
+
<div class="legend-item">
|
| 265 |
+
<div class="legend-dot" style="background:#a855f7"></div><span>Spec / Docs</span>
|
| 266 |
+
</div>
|
| 267 |
+
<div class="legend-item">
|
| 268 |
+
<div class="legend-dot" style="background:#22c55e"></div><span>Visited</span>
|
| 269 |
+
</div>
|
| 270 |
+
<div class="legend-item">
|
| 271 |
+
<div class="legend-dot" style="background:#ef4444"></div><span>Modified / Bug</span>
|
| 272 |
+
</div>
|
| 273 |
+
<div class="legend-item">
|
| 274 |
+
<div class="legend-dot" style="background:#facc15; width:20px; height:4px; border-radius:2px;"></div><span>Agent path</span>
|
| 275 |
+
</div>
|
| 276 |
+
</div>
|
| 277 |
+
|
| 278 |
+
<!-- Score ring -->
|
| 279 |
+
<div id="score-ring">
|
| 280 |
+
<svg width="80" height="80" viewBox="0 0 80 80">
|
| 281 |
+
<circle cx="40" cy="40" r="34" fill="none"
|
| 282 |
+
stroke="rgba(125,211,252,0.15)" stroke-width="6"/>
|
| 283 |
+
<circle id="score-arc" cx="40" cy="40" r="34" fill="none"
|
| 284 |
+
stroke="#7dd3fc" stroke-width="6"
|
| 285 |
+
stroke-dasharray="0 214"
|
| 286 |
+
stroke-linecap="round"
|
| 287 |
+
transform="rotate(-90 40 40)"
|
| 288 |
+
style="transition: stroke-dasharray 1s ease;"/>
|
| 289 |
+
<text id="score-text" x="40" y="45" text-anchor="middle"
|
| 290 |
+
fill="#e0e6f0" font-size="14" font-weight="700">0.0</text>
|
| 291 |
+
</svg>
|
| 292 |
+
</div>
|
| 293 |
+
|
| 294 |
+
<!-- Step log -->
|
| 295 |
+
<div id="step-log">
|
| 296 |
+
<h3>STEP LOG</h3>
|
| 297 |
+
<div id="log-entries"></div>
|
| 298 |
+
</div>
|
| 299 |
+
|
| 300 |
+
<!-- Tooltip -->
|
| 301 |
+
<div id="tooltip">
|
| 302 |
+
<h4 id="tooltip-title">File</h4>
|
| 303 |
+
<div id="tooltip-body"></div>
|
| 304 |
+
</div>
|
| 305 |
+
|
| 306 |
+
<!-- Timeline -->
|
| 307 |
+
<div id="timeline-panel">
|
| 308 |
+
<div class="tl-header">
|
| 309 |
+
<h3>TIMELINE REPLAY</h3>
|
| 310 |
+
<span id="step-label">Step 0 / 0</span>
|
| 311 |
+
</div>
|
| 312 |
+
<input type="range" id="timeline-slider" min="0" max="0" value="0"
|
| 313 |
+
oninput="onSliderChange(this.value)">
|
| 314 |
+
<div id="step-actions">
|
| 315 |
+
<button class="tl-btn" onclick="stepBack()">◀ Back</button>
|
| 316 |
+
<button class="tl-btn" id="play-btn" onclick="togglePlay()">▶ Play</button>
|
| 317 |
+
<button class="tl-btn" onclick="stepForward()">Forward ▶</button>
|
| 318 |
+
<button class="tl-btn" onclick="resetView()">↺ Reset</button>
|
| 319 |
+
<button class="tl-btn" id="orbit-btn" onclick="toggleOrbit()">🔄 Orbit</button>
|
| 320 |
+
</div>
|
| 321 |
+
</div>
|
| 322 |
+
</div>
|
| 323 |
+
|
| 324 |
+
<!-- Three.js from CDN -->
|
| 325 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/three.js/r128/three.min.js"></script>
|
| 326 |
+
<script>
|
| 327 |
+
// ── Sample data (replaced by real data from backend) ───────────────────────
|
| 328 |
+
const DEFAULT_DATA = {
|
| 329 |
+
task: "task1",
|
| 330 |
+
variant_id: "variant_1",
|
| 331 |
+
final_score: 0.714,
|
| 332 |
+
strategy: "TARGETED_DEBUGGING",
|
| 333 |
+
failure_type: "CORRECT",
|
| 334 |
+
files: [
|
| 335 |
+
{ name: "tests/test_formatter.py", type: "test" },
|
| 336 |
+
{ name: "src/formatter.py", type: "src", is_bug_file: true },
|
| 337 |
+
{ name: "src/utils.py", type: "src" }
|
| 338 |
+
],
|
| 339 |
+
dependencies: [
|
| 340 |
+
{ from: "tests/test_formatter.py", to: "src/formatter.py" },
|
| 341 |
+
{ from: "src/formatter.py", to: "src/utils.py" }
|
| 342 |
+
],
|
| 343 |
+
steps: [
|
| 344 |
+
{ step: 1, action: "read_file", path: "tests/test_formatter.py", reward: 0.0 },
|
| 345 |
+
{ step: 2, action: "read_file", path: "src/formatter.py", reward: 0.05 },
|
| 346 |
+
{ step: 3, action: "search_code", path: null, reward: 0.0 },
|
| 347 |
+
{ step: 4, action: "run_tests", path: "tests/test_formatter.py", reward: 0.0 },
|
| 348 |
+
{ step: 5, action: "submit", path: null, reward: 0.694 }
|
| 349 |
+
]
|
| 350 |
+
};
|
| 351 |
+
|
| 352 |
+
// ── Load data from injection point or use default ────────────────────────────
|
| 353 |
+
function loadVizData() {
|
| 354 |
+
const el = document.getElementById('viz-data');
|
| 355 |
+
if (el && el.textContent.trim()) {
|
| 356 |
+
try { return JSON.parse(el.textContent); } catch(e) {}
|
| 357 |
+
}
|
| 358 |
+
return DEFAULT_DATA;
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
// ── Three.js setup ───────────────────────────────────────────────────────────
|
| 362 |
+
const canvas = document.getElementById('three-canvas');
|
| 363 |
+
const renderer = new THREE.WebGLRenderer({ canvas, antialias: true, alpha: true });
|
| 364 |
+
renderer.setSize(window.innerWidth, window.innerHeight);
|
| 365 |
+
renderer.setPixelRatio(Math.min(window.devicePixelRatio, 2));
|
| 366 |
+
renderer.setClearColor(0x0a0e1a, 1);
|
| 367 |
+
|
| 368 |
+
const scene = new THREE.Scene();
|
| 369 |
+
const fov = 60;
|
| 370 |
+
const camera = new THREE.PerspectiveCamera(fov, window.innerWidth / window.innerHeight, 0.1, 1000);
|
| 371 |
+
camera.position.set(0, 8, 22);
|
| 372 |
+
camera.lookAt(0, 0, 0);
|
| 373 |
+
|
| 374 |
+
// Ambient + directional light
|
| 375 |
+
scene.add(new THREE.AmbientLight(0x1a2040, 1));
|
| 376 |
+
const dirLight = new THREE.DirectionalLight(0x7dd3fc, 0.6);
|
| 377 |
+
dirLight.position.set(5, 10, 5);
|
| 378 |
+
scene.add(dirLight);
|
| 379 |
+
|
| 380 |
+
// Grid
|
| 381 |
+
const grid = new THREE.GridHelper(40, 20, 0x1e293b, 0x1e293b);
|
| 382 |
+
grid.position.y = -3;
|
| 383 |
+
scene.add(grid);
|
| 384 |
+
|
| 385 |
+
// Stars
|
| 386 |
+
const starGeo = new THREE.BufferGeometry();
|
| 387 |
+
const starCount = 800;
|
| 388 |
+
const starPositions = new Float32Array(starCount * 3);
|
| 389 |
+
for (let i = 0; i < starCount * 3; i++) starPositions[i] = (Math.random() - 0.5) * 200;
|
| 390 |
+
starGeo.setAttribute('position', new THREE.BufferAttribute(starPositions, 3));
|
| 391 |
+
const starMat = new THREE.PointsMaterial({ color: 0x334155, size: 0.3 });
|
| 392 |
+
scene.add(new THREE.Points(starGeo, starMat));
|
| 393 |
+
|
| 394 |
+
// ── Color palette ─────────────────────────────────────────────────────────────
|
| 395 |
+
const COLORS = {
|
| 396 |
+
src: 0xf97316,
|
| 397 |
+
test: 0x3b82f6,
|
| 398 |
+
spec: 0xa855f7,
|
| 399 |
+
visited: 0x22c55e,
|
| 400 |
+
modified: 0xef4444,
|
| 401 |
+
bug: 0xef4444,
|
| 402 |
+
edge: 0x334155,
|
| 403 |
+
path: 0xfacc15,
|
| 404 |
+
agent: 0xfbbf24,
|
| 405 |
+
};
|
| 406 |
+
|
| 407 |
+
// ── Orbit control (manual implementation) ────────────────────────────────────
|
| 408 |
+
let isOrbiting = false;
|
| 409 |
+
let orbitActive = false;
|
| 410 |
+
let mouse = { x: 0, y: 0, down: false, lastX: 0, lastY: 0 };
|
| 411 |
+
let spherical = { theta: 0, phi: Math.PI / 4, r: 22 };
|
| 412 |
+
|
| 413 |
+
canvas.addEventListener('mousedown', e => { mouse.down = true; mouse.lastX = e.clientX; mouse.lastY = e.clientY; });
|
| 414 |
+
canvas.addEventListener('mouseup', () => { mouse.down = false; });
|
| 415 |
+
canvas.addEventListener('mousemove', e => {
|
| 416 |
+
if (!mouse.down) {
|
| 417 |
+
// Hover for tooltip
|
| 418 |
+
checkHover(e.clientX, e.clientY);
|
| 419 |
+
return;
|
| 420 |
+
}
|
| 421 |
+
const dx = e.clientX - mouse.lastX;
|
| 422 |
+
const dy = e.clientY - mouse.lastY;
|
| 423 |
+
spherical.theta -= dx * 0.005;
|
| 424 |
+
spherical.phi = Math.max(0.1, Math.min(Math.PI / 2, spherical.phi - dy * 0.005));
|
| 425 |
+
mouse.lastX = e.clientX;
|
| 426 |
+
mouse.lastY = e.clientY;
|
| 427 |
+
});
|
| 428 |
+
canvas.addEventListener('wheel', e => {
|
| 429 |
+
spherical.r = Math.max(8, Math.min(50, spherical.r + e.deltaY * 0.02));
|
| 430 |
+
});
|
| 431 |
+
|
| 432 |
+
function updateCamera() {
|
| 433 |
+
if (orbitActive) spherical.theta += 0.003;
|
| 434 |
+
camera.position.x = spherical.r * Math.sin(spherical.phi) * Math.sin(spherical.theta);
|
| 435 |
+
camera.position.y = spherical.r * Math.cos(spherical.phi);
|
| 436 |
+
camera.position.z = spherical.r * Math.sin(spherical.phi) * Math.cos(spherical.theta);
|
| 437 |
+
camera.lookAt(0, 0, 0);
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
// ── Scene objects ─────────────────────────────────────────────────────────────
|
| 441 |
+
const nodeObjects = {}; // name → { mesh, label, position }
|
| 442 |
+
const edgeObjects = [];
|
| 443 |
+
const pathObjects = [];
|
| 444 |
+
let agentSphere = null;
|
| 445 |
+
let agentTrail = null;
|
| 446 |
+
let currentStep = 0;
|
| 447 |
+
let maxStep = 0;
|
| 448 |
+
let playing = false;
|
| 449 |
+
let playInterval = null;
|
| 450 |
+
let vizData = null;
|
| 451 |
+
let nodePositions = {};
|
| 452 |
+
|
| 453 |
+
// ── Build scene from data ─────────────────────────────────────────────────────
|
| 454 |
+
function buildScene(data) {
|
| 455 |
+
vizData = data;
|
| 456 |
+
|
| 457 |
+
// Clear previous objects
|
| 458 |
+
Object.values(nodeObjects).forEach(o => scene.remove(o.mesh));
|
| 459 |
+
edgeObjects.forEach(e => scene.remove(e));
|
| 460 |
+
pathObjects.forEach(p => scene.remove(p));
|
| 461 |
+
if (agentSphere) scene.remove(agentSphere);
|
| 462 |
+
Object.keys(nodeObjects).forEach(k => delete nodeObjects[k]);
|
| 463 |
+
|
| 464 |
+
const files = data.files || [];
|
| 465 |
+
const n = files.length;
|
| 466 |
+
if (n === 0) return;
|
| 467 |
+
|
| 468 |
+
// Arrange files in a circular layout on XZ plane
|
| 469 |
+
files.forEach((file, i) => {
|
| 470 |
+
const angle = (i / n) * Math.PI * 2;
|
| 471 |
+
const radius = Math.max(4, n * 0.9);
|
| 472 |
+
const x = Math.cos(angle) * radius;
|
| 473 |
+
const z = Math.sin(angle) * radius;
|
| 474 |
+
const y = 0;
|
| 475 |
+
|
| 476 |
+
nodePositions[file.name] = new THREE.Vector3(x, y, z);
|
| 477 |
+
|
| 478 |
+
// Sphere geometry
|
| 479 |
+
const geo = new THREE.SphereGeometry(0.6, 16, 16);
|
| 480 |
+
const color = new THREE.Color(
|
| 481 |
+
file.is_bug_file ? COLORS.bug :
|
| 482 |
+
file.type === 'test' ? COLORS.test :
|
| 483 |
+
file.type === 'spec' ? COLORS.spec : COLORS.src
|
| 484 |
+
);
|
| 485 |
+
const mat = new THREE.MeshPhongMaterial({
|
| 486 |
+
color,
|
| 487 |
+
emissive: color.clone().multiplyScalar(0.3),
|
| 488 |
+
shininess: 60,
|
| 489 |
+
transparent: true,
|
| 490 |
+
opacity: 0.9,
|
| 491 |
+
});
|
| 492 |
+
const mesh = new THREE.Mesh(geo, mat);
|
| 493 |
+
mesh.position.set(x, y, z);
|
| 494 |
+
mesh.userData = { file };
|
| 495 |
+
scene.add(mesh);
|
| 496 |
+
|
| 497 |
+
// Glow ring
|
| 498 |
+
const ringGeo = new THREE.RingGeometry(0.75, 0.85, 32);
|
| 499 |
+
const ringMat = new THREE.MeshBasicMaterial({
|
| 500 |
+
color,
|
| 501 |
+
transparent: true,
|
| 502 |
+
opacity: 0.25,
|
| 503 |
+
side: THREE.DoubleSide,
|
| 504 |
+
});
|
| 505 |
+
const ring = new THREE.Mesh(ringGeo, ringMat);
|
| 506 |
+
ring.rotation.x = Math.PI / 2;
|
| 507 |
+
mesh.add(ring);
|
| 508 |
+
|
| 509 |
+
nodeObjects[file.name] = { mesh, position: nodePositions[file.name], file };
|
| 510 |
+
});
|
| 511 |
+
|
| 512 |
+
// Draw dependency edges
|
| 513 |
+
(data.dependencies || []).forEach(dep => {
|
| 514 |
+
const fromPos = nodePositions[dep.from];
|
| 515 |
+
const toPos = nodePositions[dep.to];
|
| 516 |
+
if (!fromPos || !toPos) return;
|
| 517 |
+
|
| 518 |
+
const points = [fromPos.clone(), toPos.clone()];
|
| 519 |
+
const geo = new THREE.BufferGeometry().setFromPoints(points);
|
| 520 |
+
const mat = new THREE.LineBasicMaterial({
|
| 521 |
+
color: COLORS.edge,
|
| 522 |
+
transparent: true,
|
| 523 |
+
opacity: 0.4,
|
| 524 |
+
});
|
| 525 |
+
const line = new THREE.Line(geo, mat);
|
| 526 |
+
scene.add(line);
|
| 527 |
+
edgeObjects.push(line);
|
| 528 |
+
});
|
| 529 |
+
|
| 530 |
+
// Agent globe
|
| 531 |
+
const agentGeo = new THREE.SphereGeometry(0.35, 16, 16);
|
| 532 |
+
const agentMat = new THREE.MeshPhongMaterial({
|
| 533 |
+
color: COLORS.agent,
|
| 534 |
+
emissive: 0xfbbf24,
|
| 535 |
+
emissiveIntensity: 0.8,
|
| 536 |
+
shininess: 100,
|
| 537 |
+
});
|
| 538 |
+
agentSphere = new THREE.Mesh(agentGeo, agentMat);
|
| 539 |
+
agentSphere.position.set(0, 3, 0); // Start above origin
|
| 540 |
+
scene.add(agentSphere);
|
| 541 |
+
|
| 542 |
+
// Update UI
|
| 543 |
+
document.getElementById('stat-task').textContent = data.task || '—';
|
| 544 |
+
document.getElementById('stat-variant').textContent = data.variant_id || '—';
|
| 545 |
+
document.getElementById('stat-steps').textContent = (data.steps || []).length;
|
| 546 |
+
document.getElementById('stat-strategy').textContent = data.strategy || '—';
|
| 547 |
+
document.getElementById('stat-failure').textContent = data.failure_type || '—';
|
| 548 |
+
updateScore(data.final_score || 0);
|
| 549 |
+
updateStepLog(data.steps || [], -1);
|
| 550 |
+
|
| 551 |
+
// Setup timeline
|
| 552 |
+
maxStep = (data.steps || []).length;
|
| 553 |
+
const slider = document.getElementById('timeline-slider');
|
| 554 |
+
slider.max = maxStep;
|
| 555 |
+
slider.value = 0;
|
| 556 |
+
currentStep = 0;
|
| 557 |
+
updateStepLabel(0, maxStep);
|
| 558 |
+
|
| 559 |
+
applyStep(0);
|
| 560 |
+
}
|
| 561 |
+
|
| 562 |
+
// ── Animation: go to a specific step ─────────────────────────────────────────
|
| 563 |
+
function applyStep(stepIndex) {
|
| 564 |
+
if (!vizData) return;
|
| 565 |
+
const steps = vizData.steps || [];
|
| 566 |
+
const visitedFiles = new Set();
|
| 567 |
+
const modifiedFiles = new Set();
|
| 568 |
+
|
| 569 |
+
// Reset all nodes
|
| 570 |
+
Object.values(nodeObjects).forEach(obj => {
|
| 571 |
+
const file = obj.file;
|
| 572 |
+
const baseColor = new THREE.Color(
|
| 573 |
+
file.is_bug_file ? COLORS.bug :
|
| 574 |
+
file.type === 'test' ? COLORS.test :
|
| 575 |
+
file.type === 'spec' ? COLORS.spec : COLORS.src
|
| 576 |
+
);
|
| 577 |
+
obj.mesh.material.color.set(baseColor);
|
| 578 |
+
obj.mesh.material.emissive.set(baseColor.clone().multiplyScalar(0.2));
|
| 579 |
+
obj.mesh.material.opacity = 0.5;
|
| 580 |
+
obj.mesh.scale.set(1, 1, 1);
|
| 581 |
+
});
|
| 582 |
+
|
| 583 |
+
// Remove old path lines
|
| 584 |
+
pathObjects.forEach(p => scene.remove(p));
|
| 585 |
+
pathObjects.length = 0;
|
| 586 |
+
|
| 587 |
+
// Collect positions for path up to current step
|
| 588 |
+
const pathPositions = [];
|
| 589 |
+
|
| 590 |
+
for (let i = 0; i < stepIndex; i++) {
|
| 591 |
+
const step = steps[i];
|
| 592 |
+
if (!step) continue;
|
| 593 |
+
|
| 594 |
+
if (step.path && nodeObjects[step.path]) {
|
| 595 |
+
const pos = nodeObjects[step.path].position.clone();
|
| 596 |
+
pathPositions.push(pos.clone().add(new THREE.Vector3(0, 0.1, 0)));
|
| 597 |
+
|
| 598 |
+
if (step.action === 'read_file') visitedFiles.add(step.path);
|
| 599 |
+
if (step.action === 'write_file') modifiedFiles.add(step.path);
|
| 600 |
+
}
|
| 601 |
+
}
|
| 602 |
+
|
| 603 |
+
// Color visited + modified
|
| 604 |
+
visitedFiles.forEach(name => {
|
| 605 |
+
if (nodeObjects[name]) {
|
| 606 |
+
nodeObjects[name].mesh.material.color.set(COLORS.visited);
|
| 607 |
+
nodeObjects[name].mesh.material.emissive.set(
|
| 608 |
+
new THREE.Color(COLORS.visited).multiplyScalar(0.4)
|
| 609 |
+
);
|
| 610 |
+
nodeObjects[name].mesh.material.opacity = 1.0;
|
| 611 |
+
nodeObjects[name].mesh.scale.set(1.2, 1.2, 1.2);
|
| 612 |
+
}
|
| 613 |
+
});
|
| 614 |
+
modifiedFiles.forEach(name => {
|
| 615 |
+
if (nodeObjects[name]) {
|
| 616 |
+
nodeObjects[name].mesh.material.color.set(COLORS.modified);
|
| 617 |
+
nodeObjects[name].mesh.material.emissive.set(
|
| 618 |
+
new THREE.Color(COLORS.modified).multiplyScalar(0.5)
|
| 619 |
+
);
|
| 620 |
+
nodeObjects[name].mesh.material.opacity = 1.0;
|
| 621 |
+
nodeObjects[name].mesh.scale.set(1.4, 1.4, 1.4);
|
| 622 |
+
}
|
| 623 |
+
});
|
| 624 |
+
|
| 625 |
+
// Draw path beam
|
| 626 |
+
if (pathPositions.length >= 2) {
|
| 627 |
+
const pathGeo = new THREE.BufferGeometry().setFromPoints(pathPositions);
|
| 628 |
+
const pathMat = new THREE.LineBasicMaterial({
|
| 629 |
+
color: COLORS.path,
|
| 630 |
+
transparent: true,
|
| 631 |
+
opacity: 0.85,
|
| 632 |
+
linewidth: 2,
|
| 633 |
+
});
|
| 634 |
+
const pathLine = new THREE.Line(pathGeo, pathMat);
|
| 635 |
+
scene.add(pathLine);
|
| 636 |
+
pathObjects.push(pathLine);
|
| 637 |
+
}
|
| 638 |
+
|
| 639 |
+
// Move agent sphere
|
| 640 |
+
if (stepIndex > 0 && stepIndex <= steps.length) {
|
| 641 |
+
const currentStepData = steps[stepIndex - 1];
|
| 642 |
+
if (currentStepData && currentStepData.path && nodeObjects[currentStepData.path]) {
|
| 643 |
+
const targetPos = nodeObjects[currentStepData.path].position;
|
| 644 |
+
agentSphere.position.set(targetPos.x, targetPos.y + 1.2, targetPos.z);
|
| 645 |
+
} else {
|
| 646 |
+
// No file target — float in center (for search/submit actions)
|
| 647 |
+
agentSphere.position.set(0, 2.5, 0);
|
| 648 |
+
}
|
| 649 |
+
} else {
|
| 650 |
+
agentSphere.position.set(0, 3.5, 0);
|
| 651 |
+
}
|
| 652 |
+
|
| 653 |
+
// Highlight current node
|
| 654 |
+
if (stepIndex > 0) {
|
| 655 |
+
const cur = steps[stepIndex - 1];
|
| 656 |
+
if (cur && cur.path && nodeObjects[cur.path]) {
|
| 657 |
+
nodeObjects[cur.path].mesh.scale.set(1.6, 1.6, 1.6);
|
| 658 |
+
}
|
| 659 |
+
}
|
| 660 |
+
|
| 661 |
+
updateStepLog(steps, stepIndex - 1);
|
| 662 |
+
updateStepLabel(stepIndex, maxStep);
|
| 663 |
+
|
| 664 |
+
// Update slider gradient
|
| 665 |
+
const slider = document.getElementById('timeline-slider');
|
| 666 |
+
const pct = maxStep > 0 ? (stepIndex / maxStep * 100) : 0;
|
| 667 |
+
slider.style.setProperty('--pct', pct + '%');
|
| 668 |
+
}
|
| 669 |
+
|
| 670 |
+
// ── Score ring ────────────────────────────────────────────────────────────────
|
| 671 |
+
function updateScore(score) {
|
| 672 |
+
const circumference = 2 * Math.PI * 34;
|
| 673 |
+
const arc = circumference * Math.min(1, Math.max(0, score));
|
| 674 |
+
document.getElementById('score-arc').setAttribute(
|
| 675 |
+
'stroke-dasharray', `${arc} ${circumference}`
|
| 676 |
+
);
|
| 677 |
+
document.getElementById('score-text').textContent = score.toFixed(2);
|
| 678 |
+
document.getElementById('stat-score').textContent = score.toFixed(3);
|
| 679 |
+
|
| 680 |
+
// Color by score
|
| 681 |
+
const color = score >= 0.7 ? '#4ade80' : score >= 0.4 ? '#fbbf24' : '#f87171';
|
| 682 |
+
document.getElementById('score-arc').setAttribute('stroke', color);
|
| 683 |
+
}
|
| 684 |
+
|
| 685 |
+
// ── Step log ──────────────────────────────────────────────────────────────────
|
| 686 |
+
function updateStepLog(steps, currentIdx) {
|
| 687 |
+
const container = document.getElementById('log-entries');
|
| 688 |
+
container.innerHTML = '';
|
| 689 |
+
|
| 690 |
+
const ACTION_EMOJI = {
|
| 691 |
+
read_file: '📖',
|
| 692 |
+
write_file: '✏️',
|
| 693 |
+
run_tests: '🧪',
|
| 694 |
+
search_code: '🔍',
|
| 695 |
+
submit: '🏁',
|
| 696 |
+
};
|
| 697 |
+
|
| 698 |
+
steps.forEach((step, i) => {
|
| 699 |
+
const active = i === currentIdx;
|
| 700 |
+
const past = i < currentIdx;
|
| 701 |
+
const entry = document.createElement('div');
|
| 702 |
+
entry.className = 'log-entry';
|
| 703 |
+
entry.style.opacity = past ? '0.6' : active ? '1' : '0.35';
|
| 704 |
+
if (active) entry.style.background = 'rgba(125,211,252,0.08)';
|
| 705 |
+
|
| 706 |
+
const reward = step.reward || 0;
|
| 707 |
+
const rewardClass = reward > 0 ? 'reward-pos' : reward < 0 ? 'reward-neg' : 'reward-zero';
|
| 708 |
+
const emoji = ACTION_EMOJI[step.action] || '•';
|
| 709 |
+
const path = step.path ? step.path.split('/').pop() : step.action;
|
| 710 |
+
|
| 711 |
+
entry.innerHTML = `
|
| 712 |
+
<span class="log-step">S${step.step}</span>
|
| 713 |
+
<span class="log-action" style="color:${active ? '#7dd3fc' : '#94a3b8'}">${emoji} ${path}</span>
|
| 714 |
+
<span class="log-reward ${rewardClass}">${reward > 0 ? '+' : ''}${reward.toFixed(2)}</span>
|
| 715 |
+
`;
|
| 716 |
+
container.appendChild(entry);
|
| 717 |
+
});
|
| 718 |
+
|
| 719 |
+
// Auto-scroll to current
|
| 720 |
+
if (currentIdx >= 0) {
|
| 721 |
+
const entries = container.children;
|
| 722 |
+
if (entries[currentIdx]) {
|
| 723 |
+
entries[currentIdx].scrollIntoView({ block: 'nearest' });
|
| 724 |
+
}
|
| 725 |
+
}
|
| 726 |
+
}
|
| 727 |
+
|
| 728 |
+
// ── Hover tooltip ─────────────────────────────────────────────────────────────
|
| 729 |
+
const raycaster = new THREE.Raycaster();
|
| 730 |
+
const mouseVec = new THREE.Vector2();
|
| 731 |
+
const tooltip = document.getElementById('tooltip');
|
| 732 |
+
|
| 733 |
+
function checkHover(mx, my) {
|
| 734 |
+
mouseVec.x = (mx / window.innerWidth) * 2 - 1;
|
| 735 |
+
mouseVec.y = -(my / window.innerHeight) * 2 + 1;
|
| 736 |
+
raycaster.setFromCamera(mouseVec, camera);
|
| 737 |
+
|
| 738 |
+
const meshes = Object.values(nodeObjects).map(o => o.mesh);
|
| 739 |
+
const hits = raycaster.intersectObjects(meshes);
|
| 740 |
+
|
| 741 |
+
if (hits.length > 0) {
|
| 742 |
+
const file = hits[0].object.userData.file;
|
| 743 |
+
if (file) {
|
| 744 |
+
tooltip.style.opacity = '1';
|
| 745 |
+
tooltip.style.left = (mx + 14) + 'px';
|
| 746 |
+
tooltip.style.top = (my - 14) + 'px';
|
| 747 |
+
document.getElementById('tooltip-title').textContent = file.name;
|
| 748 |
+
document.getElementById('tooltip-body').innerHTML = `
|
| 749 |
+
Type: ${file.type}<br>
|
| 750 |
+
${file.is_bug_file ? '⚠️ Bug location' : ''}
|
| 751 |
+
`;
|
| 752 |
+
}
|
| 753 |
+
} else {
|
| 754 |
+
tooltip.style.opacity = '0';
|
| 755 |
+
}
|
| 756 |
+
}
|
| 757 |
+
|
| 758 |
+
// ── Timeline controls ─────────────────────────────────────────────────────────
|
| 759 |
+
function onSliderChange(val) {
|
| 760 |
+
currentStep = parseInt(val);
|
| 761 |
+
applyStep(currentStep);
|
| 762 |
+
}
|
| 763 |
+
|
| 764 |
+
function stepForward() {
|
| 765 |
+
if (currentStep < maxStep) {
|
| 766 |
+
currentStep++;
|
| 767 |
+
document.getElementById('timeline-slider').value = currentStep;
|
| 768 |
+
applyStep(currentStep);
|
| 769 |
+
}
|
| 770 |
+
}
|
| 771 |
+
|
| 772 |
+
function stepBack() {
|
| 773 |
+
if (currentStep > 0) {
|
| 774 |
+
currentStep--;
|
| 775 |
+
document.getElementById('timeline-slider').value = currentStep;
|
| 776 |
+
applyStep(currentStep);
|
| 777 |
+
}
|
| 778 |
+
}
|
| 779 |
+
|
| 780 |
+
function togglePlay() {
|
| 781 |
+
playing = !playing;
|
| 782 |
+
const btn = document.getElementById('play-btn');
|
| 783 |
+
btn.textContent = playing ? '⏸ Pause' : '▶ Play';
|
| 784 |
+
if (playing) {
|
| 785 |
+
if (currentStep >= maxStep) { currentStep = 0; }
|
| 786 |
+
playInterval = setInterval(() => {
|
| 787 |
+
if (currentStep >= maxStep) {
|
| 788 |
+
playing = false;
|
| 789 |
+
btn.textContent = '▶ Play';
|
| 790 |
+
clearInterval(playInterval);
|
| 791 |
+
return;
|
| 792 |
+
}
|
| 793 |
+
stepForward();
|
| 794 |
+
}, 900);
|
| 795 |
+
} else {
|
| 796 |
+
clearInterval(playInterval);
|
| 797 |
+
}
|
| 798 |
+
}
|
| 799 |
+
|
| 800 |
+
function toggleOrbit() {
|
| 801 |
+
orbitActive = !orbitActive;
|
| 802 |
+
const btn = document.getElementById('orbit-btn');
|
| 803 |
+
btn.textContent = orbitActive ? '⏹ Stop' : '🔄 Orbit';
|
| 804 |
+
btn.classList.toggle('active', orbitActive);
|
| 805 |
+
}
|
| 806 |
+
|
| 807 |
+
function resetView() {
|
| 808 |
+
spherical = { theta: 0, phi: Math.PI / 4, r: 22 };
|
| 809 |
+
currentStep = 0;
|
| 810 |
+
document.getElementById('timeline-slider').value = 0;
|
| 811 |
+
applyStep(0);
|
| 812 |
+
}
|
| 813 |
+
|
| 814 |
+
function updateStepLabel(step, max) {
|
| 815 |
+
document.getElementById('step-label').textContent = `Step ${step} / ${max}`;
|
| 816 |
+
}
|
| 817 |
+
|
| 818 |
+
// ── Animation loop ────────────────────────────────────────────────────────────
|
| 819 |
+
let frame = 0;
|
| 820 |
+
function animate() {
|
| 821 |
+
requestAnimationFrame(animate);
|
| 822 |
+
frame++;
|
| 823 |
+
|
| 824 |
+
updateCamera();
|
| 825 |
+
|
| 826 |
+
// Pulse agent sphere
|
| 827 |
+
if (agentSphere) {
|
| 828 |
+
const pulse = 1 + Math.sin(frame * 0.08) * 0.15;
|
| 829 |
+
agentSphere.scale.set(pulse, pulse, pulse);
|
| 830 |
+
agentSphere.rotation.y += 0.03;
|
| 831 |
+
}
|
| 832 |
+
|
| 833 |
+
// Subtle node oscillation
|
| 834 |
+
Object.values(nodeObjects).forEach((obj, i) => {
|
| 835 |
+
obj.mesh.position.y = obj.position.y + Math.sin(frame * 0.02 + i) * 0.05;
|
| 836 |
+
});
|
| 837 |
+
|
| 838 |
+
renderer.render(scene, camera);
|
| 839 |
+
}
|
| 840 |
+
|
| 841 |
+
// ── Window resize ─────────────────────────────────────────────────────────────
|
| 842 |
+
window.addEventListener('resize', () => {
|
| 843 |
+
camera.aspect = window.innerWidth / window.innerHeight;
|
| 844 |
+
camera.updateProjectionMatrix();
|
| 845 |
+
renderer.setSize(window.innerWidth, window.innerHeight);
|
| 846 |
+
});
|
| 847 |
+
|
| 848 |
+
// ── Public API for Gradio integration ────────────────────────────────────────
|
| 849 |
+
window.loadTrajectoryData = function(jsonData) {
|
| 850 |
+
try {
|
| 851 |
+
const data = typeof jsonData === 'string' ? JSON.parse(jsonData) : jsonData;
|
| 852 |
+
buildScene(data);
|
| 853 |
+
} catch(e) {
|
| 854 |
+
console.error('Failed to load trajectory data:', e);
|
| 855 |
+
}
|
| 856 |
+
};
|
| 857 |
+
|
| 858 |
+
// ── Init ─────────────────────────────────────────────────────────────────────
|
| 859 |
+
document.addEventListener('DOMContentLoaded', () => {
|
| 860 |
+
const data = loadVizData();
|
| 861 |
+
buildScene(data);
|
| 862 |
+
document.getElementById('loader').style.display = 'none';
|
| 863 |
+
animate();
|
| 864 |
+
});
|
| 865 |
+
</script>
|
| 866 |
+
</body>
|
| 867 |
+
</html>
|