Spaces:
Sleeping
Sleeping
Commit ยท
0b0338d
1
Parent(s): dfbd16e
v4 Research Modules & Pre-submission tweaks
Browse files- app.py +480 -391
- e2e_test_v3.py +389 -0
- inference.py +8 -4
- server/analytics_engine.py +551 -0
- server/app.py +148 -3
- server/benchmark_runner.py +413 -0
- server/causal_probe.py +409 -0
- server/confidence_calibrator.py +363 -0
- server/counterfactual_engine.py +383 -0
- server/memory_bank.py +362 -0
- static/viz3d.html +419 -680
app.py
CHANGED
|
@@ -1,16 +1,21 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
app.py โ Gradio UI
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
๐ฎ Interactive
|
| 7 |
-
๐ค Run Agent
|
| 8 |
-
๐ Evaluation
|
| 9 |
-
๐ง Intelligence
|
| 10 |
-
๐ Self-Improve
|
| 11 |
-
โ๏ธ Compare Agents
|
| 12 |
-
๐ 3D Visualizer
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
"""
|
| 15 |
import os
|
| 16 |
import json
|
|
@@ -22,6 +27,12 @@ from server.strategy_detector import StrategyDetector
|
|
| 22 |
from server.advanced_metrics import AdvancedMetricsEngine
|
| 23 |
from server.self_improvement import SelfImprovementEngine
|
| 24 |
from server.multi_agent import MultiAgentComparison
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
# โโ Global instances โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 27 |
env = CodebaseNavEnvironment()
|
|
@@ -30,36 +41,56 @@ strategy_det = StrategyDetector()
|
|
| 30 |
adv_metrics_engine = AdvancedMetricsEngine()
|
| 31 |
improvement_engine = SelfImprovementEngine()
|
| 32 |
multi_agent_engine = MultiAgentComparison()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
# โโ Tab 1: Interactive โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 36 |
|
| 37 |
-
def reset_environment(task
|
| 38 |
try:
|
| 39 |
result = env.reset(task=task)
|
| 40 |
obs = result.observation
|
| 41 |
tree = "\n".join(f" ๐ {f}" for f in obs.repo_tree)
|
| 42 |
-
failing = ", ".join(obs.failing_tests) if obs.failing_tests else "None
|
| 43 |
fi = result.info.get("fault_injection", {})
|
| 44 |
faults = ""
|
| 45 |
if fi.get("faults_injected"):
|
| 46 |
-
faults = f"\n\nโ ๏ธ Fault Injection ({fi.get('difficulty_multiplier',
|
| 47 |
faults += "\n".join(f" โข {f}" for f in fi["faults_injected"][:5])
|
| 48 |
-
|
| 49 |
status = (
|
| 50 |
-
f"โ
Episode
|
| 51 |
-
f"โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n"
|
| 52 |
-
f"Steps: {obs.steps_remaining}
|
| 53 |
f"๐ Files:\n{tree}\n\n"
|
| 54 |
f"๐ด Failing Tests: {failing}\n\n"
|
| 55 |
-
f"๐
|
| 56 |
)
|
| 57 |
return status, "", "0", "0.000"
|
| 58 |
except Exception as e:
|
| 59 |
return f"โ Error: {e}", "", "0", "0.000"
|
| 60 |
|
| 61 |
|
| 62 |
-
def take_step(action_type
|
| 63 |
if env.done:
|
| 64 |
return "โ Episode done. Reset first.", "", "", ""
|
| 65 |
try:
|
|
@@ -71,83 +102,88 @@ def take_step(action_type: str, path: str, query: str, content: str):
|
|
| 71 |
)
|
| 72 |
result = env.step(action)
|
| 73 |
obs = result.observation
|
| 74 |
-
result_text = obs.last_action_result or "
|
| 75 |
-
|
| 76 |
flags = result.info.get("security_flags", [])
|
| 77 |
-
sec = f"\n๐
|
| 78 |
-
|
| 79 |
status = (
|
| 80 |
-
f"Step {result.info['steps_taken']} | "
|
| 81 |
-
f"
|
| 82 |
-
f"Steps left: {obs.steps_remaining}{error}{sec}"
|
| 83 |
)
|
| 84 |
if result.done:
|
| 85 |
status += f"\n\n๐ DONE โ Score: {result.info['final_score']:.3f}"
|
| 86 |
-
|
| 87 |
-
return (
|
| 88 |
-
status,
|
| 89 |
-
result_text[:3000],
|
| 90 |
-
str(result.info["steps_taken"]),
|
| 91 |
-
f"{result.info.get('cumulative_reward', 0):.3f}",
|
| 92 |
-
)
|
| 93 |
except Exception as e:
|
| 94 |
-
return f"โ
|
| 95 |
|
| 96 |
|
| 97 |
# โโ Tab 2: Run Agent โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 98 |
|
| 99 |
-
def run_builtin_agent(task
|
| 100 |
try:
|
| 101 |
result = env.reset(task=task)
|
| 102 |
obs = result.observation
|
| 103 |
-
log = [
|
| 104 |
-
f"๐ {task} (variant: {result.info.get('variant_id')})",
|
| 105 |
-
f" Files: {obs.repo_tree}",
|
| 106 |
-
f" Failing: {obs.failing_tests}",
|
| 107 |
-
]
|
| 108 |
tree = obs.repo_tree
|
|
|
|
| 109 |
test_files = sorted([f for f in tree if f.startswith("tests/")])
|
| 110 |
src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")])
|
| 111 |
spec_files = sorted([f for f in tree if f.endswith(".md")])
|
| 112 |
steps = 0
|
| 113 |
|
| 114 |
if task == "task3" and spec_files:
|
| 115 |
-
for sf in spec_files:
|
| 116 |
if env.done: break
|
| 117 |
r = env.step(RepoAction(action_type="read_file", path=sf))
|
| 118 |
-
steps += 1
|
| 119 |
-
log.append(f" Step {steps}: read_file {sf} โ {r.reward:+.3f}")
|
| 120 |
|
| 121 |
for tf in test_files:
|
| 122 |
if env.done: break
|
| 123 |
r = env.step(RepoAction(action_type="read_file", path=tf))
|
| 124 |
-
steps += 1
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
for sf in src_files:
|
| 128 |
-
if env.done or steps >=
|
| 129 |
r = env.step(RepoAction(action_type="read_file", path=sf))
|
| 130 |
-
steps += 1
|
| 131 |
-
log.append(f" Step {steps}: read_file {sf} โ {r.reward:+.3f}")
|
| 132 |
|
| 133 |
if not env.done and test_files:
|
| 134 |
r = env.step(RepoAction(action_type="run_tests", path=test_files[0]))
|
| 135 |
-
steps += 1
|
| 136 |
-
log.append(f" Step {steps}: run_tests โ {r.reward:+.3f}")
|
| 137 |
|
| 138 |
if not env.done:
|
| 139 |
r = env.step(RepoAction(action_type="submit"))
|
| 140 |
-
steps += 1
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
-
log += [
|
| 144 |
-
f"\n๐ Score: {env.final_score:.3f}",
|
| 145 |
-
f" Steps: {steps}",
|
| 146 |
-
f" Reward: {env.cumulative_reward:.3f}",
|
| 147 |
-
]
|
| 148 |
return "\n".join(log)
|
| 149 |
except Exception as e:
|
| 150 |
-
return f"โ
|
| 151 |
|
| 152 |
|
| 153 |
# โโ Tab 3: Evaluation โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
@@ -156,55 +192,42 @@ def get_evaluation():
|
|
| 156 |
try:
|
| 157 |
ev = env.get_evaluation()
|
| 158 |
if "error" in ev:
|
| 159 |
-
return
|
| 160 |
-
lines = [
|
| 161 |
-
f"๐ฏ Composite Score: {ev['composite_score']:.3f}",
|
| 162 |
-
"โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ",
|
| 163 |
-
]
|
| 164 |
for name, dim in ev.get("dimensions", {}).items():
|
| 165 |
-
bar = "โ" * int(dim["score"]
|
| 166 |
lines.append(f" {name:15s} [{bar}] {dim['score']:.3f}")
|
| 167 |
-
for e in dim.get("evidence",
|
| 168 |
lines.append(f" โ {e}")
|
| 169 |
if ev.get("strengths"):
|
| 170 |
lines += ["\n๐ช Strengths:"] + [f" โ
{s}" for s in ev["strengths"]]
|
| 171 |
if ev.get("failure_analysis"):
|
| 172 |
lines += ["\nโ ๏ธ Failures:"] + [f" โ {f}" for f in ev["failure_analysis"]]
|
| 173 |
if ev.get("recommendations"):
|
| 174 |
-
lines += ["\n๐ก
|
| 175 |
return "\n".join(lines)
|
| 176 |
except Exception as e:
|
| 177 |
return f"Error: {e}"
|
| 178 |
|
| 179 |
-
|
| 180 |
def get_metrics():
|
| 181 |
try:
|
| 182 |
return json.dumps(env.get_metrics(), indent=2, default=str)
|
| 183 |
except Exception as e:
|
| 184 |
return f"Error: {e}"
|
| 185 |
|
| 186 |
-
|
| 187 |
def get_trajectory():
|
| 188 |
try:
|
| 189 |
t = env.get_trajectory()
|
| 190 |
-
if not t:
|
| 191 |
-
return "No trajectory. Run an episode first."
|
| 192 |
lines = [
|
| 193 |
-
f"Episode: {t.get('episode_id')}",
|
| 194 |
-
f"
|
| 195 |
-
f"Score: {t.get('final_score', 0):.3f} | Duration: {t.get('duration_seconds', '?')}s",
|
| 196 |
-
"โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ",
|
| 197 |
]
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
for step in t.get("steps", []):
|
| 201 |
-
em = emojis.get(step["action_type"], "โข")
|
| 202 |
p = step.get("action_path") or step.get("action_query") or ""
|
| 203 |
err = " โ" if step.get("error") else ""
|
| 204 |
-
lines.append(
|
| 205 |
-
f" {em} {step['step_number']:2d}: {step['action_type']:12s} {p:30s} "
|
| 206 |
-
f"reward={step['reward']:+.3f} ({step['duration_ms']:.0f}ms){err}"
|
| 207 |
-
)
|
| 208 |
return "\n".join(lines)
|
| 209 |
except Exception as e:
|
| 210 |
return f"Error: {e}"
|
|
@@ -214,294 +237,310 @@ def get_trajectory():
|
|
| 214 |
|
| 215 |
def get_failure_classification():
|
| 216 |
try:
|
| 217 |
-
traj =
|
| 218 |
-
if not traj:
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
episode_id=traj.get("episode_id", ""),
|
| 223 |
-
task=env.current_task or "unknown",
|
| 224 |
-
trajectory_steps=traj.get("steps", []),
|
| 225 |
-
variant_meta=meta,
|
| 226 |
-
files_read=list(env.files_read),
|
| 227 |
-
files_written=list(env.files_written),
|
| 228 |
-
final_score=env.final_score,
|
| 229 |
-
security_violations=env.security_violations,
|
| 230 |
-
)
|
| 231 |
-
d = report.to_dict()
|
| 232 |
lines = [
|
| 233 |
f"{'โ
SUCCESS' if d['success'] else 'โ FAILURE'}",
|
| 234 |
-
f"Primary
|
| 235 |
-
f"Failures Detected: {d['failure_count']}",
|
| 236 |
-
"โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ",
|
| 237 |
]
|
| 238 |
-
for f in d.get("failures",
|
| 239 |
-
lines += [
|
| 240 |
-
|
| 241 |
-
f" Evidence: {f['evidence']}",
|
| 242 |
-
f" Root Cause: {f['root_cause']}",
|
| 243 |
-
f" Fix: {f['remediation']}",
|
| 244 |
-
]
|
| 245 |
if d.get("failure_summary"):
|
| 246 |
lines += ["\n๐ Summary:", f" {d['failure_summary']}"]
|
| 247 |
if d.get("retry_hint"):
|
| 248 |
-
lines += ["\n๐ Retry
|
| 249 |
return "\n".join(lines)
|
| 250 |
-
except Exception as e:
|
| 251 |
-
return f"Error: {e}"
|
| 252 |
|
| 253 |
|
| 254 |
def get_strategy_detection():
|
| 255 |
try:
|
| 256 |
-
traj =
|
| 257 |
-
if not traj:
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
trajectory_steps=traj.get("steps", []),
|
| 262 |
-
task=env.current_task or "unknown",
|
| 263 |
-
variant_meta=meta,
|
| 264 |
-
files_read=list(env.files_read),
|
| 265 |
-
final_score=env.final_score,
|
| 266 |
-
)
|
| 267 |
-
d = report.to_dict()
|
| 268 |
-
score_bar = "โ" * int(d["score"] * 20) + "โ" * (20 - int(d["score"] * 20))
|
| 269 |
lines = [
|
| 270 |
-
f"๐งญ Strategy: {d['strategy']}",
|
| 271 |
-
f"
|
| 272 |
-
f"
|
| 273 |
-
f"\n๐ {d['strategy_description']}",
|
| 274 |
-
f"\n๐ Exploration Ratio: {d['exploration_ratio']:.2f} "
|
| 275 |
-
f"({'explore-heavy' if d['exploration_ratio'] > 0.6 else 'exploit-heavy' if d['exploration_ratio'] < 0.4 else 'balanced'})",
|
| 276 |
-
f" Strategy Pivots: {d['pivot_count']}",
|
| 277 |
]
|
| 278 |
-
if d.get("sub_patterns"):
|
| 279 |
-
|
| 280 |
-
if d.get("evidence"):
|
| 281 |
-
lines += ["\n๐ Evidence:"] + [f" โ {e}" for e in d["evidence"]]
|
| 282 |
return "\n".join(lines)
|
| 283 |
-
except Exception as e:
|
| 284 |
-
return f"Error: {e}"
|
| 285 |
|
| 286 |
|
| 287 |
def get_advanced_metrics():
|
| 288 |
try:
|
| 289 |
-
traj =
|
| 290 |
-
if not traj:
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
variant_meta=meta,
|
| 296 |
-
final_score=env.final_score,
|
| 297 |
-
files_read=list(env.files_read),
|
| 298 |
-
files_written=list(env.files_written),
|
| 299 |
-
)
|
| 300 |
-
d = report.to_dict()
|
| 301 |
-
|
| 302 |
-
def bar(v):
|
| 303 |
-
return "โ" * int(v * 20) + "โ" * (20 - int(v * 20))
|
| 304 |
-
|
| 305 |
-
lines = [
|
| 306 |
-
"โก ADVANCED METRICS",
|
| 307 |
-
"โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ",
|
| 308 |
f" Reasoning Efficiency [{bar(d['reasoning_efficiency'])}] {d['reasoning_efficiency']:.3f}",
|
| 309 |
f" Reliability Index [{bar(d['reliability_index'])}] {d['reliability_index']:.3f}",
|
| 310 |
f" Exploration Ratio [{bar(d['exploration_ratio'])}] {d['exploration_ratio']:.3f}",
|
| 311 |
f" Decision Entropy [{bar(d['decision_entropy'])}] {d['decision_entropy']:.3f}",
|
| 312 |
f" Wasteful Ratio [{bar(d['wasteful_ratio'])}] {d['wasteful_ratio']:.3f}",
|
| 313 |
-
f" Pivot Rate
|
| 314 |
-
f" Consistency [{bar(d['consistency_score'])}] {d['consistency_score']:.3f} ({d['runs_analyzed']} runs)",
|
| 315 |
-
"\n๐ Action Distribution:",
|
| 316 |
]
|
| 317 |
-
|
| 318 |
-
lines
|
| 319 |
-
if d.get("useful_actions"):
|
| 320 |
-
lines += ["\nโ
Useful Actions:"] + [f" โข {a}" for a in d["useful_actions"]]
|
| 321 |
-
if d.get("wasteful_actions"):
|
| 322 |
-
lines += ["\nโ ๏ธ Wasteful Actions:"] + [f" โข {a}" for a in d["wasteful_actions"]]
|
| 323 |
-
lines += ["\n๐ Reliability Breakdown:"]
|
| 324 |
-
for k, v in d.get("reliability_breakdown", {}).items():
|
| 325 |
-
lines.append(f" {k:15s}: {v:.3f}")
|
| 326 |
return "\n".join(lines)
|
| 327 |
-
except Exception as e:
|
| 328 |
-
return f"Error: {e}"
|
| 329 |
|
| 330 |
|
| 331 |
# โโ Tab 5: Self-Improve โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 332 |
|
| 333 |
def get_improvement_plan():
|
| 334 |
try:
|
| 335 |
-
traj =
|
| 336 |
-
if not traj:
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
steps = traj.get("steps", [])
|
| 340 |
-
|
| 341 |
-
fail_report = failure_clf.classify(
|
| 342 |
-
episode_id=traj.get("episode_id", ""),
|
| 343 |
-
task=env.current_task or "unknown",
|
| 344 |
-
trajectory_steps=steps,
|
| 345 |
-
variant_meta=meta,
|
| 346 |
-
files_read=list(env.files_read),
|
| 347 |
-
files_written=list(env.files_written),
|
| 348 |
-
final_score=env.final_score,
|
| 349 |
-
security_violations=env.security_violations,
|
| 350 |
-
)
|
| 351 |
plan = improvement_engine.generate_improvement_plan(
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
failure_evidence=[f.evidence for f in fail_report.failures],
|
| 356 |
-
original_score=env.final_score,
|
| 357 |
-
trajectory_steps=steps,
|
| 358 |
-
files_read=list(env.files_read),
|
| 359 |
-
files_written=list(env.files_written),
|
| 360 |
)
|
| 361 |
d = plan.to_dict()
|
| 362 |
lines = [
|
| 363 |
-
|
| 364 |
-
f"
|
| 365 |
-
f"
|
| 366 |
-
f"
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
]
|
| 371 |
-
for step in d.get("step_by_step_plan", []):
|
| 372 |
-
lines.append(f" {step}")
|
| 373 |
-
if d.get("specific_errors"):
|
| 374 |
-
lines += ["\n๐ Specific Errors:"] + [f" โข {e}" for e in d["specific_errors"][:5]]
|
| 375 |
-
lines += [
|
| 376 |
-
"\n๐ System Prompt Injection (for next LLM run):",
|
| 377 |
-
"โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ",
|
| 378 |
-
d.get("system_prompt_addon", "No injection needed."),
|
| 379 |
-
]
|
| 380 |
return "\n".join(lines)
|
| 381 |
-
except Exception as e:
|
| 382 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
|
| 384 |
|
| 385 |
# โโ Tab 6: Compare Agents โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 386 |
|
| 387 |
-
def run_comparison(task
|
| 388 |
try:
|
| 389 |
-
agents = selected_agents
|
| 390 |
report = multi_agent_engine.compare(env, task=task, agents=agents)
|
| 391 |
d = report.to_dict()
|
| 392 |
-
|
| 393 |
lines = [
|
| 394 |
f"โ๏ธ MULTI-AGENT COMPARISON โ {task} (variant: {d.get('variant_id')})",
|
| 395 |
-
f"๐ Winner: {d.get('winner')} (score: {d.get('winner_score',
|
| 396 |
-
"
|
| 397 |
-
|
| 398 |
-
"โ" * 100,
|
| 399 |
]
|
| 400 |
-
for row in d.get("summary_table",
|
| 401 |
-
lines.append(
|
| 402 |
-
|
| 403 |
-
f"{row['steps']:<8} {row['strategy']:<22} {row['failure']:<22} {row['reliability']:<12.3f}"
|
| 404 |
-
)
|
| 405 |
-
lines.append("โ" * 100)
|
| 406 |
-
|
| 407 |
if d.get("insights"):
|
| 408 |
lines += ["\n๐ก Insights:"] + [f" โ {i}" for i in d["insights"]]
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
seq = " โ ".join(run.get("action_sequence", []))
|
| 413 |
lines.append(f" {run['agent_name']:16s}: {seq}")
|
| 414 |
-
|
| 415 |
return "\n".join(lines)
|
| 416 |
-
except Exception as e:
|
| 417 |
-
return f"โ Error: {e}"
|
| 418 |
|
| 419 |
|
| 420 |
# โโ Tab 7: 3D Visualizer โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 421 |
|
| 422 |
-
def
|
| 423 |
-
"""
|
| 424 |
-
#
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
|
| 429 |
-
with open(static_path, "r") as f:
|
| 430 |
-
html = f.read()
|
| 431 |
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
"
|
| 482 |
-
"
|
| 483 |
-
"
|
| 484 |
-
"
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 496 |
|
| 497 |
|
| 498 |
-
#
|
|
|
|
|
|
|
| 499 |
|
| 500 |
-
with gr.Blocks(title="Codebase Navigation & Repair โ OpenEnv
|
| 501 |
gr.Markdown(
|
| 502 |
-
"# ๐ Codebase Navigation & Repair โ OpenEnv
|
| 503 |
-
"**The
|
| 504 |
-
"Navigate
|
| 505 |
)
|
| 506 |
|
| 507 |
with gr.Tabs():
|
|
@@ -510,19 +549,12 @@ with gr.Blocks(title="Codebase Navigation & Repair โ OpenEnv v3") as demo:
|
|
| 510 |
with gr.TabItem("๐ฎ Interactive"):
|
| 511 |
with gr.Row():
|
| 512 |
with gr.Column(scale=1):
|
| 513 |
-
|
| 514 |
-
["task1", "task2", "task3"], value="task1",
|
| 515 |
-
label="Task",
|
| 516 |
-
info="task1=bugs, task2=cross-module, task3=feature impl"
|
| 517 |
-
)
|
| 518 |
reset_btn = gr.Button("๐ Reset Environment", variant="primary")
|
| 519 |
gr.Markdown("### Action")
|
| 520 |
-
act_type = gr.Dropdown(
|
| 521 |
-
["read_file", "write_file", "run_tests", "search_code", "submit"],
|
| 522 |
-
value="read_file", label="Action Type",
|
| 523 |
-
)
|
| 524 |
act_path = gr.Textbox(label="Path", placeholder="src/auth.py")
|
| 525 |
-
act_query = gr.Textbox(label="Query
|
| 526 |
act_content = gr.Textbox(label="Content (write_file)", lines=4)
|
| 527 |
step_btn = gr.Button("โถ๏ธ Execute Step", variant="secondary")
|
| 528 |
with gr.Column(scale=2):
|
|
@@ -531,16 +563,16 @@ with gr.Blocks(title="Codebase Navigation & Repair โ OpenEnv v3") as demo:
|
|
| 531 |
with gr.Row():
|
| 532 |
steps_box = gr.Textbox(label="Steps", value="0", interactive=False)
|
| 533 |
reward_box = gr.Textbox(label="Cumulative Reward", value="0.000", interactive=False)
|
| 534 |
-
reset_btn.click(reset_environment, [
|
| 535 |
step_btn.click(take_step, [act_type, act_path, act_query, act_content], [status_box, result_box, steps_box, reward_box])
|
| 536 |
|
| 537 |
# โโ Tab 2: Run Agent โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 538 |
with gr.TabItem("๐ค Run Agent"):
|
| 539 |
-
gr.Markdown("### Built-in Demonstration Agent\nRuns deterministic
|
| 540 |
-
agent_task = gr.Dropdown(["task1",
|
| 541 |
run_btn = gr.Button("๐ Run Agent", variant="primary")
|
| 542 |
-
|
| 543 |
-
run_btn.click(run_builtin_agent, [agent_task], [
|
| 544 |
|
| 545 |
# โโ Tab 3: Evaluation โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 546 |
with gr.TabItem("๐ Evaluation"):
|
|
@@ -553,107 +585,164 @@ with gr.Blocks(title="Codebase Navigation & Repair โ OpenEnv v3") as demo:
|
|
| 553 |
metrics_btn.click(get_metrics, outputs=[eval_out])
|
| 554 |
traj_btn.click(get_trajectory, outputs=[eval_out])
|
| 555 |
|
| 556 |
-
# โโ Tab 4:
|
| 557 |
with gr.TabItem("๐ง Intelligence"):
|
| 558 |
-
gr.Markdown(
|
| 559 |
-
"### Deep Agent Intelligence Analysis\n"
|
| 560 |
-
"Failure classification, strategy detection, and advanced behavioral metrics."
|
| 561 |
-
)
|
| 562 |
with gr.Row():
|
| 563 |
-
|
| 564 |
-
|
| 565 |
adv_btn = gr.Button("โก Advanced Metrics", variant="secondary")
|
| 566 |
intel_out = gr.Textbox(label="Analysis", lines=32, interactive=False)
|
| 567 |
-
|
| 568 |
-
|
| 569 |
adv_btn.click(get_advanced_metrics, outputs=[intel_out])
|
| 570 |
|
| 571 |
-
# โโ Tab 5:
|
| 572 |
with gr.TabItem("๐ Self-Improve"):
|
| 573 |
-
gr.Markdown(
|
| 574 |
-
|
| 575 |
-
"
|
| 576 |
-
"
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
improve_out = gr.Textbox(label="Improvement Plan", lines=32, interactive=False)
|
| 580 |
improve_btn.click(get_improvement_plan, outputs=[improve_out])
|
|
|
|
| 581 |
|
| 582 |
-
# โโ Tab 6:
|
| 583 |
with gr.TabItem("โ๏ธ Compare Agents"):
|
| 584 |
-
gr.Markdown(
|
| 585 |
-
"### Multi-Agent Strategy Comparison\n"
|
| 586 |
-
"Runs 4 built-in agent strategies on the same task to compare "
|
| 587 |
-
"efficiency, strategy, and reliability side-by-side."
|
| 588 |
-
)
|
| 589 |
with gr.Row():
|
| 590 |
-
comp_task = gr.Dropdown(["task1",
|
| 591 |
comp_agents = gr.CheckboxGroup(
|
| 592 |
-
["test-first",
|
| 593 |
-
value=["test-first",
|
| 594 |
-
label="Agents
|
| 595 |
)
|
| 596 |
comp_btn = gr.Button("โ๏ธ Run Comparison", variant="primary")
|
| 597 |
-
comp_out = gr.Textbox(label="
|
| 598 |
comp_btn.click(run_comparison, [comp_task, comp_agents], [comp_out])
|
| 599 |
|
| 600 |
-
# โโ Tab 7:
|
| 601 |
with gr.TabItem("๐ 3D Visualizer"):
|
| 602 |
gr.Markdown(
|
| 603 |
"### Agent Trajectory 3D Visualization\n"
|
| 604 |
-
"Files = 3D
|
| 605 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 606 |
)
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
|
| 611 |
-
# โโ Tab
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 612 |
with gr.TabItem("๐ API"):
|
| 613 |
gr.Markdown("""
|
| 614 |
-
### REST API โ
|
| 615 |
|
| 616 |
-
#### Core
|
| 617 |
-
|
|
| 618 |
-
|----------|--------|-------------|
|
| 619 |
-
| `/reset?task=task1` | POST | Start new episode |
|
| 620 |
-
| `/step` | POST | Take action |
|
| 621 |
-
| `/state` | GET | Current state |
|
| 622 |
-
| `/health` | GET | Health check |
|
| 623 |
|
| 624 |
#### Evaluation
|
| 625 |
-
|
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
| `/
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
#### Intelligence (NEW in v3)
|
| 633 |
-
| Endpoint | Method | Description |
|
| 634 |
-
|----------|--------|-------------|
|
| 635 |
-
| `/classify` | GET | Typed failure classification |
|
| 636 |
-
| `/strategy` | GET | Behavioral strategy detection |
|
| 637 |
-
| `/advanced-metrics` | GET | Entropy, reliability, consistency |
|
| 638 |
-
| `/improvement-plan` | GET | Self-improvement feedback |
|
| 639 |
-
| `/compare-agents` | POST | Multi-agent comparison |
|
| 640 |
-
| `/viz-data` | GET | 3D visualization data |
|
| 641 |
|
| 642 |
```bash
|
| 643 |
BASE="http://localhost:7860"
|
|
|
|
| 644 |
curl -X POST "$BASE/reset?task=task1"
|
| 645 |
-
curl -X POST "$BASE/step" -H "Content-Type: application/json" -d '{"action_type":"read_file","path":"
|
| 646 |
curl -X POST "$BASE/step" -d '{"action_type":"submit"}'
|
|
|
|
|
|
|
| 647 |
curl "$BASE/classify"
|
| 648 |
-
curl "$BASE/
|
| 649 |
-
curl "$BASE/
|
| 650 |
-
curl "$BASE/
|
| 651 |
-
curl
|
|
|
|
|
|
|
|
|
|
| 652 |
```
|
| 653 |
""")
|
| 654 |
|
| 655 |
|
| 656 |
-
# โโ Mount FastAPI
|
| 657 |
from server.app import app as fastapi_app
|
| 658 |
gr_app = gr.mount_gradio_app(fastapi_app, demo, path="/")
|
| 659 |
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
app.py โ Gradio UI v4.0 โ Full Research Platform
|
| 4 |
+
|
| 5 |
+
13 tabs:
|
| 6 |
+
๐ฎ Interactive โ manual control
|
| 7 |
+
๐ค Run Agent โ deterministic demo agent
|
| 8 |
+
๐ Evaluation โ 6-dimension process evaluation
|
| 9 |
+
๐ง Intelligence โ failure, strategy, advanced metrics
|
| 10 |
+
๐ Self-Improve โ improvement plan with prompt injection
|
| 11 |
+
โ๏ธ Compare Agents โ multi-agent strategy comparison
|
| 12 |
+
๐ 3D Visualizer โ Three.js trajectory viz (FIXED: iframe)
|
| 13 |
+
๐งช Causal Probe โ causal reasoning vs guessing
|
| 14 |
+
๐ญ Counterfactual โ brittleness / robustness testing
|
| 15 |
+
๐ Confidence โ calibration: overconfident vs underconfident
|
| 16 |
+
๐ Benchmark โ automated leaderboard
|
| 17 |
+
๐ Analytics โ unified research-grade report
|
| 18 |
+
๐ API โ REST reference
|
| 19 |
"""
|
| 20 |
import os
|
| 21 |
import json
|
|
|
|
| 27 |
from server.advanced_metrics import AdvancedMetricsEngine
|
| 28 |
from server.self_improvement import SelfImprovementEngine
|
| 29 |
from server.multi_agent import MultiAgentComparison
|
| 30 |
+
from server.causal_probe import CausalProbe
|
| 31 |
+
from server.counterfactual_engine import CounterfactualEngine
|
| 32 |
+
from server.confidence_calibrator import ConfidenceCalibrator
|
| 33 |
+
from server.benchmark_runner import BenchmarkRunner
|
| 34 |
+
from server.analytics_engine import AnalyticsEngine
|
| 35 |
+
from server.memory_bank import get_global_memory
|
| 36 |
|
| 37 |
# โโ Global instances โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 38 |
env = CodebaseNavEnvironment()
|
|
|
|
| 41 |
adv_metrics_engine = AdvancedMetricsEngine()
|
| 42 |
improvement_engine = SelfImprovementEngine()
|
| 43 |
multi_agent_engine = MultiAgentComparison()
|
| 44 |
+
causal_probe = CausalProbe()
|
| 45 |
+
counterfactual_engine = CounterfactualEngine()
|
| 46 |
+
confidence_calibrator = ConfidenceCalibrator()
|
| 47 |
+
benchmark_runner = BenchmarkRunner()
|
| 48 |
+
analytics_engine = AnalyticsEngine()
|
| 49 |
+
memory_bank = get_global_memory()
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
# โโ Helpers โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 53 |
+
|
| 54 |
+
def _get_traj_and_meta():
|
| 55 |
+
traj = env.get_trajectory()
|
| 56 |
+
if not traj:
|
| 57 |
+
return None, None, None, None
|
| 58 |
+
meta = env.variant.meta if env.variant else {}
|
| 59 |
+
steps = traj.get("steps", [])
|
| 60 |
+
return traj, meta, steps, traj.get("episode_id", "")
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def _no_traj():
|
| 64 |
+
return "โ ๏ธ No trajectory. Run an episode first (Interactive or Run Agent tab)."
|
| 65 |
|
| 66 |
|
| 67 |
# โโ Tab 1: Interactive โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 68 |
|
| 69 |
+
def reset_environment(task):
|
| 70 |
try:
|
| 71 |
result = env.reset(task=task)
|
| 72 |
obs = result.observation
|
| 73 |
tree = "\n".join(f" ๐ {f}" for f in obs.repo_tree)
|
| 74 |
+
failing = ", ".join(obs.failing_tests) if obs.failing_tests else "None"
|
| 75 |
fi = result.info.get("fault_injection", {})
|
| 76 |
faults = ""
|
| 77 |
if fi.get("faults_injected"):
|
| 78 |
+
faults = f"\n\nโ ๏ธ Fault Injection ({fi.get('difficulty_multiplier',1):.1f}ร):\n"
|
| 79 |
faults += "\n".join(f" โข {f}" for f in fi["faults_injected"][:5])
|
|
|
|
| 80 |
status = (
|
| 81 |
+
f"โ
Episode started โ {task} (variant: {result.info.get('variant_id','?')})\n"
|
| 82 |
+
f"โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n"
|
| 83 |
+
f"Steps remaining: {obs.steps_remaining}\n\n"
|
| 84 |
f"๐ Files:\n{tree}\n\n"
|
| 85 |
f"๐ด Failing Tests: {failing}\n\n"
|
| 86 |
+
f"๐ {obs.task_description}{faults}"
|
| 87 |
)
|
| 88 |
return status, "", "0", "0.000"
|
| 89 |
except Exception as e:
|
| 90 |
return f"โ Error: {e}", "", "0", "0.000"
|
| 91 |
|
| 92 |
|
| 93 |
+
def take_step(action_type, path, query, content):
|
| 94 |
if env.done:
|
| 95 |
return "โ Episode done. Reset first.", "", "", ""
|
| 96 |
try:
|
|
|
|
| 102 |
)
|
| 103 |
result = env.step(action)
|
| 104 |
obs = result.observation
|
| 105 |
+
result_text = obs.last_action_result or ""
|
| 106 |
+
err = f"\nโ ๏ธ {obs.last_action_error}" if obs.last_action_error else ""
|
| 107 |
flags = result.info.get("security_flags", [])
|
| 108 |
+
sec = f"\n๐ {flags}" if flags else ""
|
|
|
|
| 109 |
status = (
|
| 110 |
+
f"Step {result.info['steps_taken']} | Reward: {result.reward:+.3f} | "
|
| 111 |
+
f"Left: {obs.steps_remaining}{err}{sec}"
|
|
|
|
| 112 |
)
|
| 113 |
if result.done:
|
| 114 |
status += f"\n\n๐ DONE โ Score: {result.info['final_score']:.3f}"
|
| 115 |
+
return status, result_text[:3000], str(result.info["steps_taken"]), f"{result.info.get('cumulative_reward',0):.3f}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
except Exception as e:
|
| 117 |
+
return f"โ {e}", "", "", ""
|
| 118 |
|
| 119 |
|
| 120 |
# โโ Tab 2: Run Agent โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 121 |
|
| 122 |
+
def run_builtin_agent(task):
|
| 123 |
try:
|
| 124 |
result = env.reset(task=task)
|
| 125 |
obs = result.observation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
tree = obs.repo_tree
|
| 127 |
+
log = [f"๐ {task} (variant: {result.info.get('variant_id')})", f" Files: {tree}"]
|
| 128 |
test_files = sorted([f for f in tree if f.startswith("tests/")])
|
| 129 |
src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")])
|
| 130 |
spec_files = sorted([f for f in tree if f.endswith(".md")])
|
| 131 |
steps = 0
|
| 132 |
|
| 133 |
if task == "task3" and spec_files:
|
| 134 |
+
for sf in spec_files[:2]:
|
| 135 |
if env.done: break
|
| 136 |
r = env.step(RepoAction(action_type="read_file", path=sf))
|
| 137 |
+
steps += 1; log.append(f" Step {steps}: read_file {sf} โ {r.reward:+.3f}")
|
|
|
|
| 138 |
|
| 139 |
for tf in test_files:
|
| 140 |
if env.done: break
|
| 141 |
r = env.step(RepoAction(action_type="read_file", path=tf))
|
| 142 |
+
steps += 1; log.append(f" Step {steps}: read_file {tf} โ {r.reward:+.3f}")
|
| 143 |
+
|
| 144 |
+
if not env.done:
|
| 145 |
+
r = env.step(RepoAction(action_type="search_code", query="def "))
|
| 146 |
+
steps += 1; log.append(f" Step {steps}: search_code โ {r.reward:+.3f}")
|
| 147 |
|
| 148 |
for sf in src_files:
|
| 149 |
+
if env.done or steps >= 14: break
|
| 150 |
r = env.step(RepoAction(action_type="read_file", path=sf))
|
| 151 |
+
steps += 1; log.append(f" Step {steps}: read_file {sf} โ {r.reward:+.3f}")
|
|
|
|
| 152 |
|
| 153 |
if not env.done and test_files:
|
| 154 |
r = env.step(RepoAction(action_type="run_tests", path=test_files[0]))
|
| 155 |
+
steps += 1; log.append(f" Step {steps}: run_tests โ {r.reward:+.3f}")
|
|
|
|
| 156 |
|
| 157 |
if not env.done:
|
| 158 |
r = env.step(RepoAction(action_type="submit"))
|
| 159 |
+
steps += 1; log.append(f" Step {steps}: submit โ {r.reward:+.3f}")
|
| 160 |
+
|
| 161 |
+
log += ["", f"๐ Score: {env.final_score:.3f} | Steps: {steps} | Reward: {env.cumulative_reward:.3f}"]
|
| 162 |
+
|
| 163 |
+
# Store in memory
|
| 164 |
+
traj = env.get_trajectory()
|
| 165 |
+
if traj:
|
| 166 |
+
meta = env.variant.meta if env.variant else {}
|
| 167 |
+
fail_r = failure_clf.classify(
|
| 168 |
+
traj.get("episode_id",""), task, traj.get("steps",[]), meta,
|
| 169 |
+
list(env.files_read), list(env.files_written), env.final_score
|
| 170 |
+
)
|
| 171 |
+
strat_r = strategy_det.detect(traj.get("steps",[]), task, meta, list(env.files_read), env.final_score)
|
| 172 |
+
imp_plan = improvement_engine.generate_improvement_plan(
|
| 173 |
+
traj.get("episode_id",""), task, fail_r.primary_failure,
|
| 174 |
+
[], env.final_score, traj.get("steps",[]),
|
| 175 |
+
list(env.files_read), list(env.files_written)
|
| 176 |
+
)
|
| 177 |
+
memory_bank.store(
|
| 178 |
+
traj.get("episode_id",""), task, fail_r.primary_failure,
|
| 179 |
+
fail_r.failure_summary or "", env.final_score,
|
| 180 |
+
strat_r.strategy, traj.get("steps",[]), imp_plan.to_dict()
|
| 181 |
+
)
|
| 182 |
+
log.append(f"๐พ Stored lesson in memory bank ({memory_bank.get_stats()['total_entries']} total)")
|
| 183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
return "\n".join(log)
|
| 185 |
except Exception as e:
|
| 186 |
+
return f"โ {e}"
|
| 187 |
|
| 188 |
|
| 189 |
# โโ Tab 3: Evaluation โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
|
|
| 192 |
try:
|
| 193 |
ev = env.get_evaluation()
|
| 194 |
if "error" in ev:
|
| 195 |
+
return _no_traj()
|
| 196 |
+
lines = [f"๐ฏ Composite Score: {ev['composite_score']:.3f}", "โ"*50]
|
|
|
|
|
|
|
|
|
|
| 197 |
for name, dim in ev.get("dimensions", {}).items():
|
| 198 |
+
bar = "โ" * int(dim["score"]*20) + "โ" * (20-int(dim["score"]*20))
|
| 199 |
lines.append(f" {name:15s} [{bar}] {dim['score']:.3f}")
|
| 200 |
+
for e in dim.get("evidence",[])[:2]:
|
| 201 |
lines.append(f" โ {e}")
|
| 202 |
if ev.get("strengths"):
|
| 203 |
lines += ["\n๐ช Strengths:"] + [f" โ
{s}" for s in ev["strengths"]]
|
| 204 |
if ev.get("failure_analysis"):
|
| 205 |
lines += ["\nโ ๏ธ Failures:"] + [f" โ {f}" for f in ev["failure_analysis"]]
|
| 206 |
if ev.get("recommendations"):
|
| 207 |
+
lines += ["\n๐ก Recs:"] + [f" โ {r}" for r in ev["recommendations"]]
|
| 208 |
return "\n".join(lines)
|
| 209 |
except Exception as e:
|
| 210 |
return f"Error: {e}"
|
| 211 |
|
|
|
|
| 212 |
def get_metrics():
|
| 213 |
try:
|
| 214 |
return json.dumps(env.get_metrics(), indent=2, default=str)
|
| 215 |
except Exception as e:
|
| 216 |
return f"Error: {e}"
|
| 217 |
|
|
|
|
| 218 |
def get_trajectory():
|
| 219 |
try:
|
| 220 |
t = env.get_trajectory()
|
| 221 |
+
if not t: return _no_traj()
|
|
|
|
| 222 |
lines = [
|
| 223 |
+
f"Episode: {t.get('episode_id')}", f"Task: {t.get('task')} | Variant: {t.get('variant_id')}",
|
| 224 |
+
f"Score: {t.get('final_score',0):.3f} | Duration: {t.get('duration_seconds','?')}s", "โ"*60,
|
|
|
|
|
|
|
| 225 |
]
|
| 226 |
+
em = {"read_file":"๐","write_file":"โ๏ธ","run_tests":"๐งช","search_code":"๐","submit":"๐"}
|
| 227 |
+
for step in t.get("steps",[]):
|
|
|
|
|
|
|
| 228 |
p = step.get("action_path") or step.get("action_query") or ""
|
| 229 |
err = " โ" if step.get("error") else ""
|
| 230 |
+
lines.append(f" {em.get(step['action_type'],'โข')} {step['step_number']:2d}: {step['action_type']:12s} {p:25s} reward={step['reward']:+.3f}{err}")
|
|
|
|
|
|
|
|
|
|
| 231 |
return "\n".join(lines)
|
| 232 |
except Exception as e:
|
| 233 |
return f"Error: {e}"
|
|
|
|
| 237 |
|
| 238 |
def get_failure_classification():
|
| 239 |
try:
|
| 240 |
+
traj, meta, steps, ep_id = _get_traj_and_meta()
|
| 241 |
+
if not traj: return _no_traj()
|
| 242 |
+
r = failure_clf.classify(ep_id, env.current_task or "?", steps, meta,
|
| 243 |
+
list(env.files_read), list(env.files_written), env.final_score)
|
| 244 |
+
d = r.to_dict()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
lines = [
|
| 246 |
f"{'โ
SUCCESS' if d['success'] else 'โ FAILURE'}",
|
| 247 |
+
f"Primary: {d['primary_failure']} | Count: {d['failure_count']}", "โ"*50,
|
|
|
|
|
|
|
| 248 |
]
|
| 249 |
+
for f in d.get("failures",[]):
|
| 250 |
+
lines += [f"\n[{f['severity'].upper()}] {f['type']} @ step {f['step']}",
|
| 251 |
+
f" Evidence: {f['evidence']}", f" Fix: {f['remediation']}"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
if d.get("failure_summary"):
|
| 253 |
lines += ["\n๐ Summary:", f" {d['failure_summary']}"]
|
| 254 |
if d.get("retry_hint"):
|
| 255 |
+
lines += [f"\n๐ Retry hint: {d['retry_hint']}"]
|
| 256 |
return "\n".join(lines)
|
| 257 |
+
except Exception as e: return f"Error: {e}"
|
|
|
|
| 258 |
|
| 259 |
|
| 260 |
def get_strategy_detection():
|
| 261 |
try:
|
| 262 |
+
traj, meta, steps, _ = _get_traj_and_meta()
|
| 263 |
+
if not traj: return _no_traj()
|
| 264 |
+
r = strategy_det.detect(steps, env.current_task or "?", meta, list(env.files_read), env.final_score)
|
| 265 |
+
d = r.to_dict()
|
| 266 |
+
bar = "โ"*int(d["score"]*20)+"โ"*(20-int(d["score"]*20))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
lines = [
|
| 268 |
+
f"๐งญ Strategy: {d['strategy']}", f" [{bar}] {d['score']:.3f} (confidence: {d['confidence']:.0%})",
|
| 269 |
+
f"\n{d['strategy_description']}",
|
| 270 |
+
f"\nExploration: {d['exploration_ratio']:.2f} | Pivots: {d['pivot_count']}",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
]
|
| 272 |
+
if d.get("sub_patterns"): lines += ["\nSub-patterns:"] + [f" โข {p}" for p in d["sub_patterns"]]
|
| 273 |
+
if d.get("evidence"): lines += ["\nEvidence:"] + [f" โ {e}" for e in d["evidence"]]
|
|
|
|
|
|
|
| 274 |
return "\n".join(lines)
|
| 275 |
+
except Exception as e: return f"Error: {e}"
|
|
|
|
| 276 |
|
| 277 |
|
| 278 |
def get_advanced_metrics():
|
| 279 |
try:
|
| 280 |
+
traj, meta, steps, _ = _get_traj_and_meta()
|
| 281 |
+
if not traj: return _no_traj()
|
| 282 |
+
r = adv_metrics_engine.compute(steps, meta, env.final_score, list(env.files_read), list(env.files_written))
|
| 283 |
+
d = r.to_dict()
|
| 284 |
+
def bar(v): return "โ"*int(v*20)+"โ"*(20-int(v*20))
|
| 285 |
+
lines = ["โก ADVANCED METRICS", "โ"*50,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
f" Reasoning Efficiency [{bar(d['reasoning_efficiency'])}] {d['reasoning_efficiency']:.3f}",
|
| 287 |
f" Reliability Index [{bar(d['reliability_index'])}] {d['reliability_index']:.3f}",
|
| 288 |
f" Exploration Ratio [{bar(d['exploration_ratio'])}] {d['exploration_ratio']:.3f}",
|
| 289 |
f" Decision Entropy [{bar(d['decision_entropy'])}] {d['decision_entropy']:.3f}",
|
| 290 |
f" Wasteful Ratio [{bar(d['wasteful_ratio'])}] {d['wasteful_ratio']:.3f}",
|
| 291 |
+
f" Pivot Rate {d['pivot_rate']:.2f}/10 steps | Consistency {d['consistency_score']:.3f} ({d['runs_analyzed']} runs)",
|
|
|
|
|
|
|
| 292 |
]
|
| 293 |
+
if d.get("action_distribution"):
|
| 294 |
+
lines += ["\nAction Distribution:"] + [f" {a:14s}: {c}" for a,c in d["action_distribution"].items()]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
return "\n".join(lines)
|
| 296 |
+
except Exception as e: return f"Error: {e}"
|
|
|
|
| 297 |
|
| 298 |
|
| 299 |
# โโ Tab 5: Self-Improve โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 300 |
|
| 301 |
def get_improvement_plan():
|
| 302 |
try:
|
| 303 |
+
traj, meta, steps, ep_id = _get_traj_and_meta()
|
| 304 |
+
if not traj: return _no_traj()
|
| 305 |
+
fail_r = failure_clf.classify(ep_id, env.current_task or "?", steps, meta,
|
| 306 |
+
list(env.files_read), list(env.files_written), env.final_score)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
plan = improvement_engine.generate_improvement_plan(
|
| 308 |
+
ep_id, env.current_task or "?", fail_r.primary_failure,
|
| 309 |
+
[f.evidence for f in fail_r.failures], env.final_score,
|
| 310 |
+
steps, list(env.files_read), list(env.files_written)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
)
|
| 312 |
d = plan.to_dict()
|
| 313 |
lines = [
|
| 314 |
+
"๐ SELF-IMPROVEMENT PLAN", "โ"*50,
|
| 315 |
+
f"Original Score: {d['original_score']:.3f} | Failure: {d['failure_type']}",
|
| 316 |
+
f"\nโ What went wrong:\n {d['what_went_wrong']}",
|
| 317 |
+
f"\n๐ฏ Improved strategy:\n {d['improved_strategy']}",
|
| 318 |
+
"\n๐ Step-by-step plan:",
|
| 319 |
+
] + [f" {s}" for s in d.get("step_by_step_plan",[])]
|
| 320 |
+
lines += ["\n๐ System Prompt Injection:", "โ"*40, d.get("system_prompt_addon","None")]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
return "\n".join(lines)
|
| 322 |
+
except Exception as e: return f"Error: {e}"
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
def get_memory_context_for_task(task):
|
| 326 |
+
try:
|
| 327 |
+
ctx = memory_bank.retrieve(task=task, max_lessons=3)
|
| 328 |
+
stats = memory_bank.get_stats()
|
| 329 |
+
lines = [
|
| 330 |
+
f"๐ง MEMORY BANK โ {stats['total_entries']} total lessons",
|
| 331 |
+
f"Retrieving for: {task}", "โ"*50,
|
| 332 |
+
]
|
| 333 |
+
if not ctx.relevant_lessons:
|
| 334 |
+
lines.append("No lessons stored yet. Run episodes to build memory.")
|
| 335 |
+
else:
|
| 336 |
+
lines.append(f"\n๐ {ctx.lessons_count} relevant lesson(s):\n")
|
| 337 |
+
for i, e in enumerate(ctx.relevant_lessons, 1):
|
| 338 |
+
lines += [
|
| 339 |
+
f"[Lesson {i}] Task: {e.task} | Failure: {e.failure_type} | Score: {e.score:.2f}",
|
| 340 |
+
f" Title: {e.lesson_title}",
|
| 341 |
+
f" Lesson: {e.lesson_body[:120]}",
|
| 342 |
+
f" Hint: {e.lesson_hint[:120]}" if e.lesson_hint else "",
|
| 343 |
+
"",
|
| 344 |
+
]
|
| 345 |
+
lines += ["\n๐ System Prompt Injection:", "โ"*40, ctx.system_prompt_injection]
|
| 346 |
+
return "\n".join(l for l in lines)
|
| 347 |
+
except Exception as e: return f"Error: {e}"
|
| 348 |
|
| 349 |
|
| 350 |
# โโ Tab 6: Compare Agents โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 351 |
|
| 352 |
+
def run_comparison(task, selected_agents):
|
| 353 |
try:
|
| 354 |
+
agents = selected_agents or None
|
| 355 |
report = multi_agent_engine.compare(env, task=task, agents=agents)
|
| 356 |
d = report.to_dict()
|
|
|
|
| 357 |
lines = [
|
| 358 |
f"โ๏ธ MULTI-AGENT COMPARISON โ {task} (variant: {d.get('variant_id')})",
|
| 359 |
+
f"๐ Winner: {d.get('winner')} (score: {d.get('winner_score',0):.3f})", "โ"*80,
|
| 360 |
+
f"{'Rank':<5} {'Agent':<16} {'Score':<8} {'Steps':<7} {'Strategy':<22} {'Failure':<20} {'Reliability'}",
|
| 361 |
+
"โ"*80,
|
|
|
|
| 362 |
]
|
| 363 |
+
for row in d.get("summary_table",[]):
|
| 364 |
+
lines.append(f"#{row['rank']:<4} {row['agent']:<16} {row['score']:<8.3f} {row['steps']:<7} {row['strategy']:<22} {row['failure']:<20} {row['reliability']:.3f}")
|
| 365 |
+
lines.append("โ"*80)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
if d.get("insights"):
|
| 367 |
lines += ["\n๐ก Insights:"] + [f" โ {i}" for i in d["insights"]]
|
| 368 |
+
lines.append("\n๐ Action Sequences:")
|
| 369 |
+
for run in d.get("detailed_runs",[]):
|
| 370 |
+
seq = " โ ".join(run.get("action_sequence",[]))
|
|
|
|
| 371 |
lines.append(f" {run['agent_name']:16s}: {seq}")
|
|
|
|
| 372 |
return "\n".join(lines)
|
| 373 |
+
except Exception as e: return f"โ {e}"
|
|
|
|
| 374 |
|
| 375 |
|
| 376 |
# โโ Tab 7: 3D Visualizer โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 377 |
|
| 378 |
+
def get_viz_iframe():
|
| 379 |
+
"""Return iframe pointing to /static/viz3d.html โ fixes Three.js canvas rendering."""
|
| 380 |
+
# Add a cache-busting timestamp so Gradio re-renders on refresh
|
| 381 |
+
import time
|
| 382 |
+
ts = int(time.time())
|
| 383 |
+
return (
|
| 384 |
+
f'<iframe src="/static/viz3d.html?t={ts}" '
|
| 385 |
+
f'width="100%" height="640" frameborder="0" '
|
| 386 |
+
f'style="border-radius:10px;border:1px solid rgba(125,211,252,0.2);'
|
| 387 |
+
f'background:#0a0e1a;" '
|
| 388 |
+
f'allow="accelerometer; autoplay" loading="lazy">'
|
| 389 |
+
f'</iframe>'
|
| 390 |
+
)
|
| 391 |
|
|
|
|
|
|
|
| 392 |
|
| 393 |
+
# โโ Tab 8: Causal Probe โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 394 |
+
|
| 395 |
+
def get_causal_probe():
|
| 396 |
+
try:
|
| 397 |
+
traj, meta, steps, ep_id = _get_traj_and_meta()
|
| 398 |
+
if not traj: return _no_traj()
|
| 399 |
+
r = causal_probe.probe(ep_id, env.current_task or "?", steps, meta,
|
| 400 |
+
list(env.files_read), list(env.files_written), env.final_score)
|
| 401 |
+
d = r.to_dict()
|
| 402 |
+
bar = lambda v: "โ"*int(v*20)+"โ"*(20-int(v*20))
|
| 403 |
+
lines = [
|
| 404 |
+
f"๐งช CAUSAL REASONING PROBE",
|
| 405 |
+
f"โ"*55,
|
| 406 |
+
f"Understanding Level: {d['understanding_level']}",
|
| 407 |
+
f"Causal Score: [{bar(d['causal_score'])}] {d['causal_score']:.3f}",
|
| 408 |
+
f"Chain Coverage: [{bar(d['chain_coverage'])}] {d['chain_coverage']:.3f}",
|
| 409 |
+
f"Chain Order Score: [{bar(d['chain_order_score'])}] {d['chain_order_score']:.3f}",
|
| 410 |
+
f"\n๐ก Behavioral Signals:",
|
| 411 |
+
]
|
| 412 |
+
sigs = d.get("behavioral_signals",{})
|
| 413 |
+
for k,v in sigs.items():
|
| 414 |
+
lines.append(f" {'โ
' if v else 'โ'} {k.replace('_',' ').title()}")
|
| 415 |
+
if d.get("understanding_indicators"):
|
| 416 |
+
lines += ["\nโ
Understanding Indicators:"] + [f" โข {i}" for i in d["understanding_indicators"]]
|
| 417 |
+
if d.get("guessing_indicators"):
|
| 418 |
+
lines += ["\nโ Guessing Indicators:"] + [f" โข {i}" for i in d["guessing_indicators"]]
|
| 419 |
+
diag = d.get("diagnostics",{})
|
| 420 |
+
if diag.get("false_confidence_detected"):
|
| 421 |
+
lines.append("\nโ ๏ธ FALSE CONFIDENCE DETECTED โ submitted without adequate exploration")
|
| 422 |
+
if diag.get("shortcut_learning_detected"):
|
| 423 |
+
lines.append("โ ๏ธ SHORTCUT LEARNING DETECTED โ wrote without reading source")
|
| 424 |
+
lines += [f"\n๐ {d['explanation']}"]
|
| 425 |
+
if d.get("recommendations"):
|
| 426 |
+
lines += ["\n๐ก Recommendations:"] + [f" โ {r_}" for r_ in d["recommendations"]]
|
| 427 |
+
return "\n".join(lines)
|
| 428 |
+
except Exception as e: return f"Error: {e}"
|
| 429 |
+
|
| 430 |
+
|
| 431 |
+
# โโ Tab 9: Counterfactual โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 432 |
+
|
| 433 |
+
def get_counterfactual():
|
| 434 |
+
try:
|
| 435 |
+
traj, meta, steps, ep_id = _get_traj_and_meta()
|
| 436 |
+
if not traj: return _no_traj()
|
| 437 |
+
r = counterfactual_engine.analyze(ep_id, env.current_task or "?", steps, meta,
|
| 438 |
+
list(env.files_read), list(env.files_written), env.final_score)
|
| 439 |
+
d = r.to_dict()
|
| 440 |
+
bar = lambda v: "โ"*int(v*20)+"โ"*(20-int(v*20))
|
| 441 |
+
lines = [
|
| 442 |
+
f"๐ญ COUNTERFACTUAL ROBUSTNESS TEST",
|
| 443 |
+
f"โ"*55,
|
| 444 |
+
f"Brittleness Level: {d['brittleness_level']}",
|
| 445 |
+
f"Robustness Score: [{bar(d['robustness_score'])}] {d['robustness_score']:.3f}",
|
| 446 |
+
f"Mutations Tested: {d['mutations_tested']}",
|
| 447 |
+
f"Mutations Survived: {d['mutations_survived']} โ
| Failed: {d['mutations_failed']} โ",
|
| 448 |
+
f"\n๐งฌ Mutation Results:",
|
| 449 |
+
]
|
| 450 |
+
for m in d.get("mutations",[]):
|
| 451 |
+
icon = "โ
" if not m["would_break_agent"] else "โ"
|
| 452 |
+
lines.append(f" {icon} [{m['type']}] {m['description'][:55]}")
|
| 453 |
+
lines.append(f" {m['why'][:80]}")
|
| 454 |
+
if d.get("surface_dependencies"):
|
| 455 |
+
lines += ["\nโ ๏ธ Surface Dependencies:"] + [f" โข {s}" for s in d["surface_dependencies"]]
|
| 456 |
+
if d.get("deep_dependencies"):
|
| 457 |
+
lines += ["\nโ
Deep Dependencies:"] + [f" โข {s}" for s in d["deep_dependencies"]]
|
| 458 |
+
lines += [f"\n๐ {d['explanation']}"]
|
| 459 |
+
if d.get("recommendations"):
|
| 460 |
+
lines += ["\n๐ก Recommendations:"] + [f" โ {r_}" for r_ in d["recommendations"]]
|
| 461 |
+
return "\n".join(lines)
|
| 462 |
+
except Exception as e: return f"Error: {e}"
|
| 463 |
+
|
| 464 |
+
|
| 465 |
+
# โโ Tab 10: Confidence Calibration โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 466 |
+
|
| 467 |
+
def get_calibration():
|
| 468 |
+
try:
|
| 469 |
+
traj, meta, steps, ep_id = _get_traj_and_meta()
|
| 470 |
+
if not traj: return _no_traj()
|
| 471 |
+
r = confidence_calibrator.calibrate(ep_id, env.current_task or "?", steps, env.final_score)
|
| 472 |
+
d = r.to_dict()
|
| 473 |
+
bar = lambda v: "โ"*int(v*20)+"โ"*(20-int(v*20))
|
| 474 |
+
lines = [
|
| 475 |
+
f"๐ CONFIDENCE CALIBRATION REPORT",
|
| 476 |
+
f"โ"*55,
|
| 477 |
+
f"Calibration Profile: {d['profile']}",
|
| 478 |
+
f"Calibration Score: [{bar(d['calibration_score'])}] {d['calibration_score']:.3f}",
|
| 479 |
+
f"Inferred Confidence: [{bar(d['inferred_confidence'])}] {d['inferred_confidence']:.3f}",
|
| 480 |
+
f"Actual Performance: [{bar(d['actual_performance'])}] {d['actual_performance']:.3f}",
|
| 481 |
+
f"Calibration Error: {d['expected_calibration_error']:.3f} (lower=better)",
|
| 482 |
+
f"Conf-Acc Correlation: {d['confidence_accuracy_correlation']:.3f}",
|
| 483 |
+
f"\n๐ Behavioral Signals:",
|
| 484 |
+
]
|
| 485 |
+
sigs = d.get("signals",{})
|
| 486 |
+
lines.append(f" Commitment Speed: {sigs.get('commitment_speed',0):.3f} (high=fast commit)")
|
| 487 |
+
lines.append(f" Re-Exploration Rate: {sigs.get('re_exploration_rate',0):.3f} (high=uncertain)")
|
| 488 |
+
lines.append(f" Verification Rate: {sigs.get('verification_rate',0):.3f} tests/write")
|
| 489 |
+
lines.append(f" Submit Speed: {sigs.get('submit_speed',0):.3f} (high=early submit)")
|
| 490 |
+
lines += [f"\n๐ {d['diagnosis']}"]
|
| 491 |
+
if d.get("recommendations"):
|
| 492 |
+
lines += ["\n๐ก Recommendations:"] + [f" โ {r_}" for r_ in d["recommendations"]]
|
| 493 |
+
if d.get("confidence_trajectory"):
|
| 494 |
+
lines.append("\n๐ Confidence Trajectory:")
|
| 495 |
+
for s in d["confidence_trajectory"][:8]:
|
| 496 |
+
acc_str = f" | acc={s['accuracy']:.2f}" if s['accuracy'] is not None else ""
|
| 497 |
+
lines.append(f" S{s['step']}: {s['action']:12s} conf={s['confidence']:.2f}{acc_str}")
|
| 498 |
+
return "\n".join(lines)
|
| 499 |
+
except Exception as e: return f"Error: {e}"
|
| 500 |
+
|
| 501 |
+
|
| 502 |
+
# โโ Tab 11: Benchmark โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 503 |
+
|
| 504 |
+
def run_benchmark(tasks_selected, agents_selected):
|
| 505 |
+
try:
|
| 506 |
+
tasks = tasks_selected if tasks_selected else ["task1", "task2", "task3"]
|
| 507 |
+
agents = agents_selected if agents_selected else None
|
| 508 |
+
report = benchmark_runner.run(env, tasks=tasks, agents=agents)
|
| 509 |
+
return report.render_table()
|
| 510 |
+
except Exception as e:
|
| 511 |
+
return f"โ Benchmark error: {e}"
|
| 512 |
+
|
| 513 |
+
|
| 514 |
+
# โโ Tab 12: Analytics โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 515 |
+
|
| 516 |
+
def get_analytics():
|
| 517 |
+
try:
|
| 518 |
+
if not env.get_trajectory():
|
| 519 |
+
return _no_traj()
|
| 520 |
+
report = analytics_engine.analyze(env)
|
| 521 |
+
return report.render_text()
|
| 522 |
+
except Exception as e:
|
| 523 |
+
return f"Error: {e}"
|
| 524 |
+
|
| 525 |
+
def get_analytics_json():
|
| 526 |
+
try:
|
| 527 |
+
if not env.get_trajectory():
|
| 528 |
+
return _no_traj()
|
| 529 |
+
report = analytics_engine.analyze(env)
|
| 530 |
+
return json.dumps(report.to_dict(), indent=2, default=str)
|
| 531 |
+
except Exception as e:
|
| 532 |
+
return f"Error: {e}"
|
| 533 |
|
| 534 |
|
| 535 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 536 |
+
# Gradio UI
|
| 537 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 538 |
|
| 539 |
+
with gr.Blocks(title="Codebase Navigation & Repair โ OpenEnv v4") as demo:
|
| 540 |
gr.Markdown(
|
| 541 |
+
"# ๐ Codebase Navigation & Repair โ OpenEnv v4\n"
|
| 542 |
+
"**The first platform that scientifically measures, explains, and improves AI agent reasoning.** "
|
| 543 |
+
"Navigate ยท Fix ยท Evaluate Process ยท Probe Causality ยท Test Counterfactuals ยท Calibrate Confidence ยท Benchmark."
|
| 544 |
)
|
| 545 |
|
| 546 |
with gr.Tabs():
|
|
|
|
| 549 |
with gr.TabItem("๐ฎ Interactive"):
|
| 550 |
with gr.Row():
|
| 551 |
with gr.Column(scale=1):
|
| 552 |
+
task_sel = gr.Dropdown(["task1","task2","task3"], value="task1", label="Task")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 553 |
reset_btn = gr.Button("๐ Reset Environment", variant="primary")
|
| 554 |
gr.Markdown("### Action")
|
| 555 |
+
act_type = gr.Dropdown(["read_file","write_file","run_tests","search_code","submit"], value="read_file", label="Action Type")
|
|
|
|
|
|
|
|
|
|
| 556 |
act_path = gr.Textbox(label="Path", placeholder="src/auth.py")
|
| 557 |
+
act_query = gr.Textbox(label="Query", placeholder="validate_token")
|
| 558 |
act_content = gr.Textbox(label="Content (write_file)", lines=4)
|
| 559 |
step_btn = gr.Button("โถ๏ธ Execute Step", variant="secondary")
|
| 560 |
with gr.Column(scale=2):
|
|
|
|
| 563 |
with gr.Row():
|
| 564 |
steps_box = gr.Textbox(label="Steps", value="0", interactive=False)
|
| 565 |
reward_box = gr.Textbox(label="Cumulative Reward", value="0.000", interactive=False)
|
| 566 |
+
reset_btn.click(reset_environment, [task_sel], [status_box, result_box, steps_box, reward_box])
|
| 567 |
step_btn.click(take_step, [act_type, act_path, act_query, act_content], [status_box, result_box, steps_box, reward_box])
|
| 568 |
|
| 569 |
# โโ Tab 2: Run Agent โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 570 |
with gr.TabItem("๐ค Run Agent"):
|
| 571 |
+
gr.Markdown("### Built-in Demonstration Agent\nRuns test-first deterministic strategy + stores lesson in memory bank.")
|
| 572 |
+
agent_task = gr.Dropdown(["task1","task2","task3"], value="task1", label="Task")
|
| 573 |
run_btn = gr.Button("๐ Run Agent", variant="primary")
|
| 574 |
+
agent_out = gr.Textbox(label="Agent Log", lines=22, interactive=False)
|
| 575 |
+
run_btn.click(run_builtin_agent, [agent_task], [agent_out])
|
| 576 |
|
| 577 |
# โโ Tab 3: Evaluation โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 578 |
with gr.TabItem("๐ Evaluation"):
|
|
|
|
| 585 |
metrics_btn.click(get_metrics, outputs=[eval_out])
|
| 586 |
traj_btn.click(get_trajectory, outputs=[eval_out])
|
| 587 |
|
| 588 |
+
# โโ Tab 4: Intelligence โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 589 |
with gr.TabItem("๐ง Intelligence"):
|
| 590 |
+
gr.Markdown("### Deep Agent Intelligence Analysis")
|
|
|
|
|
|
|
|
|
|
| 591 |
with gr.Row():
|
| 592 |
+
clf_btn = gr.Button("๐ฌ Classify Failure", variant="primary")
|
| 593 |
+
strat_btn = gr.Button("๐งญ Detect Strategy", variant="secondary")
|
| 594 |
adv_btn = gr.Button("โก Advanced Metrics", variant="secondary")
|
| 595 |
intel_out = gr.Textbox(label="Analysis", lines=32, interactive=False)
|
| 596 |
+
clf_btn.click(get_failure_classification, outputs=[intel_out])
|
| 597 |
+
strat_btn.click(get_strategy_detection, outputs=[intel_out])
|
| 598 |
adv_btn.click(get_advanced_metrics, outputs=[intel_out])
|
| 599 |
|
| 600 |
+
# โโ Tab 5: Self-Improve โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 601 |
with gr.TabItem("๐ Self-Improve"):
|
| 602 |
+
gr.Markdown("### Self-Improvement Loop + Episodic Memory")
|
| 603 |
+
with gr.Row():
|
| 604 |
+
improve_btn = gr.Button("๐ Improvement Plan", variant="primary")
|
| 605 |
+
mem_task = gr.Dropdown(["task1","task2","task3"], value="task1", label="Task for Memory")
|
| 606 |
+
mem_btn = gr.Button("๐ง Retrieve Memory", variant="secondary")
|
| 607 |
+
improve_out = gr.Textbox(label="Output", lines=32, interactive=False)
|
|
|
|
| 608 |
improve_btn.click(get_improvement_plan, outputs=[improve_out])
|
| 609 |
+
mem_btn.click(get_memory_context_for_task, [mem_task], [improve_out])
|
| 610 |
|
| 611 |
+
# โโ Tab 6: Compare Agents โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 612 |
with gr.TabItem("โ๏ธ Compare Agents"):
|
| 613 |
+
gr.Markdown("### Multi-Agent Strategy Comparison")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 614 |
with gr.Row():
|
| 615 |
+
comp_task = gr.Dropdown(["task1","task2","task3"], value="task1", label="Task")
|
| 616 |
comp_agents = gr.CheckboxGroup(
|
| 617 |
+
["test-first","search-first","minimal","exhaustive"],
|
| 618 |
+
value=["test-first","search-first","minimal","exhaustive"],
|
| 619 |
+
label="Agents",
|
| 620 |
)
|
| 621 |
comp_btn = gr.Button("โ๏ธ Run Comparison", variant="primary")
|
| 622 |
+
comp_out = gr.Textbox(label="Report", lines=30, interactive=False)
|
| 623 |
comp_btn.click(run_comparison, [comp_task, comp_agents], [comp_out])
|
| 624 |
|
| 625 |
+
# โโ Tab 7: 3D Visualizer โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 626 |
with gr.TabItem("๐ 3D Visualizer"):
|
| 627 |
gr.Markdown(
|
| 628 |
"### Agent Trajectory 3D Visualization\n"
|
| 629 |
+
"Files = glowing 3D spheres ยท Dependencies = edges ยท Agent = animated beam ยท **Run an episode first.**"
|
| 630 |
+
)
|
| 631 |
+
refresh_btn = gr.Button("๐ Load / Refresh Visualizer", variant="primary")
|
| 632 |
+
viz_html = gr.HTML(
|
| 633 |
+
value='<div style="text-align:center;padding:60px;color:#475569;background:#0a0e1a;border-radius:10px">'
|
| 634 |
+
'<p style="font-size:24px">๐</p>'
|
| 635 |
+
'<p style="color:#7dd3fc;font-weight:700">Run an episode then click Load</p></div>'
|
| 636 |
+
)
|
| 637 |
+
refresh_btn.click(get_viz_iframe, outputs=[viz_html])
|
| 638 |
+
|
| 639 |
+
# โโ Tab 8: Causal Probe โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 640 |
+
with gr.TabItem("๐งช Causal Probe"):
|
| 641 |
+
gr.Markdown(
|
| 642 |
+
"### Causal Reasoning Evaluation\n"
|
| 643 |
+
"Did the agent truly understand WHY the bug exists, "
|
| 644 |
+
"or did it pattern-match and guess? "
|
| 645 |
+
"Measures chain coverage, order, and shortcut learning."
|
| 646 |
+
)
|
| 647 |
+
causal_btn = gr.Button("๐งช Run Causal Probe", variant="primary")
|
| 648 |
+
causal_out = gr.Textbox(label="Causal Reasoning Report", lines=32, interactive=False)
|
| 649 |
+
causal_btn.click(get_causal_probe, outputs=[causal_out])
|
| 650 |
+
|
| 651 |
+
# โโ Tab 9: Counterfactual โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 652 |
+
with gr.TabItem("๐ญ Counterfactual"):
|
| 653 |
+
gr.Markdown(
|
| 654 |
+
"### Counterfactual Robustness Testing\n"
|
| 655 |
+
"Applies 6 semantic-neutral mutations (filename rename, constant change, "
|
| 656 |
+
"dummy function, directory shift, docstring noise, import reorder) "
|
| 657 |
+
"and measures whether the agent's strategy survives."
|
| 658 |
)
|
| 659 |
+
cf_btn = gr.Button("๐ญ Run Counterfactual Analysis", variant="primary")
|
| 660 |
+
cf_out = gr.Textbox(label="Robustness Report", lines=32, interactive=False)
|
| 661 |
+
cf_btn.click(get_counterfactual, outputs=[cf_out])
|
| 662 |
|
| 663 |
+
# โโ Tab 10: Confidence โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 664 |
+
with gr.TabItem("๐ Confidence"):
|
| 665 |
+
gr.Markdown(
|
| 666 |
+
"### Confidence Calibration Analysis\n"
|
| 667 |
+
"Infers agent confidence from behavioral proxies (commitment speed, "
|
| 668 |
+
"re-exploration rate, verification rate, submit timing) "
|
| 669 |
+
"and compares to actual performance. Detects overconfident and underconfident agents."
|
| 670 |
+
)
|
| 671 |
+
calib_btn = gr.Button("๐ Analyze Calibration", variant="primary")
|
| 672 |
+
calib_out = gr.Textbox(label="Calibration Report", lines=32, interactive=False)
|
| 673 |
+
calib_btn.click(get_calibration, outputs=[calib_out])
|
| 674 |
+
|
| 675 |
+
# โโ Tab 11: Benchmark โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 676 |
+
with gr.TabItem("๐ Benchmark"):
|
| 677 |
+
gr.Markdown(
|
| 678 |
+
"### Automated Benchmark Leaderboard\n"
|
| 679 |
+
"Runs all selected agent strategies ร all selected tasks automatically. "
|
| 680 |
+
"Ranks by composite score: correctness + causal reasoning + robustness + calibration + generalization."
|
| 681 |
+
)
|
| 682 |
+
with gr.Row():
|
| 683 |
+
bench_tasks = gr.CheckboxGroup(["task1","task2","task3"], value=["task1","task2"], label="Tasks to Benchmark")
|
| 684 |
+
bench_agents = gr.CheckboxGroup(
|
| 685 |
+
["test-first","search-first","minimal","exhaustive"],
|
| 686 |
+
value=["test-first","minimal"],
|
| 687 |
+
label="Agent Strategies",
|
| 688 |
+
)
|
| 689 |
+
bench_btn = gr.Button("๐ Run Benchmark (2โ4 min)", variant="primary")
|
| 690 |
+
bench_out = gr.Textbox(label="Leaderboard", lines=35, interactive=False)
|
| 691 |
+
bench_btn.click(run_benchmark, [bench_tasks, bench_agents], [bench_out])
|
| 692 |
+
|
| 693 |
+
# โโ Tab 12: Analytics โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 694 |
+
with gr.TabItem("๐ Analytics"):
|
| 695 |
+
gr.Markdown(
|
| 696 |
+
"### Unified Research-Grade Analytics\n"
|
| 697 |
+
"Synthesizes ALL evaluation dimensions into one report: "
|
| 698 |
+
"reasoning graph, root cause tree, alternative paths, profile tags, "
|
| 699 |
+
"decision efficiency, composite score. Paper-ready JSON available."
|
| 700 |
+
)
|
| 701 |
+
with gr.Row():
|
| 702 |
+
analytics_btn = gr.Button("๐ Full Analytics Report", variant="primary")
|
| 703 |
+
analytics_json_btn = gr.Button("๐ Export JSON", variant="secondary")
|
| 704 |
+
analytics_out = gr.Textbox(label="Analytics Report", lines=40, interactive=False)
|
| 705 |
+
analytics_btn.click(get_analytics, outputs=[analytics_out])
|
| 706 |
+
analytics_json_btn.click(get_analytics_json, outputs=[analytics_out])
|
| 707 |
+
|
| 708 |
+
# โโ Tab 13: API โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 709 |
with gr.TabItem("๐ API"):
|
| 710 |
gr.Markdown("""
|
| 711 |
+
### REST API โ v4.0 Endpoints
|
| 712 |
|
| 713 |
+
#### Core
|
| 714 |
+
| `/reset` POST | `/step` POST | `/state` GET | `/health` GET |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 715 |
|
| 716 |
#### Evaluation
|
| 717 |
+
| `/trajectory` GET | `/evaluate` GET | `/metrics` GET | `/fault-config` POST |
|
| 718 |
+
|
| 719 |
+
#### Intelligence (v3)
|
| 720 |
+
| `/classify` GET | `/strategy` GET | `/advanced-metrics` GET | `/improvement-plan` GET | `/compare-agents` POST | `/viz-data` GET |
|
| 721 |
+
|
| 722 |
+
#### Research (v4 NEW)
|
| 723 |
+
| `/causal-probe` GET | `/counterfactual` GET | `/confidence` GET | `/benchmark` POST | `/analytics` GET |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 724 |
|
| 725 |
```bash
|
| 726 |
BASE="http://localhost:7860"
|
| 727 |
+
# Run a full episode
|
| 728 |
curl -X POST "$BASE/reset?task=task1"
|
| 729 |
+
curl -X POST "$BASE/step" -H "Content-Type: application/json" -d '{"action_type":"read_file","path":"tests/test_formatter.py"}'
|
| 730 |
curl -X POST "$BASE/step" -d '{"action_type":"submit"}'
|
| 731 |
+
|
| 732 |
+
# All intelligence endpoints
|
| 733 |
curl "$BASE/classify"
|
| 734 |
+
curl "$BASE/causal-probe"
|
| 735 |
+
curl "$BASE/counterfactual"
|
| 736 |
+
curl "$BASE/confidence"
|
| 737 |
+
curl "$BASE/analytics"
|
| 738 |
+
|
| 739 |
+
# Benchmark
|
| 740 |
+
curl -X POST "$BASE/benchmark?tasks=task1,task2"
|
| 741 |
```
|
| 742 |
""")
|
| 743 |
|
| 744 |
|
| 745 |
+
# โโ Mount FastAPI โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 746 |
from server.app import app as fastapi_app
|
| 747 |
gr_app = gr.mount_gradio_app(fastapi_app, demo, path="/")
|
| 748 |
|
e2e_test_v3.py
ADDED
|
@@ -0,0 +1,389 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
e2e_test_v3.py โ Full End-to-End test suite for v3.0
|
| 4 |
+
|
| 5 |
+
Tests every endpoint, all 3 tasks, all new intelligence modules,
|
| 6 |
+
multi-agent comparison, and the 3D viz-data endpoint.
|
| 7 |
+
"""
|
| 8 |
+
import sys
|
| 9 |
+
import json
|
| 10 |
+
import time
|
| 11 |
+
import requests
|
| 12 |
+
|
| 13 |
+
BASE = "http://localhost:7860"
|
| 14 |
+
PASS = 0
|
| 15 |
+
FAIL = 0
|
| 16 |
+
RESULTS = []
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def check(name, condition, detail=""):
|
| 20 |
+
global PASS, FAIL
|
| 21 |
+
status = "โ
PASS" if condition else "โ FAIL"
|
| 22 |
+
if condition:
|
| 23 |
+
PASS += 1
|
| 24 |
+
else:
|
| 25 |
+
FAIL += 1
|
| 26 |
+
msg = f" {status} {name}"
|
| 27 |
+
if detail:
|
| 28 |
+
msg += f" โ {detail}"
|
| 29 |
+
print(msg)
|
| 30 |
+
RESULTS.append({"name": name, "passed": condition, "detail": detail})
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def section(title):
|
| 34 |
+
print(f"\n{'โ'*60}")
|
| 35 |
+
print(f" {title}")
|
| 36 |
+
print(f"{'โ'*60}")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 40 |
+
section("1. HEALTH & BASIC CONNECTIVITY")
|
| 41 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 42 |
+
|
| 43 |
+
r = requests.get(f"{BASE}/health")
|
| 44 |
+
check("GET /health returns 200", r.status_code == 200)
|
| 45 |
+
data = r.json()
|
| 46 |
+
check("Health version is 3.0.0", data.get("version") == "3.0.0", data.get("version"))
|
| 47 |
+
check("Health status is ok", data.get("status") == "ok")
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 51 |
+
section("2. CORE OPENENV โ ALL 3 TASKS")
|
| 52 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 53 |
+
|
| 54 |
+
for task in ["task1", "task2", "task3"]:
|
| 55 |
+
r = requests.post(f"{BASE}/reset?task={task}")
|
| 56 |
+
check(f"POST /reset?task={task} โ 200", r.status_code == 200, f"status={r.status_code}")
|
| 57 |
+
if r.status_code == 200:
|
| 58 |
+
d = r.json()
|
| 59 |
+
obs = d.get("observation", {})
|
| 60 |
+
check(f" {task}: has repo_tree", bool(obs.get("repo_tree")), str(obs.get("repo_tree", [])[:2]))
|
| 61 |
+
check(f" {task}: has variant_id", bool(d.get("info", {}).get("variant_id")))
|
| 62 |
+
check(f" {task}: steps_remaining > 0", obs.get("steps_remaining", 0) > 0)
|
| 63 |
+
|
| 64 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 65 |
+
section("3. STEP ACTIONS โ FULL EPISODE (task1)")
|
| 66 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 67 |
+
|
| 68 |
+
r = requests.post(f"{BASE}/reset?task=task1")
|
| 69 |
+
obs = r.json()["observation"]
|
| 70 |
+
tree = obs["repo_tree"]
|
| 71 |
+
test_files = [f for f in tree if f.startswith("tests/")]
|
| 72 |
+
src_files = [f for f in tree if f.startswith("src/")]
|
| 73 |
+
|
| 74 |
+
# read_file
|
| 75 |
+
r = requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": test_files[0]})
|
| 76 |
+
check("POST /step read_file test file โ 200", r.status_code == 200)
|
| 77 |
+
check("read_file reward >= 0", r.json().get("reward", -1) >= 0, str(r.json().get("reward")))
|
| 78 |
+
|
| 79 |
+
r = requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": src_files[0]})
|
| 80 |
+
check("POST /step read_file src file โ 200", r.status_code == 200)
|
| 81 |
+
|
| 82 |
+
# search_code
|
| 83 |
+
r = requests.post(f"{BASE}/step", json={"action_type": "search_code", "query": "def "})
|
| 84 |
+
check("POST /step search_code โ 200", r.status_code == 200)
|
| 85 |
+
|
| 86 |
+
# run_tests
|
| 87 |
+
r = requests.post(f"{BASE}/step", json={"action_type": "run_tests"})
|
| 88 |
+
check("POST /step run_tests โ 200", r.status_code == 200, f"reward={r.json().get('reward')}")
|
| 89 |
+
|
| 90 |
+
# submit
|
| 91 |
+
r = requests.post(f"{BASE}/step", json={"action_type": "submit"})
|
| 92 |
+
check("POST /step submit โ 200", r.status_code == 200)
|
| 93 |
+
final_score = r.json()["info"].get("final_score", 0)
|
| 94 |
+
check("Episode done after submit", r.json().get("done") == True)
|
| 95 |
+
|
| 96 |
+
# Try stepping after done โ should get 400
|
| 97 |
+
r = requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": "x.py"})
|
| 98 |
+
check("POST /step after done โ 400", r.status_code == 400)
|
| 99 |
+
|
| 100 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 101 |
+
section("4. STATE ENDPOINT")
|
| 102 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 103 |
+
|
| 104 |
+
requests.post(f"{BASE}/reset?task=task1")
|
| 105 |
+
requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": test_files[0]})
|
| 106 |
+
r = requests.get(f"{BASE}/state")
|
| 107 |
+
check("GET /state โ 200", r.status_code == 200)
|
| 108 |
+
d = r.json()
|
| 109 |
+
check("State has observation", "observation" in d)
|
| 110 |
+
check("State total_steps_taken >= 1", d.get("total_steps_taken", 0) >= 1)
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 114 |
+
section("5. TRAJECTORY & EVALUATION")
|
| 115 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 116 |
+
|
| 117 |
+
requests.post(f"{BASE}/step", json={"action_type": "submit"})
|
| 118 |
+
|
| 119 |
+
r = requests.get(f"{BASE}/trajectory")
|
| 120 |
+
check("GET /trajectory โ 200", r.status_code == 200)
|
| 121 |
+
traj = r.json()
|
| 122 |
+
check("Trajectory has episode_id", bool(traj.get("episode_id")))
|
| 123 |
+
check("Trajectory steps > 0", len(traj.get("steps", [])) > 0, f"steps={len(traj.get('steps',[]))}")
|
| 124 |
+
|
| 125 |
+
r = requests.get(f"{BASE}/evaluate")
|
| 126 |
+
check("GET /evaluate โ 200", r.status_code == 200)
|
| 127 |
+
ev = r.json()
|
| 128 |
+
check("Evaluation has composite_score", "composite_score" in ev, str(ev.get("composite_score")))
|
| 129 |
+
check("Evaluation has 6 dimensions", len(ev.get("dimensions", {})) == 6, str(list(ev.get("dimensions", {}).keys())))
|
| 130 |
+
|
| 131 |
+
r = requests.get(f"{BASE}/metrics")
|
| 132 |
+
check("GET /metrics โ 200", r.status_code == 200)
|
| 133 |
+
m = r.json()
|
| 134 |
+
check("Metrics has timeline", "timeline" in m, str(list(m.keys())[:5]))
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 138 |
+
section("6. FAULT INJECTION")
|
| 139 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 140 |
+
|
| 141 |
+
r = requests.post(f"{BASE}/fault-config", json={"level": "light"})
|
| 142 |
+
check("POST /fault-config light โ 200", r.status_code == 200)
|
| 143 |
+
r = requests.post(f"{BASE}/reset?task=task1")
|
| 144 |
+
check("Reset with fault injection โ 200", r.status_code == 200)
|
| 145 |
+
fi = r.json().get("info", {}).get("fault_injection", {})
|
| 146 |
+
check("Fault injection info present", "difficulty_multiplier" in fi or "faults_injected" in fi, str(fi))
|
| 147 |
+
|
| 148 |
+
# Reset back
|
| 149 |
+
requests.post(f"{BASE}/fault-config", json={"level": "none"})
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 153 |
+
section("7. INTELLIGENCE โ FAILURE CLASSIFIER")
|
| 154 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 155 |
+
|
| 156 |
+
# Run a fresh episode with minimal effort to get a known failure
|
| 157 |
+
requests.post(f"{BASE}/reset?task=task1")
|
| 158 |
+
requests.post(f"{BASE}/step", json={"action_type": "submit"}) # Submit without doing anything
|
| 159 |
+
|
| 160 |
+
r = requests.get(f"{BASE}/classify")
|
| 161 |
+
check("GET /classify โ 200", r.status_code == 200)
|
| 162 |
+
d = r.json()
|
| 163 |
+
check("Classify has episode_id", "episode_id" in d, d.get("episode_id"))
|
| 164 |
+
check("Classify has primary_failure", "primary_failure" in d, d.get("primary_failure"))
|
| 165 |
+
check("Classify has success field", "success" in d)
|
| 166 |
+
check("Classify success=False for minimal effort", d.get("success") == False)
|
| 167 |
+
check("Classify has retry_hint", bool(d.get("retry_hint")), d.get("retry_hint", "")[:60])
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 171 |
+
section("8. INTELLIGENCE โ STRATEGY DETECTOR")
|
| 172 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 173 |
+
|
| 174 |
+
r = requests.get(f"{BASE}/strategy")
|
| 175 |
+
check("GET /strategy โ 200", r.status_code == 200)
|
| 176 |
+
d = r.json()
|
| 177 |
+
check("Strategy has strategy field", "strategy" in d, d.get("strategy"))
|
| 178 |
+
VALID_STRATEGIES = ["TARGETED_DEBUGGING", "SYSTEMATIC_SEARCH", "BRUTE_FORCE",
|
| 179 |
+
"RANDOM_EXPLORATION", "SPEC_DRIVEN", "MINIMAL_EFFORT"]
|
| 180 |
+
check("Strategy is a known label", d.get("strategy") in VALID_STRATEGIES, d.get("strategy"))
|
| 181 |
+
check("Strategy has score 0-1", 0 <= d.get("score", -1) <= 1, str(d.get("score")))
|
| 182 |
+
check("Strategy has exploration_ratio", "exploration_ratio" in d)
|
| 183 |
+
check("Strategy has sub_patterns list", isinstance(d.get("sub_patterns"), list))
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 187 |
+
section("9. INTELLIGENCE โ ADVANCED METRICS")
|
| 188 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 189 |
+
|
| 190 |
+
r = requests.get(f"{BASE}/advanced-metrics")
|
| 191 |
+
check("GET /advanced-metrics โ 200", r.status_code == 200)
|
| 192 |
+
d = r.json()
|
| 193 |
+
expected_keys = ["reasoning_efficiency", "exploration_ratio", "decision_entropy",
|
| 194 |
+
"reliability_index", "pivot_rate", "wasteful_ratio", "consistency_score"]
|
| 195 |
+
for key in expected_keys:
|
| 196 |
+
check(f" advanced-metrics has '{key}'", key in d, str(d.get(key, "MISSING")))
|
| 197 |
+
check("reliability_index in [0,1]", 0 <= d.get("reliability_index", -1) <= 1)
|
| 198 |
+
check("action_distribution is dict", isinstance(d.get("action_distribution"), dict))
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 202 |
+
section("10. INTELLIGENCE โ IMPROVEMENT PLAN")
|
| 203 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 204 |
+
|
| 205 |
+
r = requests.get(f"{BASE}/improvement-plan")
|
| 206 |
+
check("GET /improvement-plan โ 200", r.status_code == 200)
|
| 207 |
+
d = r.json()
|
| 208 |
+
check("Plan has failure_type", "failure_type" in d, d.get("failure_type"))
|
| 209 |
+
check("Plan has what_went_wrong", bool(d.get("what_went_wrong")))
|
| 210 |
+
check("Plan has improved_strategy", bool(d.get("improved_strategy")))
|
| 211 |
+
check("Plan has step_by_step_plan list", isinstance(d.get("step_by_step_plan"), list))
|
| 212 |
+
check("Plan step_by_step_plan not empty", len(d.get("step_by_step_plan", [])) > 0)
|
| 213 |
+
check("Plan has system_prompt_addon", "system_prompt_addon" in d)
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 217 |
+
section("11. MULTI-AGENT COMPARISON")
|
| 218 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 219 |
+
|
| 220 |
+
r = requests.post(f"{BASE}/compare-agents?task=task1&agents=test-first,minimal")
|
| 221 |
+
check("POST /compare-agents (2 agents) โ 200", r.status_code == 200, f"status={r.status_code}")
|
| 222 |
+
if r.status_code == 200:
|
| 223 |
+
d = r.json()
|
| 224 |
+
check("Comparison has winner", "winner" in d, d.get("winner"))
|
| 225 |
+
check("Comparison has summary_table", "summary_table" in d)
|
| 226 |
+
check("Summary table has 2 rows", len(d.get("summary_table", [])) == 2,
|
| 227 |
+
str(len(d.get("summary_table", []))))
|
| 228 |
+
check("Each row has score/steps/strategy", all(
|
| 229 |
+
"score" in row and "steps" in row and "strategy" in row
|
| 230 |
+
for row in d.get("summary_table", [])
|
| 231 |
+
))
|
| 232 |
+
check("Comparison has insights", "insights" in d)
|
| 233 |
+
check("Comparison has detailed_runs", len(d.get("detailed_runs", [])) == 2)
|
| 234 |
+
|
| 235 |
+
# Test all 4 agents
|
| 236 |
+
r = requests.post(f"{BASE}/compare-agents?task=task1")
|
| 237 |
+
check("POST /compare-agents (all agents) โ 200", r.status_code == 200)
|
| 238 |
+
if r.status_code == 200:
|
| 239 |
+
d = r.json()
|
| 240 |
+
check("All 4 agents ran", len(d.get("summary_table", [])) == 4,
|
| 241 |
+
f"rows={len(d.get('summary_table',[]))}")
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 245 |
+
section("12. 3D VISUALIZATION DATA")
|
| 246 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 247 |
+
|
| 248 |
+
# Run a full episode first for viz data
|
| 249 |
+
requests.post(f"{BASE}/reset?task=task1")
|
| 250 |
+
requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": test_files[0]})
|
| 251 |
+
requests.post(f"{BASE}/step", json={"action_type": "submit"})
|
| 252 |
+
|
| 253 |
+
r = requests.get(f"{BASE}/viz-data")
|
| 254 |
+
check("GET /viz-data โ 200", r.status_code == 200)
|
| 255 |
+
d = r.json()
|
| 256 |
+
check("Viz-data has files array", isinstance(d.get("files"), list), f"len={len(d.get('files',[]))}")
|
| 257 |
+
check("Viz-data files > 0", len(d.get("files", [])) > 0)
|
| 258 |
+
check("Viz-data has dependencies", isinstance(d.get("dependencies"), list))
|
| 259 |
+
check("Viz-data has steps", isinstance(d.get("steps"), list))
|
| 260 |
+
check("Viz-data has strategy", "strategy" in d, d.get("strategy"))
|
| 261 |
+
check("Viz-data has final_score", "final_score" in d)
|
| 262 |
+
if d.get("files"):
|
| 263 |
+
f = d["files"][0]
|
| 264 |
+
check("File node has name/type/is_bug_file", all(k in f for k in ["name","type","is_bug_file"]))
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 268 |
+
section("13. INVALID ACTION HANDLING")
|
| 269 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 270 |
+
|
| 271 |
+
requests.post(f"{BASE}/reset?task=task1")
|
| 272 |
+
|
| 273 |
+
# Invalid task
|
| 274 |
+
r = requests.post(f"{BASE}/reset?task=task99")
|
| 275 |
+
check("Invalid task โ 400", r.status_code == 400)
|
| 276 |
+
|
| 277 |
+
# Invalid action type
|
| 278 |
+
r = requests.post(f"{BASE}/step", json={"action_type": "hack_system"})
|
| 279 |
+
check("Invalid action_type โ 400 or 422", r.status_code in (400, 422))
|
| 280 |
+
|
| 281 |
+
# Non-existent file
|
| 282 |
+
r = requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": "non_existent.py"})
|
| 283 |
+
check("Read non-existent file โ 200 with error", r.status_code == 200)
|
| 284 |
+
obs = r.json().get("observation", {})
|
| 285 |
+
check("Non-existent file has error in obs", bool(obs.get("last_action_error")), obs.get("last_action_error","")[:60])
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 289 |
+
section("14. SECURITY SCANNING")
|
| 290 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 291 |
+
|
| 292 |
+
requests.post(f"{BASE}/reset?task=task1")
|
| 293 |
+
# Try to write a file with dangerous code
|
| 294 |
+
r = requests.post(f"{BASE}/step", json={
|
| 295 |
+
"action_type": "write_file",
|
| 296 |
+
"path": src_files[0] if src_files else "src/hack.py",
|
| 297 |
+
"content": "import os\nos.system('rm -rf /')\n"
|
| 298 |
+
})
|
| 299 |
+
check("Write dangerous code โ 200", r.status_code == 200)
|
| 300 |
+
if r.status_code == 200:
|
| 301 |
+
info = r.json().get("info", {})
|
| 302 |
+
flags = info.get("security_flags", [])
|
| 303 |
+
check("Security flags populated for os.system", len(flags) > 0, str(flags[:2]))
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 307 |
+
section("15. GRADIO UI ENDPOINTS")
|
| 308 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 309 |
+
|
| 310 |
+
r = requests.get(f"{BASE}/")
|
| 311 |
+
check("GET / (Gradio UI) โ 200", r.status_code == 200)
|
| 312 |
+
check("Response is HTML", "text/html" in r.headers.get("content-type", ""))
|
| 313 |
+
|
| 314 |
+
r = requests.get(f"{BASE}/static/viz3d.html")
|
| 315 |
+
check("GET /static/viz3d.html โ 200", r.status_code == 200)
|
| 316 |
+
check("viz3d.html is HTML", "html" in r.text.lower()[:200])
|
| 317 |
+
check("viz3d.html has Three.js", "three" in r.text.lower())
|
| 318 |
+
check("viz3d.html has timeline-slider", "timeline-slider" in r.text)
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 322 |
+
section("16. TASK2 & TASK3 FULL EPISODE")
|
| 323 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 324 |
+
|
| 325 |
+
for task in ["task2", "task3"]:
|
| 326 |
+
r = requests.post(f"{BASE}/reset?task={task}")
|
| 327 |
+
check(f"{task} reset โ 200", r.status_code == 200)
|
| 328 |
+
obs = r.json()["observation"]
|
| 329 |
+
tree = obs["repo_tree"]
|
| 330 |
+
tf = [f for f in tree if f.startswith("tests/")]
|
| 331 |
+
sf = [f for f in tree if f.startswith("src/")]
|
| 332 |
+
md = [f for f in tree if f.endswith(".md")]
|
| 333 |
+
|
| 334 |
+
if task == "task3" and md:
|
| 335 |
+
requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": md[0]})
|
| 336 |
+
if tf:
|
| 337 |
+
requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": tf[0]})
|
| 338 |
+
if sf:
|
| 339 |
+
requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": sf[0]})
|
| 340 |
+
|
| 341 |
+
r = requests.post(f"{BASE}/step", json={"action_type": "submit"})
|
| 342 |
+
check(f"{task} submit โ done", r.json().get("done") == True)
|
| 343 |
+
|
| 344 |
+
# Verify all intelligence endpoints work post-episode
|
| 345 |
+
r = requests.get(f"{BASE}/classify")
|
| 346 |
+
check(f"{task} /classify works", r.status_code == 200 and "primary_failure" in r.json())
|
| 347 |
+
r = requests.get(f"{BASE}/strategy")
|
| 348 |
+
check(f"{task} /strategy works", r.status_code == 200 and "strategy" in r.json())
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
# โโโโโโโโโ๏ฟฝ๏ฟฝโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 352 |
+
section("17. CONSISTENCY โ 3 RUNS SAME TASK")
|
| 353 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 354 |
+
|
| 355 |
+
scores = []
|
| 356 |
+
for i in range(3):
|
| 357 |
+
requests.post(f"{BASE}/reset?task=task1")
|
| 358 |
+
r = requests.get(f"{BASE}/state")
|
| 359 |
+
tree = r.json()["observation"]["repo_tree"]
|
| 360 |
+
tf = [f for f in tree if f.startswith("tests/")]
|
| 361 |
+
if tf:
|
| 362 |
+
requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": tf[0]})
|
| 363 |
+
requests.post(f"{BASE}/step", json={"action_type": "submit"})
|
| 364 |
+
metrics = requests.get(f"{BASE}/advanced-metrics").json()
|
| 365 |
+
scores.append(requests.get(f"{BASE}/evaluate").json().get("composite_score", 0))
|
| 366 |
+
|
| 367 |
+
check("3 runs completed", len(scores) == 3, str(scores))
|
| 368 |
+
check("All runs have valid scores", all(0 <= s <= 1 for s in scores), str(scores))
|
| 369 |
+
|
| 370 |
+
# Consistency metric
|
| 371 |
+
r = requests.get(f"{BASE}/advanced-metrics")
|
| 372 |
+
d = r.json()
|
| 373 |
+
check("Consistency score populated after multiple runs", d.get("runs_analyzed", 0) >= 1,
|
| 374 |
+
f"runs={d.get('runs_analyzed')}, consistency={d.get('consistency_score'):.3f}")
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 378 |
+
print(f"\n{'โ'*60}")
|
| 379 |
+
print(f" E2E RESULTS: {PASS} passed | {FAIL} failed | {PASS+FAIL} total")
|
| 380 |
+
print(f" Score: {PASS/(PASS+FAIL)*100:.1f}%")
|
| 381 |
+
print(f"{'โ'*60}")
|
| 382 |
+
|
| 383 |
+
if FAIL > 0:
|
| 384 |
+
print("\nFailed tests:")
|
| 385 |
+
for r in RESULTS:
|
| 386 |
+
if not r["passed"]:
|
| 387 |
+
print(f" โ {r['name']}: {r['detail']}")
|
| 388 |
+
|
| 389 |
+
sys.exit(0 if FAIL == 0 else 1)
|
inference.py
CHANGED
|
@@ -17,9 +17,13 @@ from openai import OpenAI
|
|
| 17 |
import httpx
|
| 18 |
|
| 19 |
# โโ Configuration โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:7860")
|
| 24 |
|
| 25 |
MAX_STEPS_PER_TASK = {"task1": 12, "task2": 18, "task3": 22}
|
|
@@ -229,7 +233,7 @@ def run_task(env_client: EnvClient, llm_client: OpenAI, task: str) -> tuple:
|
|
| 229 |
|
| 230 |
def main():
|
| 231 |
env_client = EnvClient(ENV_BASE_URL)
|
| 232 |
-
llm_client = OpenAI(base_url=API_BASE_URL, api_key=
|
| 233 |
|
| 234 |
all_scores = []
|
| 235 |
for task in TASKS:
|
|
|
|
| 17 |
import httpx
|
| 18 |
|
| 19 |
# โโ Configuration โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 20 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "<your-active-endpoint>")
|
| 21 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "<your-active-model>")
|
| 22 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 23 |
+
|
| 24 |
+
# Optional โ if you use from_docker_image():
|
| 25 |
+
LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
|
| 26 |
+
|
| 27 |
ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:7860")
|
| 28 |
|
| 29 |
MAX_STEPS_PER_TASK = {"task1": 12, "task2": 18, "task3": 22}
|
|
|
|
| 233 |
|
| 234 |
def main():
|
| 235 |
env_client = EnvClient(ENV_BASE_URL)
|
| 236 |
+
llm_client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
| 237 |
|
| 238 |
all_scores = []
|
| 239 |
for task in TASKS:
|
server/analytics_engine.py
ADDED
|
@@ -0,0 +1,551 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/analytics_engine.py
|
| 2 |
+
"""
|
| 3 |
+
Unified Analytics Engine โ v4.0
|
| 4 |
+
|
| 5 |
+
Aggregates ALL scoring dimensions into a single research-grade report.
|
| 6 |
+
Produces:
|
| 7 |
+
- Reasoning graph (structured DAG of the agent's decision process)
|
| 8 |
+
- Root cause analysis (why the agent failed at every level)
|
| 9 |
+
- Decision efficiency score
|
| 10 |
+
- Overall AI reliability profile (radar chart data)
|
| 11 |
+
- Paper-ready JSON suitable for arXiv submission
|
| 12 |
+
|
| 13 |
+
This module is the "top of the stack" โ it calls all other engines
|
| 14 |
+
and synthesizes their outputs into one authoritative report.
|
| 15 |
+
"""
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
import time
|
| 18 |
+
import json
|
| 19 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 20 |
+
from dataclasses import dataclass, field
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@dataclass
|
| 24 |
+
class ReasoningNode:
|
| 25 |
+
"""One node in the agent's reconstructed reasoning graph."""
|
| 26 |
+
node_id: str
|
| 27 |
+
step_number: int
|
| 28 |
+
action_type: str
|
| 29 |
+
target: Optional[str] # file path or search query
|
| 30 |
+
reward: float
|
| 31 |
+
was_useful: bool
|
| 32 |
+
connected_to: List[str] # IDs of subsequent nodes that built on this
|
| 33 |
+
label: str # Human-readable description
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@dataclass
|
| 37 |
+
class ReasoningGraph:
|
| 38 |
+
"""
|
| 39 |
+
A directed graph reconstruction of the agent's thought process.
|
| 40 |
+
|
| 41 |
+
Nodes = actions taken.
|
| 42 |
+
Edges = "built on" relationships (e.g., write followed a read = used info from read).
|
| 43 |
+
Clusters = logical reasoning phases (Exploration, Hypothesis, Verification, Commit)
|
| 44 |
+
"""
|
| 45 |
+
nodes: List[ReasoningNode]
|
| 46 |
+
phases: Dict[str, List[str]] # phase_name โ [node_ids]
|
| 47 |
+
critical_path: List[str] # node_ids on the most impactful path
|
| 48 |
+
wasted_nodes: List[str] # node_ids that contributed nothing
|
| 49 |
+
optimal_path_comparison: Optional[str] # What should the agent have done
|
| 50 |
+
|
| 51 |
+
def to_dict(self) -> dict:
|
| 52 |
+
return {
|
| 53 |
+
"nodes": [
|
| 54 |
+
{
|
| 55 |
+
"id": n.node_id, "step": n.step_number,
|
| 56 |
+
"action": n.action_type, "target": n.target,
|
| 57 |
+
"reward": round(n.reward, 3), "useful": n.was_useful,
|
| 58 |
+
"connects_to": n.connected_to, "label": n.label,
|
| 59 |
+
}
|
| 60 |
+
for n in self.nodes
|
| 61 |
+
],
|
| 62 |
+
"phases": self.phases,
|
| 63 |
+
"critical_path": self.critical_path,
|
| 64 |
+
"wasted_nodes": self.wasted_nodes,
|
| 65 |
+
"optimal_path": self.optimal_path_comparison,
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
@dataclass
|
| 70 |
+
class AnalyticsReport:
|
| 71 |
+
"""
|
| 72 |
+
The master analytics report โ synthesizes all evaluation dimensions.
|
| 73 |
+
Paper-ready, structured for research publication or leaderboard submission.
|
| 74 |
+
"""
|
| 75 |
+
report_id: str
|
| 76 |
+
episode_id: str
|
| 77 |
+
task: str
|
| 78 |
+
variant_id: str
|
| 79 |
+
generated_at: float
|
| 80 |
+
|
| 81 |
+
# Dimension scores (0.0โ1.0 each)
|
| 82 |
+
correctness_score: float # Did it fix the bug?
|
| 83 |
+
causal_score: float # Did it understand WHY?
|
| 84 |
+
robustness_score: float # Is the strategy resilient?
|
| 85 |
+
calibration_score: float # Was it appropriately confident?
|
| 86 |
+
reliability_index: float # Weighted multi-dim score
|
| 87 |
+
generalization_hint: float # Based on strategy (robust strategies generalize better)
|
| 88 |
+
decision_efficiency: float # Score / Steps ratio (normalized)
|
| 89 |
+
process_quality: float # How structured was the reasoning process?
|
| 90 |
+
|
| 91 |
+
# Composite
|
| 92 |
+
composite_score: float # Weighted aggregate of all dimensions
|
| 93 |
+
|
| 94 |
+
# Graph
|
| 95 |
+
reasoning_graph: ReasoningGraph
|
| 96 |
+
|
| 97 |
+
# Root cause trees
|
| 98 |
+
failure_root_causes: List[Dict] # Each: {cause, effect, evidence, depth}
|
| 99 |
+
|
| 100 |
+
# Alternative path analysis
|
| 101 |
+
what_agent_did: List[str]
|
| 102 |
+
what_agent_should_have_done: List[str]
|
| 103 |
+
steps_wasted: int
|
| 104 |
+
steps_optimal: int
|
| 105 |
+
|
| 106 |
+
# Profile tags
|
| 107 |
+
profile_tags: List[str] # e.g., ["OVERCONFIDENT", "SHORTCUT_LEARNER", "WELL_CALIBRATED"]
|
| 108 |
+
|
| 109 |
+
# Executive summary
|
| 110 |
+
executive_summary: str
|
| 111 |
+
researcher_notes: str # More technical deep dive
|
| 112 |
+
|
| 113 |
+
def to_dict(self) -> dict:
|
| 114 |
+
return {
|
| 115 |
+
"report_id": self.report_id,
|
| 116 |
+
"episode_id": self.episode_id,
|
| 117 |
+
"task": self.task,
|
| 118 |
+
"variant_id": self.variant_id,
|
| 119 |
+
"generated_at": self.generated_at,
|
| 120 |
+
"dimension_scores": {
|
| 121 |
+
"correctness": round(self.correctness_score, 3),
|
| 122 |
+
"causal_reasoning": round(self.causal_score, 3),
|
| 123 |
+
"robustness": round(self.robustness_score, 3),
|
| 124 |
+
"calibration": round(self.calibration_score, 3),
|
| 125 |
+
"reliability_index": round(self.reliability_index, 3),
|
| 126 |
+
"generalization": round(self.generalization_hint, 3),
|
| 127 |
+
"decision_efficiency": round(self.decision_efficiency, 3),
|
| 128 |
+
"process_quality": round(self.process_quality, 3),
|
| 129 |
+
"composite": round(self.composite_score, 3),
|
| 130 |
+
},
|
| 131 |
+
"reasoning_graph": self.reasoning_graph.to_dict(),
|
| 132 |
+
"failure_root_causes": self.failure_root_causes,
|
| 133 |
+
"alternative_paths": {
|
| 134 |
+
"what_agent_did": self.what_agent_did,
|
| 135 |
+
"optimal_path": self.what_agent_should_have_done,
|
| 136 |
+
"steps_wasted": self.steps_wasted,
|
| 137 |
+
"steps_optimal": self.steps_optimal,
|
| 138 |
+
},
|
| 139 |
+
"profile_tags": self.profile_tags,
|
| 140 |
+
"executive_summary": self.executive_summary,
|
| 141 |
+
"researcher_notes": self.researcher_notes,
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
def render_text(self) -> str:
|
| 145 |
+
"""Render a human-readable analytics report."""
|
| 146 |
+
def bar(v: float, width: int = 20) -> str:
|
| 147 |
+
filled = int(v * width)
|
| 148 |
+
return "โ" * filled + "โ" * (width - filled)
|
| 149 |
+
|
| 150 |
+
lines = [
|
| 151 |
+
f"{'โ'*70}",
|
| 152 |
+
f" ๐ ANALYTICS ENGINE REPORT โ {self.task} | {self.variant_id}",
|
| 153 |
+
f" Episode: {self.episode_id}",
|
| 154 |
+
f"{'โ'*70}",
|
| 155 |
+
"",
|
| 156 |
+
"โโ DIMENSION SCORES โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ",
|
| 157 |
+
f"โ Correctness [{bar(self.correctness_score)}] {self.correctness_score:.3f}",
|
| 158 |
+
f"โ Causal Reasoning [{bar(self.causal_score)}] {self.causal_score:.3f}",
|
| 159 |
+
f"โ Robustness [{bar(self.robustness_score)}] {self.robustness_score:.3f}",
|
| 160 |
+
f"โ Calibration [{bar(self.calibration_score)}] {self.calibration_score:.3f}",
|
| 161 |
+
f"โ Reliability [{bar(self.reliability_index)}] {self.reliability_index:.3f}",
|
| 162 |
+
f"โ Decision Effic. [{bar(self.decision_efficiency)}] {self.decision_efficiency:.3f}",
|
| 163 |
+
f"โ Process Quality [{bar(self.process_quality)}] {self.process_quality:.3f}",
|
| 164 |
+
f"โ {'โ'*60}",
|
| 165 |
+
f"โ COMPOSITE [{bar(self.composite_score)}] {self.composite_score:.3f}",
|
| 166 |
+
"โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ",
|
| 167 |
+
"",
|
| 168 |
+
]
|
| 169 |
+
|
| 170 |
+
if self.profile_tags:
|
| 171 |
+
lines.append(f"๐ท๏ธ Profile: {' | '.join(self.profile_tags)}")
|
| 172 |
+
lines.append("")
|
| 173 |
+
|
| 174 |
+
lines += [
|
| 175 |
+
"๐ Executive Summary",
|
| 176 |
+
f" {self.executive_summary}",
|
| 177 |
+
"",
|
| 178 |
+
]
|
| 179 |
+
|
| 180 |
+
if self.failure_root_causes:
|
| 181 |
+
lines.append("๐ฅ Failure Root Cause Analysis")
|
| 182 |
+
for rc in self.failure_root_causes[:3]:
|
| 183 |
+
lines.append(f" Cause: {rc.get('cause')}")
|
| 184 |
+
lines.append(f" Effect: {rc.get('effect')}")
|
| 185 |
+
lines.append(f" Fix: {rc.get('remediation')}")
|
| 186 |
+
lines.append("")
|
| 187 |
+
|
| 188 |
+
lines += [
|
| 189 |
+
"๐บ๏ธ What Agent Did vs Optimal",
|
| 190 |
+
f" Steps taken: {len(self.what_agent_did)} | Steps optimal: {self.steps_optimal} | Wasted: {self.steps_wasted}",
|
| 191 |
+
]
|
| 192 |
+
for a, o in zip(
|
| 193 |
+
self.what_agent_did[:5],
|
| 194 |
+
self.what_agent_should_have_done[:5],
|
| 195 |
+
):
|
| 196 |
+
prefix_a = " โ" if a == o else " โ"
|
| 197 |
+
lines.append(f" Agent: {a}")
|
| 198 |
+
lines.append(f" Optimal: {o}")
|
| 199 |
+
lines.append("")
|
| 200 |
+
|
| 201 |
+
if self.researcher_notes:
|
| 202 |
+
lines += ["๐ฌ Researcher Notes", f" {self.researcher_notes}", ""]
|
| 203 |
+
|
| 204 |
+
lines.append(f"{'โ'*70}")
|
| 205 |
+
return "\n".join(lines)
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
class AnalyticsEngine:
|
| 209 |
+
"""
|
| 210 |
+
Master analytics engine โ integrates all evaluation modules.
|
| 211 |
+
|
| 212 |
+
Call .analyze() after an episode to get the full AnalyticsReport.
|
| 213 |
+
"""
|
| 214 |
+
|
| 215 |
+
def analyze(
|
| 216 |
+
self,
|
| 217 |
+
env,
|
| 218 |
+
causal_report=None,
|
| 219 |
+
counterfactual_report=None,
|
| 220 |
+
calibration_report=None,
|
| 221 |
+
advanced_metrics=None,
|
| 222 |
+
failure_report=None,
|
| 223 |
+
strategy_report=None,
|
| 224 |
+
) -> AnalyticsReport:
|
| 225 |
+
"""
|
| 226 |
+
Synthesize all evaluation outputs into one AnalyticsReport.
|
| 227 |
+
Each sub-report is optional โ we gracefully handle None.
|
| 228 |
+
"""
|
| 229 |
+
import uuid
|
| 230 |
+
|
| 231 |
+
traj = env.get_trajectory()
|
| 232 |
+
steps = traj.get("steps", []) if traj else []
|
| 233 |
+
meta = env.variant.meta if env.variant else {}
|
| 234 |
+
episode_id = traj.get("episode_id", "unknown") if traj else "unknown"
|
| 235 |
+
variant_id = traj.get("variant_id", "unknown") if traj else "unknown"
|
| 236 |
+
task = env.current_task or "unknown"
|
| 237 |
+
final_score = env.final_score
|
| 238 |
+
files_read = list(env.files_read)
|
| 239 |
+
files_written = list(env.files_written)
|
| 240 |
+
|
| 241 |
+
# โโ Run sub-engines if reports not provided โโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 242 |
+
if causal_report is None:
|
| 243 |
+
from server.causal_probe import CausalProbe
|
| 244 |
+
causal_report = CausalProbe().probe(
|
| 245 |
+
episode_id, task, steps, meta, files_read, files_written, final_score
|
| 246 |
+
)
|
| 247 |
+
if counterfactual_report is None:
|
| 248 |
+
from server.counterfactual_engine import CounterfactualEngine
|
| 249 |
+
counterfactual_report = CounterfactualEngine().analyze(
|
| 250 |
+
episode_id, task, steps, meta, files_read, files_written, final_score
|
| 251 |
+
)
|
| 252 |
+
if calibration_report is None:
|
| 253 |
+
from server.confidence_calibrator import ConfidenceCalibrator
|
| 254 |
+
calibration_report = ConfidenceCalibrator().calibrate(
|
| 255 |
+
episode_id, task, steps, final_score
|
| 256 |
+
)
|
| 257 |
+
if advanced_metrics is None:
|
| 258 |
+
from server.advanced_metrics import AdvancedMetricsEngine
|
| 259 |
+
advanced_metrics = AdvancedMetricsEngine().compute(
|
| 260 |
+
steps, meta, final_score, files_read, files_written
|
| 261 |
+
)
|
| 262 |
+
if failure_report is None:
|
| 263 |
+
from server.failure_classifier import FailureClassifier
|
| 264 |
+
failure_report = FailureClassifier().classify(
|
| 265 |
+
episode_id, task, steps, meta, files_read, files_written, final_score
|
| 266 |
+
)
|
| 267 |
+
if strategy_report is None:
|
| 268 |
+
from server.strategy_detector import StrategyDetector
|
| 269 |
+
strategy_report = StrategyDetector().detect(
|
| 270 |
+
steps, task, meta, files_read, final_score
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
+
# โโ Compute derived scores โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 274 |
+
causal_score = causal_report.causal_score
|
| 275 |
+
robustness_score = counterfactual_report.robustness_score
|
| 276 |
+
calibration_score = calibration_report.calibration_score
|
| 277 |
+
reliability_index = advanced_metrics.reliability_index
|
| 278 |
+
correctness_score = final_score
|
| 279 |
+
|
| 280 |
+
# Decision efficiency: correctness per step, normalized
|
| 281 |
+
total_steps = max(len(steps), 1)
|
| 282 |
+
max_steps_possible = meta.get("max_steps", 20)
|
| 283 |
+
decision_efficiency = (
|
| 284 |
+
final_score /
|
| 285 |
+
max(1.0, total_steps / max(1, max_steps_possible / 3))
|
| 286 |
+
)
|
| 287 |
+
decision_efficiency = min(1.0, decision_efficiency)
|
| 288 |
+
|
| 289 |
+
# Process quality: measures structural quality of reasoning process
|
| 290 |
+
read_before_write = causal_report.read_before_write
|
| 291 |
+
tested_before_submit = causal_report.submit_after_test
|
| 292 |
+
used_search = causal_report.search_before_navigate
|
| 293 |
+
full_chain = causal_report.actual_chain_coverage
|
| 294 |
+
process_quality = (
|
| 295 |
+
(0.25 if read_before_write else 0.0) +
|
| 296 |
+
(0.25 if tested_before_submit else 0.0) +
|
| 297 |
+
(0.20 if used_search else 0.0) +
|
| 298 |
+
full_chain * 0.30
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
+
# Generalization hint from strategy robustness
|
| 302 |
+
strategy_generalization_map = {
|
| 303 |
+
"TARGETED_DEBUGGING": 0.75,
|
| 304 |
+
"SYSTEMATIC_SEARCH": 0.70,
|
| 305 |
+
"SPEC_DRIVEN": 0.80,
|
| 306 |
+
"BRUTE_FORCE": 0.40,
|
| 307 |
+
"RANDOM_EXPLORATION": 0.30,
|
| 308 |
+
"MINIMAL_EFFORT": 0.20,
|
| 309 |
+
}
|
| 310 |
+
generalization_hint = strategy_generalization_map.get(strategy_report.strategy, 0.5)
|
| 311 |
+
generalization_hint = (generalization_hint + robustness_score) / 2
|
| 312 |
+
|
| 313 |
+
# Composite (research-grade weighted aggregate)
|
| 314 |
+
composite_score = (
|
| 315 |
+
correctness_score * 0.30 +
|
| 316 |
+
causal_score * 0.20 +
|
| 317 |
+
robustness_score * 0.15 +
|
| 318 |
+
calibration_score * 0.12 +
|
| 319 |
+
reliability_index * 0.10 +
|
| 320 |
+
process_quality * 0.08 +
|
| 321 |
+
decision_efficiency * 0.05
|
| 322 |
+
)
|
| 323 |
+
|
| 324 |
+
# โโ Build reasoning graph โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 325 |
+
reasoning_graph = self._build_reasoning_graph(steps, meta, files_read, files_written)
|
| 326 |
+
|
| 327 |
+
# โโ Root cause analysis โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 328 |
+
root_causes = self._build_root_cause_tree(
|
| 329 |
+
failure_report, causal_report, calibration_report, final_score
|
| 330 |
+
)
|
| 331 |
+
|
| 332 |
+
# โโ Alternative path analysis โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 333 |
+
what_did = [
|
| 334 |
+
f"{s.get('action_type')} {s.get('action_path') or s.get('action_query') or ''}".strip()
|
| 335 |
+
for s in steps
|
| 336 |
+
]
|
| 337 |
+
optimal = self._compute_optimal_path(meta, files_read, files_written, final_score)
|
| 338 |
+
steps_wasted = max(0, total_steps - len(optimal))
|
| 339 |
+
|
| 340 |
+
# โโ Profile tags โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 341 |
+
tags = []
|
| 342 |
+
if calibration_report.profile.value != "WELL_CALIBRATED":
|
| 343 |
+
tags.append(calibration_report.profile.value)
|
| 344 |
+
if causal_report.shortcut_learning_detected:
|
| 345 |
+
tags.append("SHORTCUT_LEARNER")
|
| 346 |
+
if causal_report.false_confidence_detected:
|
| 347 |
+
tags.append("FALSE_CONFIDENCE")
|
| 348 |
+
if counterfactual_report.brittleness_level.value in ("BRITTLE", "FRAGILE"):
|
| 349 |
+
tags.append(f"BRITTLE_STRATEGY_{counterfactual_report.brittleness_level.value}")
|
| 350 |
+
if causal_report.understanding_level.value == "DEEP":
|
| 351 |
+
tags.append("DEEP_REASONER")
|
| 352 |
+
if strategy_report.strategy == "TARGETED_DEBUGGING":
|
| 353 |
+
tags.append("TARGETED_DEBUGGER")
|
| 354 |
+
if not tags:
|
| 355 |
+
tags.append("TYPICAL")
|
| 356 |
+
|
| 357 |
+
# โโ Executive summary โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 358 |
+
summary_parts = [
|
| 359 |
+
f"Agent scored {final_score:.2f} on {task}.",
|
| 360 |
+
f"Causal understanding: {causal_report.understanding_level.value} ({causal_score:.2f}).",
|
| 361 |
+
f"Strategy: {strategy_report.strategy} (robustness: {robustness_score:.2f}).",
|
| 362 |
+
f"Confidence calibration: {calibration_report.profile.value} (error: {calibration_report.expected_calibration_error:.2f}).",
|
| 363 |
+
f"Composite reliability: {composite_score:.2f}.",
|
| 364 |
+
]
|
| 365 |
+
executive_summary = " ".join(summary_parts)
|
| 366 |
+
|
| 367 |
+
# โโ Researcher notes โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 368 |
+
researcher_notes = (
|
| 369 |
+
f"Observed {total_steps} steps ({steps_wasted} wasted vs estimated {len(optimal)} optimal). "
|
| 370 |
+
f"Chain coverage: {causal_report.actual_chain_coverage:.0%}. "
|
| 371 |
+
f"Chain order score: {causal_report.chain_order_score:.2f}. "
|
| 372 |
+
f"Counterfactual mutations survived: {counterfactual_report.mutations_survived}/{len(counterfactual_report.mutations_tested)}. "
|
| 373 |
+
f"Expected calibration error: {calibration_report.expected_calibration_error:.3f}. "
|
| 374 |
+
f"Decision efficiency: {decision_efficiency:.3f}. "
|
| 375 |
+
f"Process quality: {process_quality:.3f}."
|
| 376 |
+
)
|
| 377 |
+
|
| 378 |
+
return AnalyticsReport(
|
| 379 |
+
report_id=f"ar_{uuid.uuid4().hex[:10]}",
|
| 380 |
+
episode_id=episode_id,
|
| 381 |
+
task=task,
|
| 382 |
+
variant_id=variant_id,
|
| 383 |
+
generated_at=time.time(),
|
| 384 |
+
correctness_score=correctness_score,
|
| 385 |
+
causal_score=causal_score,
|
| 386 |
+
robustness_score=robustness_score,
|
| 387 |
+
calibration_score=calibration_score,
|
| 388 |
+
reliability_index=reliability_index,
|
| 389 |
+
generalization_hint=generalization_hint,
|
| 390 |
+
decision_efficiency=decision_efficiency,
|
| 391 |
+
process_quality=process_quality,
|
| 392 |
+
composite_score=composite_score,
|
| 393 |
+
reasoning_graph=reasoning_graph,
|
| 394 |
+
failure_root_causes=root_causes,
|
| 395 |
+
what_agent_did=what_did,
|
| 396 |
+
what_agent_should_have_done=optimal,
|
| 397 |
+
steps_wasted=steps_wasted,
|
| 398 |
+
steps_optimal=len(optimal),
|
| 399 |
+
profile_tags=tags,
|
| 400 |
+
executive_summary=executive_summary,
|
| 401 |
+
researcher_notes=researcher_notes,
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
def _build_reasoning_graph(
|
| 405 |
+
self,
|
| 406 |
+
steps: List[dict],
|
| 407 |
+
meta: dict,
|
| 408 |
+
files_read: List[str],
|
| 409 |
+
files_written: List[str],
|
| 410 |
+
) -> ReasoningGraph:
|
| 411 |
+
"""Build a DAG from the trajectory steps."""
|
| 412 |
+
bug_files = set(meta.get("bug_files", []) + meta.get("files_to_implement", []))
|
| 413 |
+
|
| 414 |
+
nodes: List[ReasoningNode] = []
|
| 415 |
+
phases: Dict[str, List[str]] = {
|
| 416 |
+
"Exploration": [], "Hypothesis": [], "Verification": [], "Commit": []
|
| 417 |
+
}
|
| 418 |
+
files_read_set = set()
|
| 419 |
+
last_useful_node_id: Optional[str] = None
|
| 420 |
+
all_node_ids: List[str] = []
|
| 421 |
+
|
| 422 |
+
for s in steps:
|
| 423 |
+
node_id = f"n{s.get('step_number', len(nodes)+1)}"
|
| 424 |
+
atype = s.get("action_type", "unknown")
|
| 425 |
+
target = s.get("action_path") or s.get("action_query")
|
| 426 |
+
reward = s.get("reward", 0.0)
|
| 427 |
+
|
| 428 |
+
# Determine usefulness
|
| 429 |
+
was_useful = (
|
| 430 |
+
reward > 0 or
|
| 431 |
+
(atype == "read_file" and target in bug_files) or
|
| 432 |
+
(atype == "search_code") or
|
| 433 |
+
(atype == "run_tests") or
|
| 434 |
+
(atype == "submit" and reward > 0)
|
| 435 |
+
)
|
| 436 |
+
|
| 437 |
+
# Determine phase
|
| 438 |
+
if atype in ("read_file", "search_code"):
|
| 439 |
+
phase = "Exploration"
|
| 440 |
+
elif atype == "write_file":
|
| 441 |
+
phase = "Hypothesis"
|
| 442 |
+
elif atype == "run_tests":
|
| 443 |
+
phase = "Verification"
|
| 444 |
+
else:
|
| 445 |
+
phase = "Commit"
|
| 446 |
+
|
| 447 |
+
# Build label
|
| 448 |
+
short_target = (target.split("/")[-1] if target else "")[:20] if target else ""
|
| 449 |
+
label = f"{atype}({short_target})" if short_target else atype
|
| 450 |
+
|
| 451 |
+
# Connections: link to previous useful node
|
| 452 |
+
connects_to = [last_useful_node_id] if last_useful_node_id and was_useful else []
|
| 453 |
+
connects_to = [c for c in connects_to if c]
|
| 454 |
+
|
| 455 |
+
node = ReasoningNode(
|
| 456 |
+
node_id=node_id,
|
| 457 |
+
step_number=s.get("step_number", len(nodes) + 1),
|
| 458 |
+
action_type=atype,
|
| 459 |
+
target=target,
|
| 460 |
+
reward=reward,
|
| 461 |
+
was_useful=was_useful,
|
| 462 |
+
connected_to=connects_to,
|
| 463 |
+
label=label,
|
| 464 |
+
)
|
| 465 |
+
nodes.append(node)
|
| 466 |
+
phases[phase].append(node_id)
|
| 467 |
+
all_node_ids.append(node_id)
|
| 468 |
+
if was_useful:
|
| 469 |
+
last_useful_node_id = node_id
|
| 470 |
+
|
| 471 |
+
# Critical path: nodes with positive reward or that led to the final submit
|
| 472 |
+
critical_path = [n.node_id for n in nodes if n.reward > 0 or n.action_type == "submit"]
|
| 473 |
+
wasted_nodes = [n.node_id for n in nodes if not n.was_useful and n.action_type != "submit"]
|
| 474 |
+
|
| 475 |
+
# Optimal path comparison
|
| 476 |
+
optimal_actions = []
|
| 477 |
+
test_files = [f for f in (list(files_read) + list(bug_files)) if "test" in f.lower()]
|
| 478 |
+
src_files = [f for f in (list(files_read) + list(bug_files)) if f not in test_files]
|
| 479 |
+
for tf in test_files[:1]:
|
| 480 |
+
optimal_actions.append(f"read_file({tf.split('/')[-1]})")
|
| 481 |
+
for sf in src_files[:2]:
|
| 482 |
+
optimal_actions.append(f"read_file({sf.split('/')[-1]})")
|
| 483 |
+
optimal_actions += ["write_file(src)", "run_tests", "submit"]
|
| 484 |
+
optimal_path = " โ ".join(optimal_actions)
|
| 485 |
+
|
| 486 |
+
return ReasoningGraph(
|
| 487 |
+
nodes=nodes,
|
| 488 |
+
phases={k: v for k, v in phases.items() if v},
|
| 489 |
+
critical_path=critical_path,
|
| 490 |
+
wasted_nodes=wasted_nodes,
|
| 491 |
+
optimal_path_comparison=optimal_path,
|
| 492 |
+
)
|
| 493 |
+
|
| 494 |
+
def _build_root_cause_tree(
|
| 495 |
+
self, failure_report, causal_report, calibration_report, final_score: float
|
| 496 |
+
) -> List[Dict]:
|
| 497 |
+
"""Build a structured root cause tree."""
|
| 498 |
+
causes = []
|
| 499 |
+
|
| 500 |
+
if failure_report and failure_report.failures:
|
| 501 |
+
for f in failure_report.failures[:3]:
|
| 502 |
+
causes.append({
|
| 503 |
+
"depth": "primary",
|
| 504 |
+
"cause": f.failure_type if hasattr(f, "failure_type") else str(f),
|
| 505 |
+
"effect": f.evidence if hasattr(f, "evidence") else "unknown",
|
| 506 |
+
"remediation": f.remediation if hasattr(f, "remediation") else "See improvement plan",
|
| 507 |
+
})
|
| 508 |
+
elif final_score < 0.5:
|
| 509 |
+
causes.append({
|
| 510 |
+
"depth": "primary",
|
| 511 |
+
"cause": failure_report.primary_failure if failure_report else "LOW_SCORE",
|
| 512 |
+
"effect": f"Final score only {final_score:.2f} โ bug not adequately fixed",
|
| 513 |
+
"remediation": "Use test-first navigation and verify with run_tests",
|
| 514 |
+
})
|
| 515 |
+
|
| 516 |
+
if causal_report and causal_report.guessing_indicators:
|
| 517 |
+
for ind in causal_report.guessing_indicators[:2]:
|
| 518 |
+
causes.append({
|
| 519 |
+
"depth": "secondary",
|
| 520 |
+
"cause": "CAUSAL_GAP",
|
| 521 |
+
"effect": ind,
|
| 522 |
+
"remediation": causal_report.recommendations[0] if causal_report.recommendations else "",
|
| 523 |
+
})
|
| 524 |
+
|
| 525 |
+
if calibration_report and calibration_report.profile.value == "OVERCONFIDENT":
|
| 526 |
+
causes.append({
|
| 527 |
+
"depth": "secondary",
|
| 528 |
+
"cause": "OVERCONFIDENCE",
|
| 529 |
+
"effect": f"Inferred confidence {calibration_report.inferred_confidence:.2f} vs actual {calibration_report.actual_performance:.2f}",
|
| 530 |
+
"remediation": "Read more before committing. Verify with tests.",
|
| 531 |
+
})
|
| 532 |
+
|
| 533 |
+
return causes
|
| 534 |
+
|
| 535 |
+
def _compute_optimal_path(
|
| 536 |
+
self, meta: dict, files_read: List[str], files_written: List[str], score: float
|
| 537 |
+
) -> List[str]:
|
| 538 |
+
"""Suggest what the optimal action sequence would have been."""
|
| 539 |
+
test_files = [f for f in files_read if "test" in f.lower()]
|
| 540 |
+
bug_files = meta.get("bug_files", []) or meta.get("files_to_implement", [])
|
| 541 |
+
|
| 542 |
+
path = []
|
| 543 |
+
for tf in (test_files or ["tests/test_main.py"])[:1]:
|
| 544 |
+
path.append(f"read_file {tf}")
|
| 545 |
+
for bf in (bug_files or ["src/main.py"])[:2]:
|
| 546 |
+
path.append(f"read_file {bf}")
|
| 547 |
+
path.append("search_code <function_name>")
|
| 548 |
+
path.append("write_file <targeted_fix>")
|
| 549 |
+
path.append("run_tests")
|
| 550 |
+
path.append("submit")
|
| 551 |
+
return path
|
server/app.py
CHANGED
|
@@ -1,12 +1,14 @@
|
|
| 1 |
# server/app.py
|
| 2 |
"""
|
| 3 |
-
FastAPI server โ
|
| 4 |
|
| 5 |
Core endpoints: POST /reset, POST /step, GET /state, GET /health
|
| 6 |
Evaluation endpoints: GET /trajectory, GET /evaluate, GET /metrics
|
| 7 |
Control endpoints: POST /fault-config
|
| 8 |
-
Intelligence
|
| 9 |
-
|
|
|
|
|
|
|
| 10 |
"""
|
| 11 |
from fastapi import FastAPI, HTTPException
|
| 12 |
from fastapi.staticfiles import StaticFiles
|
|
@@ -337,3 +339,146 @@ async def get_viz_data():
|
|
| 337 |
"dependencies": deps,
|
| 338 |
"steps": steps_data,
|
| 339 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# server/app.py
|
| 2 |
"""
|
| 3 |
+
FastAPI server โ v4.0
|
| 4 |
|
| 5 |
Core endpoints: POST /reset, POST /step, GET /state, GET /health
|
| 6 |
Evaluation endpoints: GET /trajectory, GET /evaluate, GET /metrics
|
| 7 |
Control endpoints: POST /fault-config
|
| 8 |
+
Intelligence (v3): GET /classify, GET /strategy, GET /advanced-metrics,
|
| 9 |
+
POST /compare-agents, GET /improvement-plan, GET /viz-data
|
| 10 |
+
Research (v4 NEW): GET /causal-probe, GET /counterfactual, GET /confidence,
|
| 11 |
+
POST /benchmark, GET /analytics
|
| 12 |
"""
|
| 13 |
from fastapi import FastAPI, HTTPException
|
| 14 |
from fastapi.staticfiles import StaticFiles
|
|
|
|
| 339 |
"dependencies": deps,
|
| 340 |
"steps": steps_data,
|
| 341 |
}
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
# โโ Research Endpoints (NEW in v4) โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 345 |
+
|
| 346 |
+
from .causal_probe import CausalProbe
|
| 347 |
+
from .counterfactual_engine import CounterfactualEngine
|
| 348 |
+
from .confidence_calibrator import ConfidenceCalibrator
|
| 349 |
+
from .benchmark_runner import BenchmarkRunner
|
| 350 |
+
from .analytics_engine import AnalyticsEngine
|
| 351 |
+
|
| 352 |
+
_causal = CausalProbe()
|
| 353 |
+
_counter = CounterfactualEngine()
|
| 354 |
+
_calibrator = ConfidenceCalibrator()
|
| 355 |
+
_benchmark = BenchmarkRunner()
|
| 356 |
+
_analytics = AnalyticsEngine()
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
@app.get("/causal-probe")
|
| 360 |
+
async def causal_probe():
|
| 361 |
+
"""
|
| 362 |
+
Causal reasoning probe โ did the agent understand WHY the bug exists?
|
| 363 |
+
Returns: causal_score, understanding_level, chain_coverage, shortcut_detection.
|
| 364 |
+
"""
|
| 365 |
+
traj = env.get_trajectory()
|
| 366 |
+
if not traj:
|
| 367 |
+
return {"error": "No trajectory available."}
|
| 368 |
+
steps = traj.get("steps", [])
|
| 369 |
+
meta = env.variant.meta if env.variant else {}
|
| 370 |
+
report = _causal.probe(
|
| 371 |
+
episode_id=traj.get("episode_id", ""),
|
| 372 |
+
task=env.current_task or "unknown",
|
| 373 |
+
trajectory_steps=steps,
|
| 374 |
+
variant_meta=meta,
|
| 375 |
+
files_read=list(env.files_read),
|
| 376 |
+
files_written=list(env.files_written),
|
| 377 |
+
final_score=env.final_score,
|
| 378 |
+
)
|
| 379 |
+
return report.to_dict()
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
@app.get("/counterfactual")
|
| 383 |
+
async def counterfactual():
|
| 384 |
+
"""
|
| 385 |
+
Counterfactual robustness test โ is the agent's strategy brittle?
|
| 386 |
+
Simulates 6 mutations and measures how many the strategy survives.
|
| 387 |
+
Returns: robustness_score, brittleness_level, mutations analysis.
|
| 388 |
+
"""
|
| 389 |
+
traj = env.get_trajectory()
|
| 390 |
+
if not traj:
|
| 391 |
+
return {"error": "No trajectory available."}
|
| 392 |
+
steps = traj.get("steps", [])
|
| 393 |
+
meta = env.variant.meta if env.variant else {}
|
| 394 |
+
report = _counter.analyze(
|
| 395 |
+
episode_id=traj.get("episode_id", ""),
|
| 396 |
+
task=env.current_task or "unknown",
|
| 397 |
+
trajectory_steps=steps,
|
| 398 |
+
variant_meta=meta,
|
| 399 |
+
files_read=list(env.files_read),
|
| 400 |
+
files_written=list(env.files_written),
|
| 401 |
+
final_score=env.final_score,
|
| 402 |
+
)
|
| 403 |
+
return report.to_dict()
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
@app.get("/confidence")
|
| 407 |
+
async def confidence_calibration():
|
| 408 |
+
"""
|
| 409 |
+
Confidence calibration โ is the agent appropriately confident?
|
| 410 |
+
Infers confidence from behavioral proxies and compares to actual performance.
|
| 411 |
+
Returns: profile (WELL_CALIBRATED|OVERCONFIDENT|UNDERCONFIDENT), calibration_score.
|
| 412 |
+
"""
|
| 413 |
+
traj = env.get_trajectory()
|
| 414 |
+
if not traj:
|
| 415 |
+
return {"error": "No trajectory available."}
|
| 416 |
+
steps = traj.get("steps", [])
|
| 417 |
+
report = _calibrator.calibrate(
|
| 418 |
+
episode_id=traj.get("episode_id", ""),
|
| 419 |
+
task=env.current_task or "unknown",
|
| 420 |
+
trajectory_steps=steps,
|
| 421 |
+
final_score=env.final_score,
|
| 422 |
+
)
|
| 423 |
+
return report.to_dict()
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
@app.post("/benchmark")
|
| 427 |
+
async def run_benchmark(
|
| 428 |
+
tasks: str = "task1,task2",
|
| 429 |
+
agents: str = "all",
|
| 430 |
+
benchmark_id: str = None,
|
| 431 |
+
):
|
| 432 |
+
"""
|
| 433 |
+
Automated benchmark leaderboard.
|
| 434 |
+
Runs all selected agents ร tasks. Returns ranked leaderboard.
|
| 435 |
+
tasks: comma-separated task IDs. agents: "all" or comma-separated strategy names.
|
| 436 |
+
"""
|
| 437 |
+
task_list = [t.strip() for t in tasks.split(",") if t.strip()]
|
| 438 |
+
valid_tasks = ["task1", "task2", "task3"]
|
| 439 |
+
task_list = [t for t in task_list if t in valid_tasks]
|
| 440 |
+
if not task_list:
|
| 441 |
+
raise HTTPException(status_code=400, detail=f"tasks must be one of {valid_tasks}")
|
| 442 |
+
|
| 443 |
+
agent_list = None if agents == "all" else [a.strip() for a in agents.split(",")]
|
| 444 |
+
|
| 445 |
+
try:
|
| 446 |
+
report = _benchmark.run(env, tasks=task_list, agents=agent_list, benchmark_id=benchmark_id)
|
| 447 |
+
return report.to_dict()
|
| 448 |
+
except Exception as e:
|
| 449 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 450 |
+
|
| 451 |
+
|
| 452 |
+
@app.get("/analytics")
|
| 453 |
+
async def get_analytics():
|
| 454 |
+
"""
|
| 455 |
+
Unified research-grade analytics report.
|
| 456 |
+
Synthesizes all v3+v4 evaluation dimensions into one report with:
|
| 457 |
+
reasoning graph, root cause tree, alternative paths, profile tags,
|
| 458 |
+
composite score, executive summary, researcher notes.
|
| 459 |
+
"""
|
| 460 |
+
traj = env.get_trajectory()
|
| 461 |
+
if not traj:
|
| 462 |
+
return {"error": "No trajectory available."}
|
| 463 |
+
try:
|
| 464 |
+
report = _analytics.analyze(env)
|
| 465 |
+
return report.to_dict()
|
| 466 |
+
except Exception as e:
|
| 467 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 468 |
+
|
| 469 |
+
|
| 470 |
+
@app.get("/health")
|
| 471 |
+
async def health_v4():
|
| 472 |
+
return {
|
| 473 |
+
"status": "ok",
|
| 474 |
+
"environment": "codebase-nav-env",
|
| 475 |
+
"version": "4.0.0",
|
| 476 |
+
"endpoints": [
|
| 477 |
+
"/reset", "/step", "/state", "/health",
|
| 478 |
+
"/trajectory", "/evaluate", "/metrics", "/fault-config",
|
| 479 |
+
"/classify", "/strategy", "/advanced-metrics",
|
| 480 |
+
"/improvement-plan", "/compare-agents", "/viz-data",
|
| 481 |
+
"/causal-probe", "/counterfactual", "/confidence",
|
| 482 |
+
"/benchmark", "/analytics",
|
| 483 |
+
],
|
| 484 |
+
}
|
server/benchmark_runner.py
ADDED
|
@@ -0,0 +1,413 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/benchmark_runner.py
|
| 2 |
+
"""
|
| 3 |
+
Benchmark Runner + Leaderboard โ v4.0
|
| 4 |
+
|
| 5 |
+
Automatically runs ALL tasks ร selected agent configurations and generates
|
| 6 |
+
a research-grade leaderboard output with per-task, per-strategy breakdowns.
|
| 7 |
+
|
| 8 |
+
Unlike existing benchmarks (SWE-bench, HumanEval) which require manual setup,
|
| 9 |
+
this runs end-to-end in-process with deterministic strategies.
|
| 10 |
+
|
| 11 |
+
Output format:
|
| 12 |
+
- Leaderboard table (ranked by composite score)
|
| 13 |
+
- Per-task breakdown
|
| 14 |
+
- Per-failure-type breakdown
|
| 15 |
+
- Generalization score (variance across tasks)
|
| 16 |
+
- Robustness score (from counterfactual engine)
|
| 17 |
+
- A "benchmark JSON" suitable for publishing or comparing systems
|
| 18 |
+
"""
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
import time
|
| 21 |
+
import json
|
| 22 |
+
from typing import List, Dict, Any, Optional
|
| 23 |
+
from dataclasses import dataclass, field
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class BenchmarkResult:
|
| 28 |
+
"""Result of running one agent on one task variant."""
|
| 29 |
+
agent_name: str
|
| 30 |
+
task: str
|
| 31 |
+
variant_id: str
|
| 32 |
+
final_score: float
|
| 33 |
+
total_steps: int
|
| 34 |
+
cumulative_reward: float
|
| 35 |
+
duration_seconds: float
|
| 36 |
+
strategy: str
|
| 37 |
+
failure_type: str
|
| 38 |
+
reliability_index: float
|
| 39 |
+
causal_score: float
|
| 40 |
+
robustness_score: float
|
| 41 |
+
calibration_score: float
|
| 42 |
+
action_sequence: List[str]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@dataclass
|
| 46 |
+
class AgentBenchmarkSummary:
|
| 47 |
+
"""Aggregated results for one agent across all tasks."""
|
| 48 |
+
agent_name: str
|
| 49 |
+
tasks_run: int
|
| 50 |
+
mean_score: float
|
| 51 |
+
std_score: float
|
| 52 |
+
generalization_score: float # 1 - std (lower variance = more generalizable)
|
| 53 |
+
mean_steps: float
|
| 54 |
+
best_task: str
|
| 55 |
+
worst_task: str
|
| 56 |
+
mean_reliability: float
|
| 57 |
+
mean_causal_score: float
|
| 58 |
+
mean_robustness_score: float
|
| 59 |
+
mean_calibration_score: float
|
| 60 |
+
dominant_strategy: str
|
| 61 |
+
dominant_failure: str
|
| 62 |
+
composite_rank_score: float # Weighted final score for leaderboard
|
| 63 |
+
per_task_scores: Dict[str, float]
|
| 64 |
+
|
| 65 |
+
def to_dict(self) -> dict:
|
| 66 |
+
return {
|
| 67 |
+
"agent_name": self.agent_name,
|
| 68 |
+
"tasks_run": self.tasks_run,
|
| 69 |
+
"scores": {
|
| 70 |
+
"mean": round(self.mean_score, 3),
|
| 71 |
+
"std": round(self.std_score, 3),
|
| 72 |
+
"generalization": round(self.generalization_score, 3),
|
| 73 |
+
"reliability": round(self.mean_reliability, 3),
|
| 74 |
+
"causal_reasoning": round(self.mean_causal_score, 3),
|
| 75 |
+
"robustness": round(self.mean_robustness_score, 3),
|
| 76 |
+
"calibration": round(self.mean_calibration_score, 3),
|
| 77 |
+
"composite": round(self.composite_rank_score, 3),
|
| 78 |
+
},
|
| 79 |
+
"efficiency": {
|
| 80 |
+
"mean_steps": round(self.mean_steps, 1),
|
| 81 |
+
},
|
| 82 |
+
"behavior": {
|
| 83 |
+
"dominant_strategy": self.dominant_strategy,
|
| 84 |
+
"dominant_failure": self.dominant_failure,
|
| 85 |
+
},
|
| 86 |
+
"per_task_scores": {k: round(v, 3) for k, v in self.per_task_scores.items()},
|
| 87 |
+
"best_task": self.best_task,
|
| 88 |
+
"worst_task": self.worst_task,
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
@dataclass
|
| 93 |
+
class LeaderboardReport:
|
| 94 |
+
"""Full benchmark leaderboard."""
|
| 95 |
+
benchmark_id: str
|
| 96 |
+
tasks_evaluated: List[str]
|
| 97 |
+
agents_evaluated: List[str]
|
| 98 |
+
total_episodes: int
|
| 99 |
+
run_duration_seconds: float
|
| 100 |
+
rankings: List[AgentBenchmarkSummary]
|
| 101 |
+
raw_results: List[BenchmarkResult]
|
| 102 |
+
|
| 103 |
+
def to_dict(self) -> dict:
|
| 104 |
+
return {
|
| 105 |
+
"benchmark_id": self.benchmark_id,
|
| 106 |
+
"tasks_evaluated": self.tasks_evaluated,
|
| 107 |
+
"agents_evaluated": self.agents_evaluated,
|
| 108 |
+
"total_episodes": self.total_episodes,
|
| 109 |
+
"run_duration_seconds": round(self.run_duration_seconds, 2),
|
| 110 |
+
"leaderboard": [r.to_dict() for r in self.rankings],
|
| 111 |
+
"winner": self.rankings[0].agent_name if self.rankings else "none",
|
| 112 |
+
"insights": self._generate_insights(),
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
def _generate_insights(self) -> List[str]:
|
| 116 |
+
if not self.rankings:
|
| 117 |
+
return []
|
| 118 |
+
insights = []
|
| 119 |
+
top = self.rankings[0]
|
| 120 |
+
bottom = self.rankings[-1]
|
| 121 |
+
|
| 122 |
+
if top.composite_rank_score - bottom.composite_rank_score > 0.2:
|
| 123 |
+
insights.append(
|
| 124 |
+
f"Large performance gap: '{top.agent_name}' ({top.composite_rank_score:.2f}) "
|
| 125 |
+
f"vs '{bottom.agent_name}' ({bottom.composite_rank_score:.2f})"
|
| 126 |
+
)
|
| 127 |
+
if top.generalization_score > 0.7:
|
| 128 |
+
insights.append(
|
| 129 |
+
f"'{top.agent_name}' shows strong generalization "
|
| 130 |
+
f"(std={top.std_score:.3f} across {top.tasks_run} tasks)"
|
| 131 |
+
)
|
| 132 |
+
for r in self.rankings:
|
| 133 |
+
if r.mean_causal_score > 0.6:
|
| 134 |
+
insights.append(
|
| 135 |
+
f"'{r.agent_name}' demonstrated genuine causal reasoning "
|
| 136 |
+
f"(causal_score={r.mean_causal_score:.2f})"
|
| 137 |
+
)
|
| 138 |
+
strategies = [r.dominant_strategy for r in self.rankings]
|
| 139 |
+
if len(set(strategies)) > 1:
|
| 140 |
+
best_strategy = self.rankings[0].dominant_strategy
|
| 141 |
+
insights.append(
|
| 142 |
+
f"Strategy '{best_strategy}' produced the highest composite score."
|
| 143 |
+
)
|
| 144 |
+
return insights
|
| 145 |
+
|
| 146 |
+
def render_table(self) -> str:
|
| 147 |
+
"""Render ASCII leaderboard table."""
|
| 148 |
+
if not self.rankings:
|
| 149 |
+
return "No results."
|
| 150 |
+
|
| 151 |
+
lines = [
|
| 152 |
+
f"{'โ'*90}",
|
| 153 |
+
f" ๐ BENCHMARK LEADERBOARD โ {self.benchmark_id}",
|
| 154 |
+
f" Tasks: {', '.join(self.tasks_evaluated)} | Agents: {len(self.agents_evaluated)} | Episodes: {self.total_episodes}",
|
| 155 |
+
f"{'โ'*90}",
|
| 156 |
+
f"{'Rank':<5} {'Agent':<16} {'Score':<8} {'Causal':<8} {'Robust':<8} {'Calibr':<8} {'Genrz':<8} {'Steps':<7} {'Strategy'}",
|
| 157 |
+
f"{'โ'*90}",
|
| 158 |
+
]
|
| 159 |
+
for i, r in enumerate(self.rankings):
|
| 160 |
+
medal = "๐ฅ" if i == 0 else "๐ฅ" if i == 1 else "๐ฅ" if i == 2 else f" #{i+1}"
|
| 161 |
+
lines.append(
|
| 162 |
+
f"{medal:<5} {r.agent_name:<16} {r.mean_score:<8.3f} "
|
| 163 |
+
f"{r.mean_causal_score:<8.3f} {r.mean_robustness_score:<8.3f} "
|
| 164 |
+
f"{r.mean_calibration_score:<8.3f} {r.generalization_score:<8.3f} "
|
| 165 |
+
f"{r.mean_steps:<7.1f} {r.dominant_strategy}"
|
| 166 |
+
)
|
| 167 |
+
lines.append(f"{'โ'*90}")
|
| 168 |
+
|
| 169 |
+
lines.append("\n๐ Per-Task Breakdown:")
|
| 170 |
+
for r in self.rankings:
|
| 171 |
+
task_str = " | ".join(f"{t}: {s:.2f}" for t, s in sorted(r.per_task_scores.items()))
|
| 172 |
+
lines.append(f" {r.agent_name:<16} {task_str}")
|
| 173 |
+
|
| 174 |
+
if self._generate_insights():
|
| 175 |
+
lines.append("\n๐ก Insights:")
|
| 176 |
+
lines.extend(f" โ {i}" for i in self._generate_insights())
|
| 177 |
+
|
| 178 |
+
return "\n".join(lines)
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
class BenchmarkRunner:
|
| 182 |
+
"""
|
| 183 |
+
Automated benchmark runner.
|
| 184 |
+
|
| 185 |
+
Runs each agent in AGENT_CONFIGS across each task, collecting:
|
| 186 |
+
- Final score
|
| 187 |
+
- All intelligence metrics (causal, counterfactual, confidence)
|
| 188 |
+
- Strategy and failure classification
|
| 189 |
+
- Reliability index
|
| 190 |
+
|
| 191 |
+
Then generates a ranked leaderboard.
|
| 192 |
+
"""
|
| 193 |
+
|
| 194 |
+
def run(
|
| 195 |
+
self,
|
| 196 |
+
env,
|
| 197 |
+
tasks: Optional[List[str]] = None,
|
| 198 |
+
agents: Optional[List[str]] = None,
|
| 199 |
+
benchmark_id: Optional[str] = None,
|
| 200 |
+
) -> LeaderboardReport:
|
| 201 |
+
"""Run the full benchmark."""
|
| 202 |
+
import uuid
|
| 203 |
+
from server.models import RepoAction
|
| 204 |
+
from server.strategy_detector import StrategyDetector
|
| 205 |
+
from server.failure_classifier import FailureClassifier
|
| 206 |
+
from server.advanced_metrics import AdvancedMetricsEngine
|
| 207 |
+
from server.causal_probe import CausalProbe
|
| 208 |
+
from server.counterfactual_engine import CounterfactualEngine
|
| 209 |
+
from server.confidence_calibrator import ConfidenceCalibrator
|
| 210 |
+
|
| 211 |
+
benchmark_id = benchmark_id or f"bench_{uuid.uuid4().hex[:8]}"
|
| 212 |
+
tasks = tasks or ["task1", "task2", "task3"]
|
| 213 |
+
agent_configs = self._get_agent_configs()
|
| 214 |
+
if agents:
|
| 215 |
+
agent_configs = {k: v for k, v in agent_configs.items() if k in agents}
|
| 216 |
+
|
| 217 |
+
clf = FailureClassifier()
|
| 218 |
+
det = StrategyDetector()
|
| 219 |
+
adv = AdvancedMetricsEngine()
|
| 220 |
+
causal = CausalProbe()
|
| 221 |
+
counter = CounterfactualEngine()
|
| 222 |
+
calibrator = ConfidenceCalibrator()
|
| 223 |
+
|
| 224 |
+
start_time = time.time()
|
| 225 |
+
all_results: List[BenchmarkResult] = []
|
| 226 |
+
|
| 227 |
+
for task in tasks:
|
| 228 |
+
for agent_name, agent_fn in agent_configs.items():
|
| 229 |
+
try:
|
| 230 |
+
result = self._run_episode(
|
| 231 |
+
env, task, agent_name, agent_fn,
|
| 232 |
+
clf, det, adv, causal, counter, calibrator
|
| 233 |
+
)
|
| 234 |
+
all_results.append(result)
|
| 235 |
+
except Exception as e:
|
| 236 |
+
# Don't crash the whole benchmark on one failure
|
| 237 |
+
all_results.append(BenchmarkResult(
|
| 238 |
+
agent_name=agent_name, task=task, variant_id="error",
|
| 239 |
+
final_score=0.0, total_steps=0, cumulative_reward=0.0,
|
| 240 |
+
duration_seconds=0.0, strategy="ERROR", failure_type="BENCHMARK_ERROR",
|
| 241 |
+
reliability_index=0.0, causal_score=0.0, robustness_score=0.0,
|
| 242 |
+
calibration_score=0.0, action_sequence=[],
|
| 243 |
+
))
|
| 244 |
+
|
| 245 |
+
total_duration = time.time() - start_time
|
| 246 |
+
rankings = self._compute_rankings(all_results, tasks)
|
| 247 |
+
|
| 248 |
+
return LeaderboardReport(
|
| 249 |
+
benchmark_id=benchmark_id,
|
| 250 |
+
tasks_evaluated=tasks,
|
| 251 |
+
agents_evaluated=list(agent_configs.keys()),
|
| 252 |
+
total_episodes=len(all_results),
|
| 253 |
+
run_duration_seconds=total_duration,
|
| 254 |
+
rankings=rankings,
|
| 255 |
+
raw_results=all_results,
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
def _run_episode(
|
| 259 |
+
self, env, task, agent_name, agent_fn,
|
| 260 |
+
clf, det, adv, causal, counter, calibrator
|
| 261 |
+
) -> BenchmarkResult:
|
| 262 |
+
from server.models import RepoAction
|
| 263 |
+
|
| 264 |
+
reset_result = env.reset(task=task)
|
| 265 |
+
obs = reset_result.observation
|
| 266 |
+
variant_id = reset_result.info.get("variant_id", "unknown")
|
| 267 |
+
context = {}
|
| 268 |
+
|
| 269 |
+
obs_dict = obs.model_dump()
|
| 270 |
+
start = time.time()
|
| 271 |
+
cumulative_reward = 0.0
|
| 272 |
+
files_read, files_written, action_sequence = [], [], []
|
| 273 |
+
max_steps = 15
|
| 274 |
+
|
| 275 |
+
for step_num in range(1, max_steps + 1):
|
| 276 |
+
if env.done:
|
| 277 |
+
break
|
| 278 |
+
action_dict = agent_fn(obs_dict, step_num, context)
|
| 279 |
+
action = RepoAction(
|
| 280 |
+
action_type=action_dict.get("action_type", "submit"),
|
| 281 |
+
path=action_dict.get("path"),
|
| 282 |
+
query=action_dict.get("query"),
|
| 283 |
+
)
|
| 284 |
+
result = env.step(action)
|
| 285 |
+
obs = result.observation
|
| 286 |
+
obs_dict = obs.model_dump()
|
| 287 |
+
cumulative_reward += result.reward
|
| 288 |
+
action_sequence.append(action.action_type)
|
| 289 |
+
if action.path and action.action_type == "read_file":
|
| 290 |
+
files_read.append(action.path)
|
| 291 |
+
if action.path and action.action_type == "write_file":
|
| 292 |
+
files_written.append(action.path)
|
| 293 |
+
if result.done:
|
| 294 |
+
break
|
| 295 |
+
|
| 296 |
+
if not env.done:
|
| 297 |
+
r = env.step(RepoAction(action_type="submit"))
|
| 298 |
+
cumulative_reward += r.reward
|
| 299 |
+
action_sequence.append("submit")
|
| 300 |
+
|
| 301 |
+
duration = time.time() - start
|
| 302 |
+
final_score = env.final_score
|
| 303 |
+
traj = env.get_trajectory()
|
| 304 |
+
steps = traj.get("steps", []) if traj else []
|
| 305 |
+
meta = env.variant.meta if env.variant else {}
|
| 306 |
+
|
| 307 |
+
# Intelligence metrics
|
| 308 |
+
fail_r = clf.classify(
|
| 309 |
+
traj.get("episode_id", "") if traj else "", task,
|
| 310 |
+
steps, meta, files_read, files_written, final_score
|
| 311 |
+
)
|
| 312 |
+
strat_r = det.detect(steps, task, meta, files_read, final_score)
|
| 313 |
+
adv_r = adv.compute(steps, meta, final_score, files_read, files_written)
|
| 314 |
+
causal_r = causal.probe(
|
| 315 |
+
traj.get("episode_id", "") if traj else "", task,
|
| 316 |
+
steps, meta, files_read, files_written, final_score
|
| 317 |
+
)
|
| 318 |
+
counter_r = counter.analyze(
|
| 319 |
+
traj.get("episode_id", "") if traj else "", task,
|
| 320 |
+
steps, meta, files_read, files_written, final_score
|
| 321 |
+
)
|
| 322 |
+
calib_r = calibrator.calibrate(
|
| 323 |
+
traj.get("episode_id", "") if traj else "", task,
|
| 324 |
+
steps, final_score,
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
return BenchmarkResult(
|
| 328 |
+
agent_name=agent_name,
|
| 329 |
+
task=task,
|
| 330 |
+
variant_id=variant_id,
|
| 331 |
+
final_score=final_score,
|
| 332 |
+
total_steps=len(action_sequence),
|
| 333 |
+
cumulative_reward=cumulative_reward,
|
| 334 |
+
duration_seconds=duration,
|
| 335 |
+
strategy=strat_r.strategy,
|
| 336 |
+
failure_type=fail_r.primary_failure,
|
| 337 |
+
reliability_index=adv_r.reliability_index,
|
| 338 |
+
causal_score=causal_r.causal_score,
|
| 339 |
+
robustness_score=counter_r.robustness_score,
|
| 340 |
+
calibration_score=calib_r.calibration_score,
|
| 341 |
+
action_sequence=action_sequence,
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
def _compute_rankings(
|
| 345 |
+
self, results: List[BenchmarkResult], tasks: List[str]
|
| 346 |
+
) -> List[AgentBenchmarkSummary]:
|
| 347 |
+
import math
|
| 348 |
+
from collections import Counter
|
| 349 |
+
|
| 350 |
+
# Group by agent
|
| 351 |
+
agent_results: Dict[str, List[BenchmarkResult]] = {}
|
| 352 |
+
for r in results:
|
| 353 |
+
agent_results.setdefault(r.agent_name, []).append(r)
|
| 354 |
+
|
| 355 |
+
summaries = []
|
| 356 |
+
for agent_name, agent_res in agent_results.items():
|
| 357 |
+
scores = [r.final_score for r in agent_res]
|
| 358 |
+
mean_score = sum(scores) / len(scores)
|
| 359 |
+
if len(scores) > 1:
|
| 360 |
+
variance = sum((s - mean_score) ** 2 for s in scores) / len(scores)
|
| 361 |
+
std_score = math.sqrt(variance)
|
| 362 |
+
else:
|
| 363 |
+
std_score = 0.0
|
| 364 |
+
generalization_score = max(0.0, 1.0 - std_score)
|
| 365 |
+
|
| 366 |
+
per_task = {r.task: r.final_score for r in agent_res}
|
| 367 |
+
strategies = Counter(r.strategy for r in agent_res)
|
| 368 |
+
failures = Counter(r.failure_type for r in agent_res)
|
| 369 |
+
|
| 370 |
+
mean_steps = sum(r.total_steps for r in agent_res) / len(agent_res)
|
| 371 |
+
mean_reliability = sum(r.reliability_index for r in agent_res) / len(agent_res)
|
| 372 |
+
mean_causal = sum(r.causal_score for r in agent_res) / len(agent_res)
|
| 373 |
+
mean_robustness = sum(r.robustness_score for r in agent_res) / len(agent_res)
|
| 374 |
+
mean_calibration = sum(r.calibration_score for r in agent_res) / len(agent_res)
|
| 375 |
+
|
| 376 |
+
# Composite leaderboard score โ weighted across all dimensions
|
| 377 |
+
composite = (
|
| 378 |
+
mean_score * 0.35 +
|
| 379 |
+
mean_causal * 0.20 +
|
| 380 |
+
mean_robustness * 0.15 +
|
| 381 |
+
mean_calibration * 0.15 +
|
| 382 |
+
generalization_score * 0.15
|
| 383 |
+
)
|
| 384 |
+
|
| 385 |
+
best_task = max(per_task, key=per_task.get)
|
| 386 |
+
worst_task = min(per_task, key=per_task.get)
|
| 387 |
+
|
| 388 |
+
summaries.append(AgentBenchmarkSummary(
|
| 389 |
+
agent_name=agent_name,
|
| 390 |
+
tasks_run=len(agent_res),
|
| 391 |
+
mean_score=mean_score,
|
| 392 |
+
std_score=std_score,
|
| 393 |
+
generalization_score=generalization_score,
|
| 394 |
+
mean_steps=mean_steps,
|
| 395 |
+
best_task=best_task,
|
| 396 |
+
worst_task=worst_task,
|
| 397 |
+
mean_reliability=mean_reliability,
|
| 398 |
+
mean_causal_score=mean_causal,
|
| 399 |
+
mean_robustness_score=mean_robustness,
|
| 400 |
+
mean_calibration_score=mean_calibration,
|
| 401 |
+
dominant_strategy=strategies.most_common(1)[0][0],
|
| 402 |
+
dominant_failure=failures.most_common(1)[0][0],
|
| 403 |
+
composite_rank_score=composite,
|
| 404 |
+
per_task_scores=per_task,
|
| 405 |
+
))
|
| 406 |
+
|
| 407 |
+
summaries.sort(key=lambda s: -s.composite_rank_score)
|
| 408 |
+
return summaries
|
| 409 |
+
|
| 410 |
+
def _get_agent_configs(self) -> Dict:
|
| 411 |
+
"""Reuse built-in strategies from multi_agent.py."""
|
| 412 |
+
from server.multi_agent import MultiAgentComparison
|
| 413 |
+
return MultiAgentComparison.AGENT_CONFIGS
|
server/causal_probe.py
ADDED
|
@@ -0,0 +1,409 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/causal_probe.py
|
| 2 |
+
"""
|
| 3 |
+
Causal Reasoning Probe โ v4.0
|
| 4 |
+
|
| 5 |
+
The key scientific question: Did the agent understand WHY the bug exists,
|
| 6 |
+
or did it accidentally fix it by pattern matching?
|
| 7 |
+
|
| 8 |
+
We measure causal understanding by checking if the agent traversed the
|
| 9 |
+
COMPLETE causal chain: Failing test โ tested function โ return path โ root cause.
|
| 10 |
+
|
| 11 |
+
An agent that reads only the test and immediately rewrites the function
|
| 12 |
+
is guessing. An agent that reads test โ traces the call stack โ finds the
|
| 13 |
+
actual cause first is reasoning causally.
|
| 14 |
+
|
| 15 |
+
This is NOT in any current benchmark. SWE-bench only checks if the test passes.
|
| 16 |
+
We check HOW the agent got there.
|
| 17 |
+
"""
|
| 18 |
+
from __future__ import annotations
|
| 19 |
+
from typing import List, Dict, Any, Optional
|
| 20 |
+
from dataclasses import dataclass, field
|
| 21 |
+
from enum import Enum
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class CausalUnderstandingLevel(str, Enum):
|
| 25 |
+
DEEP = "DEEP" # Full causal chain traversal
|
| 26 |
+
PARTIAL = "PARTIAL" # Partial chain (some steps missing)
|
| 27 |
+
SUPERFICIAL = "SUPERFICIAL" # Direct testโrewrite with no chain
|
| 28 |
+
RANDOM = "RANDOM" # No discernible causal pattern
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@dataclass
|
| 32 |
+
class CausalChainNode:
|
| 33 |
+
"""One node in the reconstructed causal chain."""
|
| 34 |
+
file: str
|
| 35 |
+
role: str # "test", "caller", "called", "root_cause", "missed"
|
| 36 |
+
was_read: bool
|
| 37 |
+
read_order: Optional[int] # Which step did agent read this?
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@dataclass
|
| 41 |
+
class CausalProbeReport:
|
| 42 |
+
"""
|
| 43 |
+
Full causal reasoning analysis for one episode.
|
| 44 |
+
This is the primary output of the CausalProbe.
|
| 45 |
+
"""
|
| 46 |
+
episode_id: str
|
| 47 |
+
task: str
|
| 48 |
+
|
| 49 |
+
# Core verdict
|
| 50 |
+
understanding_level: CausalUnderstandingLevel
|
| 51 |
+
causal_score: float # 0.0 โ 1.0
|
| 52 |
+
|
| 53 |
+
# Chain analysis
|
| 54 |
+
expected_chain: List[CausalChainNode] # What SHOULD have been read
|
| 55 |
+
actual_chain_coverage: float # Fraction of chain actually traversed
|
| 56 |
+
chain_order_score: float # Was chain traversed in correct order?
|
| 57 |
+
|
| 58 |
+
# Behavioral signals
|
| 59 |
+
read_before_write: bool # Did agent read all relevant files before writing?
|
| 60 |
+
test_informed_navigation: bool # Did reading tests change which files agent read next?
|
| 61 |
+
search_before_navigate: bool # Did agent search for function names before reading?
|
| 62 |
+
submit_after_test: bool # Did agent verify fix before submitting?
|
| 63 |
+
|
| 64 |
+
# Signal: understanding vs guessing
|
| 65 |
+
guessing_indicators: List[str] # Signs agent was guessing
|
| 66 |
+
understanding_indicators: List[str] # Signs agent understood
|
| 67 |
+
|
| 68 |
+
# Calibration
|
| 69 |
+
false_confidence_detected: bool # Submitted without reading root cause file
|
| 70 |
+
shortcut_learning_detected: bool # Read test file โ immediately wrote โ submitted
|
| 71 |
+
|
| 72 |
+
explanation: str
|
| 73 |
+
recommendations: List[str]
|
| 74 |
+
|
| 75 |
+
def to_dict(self) -> dict:
|
| 76 |
+
return {
|
| 77 |
+
"episode_id": self.episode_id,
|
| 78 |
+
"task": self.task,
|
| 79 |
+
"understanding_level": self.understanding_level.value,
|
| 80 |
+
"causal_score": round(self.causal_score, 3),
|
| 81 |
+
"chain_coverage": round(self.actual_chain_coverage, 3),
|
| 82 |
+
"chain_order_score": round(self.chain_order_score, 3),
|
| 83 |
+
"behavioral_signals": {
|
| 84 |
+
"read_before_write": self.read_before_write,
|
| 85 |
+
"test_informed_navigation": self.test_informed_navigation,
|
| 86 |
+
"search_before_navigate": self.search_before_navigate,
|
| 87 |
+
"submit_after_test": self.submit_after_test,
|
| 88 |
+
},
|
| 89 |
+
"guessing_indicators": self.guessing_indicators,
|
| 90 |
+
"understanding_indicators": self.understanding_indicators,
|
| 91 |
+
"diagnostics": {
|
| 92 |
+
"false_confidence_detected": self.false_confidence_detected,
|
| 93 |
+
"shortcut_learning_detected": self.shortcut_learning_detected,
|
| 94 |
+
},
|
| 95 |
+
"expected_chain": [
|
| 96 |
+
{"file": n.file, "role": n.role, "read": n.was_read, "order": n.read_order}
|
| 97 |
+
for n in self.expected_chain
|
| 98 |
+
],
|
| 99 |
+
"explanation": self.explanation,
|
| 100 |
+
"recommendations": self.recommendations,
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class CausalProbe:
|
| 105 |
+
"""
|
| 106 |
+
Analyzes whether an agent engaged in true causal reasoning.
|
| 107 |
+
|
| 108 |
+
The core insight: for a bug in src/X.py called from tests/test_X.py,
|
| 109 |
+
the causal chain is:
|
| 110 |
+
tests/test_X.py โ (calls) โ src/X.py โ (calls) โ src/utils.py (maybe)
|
| 111 |
+
|
| 112 |
+
A causally-aware agent reads in this order.
|
| 113 |
+
A shortcut agent reads the test, guesses the bug, rewrites without reading source.
|
| 114 |
+
|
| 115 |
+
We score order, coverage, and behavioral signals.
|
| 116 |
+
"""
|
| 117 |
+
|
| 118 |
+
def probe(
|
| 119 |
+
self,
|
| 120 |
+
episode_id: str,
|
| 121 |
+
task: str,
|
| 122 |
+
trajectory_steps: List[dict],
|
| 123 |
+
variant_meta: dict,
|
| 124 |
+
files_read: List[str],
|
| 125 |
+
files_written: List[str],
|
| 126 |
+
final_score: float,
|
| 127 |
+
) -> CausalProbeReport:
|
| 128 |
+
"""Run the causal probe on an episode's trajectory."""
|
| 129 |
+
|
| 130 |
+
# โโ Build expected causal chain from variant metadata โโโโโโโโโโโโโโโโโ
|
| 131 |
+
test_files = variant_meta.get("test_files", []) or [
|
| 132 |
+
f for f in variant_meta.get("read_first_files", []) if "test" in f
|
| 133 |
+
]
|
| 134 |
+
bug_files = variant_meta.get("bug_files", []) or variant_meta.get("files_to_implement", [])
|
| 135 |
+
dep_files = variant_meta.get("dependencies", []) or []
|
| 136 |
+
|
| 137 |
+
# If metadata sparse, infer from trajectory
|
| 138 |
+
all_files_in_traj = list({
|
| 139 |
+
s.get("action_path") for s in trajectory_steps
|
| 140 |
+
if s.get("action_path") and s.get("action_type") in ("read_file", "write_file")
|
| 141 |
+
})
|
| 142 |
+
|
| 143 |
+
if not test_files:
|
| 144 |
+
test_files = [f for f in all_files_in_traj if "test" in f.lower()]
|
| 145 |
+
if not bug_files:
|
| 146 |
+
bug_files = [f for f in all_files_in_traj
|
| 147 |
+
if "test" not in f.lower() and f.endswith(".py")]
|
| 148 |
+
|
| 149 |
+
# Build expected chain
|
| 150 |
+
expected_chain: List[CausalChainNode] = []
|
| 151 |
+
read_set = set(files_read)
|
| 152 |
+
read_order: Dict[str, int] = {}
|
| 153 |
+
for step in trajectory_steps:
|
| 154 |
+
if step.get("action_type") == "read_file" and step.get("action_path"):
|
| 155 |
+
path = step["action_path"]
|
| 156 |
+
if path not in read_order:
|
| 157 |
+
read_order[path] = step.get("step_number", len(read_order) + 1)
|
| 158 |
+
|
| 159 |
+
for tf in test_files:
|
| 160 |
+
expected_chain.append(CausalChainNode(
|
| 161 |
+
file=tf, role="test",
|
| 162 |
+
was_read=tf in read_set,
|
| 163 |
+
read_order=read_order.get(tf),
|
| 164 |
+
))
|
| 165 |
+
for bf in bug_files:
|
| 166 |
+
expected_chain.append(CausalChainNode(
|
| 167 |
+
file=bf, role="root_cause",
|
| 168 |
+
was_read=bf in read_set,
|
| 169 |
+
read_order=read_order.get(bf),
|
| 170 |
+
))
|
| 171 |
+
for df in dep_files:
|
| 172 |
+
expected_chain.append(CausalChainNode(
|
| 173 |
+
file=df, role="caller",
|
| 174 |
+
was_read=df in read_set,
|
| 175 |
+
read_order=read_order.get(df),
|
| 176 |
+
))
|
| 177 |
+
|
| 178 |
+
if not expected_chain:
|
| 179 |
+
# Fallback: any file is better than none
|
| 180 |
+
for f in all_files_in_traj[:3]:
|
| 181 |
+
expected_chain.append(CausalChainNode(
|
| 182 |
+
file=f, role="unknown",
|
| 183 |
+
was_read=True,
|
| 184 |
+
read_order=read_order.get(f),
|
| 185 |
+
))
|
| 186 |
+
|
| 187 |
+
# โโ Chain coverage โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 188 |
+
chain_files_read = [n for n in expected_chain if n.was_read and n.role != "missed"]
|
| 189 |
+
actual_chain_coverage = (
|
| 190 |
+
len(chain_files_read) / len(expected_chain) if expected_chain else 0.0
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
# โโ Chain order score (tests before src = good causal order) โโโโโโโโโโ
|
| 194 |
+
chain_order_score = 0.0
|
| 195 |
+
test_orders = [n.read_order for n in expected_chain if n.role == "test" and n.read_order]
|
| 196 |
+
src_orders = [n.read_order for n in expected_chain
|
| 197 |
+
if n.role in ("root_cause", "caller") and n.read_order]
|
| 198 |
+
|
| 199 |
+
if test_orders and src_orders:
|
| 200 |
+
# Good: all tests read before source files
|
| 201 |
+
correct_order_pairs = sum(
|
| 202 |
+
1 for to in test_orders for so in src_orders if to < so
|
| 203 |
+
)
|
| 204 |
+
total_pairs = len(test_orders) * len(src_orders)
|
| 205 |
+
chain_order_score = correct_order_pairs / total_pairs if total_pairs > 0 else 0.0
|
| 206 |
+
elif test_orders and not src_orders:
|
| 207 |
+
chain_order_score = 0.3 # Partial โ read tests but not source
|
| 208 |
+
elif src_orders and not test_orders:
|
| 209 |
+
chain_order_score = 0.2 # Read source without reading tests = weaker
|
| 210 |
+
|
| 211 |
+
# โโ Behavioral signals โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 212 |
+
action_types = [s.get("action_type", "") for s in trajectory_steps]
|
| 213 |
+
action_paths = [s.get("action_path") for s in trajectory_steps]
|
| 214 |
+
|
| 215 |
+
# read_before_write: all written files were read at least once before write
|
| 216 |
+
read_before_write = True
|
| 217 |
+
for step in trajectory_steps:
|
| 218 |
+
if step.get("action_type") == "write_file" and step.get("action_path"):
|
| 219 |
+
p = step["action_path"]
|
| 220 |
+
step_n = step.get("step_number", 0)
|
| 221 |
+
was_read_before = any(
|
| 222 |
+
s2.get("action_type") == "read_file"
|
| 223 |
+
and s2.get("action_path") == p
|
| 224 |
+
and s2.get("step_number", 99) < step_n
|
| 225 |
+
for s2 in trajectory_steps
|
| 226 |
+
)
|
| 227 |
+
if not was_read_before:
|
| 228 |
+
read_before_write = False
|
| 229 |
+
break
|
| 230 |
+
|
| 231 |
+
# test_informed_navigation: did agent read source files AFTER reading tests?
|
| 232 |
+
test_read_step = min(
|
| 233 |
+
(s.get("step_number", 99) for s in trajectory_steps
|
| 234 |
+
if s.get("action_type") == "read_file"
|
| 235 |
+
and any(tf in (s.get("action_path") or "") for tf in test_files)),
|
| 236 |
+
default=None
|
| 237 |
+
)
|
| 238 |
+
src_read_after_test = test_read_step is not None and any(
|
| 239 |
+
s.get("action_type") == "read_file"
|
| 240 |
+
and s.get("step_number", 0) > test_read_step
|
| 241 |
+
and any(bf in (s.get("action_path") or "") for bf in bug_files)
|
| 242 |
+
for s in trajectory_steps
|
| 243 |
+
)
|
| 244 |
+
test_informed_navigation = src_read_after_test
|
| 245 |
+
|
| 246 |
+
# search_before_navigate: used search_code before reading source files
|
| 247 |
+
search_steps = [s for s in trajectory_steps if s.get("action_type") == "search_code"]
|
| 248 |
+
first_src_read = min(
|
| 249 |
+
(s.get("step_number", 99) for s in trajectory_steps
|
| 250 |
+
if s.get("action_type") == "read_file"
|
| 251 |
+
and any(bf in (s.get("action_path") or "") for bf in bug_files)),
|
| 252 |
+
default=None
|
| 253 |
+
)
|
| 254 |
+
search_before_navigate = bool(search_steps) and (
|
| 255 |
+
first_src_read is None or
|
| 256 |
+
any(s.get("step_number", 99) < first_src_read for s in search_steps)
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
# submit_after_test: ran tests before submitting
|
| 260 |
+
test_runs = [s for s in trajectory_steps if s.get("action_type") == "run_tests"]
|
| 261 |
+
submit_step = next(
|
| 262 |
+
(s.get("step_number", 99) for s in trajectory_steps
|
| 263 |
+
if s.get("action_type") == "submit"), None
|
| 264 |
+
)
|
| 265 |
+
submit_after_test = bool(test_runs) and submit_step is not None and any(
|
| 266 |
+
s.get("step_number", 0) < submit_step for s in test_runs
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
# โโ Guessing vs understanding indicators โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 270 |
+
guessing_indicators = []
|
| 271 |
+
understanding_indicators = []
|
| 272 |
+
|
| 273 |
+
total = len(trajectory_steps)
|
| 274 |
+
|
| 275 |
+
# Guessing: short episode with low score
|
| 276 |
+
if total <= 3 and final_score < 0.5:
|
| 277 |
+
guessing_indicators.append(f"Submitted in only {total} steps with score {final_score:.2f}")
|
| 278 |
+
|
| 279 |
+
# Guessing: wrote without reading
|
| 280 |
+
if not read_before_write:
|
| 281 |
+
guessing_indicators.append("Wrote to file(s) without first reading them")
|
| 282 |
+
|
| 283 |
+
# Guessing: skipped test files
|
| 284 |
+
if not any(n.was_read for n in expected_chain if n.role == "test"):
|
| 285 |
+
guessing_indicators.append("Never read any test files")
|
| 286 |
+
|
| 287 |
+
# Guessing: skipped source files
|
| 288 |
+
if not any(n.was_read for n in expected_chain if n.role == "root_cause"):
|
| 289 |
+
guessing_indicators.append("Never read the bug/source file")
|
| 290 |
+
|
| 291 |
+
# Understanding: search used
|
| 292 |
+
if search_steps:
|
| 293 |
+
understanding_indicators.append(
|
| 294 |
+
f"Used search_code {len(search_steps)}ร to locate bug"
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
# Understanding: read tests first
|
| 298 |
+
if chain_order_score > 0.7:
|
| 299 |
+
understanding_indicators.append("Read tests before source files (correct causal order)")
|
| 300 |
+
|
| 301 |
+
# Understanding: tested before submitting
|
| 302 |
+
if submit_after_test:
|
| 303 |
+
understanding_indicators.append("Verified fix with run_tests before submitting")
|
| 304 |
+
|
| 305 |
+
# Understanding: explored full chain
|
| 306 |
+
if actual_chain_coverage > 0.7:
|
| 307 |
+
understanding_indicators.append(
|
| 308 |
+
f"Covered {actual_chain_coverage:.0%} of expected causal chain"
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
# โโ Diagnostics โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 312 |
+
# False confidence: submitted very early without testing
|
| 313 |
+
false_confidence_detected = (
|
| 314 |
+
submit_step is not None and submit_step <= 3 and not test_runs
|
| 315 |
+
)
|
| 316 |
+
|
| 317 |
+
# Shortcut learning: read test โ immediate write โ submit (skipped source)
|
| 318 |
+
has_write = "write_file" in action_types
|
| 319 |
+
has_src_read = any(
|
| 320 |
+
bf in (s.get("action_path") or "")
|
| 321 |
+
for s in trajectory_steps
|
| 322 |
+
if s.get("action_type") == "read_file"
|
| 323 |
+
for bf in bug_files
|
| 324 |
+
)
|
| 325 |
+
shortcut_sequence = has_write and not has_src_read
|
| 326 |
+
shortcut_learning_detected = shortcut_sequence
|
| 327 |
+
|
| 328 |
+
# โโ Composite causal score โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 329 |
+
scores = {
|
| 330 |
+
"chain_coverage": actual_chain_coverage * 0.30,
|
| 331 |
+
"chain_order": chain_order_score * 0.25,
|
| 332 |
+
"read_before_write": (0.15 if read_before_write else 0.0),
|
| 333 |
+
"test_informed": (0.15 if test_informed_navigation else 0.0),
|
| 334 |
+
"verified": (0.10 if submit_after_test else 0.0),
|
| 335 |
+
"searched": (0.05 if search_before_navigate else 0.0),
|
| 336 |
+
}
|
| 337 |
+
causal_score = sum(scores.values())
|
| 338 |
+
causal_score = max(0.0, min(1.0, causal_score))
|
| 339 |
+
|
| 340 |
+
# โโ Understanding level classification โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 341 |
+
if causal_score >= 0.75:
|
| 342 |
+
level = CausalUnderstandingLevel.DEEP
|
| 343 |
+
elif causal_score >= 0.45:
|
| 344 |
+
level = CausalUnderstandingLevel.PARTIAL
|
| 345 |
+
elif causal_score >= 0.20:
|
| 346 |
+
level = CausalUnderstandingLevel.SUPERFICIAL
|
| 347 |
+
else:
|
| 348 |
+
level = CausalUnderstandingLevel.RANDOM
|
| 349 |
+
|
| 350 |
+
# โโ Explanation โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 351 |
+
level_explanations = {
|
| 352 |
+
CausalUnderstandingLevel.DEEP: (
|
| 353 |
+
"Agent demonstrated genuine causal reasoning: read tests to understand expected "
|
| 354 |
+
"behavior, traced the call chain to the root cause, made a targeted fix, and "
|
| 355 |
+
"verified with tests before submitting."
|
| 356 |
+
),
|
| 357 |
+
CausalUnderstandingLevel.PARTIAL: (
|
| 358 |
+
"Agent showed partial causal understanding. Some chain links were traversed "
|
| 359 |
+
"but the reasoning was incomplete โ likely missed tracing deeper dependencies "
|
| 360 |
+
"or skipped test verification."
|
| 361 |
+
),
|
| 362 |
+
CausalUnderstandingLevel.SUPERFICIAL: (
|
| 363 |
+
"Agent showed superficial reasoning. Actions did not follow a clear causal "
|
| 364 |
+
"path from test โ failure โ root cause. Likely pattern-matched on filenames "
|
| 365 |
+
"or guessed the fix location."
|
| 366 |
+
),
|
| 367 |
+
CausalUnderstandingLevel.RANDOM: (
|
| 368 |
+
"Agent showed no discernible causal reasoning. Actions appear random relative "
|
| 369 |
+
"to the causal structure of the bug. This is the profile of pure trial-and-error."
|
| 370 |
+
),
|
| 371 |
+
}
|
| 372 |
+
explanation = level_explanations[level]
|
| 373 |
+
|
| 374 |
+
# โโ Recommendations โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 375 |
+
recs = []
|
| 376 |
+
if not any(n.was_read for n in expected_chain if n.role == "test"):
|
| 377 |
+
recs.append("Always read the failing test first โ it defines the expected behavior.")
|
| 378 |
+
if not read_before_write:
|
| 379 |
+
recs.append("Never write to a file before reading it โ blind writes cause more bugs.")
|
| 380 |
+
if not submit_after_test:
|
| 381 |
+
recs.append("Run tests after every write to verify your fix is correct.")
|
| 382 |
+
if not search_steps:
|
| 383 |
+
recs.append("Use search_code to find function definitions before navigating blindly.")
|
| 384 |
+
if actual_chain_coverage < 0.5:
|
| 385 |
+
recs.append(
|
| 386 |
+
"Explore more of the causal chain. The bug's root cause may be deeper than the first file."
|
| 387 |
+
)
|
| 388 |
+
if not recs:
|
| 389 |
+
recs.append("Excellent reasoning! Maintain this systematic approach.")
|
| 390 |
+
|
| 391 |
+
return CausalProbeReport(
|
| 392 |
+
episode_id=episode_id,
|
| 393 |
+
task=task,
|
| 394 |
+
understanding_level=level,
|
| 395 |
+
causal_score=causal_score,
|
| 396 |
+
expected_chain=expected_chain,
|
| 397 |
+
actual_chain_coverage=actual_chain_coverage,
|
| 398 |
+
chain_order_score=chain_order_score,
|
| 399 |
+
read_before_write=read_before_write,
|
| 400 |
+
test_informed_navigation=test_informed_navigation,
|
| 401 |
+
search_before_navigate=search_before_navigate,
|
| 402 |
+
submit_after_test=submit_after_test,
|
| 403 |
+
guessing_indicators=guessing_indicators,
|
| 404 |
+
understanding_indicators=understanding_indicators,
|
| 405 |
+
false_confidence_detected=false_confidence_detected,
|
| 406 |
+
shortcut_learning_detected=shortcut_learning_detected,
|
| 407 |
+
explanation=explanation,
|
| 408 |
+
recommendations=recs,
|
| 409 |
+
)
|
server/confidence_calibrator.py
ADDED
|
@@ -0,0 +1,363 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/confidence_calibrator.py
|
| 2 |
+
"""
|
| 3 |
+
Confidence Calibration Engine โ v4.0
|
| 4 |
+
|
| 5 |
+
The key scientific question: Is the agent calibrated?
|
| 6 |
+
An agent is calibrated when its certainty level (inferred from behavior)
|
| 7 |
+
matches its likelihood of being correct.
|
| 8 |
+
|
| 9 |
+
Since agents don't expose probability distributions directly, we infer
|
| 10 |
+
confidence from behavioral proxies:
|
| 11 |
+
- How quickly did it commit to a hypothesis (read โ write speed)?
|
| 12 |
+
- How much did it re-explore after writing (re-reads after write)?
|
| 13 |
+
- Did it verify (run_tests) before submitting?
|
| 14 |
+
- How many steps did it spend before the first write?
|
| 15 |
+
|
| 16 |
+
We then compare inferred confidence to actual accuracy (final_score).
|
| 17 |
+
Overconfident agents submit fast but score poorly.
|
| 18 |
+
Underconfident agents explore extensively but still score well.
|
| 19 |
+
Well-calibrated agents: confidence โ accuracy.
|
| 20 |
+
|
| 21 |
+
This is NOT measured by any existing benchmark or tracing tool.
|
| 22 |
+
"""
|
| 23 |
+
from __future__ import annotations
|
| 24 |
+
import math
|
| 25 |
+
from typing import List, Dict, Any, Optional
|
| 26 |
+
from dataclasses import dataclass, field
|
| 27 |
+
from enum import Enum
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class CalibrationProfile(str, Enum):
|
| 31 |
+
WELL_CALIBRATED = "WELL_CALIBRATED" # Confidence โ accuracy
|
| 32 |
+
OVERCONFIDENT = "OVERCONFIDENT" # High confidence, low accuracy
|
| 33 |
+
UNDERCONFIDENT = "UNDERCONFIDENT" # Low confidence, high accuracy
|
| 34 |
+
ERRATIC = "ERRATIC" # Confidence changes randomly
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@dataclass
|
| 38 |
+
class ConfidenceSample:
|
| 39 |
+
"""Inferred confidence at one point in the trajectory."""
|
| 40 |
+
step: int
|
| 41 |
+
action_type: str
|
| 42 |
+
inferred_confidence: float # 0.0โ1.0 based on behavioral proxy
|
| 43 |
+
actual_accuracy: Optional[float] # test_pass_rate at this step if known
|
| 44 |
+
calibration_error: Optional[float] # |confidence - accuracy| if both known
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
@dataclass
|
| 48 |
+
class CalibrationReport:
|
| 49 |
+
"""Full confidence calibration analysis."""
|
| 50 |
+
episode_id: str
|
| 51 |
+
task: str
|
| 52 |
+
|
| 53 |
+
profile: CalibrationProfile
|
| 54 |
+
calibration_score: float # 1.0 = perfectly calibrated
|
| 55 |
+
|
| 56 |
+
# Inferred overall confidence level (behavioral proxy)
|
| 57 |
+
inferred_confidence: float # 0.0โ1.0
|
| 58 |
+
actual_performance: float # final_score
|
| 59 |
+
|
| 60 |
+
# Decomposed signals
|
| 61 |
+
commitment_speed: float # How fast did agent commit? (0=slow/careful, 1=fast)
|
| 62 |
+
re_exploration_rate: float # Reads after first write / total reads
|
| 63 |
+
verification_rate: float # run_tests per write_file
|
| 64 |
+
submit_speed: float # Submit step / max_steps (early=overconfident)
|
| 65 |
+
|
| 66 |
+
# Trajectory of inferred confidence
|
| 67 |
+
confidence_trajectory: List[ConfidenceSample]
|
| 68 |
+
|
| 69 |
+
# Calibration error
|
| 70 |
+
expected_calibration_error: float # Mean(|conf - acc|) where acc is known
|
| 71 |
+
confidence_accuracy_correlation: float # Should be high for good agents
|
| 72 |
+
|
| 73 |
+
diagnosis: str
|
| 74 |
+
recommendations: List[str]
|
| 75 |
+
|
| 76 |
+
def to_dict(self) -> dict:
|
| 77 |
+
return {
|
| 78 |
+
"episode_id": self.episode_id,
|
| 79 |
+
"task": self.task,
|
| 80 |
+
"profile": self.profile.value,
|
| 81 |
+
"calibration_score": round(self.calibration_score, 3),
|
| 82 |
+
"inferred_confidence": round(self.inferred_confidence, 3),
|
| 83 |
+
"actual_performance": round(self.actual_performance, 3),
|
| 84 |
+
"signals": {
|
| 85 |
+
"commitment_speed": round(self.commitment_speed, 3),
|
| 86 |
+
"re_exploration_rate": round(self.re_exploration_rate, 3),
|
| 87 |
+
"verification_rate": round(self.verification_rate, 3),
|
| 88 |
+
"submit_speed": round(self.submit_speed, 3),
|
| 89 |
+
},
|
| 90 |
+
"expected_calibration_error": round(self.expected_calibration_error, 3),
|
| 91 |
+
"confidence_accuracy_correlation": round(self.confidence_accuracy_correlation, 3),
|
| 92 |
+
"confidence_trajectory": [
|
| 93 |
+
{
|
| 94 |
+
"step": s.step,
|
| 95 |
+
"action": s.action_type,
|
| 96 |
+
"confidence": round(s.inferred_confidence, 3),
|
| 97 |
+
"accuracy": round(s.actual_accuracy, 3) if s.actual_accuracy is not None else None,
|
| 98 |
+
"error": round(s.calibration_error, 3) if s.calibration_error is not None else None,
|
| 99 |
+
}
|
| 100 |
+
for s in self.confidence_trajectory
|
| 101 |
+
],
|
| 102 |
+
"diagnosis": self.diagnosis,
|
| 103 |
+
"recommendations": self.recommendations,
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
class ConfidenceCalibrator:
|
| 108 |
+
"""
|
| 109 |
+
Infers behavioral confidence and compares to actual performance.
|
| 110 |
+
|
| 111 |
+
Confidence proxy model:
|
| 112 |
+
- Reading files = low confidence (still exploring)
|
| 113 |
+
- Writing files = medium-high confidence (committed to hypothesis)
|
| 114 |
+
- Running tests = verification (moderate, checking own hypothesis)
|
| 115 |
+
- Submitting = maximum commitment (fully confident)
|
| 116 |
+
|
| 117 |
+
Each action type has a confidence weight:
|
| 118 |
+
read_file: 0.2 (exploring, uncertain)
|
| 119 |
+
search_code: 0.3 (slightly more directed)
|
| 120 |
+
run_tests: 0.6 (confident enough to test)
|
| 121 |
+
write_file: 0.75 (committed to hypothesis)
|
| 122 |
+
submit: 1.0 (maximum confidence)
|
| 123 |
+
|
| 124 |
+
We track how this evolves over the trajectory.
|
| 125 |
+
"""
|
| 126 |
+
|
| 127 |
+
ACTION_CONFIDENCE = {
|
| 128 |
+
"read_file": 0.2,
|
| 129 |
+
"search_code": 0.3,
|
| 130 |
+
"run_tests": 0.6,
|
| 131 |
+
"write_file": 0.75,
|
| 132 |
+
"submit": 1.0,
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
def calibrate(
|
| 136 |
+
self,
|
| 137 |
+
episode_id: str,
|
| 138 |
+
task: str,
|
| 139 |
+
trajectory_steps: List[dict],
|
| 140 |
+
final_score: float,
|
| 141 |
+
max_steps: int = 20,
|
| 142 |
+
) -> CalibrationReport:
|
| 143 |
+
"""Compute the full calibration report for one episode."""
|
| 144 |
+
|
| 145 |
+
if not trajectory_steps:
|
| 146 |
+
return self._empty_report(episode_id, task, final_score)
|
| 147 |
+
|
| 148 |
+
action_types = [s.get("action_type", "read_file") for s in trajectory_steps]
|
| 149 |
+
total_steps = len(trajectory_steps)
|
| 150 |
+
|
| 151 |
+
# โโ Build confidence trajectory โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 152 |
+
confidence_traj: List[ConfidenceSample] = []
|
| 153 |
+
running_conf = 0.0
|
| 154 |
+
|
| 155 |
+
for s in trajectory_steps:
|
| 156 |
+
atype = s.get("action_type", "read_file")
|
| 157 |
+
base_conf = self.ACTION_CONFIDENCE.get(atype, 0.3)
|
| 158 |
+
|
| 159 |
+
# Confidence grows as episode progresses
|
| 160 |
+
step_n = s.get("step_number", 1)
|
| 161 |
+
progress_bonus = (step_n / max(total_steps, 1)) * 0.1
|
| 162 |
+
|
| 163 |
+
# Re-reads slightly lower confidence
|
| 164 |
+
step_write_count = sum(
|
| 165 |
+
1 for s2 in trajectory_steps
|
| 166 |
+
if s2.get("action_type") == "write_file"
|
| 167 |
+
and s2.get("step_number", 99) < step_n
|
| 168 |
+
)
|
| 169 |
+
step_reread = (
|
| 170 |
+
s.get("action_type") == "read_file"
|
| 171 |
+
and any(
|
| 172 |
+
s2.get("action_path") == s.get("action_path")
|
| 173 |
+
and s2.get("step_number", 0) < step_n
|
| 174 |
+
for s2 in trajectory_steps
|
| 175 |
+
)
|
| 176 |
+
)
|
| 177 |
+
reread_penalty = -0.1 if step_reread else 0.0
|
| 178 |
+
|
| 179 |
+
# After a write, confidence should be higher
|
| 180 |
+
post_write_bonus = min(0.15, step_write_count * 0.05)
|
| 181 |
+
|
| 182 |
+
inferred = min(1.0, max(0.0,
|
| 183 |
+
base_conf + progress_bonus + post_write_bonus + reread_penalty
|
| 184 |
+
))
|
| 185 |
+
|
| 186 |
+
# Actual accuracy at this step if test_pass_rate is known
|
| 187 |
+
actual_acc = s.get("test_pass_rate")
|
| 188 |
+
calib_err = abs(inferred - actual_acc) if actual_acc is not None else None
|
| 189 |
+
|
| 190 |
+
confidence_traj.append(ConfidenceSample(
|
| 191 |
+
step=step_n,
|
| 192 |
+
action_type=atype,
|
| 193 |
+
inferred_confidence=inferred,
|
| 194 |
+
actual_accuracy=actual_acc,
|
| 195 |
+
calibration_error=calib_err,
|
| 196 |
+
))
|
| 197 |
+
|
| 198 |
+
# โโ Behavioral signal computation โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 199 |
+
total = max(total_steps, 1)
|
| 200 |
+
|
| 201 |
+
# Commitment speed: how many reads before first write?
|
| 202 |
+
read_steps = [i for i, a in enumerate(action_types) if a == "read_file"]
|
| 203 |
+
write_steps = [i for i, a in enumerate(action_types) if a == "write_file"]
|
| 204 |
+
submit_step = next(
|
| 205 |
+
(s.get("step_number", total) for s in trajectory_steps if s.get("action_type") == "submit"),
|
| 206 |
+
total,
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
if write_steps:
|
| 210 |
+
reads_before_first_write = len([r for r in read_steps if r < write_steps[0]])
|
| 211 |
+
# Low reads before write = high commitment speed = overconfident
|
| 212 |
+
commitment_speed = max(0.0, 1.0 - reads_before_first_write / max(total, 1))
|
| 213 |
+
else:
|
| 214 |
+
commitment_speed = 0.0 # Never wrote = very cautious
|
| 215 |
+
|
| 216 |
+
# Re-exploration rate: reads after first write / total reads
|
| 217 |
+
if write_steps and read_steps:
|
| 218 |
+
reads_after_write = len([r for r in read_steps if r > write_steps[0]])
|
| 219 |
+
re_exploration_rate = reads_after_write / len(read_steps)
|
| 220 |
+
else:
|
| 221 |
+
re_exploration_rate = 0.0
|
| 222 |
+
|
| 223 |
+
# Verification rate: run_tests per write
|
| 224 |
+
test_count = action_types.count("run_tests")
|
| 225 |
+
write_count = action_types.count("write_file")
|
| 226 |
+
verification_rate = test_count / max(write_count, 1)
|
| 227 |
+
|
| 228 |
+
# Submit speed: earlier = more overconfident
|
| 229 |
+
submit_speed = 1.0 - (submit_step / max(max_steps, 1))
|
| 230 |
+
submit_speed = max(0.0, min(1.0, submit_speed))
|
| 231 |
+
|
| 232 |
+
# โโ Inferred overall confidence โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 233 |
+
# Weighted behavioral proxy
|
| 234 |
+
inferred_confidence = (
|
| 235 |
+
commitment_speed * 0.30 +
|
| 236 |
+
(1.0 - re_exploration_rate) * 0.15 +
|
| 237 |
+
verification_rate * 0.15 +
|
| 238 |
+
submit_speed * 0.20 +
|
| 239 |
+
(confidence_traj[-1].inferred_confidence if confidence_traj else 0.5) * 0.20
|
| 240 |
+
)
|
| 241 |
+
inferred_confidence = min(1.0, max(0.0, inferred_confidence))
|
| 242 |
+
|
| 243 |
+
# โโ Calibration error (where we have both conf + acc) โโโโโโโโโโโโโโโโโ
|
| 244 |
+
calib_errors = [
|
| 245 |
+
s.calibration_error for s in confidence_traj
|
| 246 |
+
if s.calibration_error is not None
|
| 247 |
+
]
|
| 248 |
+
ece = sum(calib_errors) / len(calib_errors) if calib_errors else abs(inferred_confidence - final_score)
|
| 249 |
+
|
| 250 |
+
# โโ Confidence-accuracy correlation โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 251 |
+
paired = [
|
| 252 |
+
(s.inferred_confidence, s.actual_accuracy)
|
| 253 |
+
for s in confidence_traj
|
| 254 |
+
if s.actual_accuracy is not None
|
| 255 |
+
]
|
| 256 |
+
if len(paired) >= 2:
|
| 257 |
+
corr = self._pearson_r([p[0] for p in paired], [p[1] for p in paired])
|
| 258 |
+
else:
|
| 259 |
+
# Fallback: use final point only
|
| 260 |
+
conf_err = abs(inferred_confidence - final_score)
|
| 261 |
+
corr = 1.0 - conf_err * 2
|
| 262 |
+
|
| 263 |
+
corr = max(-1.0, min(1.0, corr))
|
| 264 |
+
|
| 265 |
+
# โโ Calibration score โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 266 |
+
calibration_score = max(0.0, 1.0 - ece) * 0.5 + max(0.0, corr) * 0.5
|
| 267 |
+
calibration_score = max(0.0, min(1.0, calibration_score))
|
| 268 |
+
|
| 269 |
+
# โโ Profile classification โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 270 |
+
conf_diff = inferred_confidence - final_score
|
| 271 |
+
if abs(conf_diff) <= 0.2:
|
| 272 |
+
profile = CalibrationProfile.WELL_CALIBRATED
|
| 273 |
+
elif conf_diff > 0.2:
|
| 274 |
+
profile = CalibrationProfile.OVERCONFIDENT
|
| 275 |
+
elif conf_diff < -0.2:
|
| 276 |
+
profile = CalibrationProfile.UNDERCONFIDENT
|
| 277 |
+
else:
|
| 278 |
+
profile = CalibrationProfile.ERRATIC
|
| 279 |
+
|
| 280 |
+
# โโ Diagnosis โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 281 |
+
diagnoses = {
|
| 282 |
+
CalibrationProfile.WELL_CALIBRATED: (
|
| 283 |
+
f"Agent is well-calibrated: inferred confidence ({inferred_confidence:.2f}) "
|
| 284 |
+
f"closely matches actual performance ({final_score:.2f}). "
|
| 285 |
+
"This indicates genuine self-awareness โ the agent commits when ready and "
|
| 286 |
+
"explores when uncertain."
|
| 287 |
+
),
|
| 288 |
+
CalibrationProfile.OVERCONFIDENT: (
|
| 289 |
+
f"Agent is overconfident: behavioral confidence ({inferred_confidence:.2f}) "
|
| 290 |
+
f"significantly exceeds actual performance ({final_score:.2f}). "
|
| 291 |
+
"Agent committed to a hypothesis too early, skipped verification, "
|
| 292 |
+
"or submitted without adequate exploration. This is the profile of agents "
|
| 293 |
+
"that 'feel certain but are wrong'."
|
| 294 |
+
),
|
| 295 |
+
CalibrationProfile.UNDERCONFIDENT: (
|
| 296 |
+
f"Agent is underconfident: behavioral confidence ({inferred_confidence:.2f}) "
|
| 297 |
+
f"is well below actual performance ({final_score:.2f}). "
|
| 298 |
+
"Agent explored far more than necessary, re-read files unnecessarily, "
|
| 299 |
+
"or hesitated to commit despite having the right information. "
|
| 300 |
+
"This wastes compute and steps without improving accuracy."
|
| 301 |
+
),
|
| 302 |
+
CalibrationProfile.ERRATIC: (
|
| 303 |
+
"Agent calibration is erratic โ confidence signals are inconsistent "
|
| 304 |
+
"with behavior. The agent may be applying a rigid strategy regardless "
|
| 305 |
+
"of the task difficulty."
|
| 306 |
+
),
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
recs = []
|
| 310 |
+
if profile == CalibrationProfile.OVERCONFIDENT:
|
| 311 |
+
recs.append("Read more files before writing โ commit only when you've seen the full causal chain.")
|
| 312 |
+
recs.append("Always run_tests after writing โ don't trust your fix without verification.")
|
| 313 |
+
elif profile == CalibrationProfile.UNDERCONFIDENT:
|
| 314 |
+
recs.append("Commit to hypotheses earlier โ excessive re-reading wastes steps.")
|
| 315 |
+
recs.append("After reading tests + source files, write your fix. Stop re-reading.")
|
| 316 |
+
if verification_rate < 0.5:
|
| 317 |
+
recs.append("Increase test verification rate: run_tests after each write.")
|
| 318 |
+
if re_exploration_rate > 0.5:
|
| 319 |
+
recs.append("High re-exploration after writing suggests uncalibrated hypothesis formation.")
|
| 320 |
+
|
| 321 |
+
return CalibrationReport(
|
| 322 |
+
episode_id=episode_id,
|
| 323 |
+
task=task,
|
| 324 |
+
profile=profile,
|
| 325 |
+
calibration_score=calibration_score,
|
| 326 |
+
inferred_confidence=inferred_confidence,
|
| 327 |
+
actual_performance=final_score,
|
| 328 |
+
commitment_speed=commitment_speed,
|
| 329 |
+
re_exploration_rate=re_exploration_rate,
|
| 330 |
+
verification_rate=verification_rate,
|
| 331 |
+
submit_speed=submit_speed,
|
| 332 |
+
confidence_trajectory=confidence_traj,
|
| 333 |
+
expected_calibration_error=ece,
|
| 334 |
+
confidence_accuracy_correlation=corr,
|
| 335 |
+
diagnosis=diagnoses[profile],
|
| 336 |
+
recommendations=recs,
|
| 337 |
+
)
|
| 338 |
+
|
| 339 |
+
def _pearson_r(self, xs: List[float], ys: List[float]) -> float:
|
| 340 |
+
n = len(xs)
|
| 341 |
+
if n < 2:
|
| 342 |
+
return 0.0
|
| 343 |
+
mx, my = sum(xs) / n, sum(ys) / n
|
| 344 |
+
num = sum((x - mx) * (y - my) for x, y in zip(xs, ys))
|
| 345 |
+
dx = math.sqrt(sum((x - mx) ** 2 for x in xs))
|
| 346 |
+
dy = math.sqrt(sum((y - my) ** 2 for y in ys))
|
| 347 |
+
if dx * dy == 0:
|
| 348 |
+
return 0.0
|
| 349 |
+
return num / (dx * dy)
|
| 350 |
+
|
| 351 |
+
def _empty_report(self, episode_id: str, task: str, final_score: float) -> CalibrationReport:
|
| 352 |
+
return CalibrationReport(
|
| 353 |
+
episode_id=episode_id, task=task,
|
| 354 |
+
profile=CalibrationProfile.ERRATIC,
|
| 355 |
+
calibration_score=0.0,
|
| 356 |
+
inferred_confidence=0.0, actual_performance=final_score,
|
| 357 |
+
commitment_speed=0.0, re_exploration_rate=0.0,
|
| 358 |
+
verification_rate=0.0, submit_speed=0.0,
|
| 359 |
+
confidence_trajectory=[],
|
| 360 |
+
expected_calibration_error=1.0,
|
| 361 |
+
confidence_accuracy_correlation=0.0,
|
| 362 |
+
diagnosis="No trajectory data.", recommendations=[],
|
| 363 |
+
)
|
server/counterfactual_engine.py
ADDED
|
@@ -0,0 +1,383 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/counterfactual_engine.py
|
| 2 |
+
"""
|
| 3 |
+
Counterfactual Robustness Engine โ v4.0
|
| 4 |
+
|
| 5 |
+
The key scientific question: Is the agent's strategy robust, or is it brittle?
|
| 6 |
+
|
| 7 |
+
We test this by:
|
| 8 |
+
1. Running an episode โ recording strategy
|
| 9 |
+
2. Applying small, semantically-neutral mutations to the repo
|
| 10 |
+
(rename variable, change a constant, add a dummy function)
|
| 11 |
+
3. Measuring whether the agent's recorded strategy would fail on the mutated repo
|
| 12 |
+
|
| 13 |
+
IMPORTANT: This does NOT re-run the agent. It analyzes whether the
|
| 14 |
+
already-recorded navigation pattern was based on deep structure (robust)
|
| 15 |
+
or surface signals like filenames/constants (brittle).
|
| 16 |
+
|
| 17 |
+
This is completely novel โ no benchmark or tool does this.
|
| 18 |
+
"""
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
import random
|
| 21 |
+
import hashlib
|
| 22 |
+
from typing import List, Dict, Any, Tuple
|
| 23 |
+
from dataclasses import dataclass, field
|
| 24 |
+
from enum import Enum
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class BrittlenessLevel(str, Enum):
|
| 28 |
+
ROBUST = "ROBUST" # Strategy survives all mutations
|
| 29 |
+
MILDLY_BRITTLE = "MILDLY_BRITTLE" # Survives 60-80% of mutations
|
| 30 |
+
BRITTLE = "BRITTLE" # Survives < 60%
|
| 31 |
+
FRAGILE = "FRAGILE" # Survives < 30%
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@dataclass
|
| 35 |
+
class Mutation:
|
| 36 |
+
"""A single counterfactual mutation applied to the repo."""
|
| 37 |
+
mutation_type: str
|
| 38 |
+
target_file: str
|
| 39 |
+
description: str
|
| 40 |
+
would_break_agent: bool # Would this mutation cause agent's strategy to fail?
|
| 41 |
+
why: str # Explanation
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@dataclass
|
| 45 |
+
class CounterfactualReport:
|
| 46 |
+
"""Results of counterfactual robustness testing."""
|
| 47 |
+
episode_id: str
|
| 48 |
+
task: str
|
| 49 |
+
brittleness_level: BrittlenessLevel
|
| 50 |
+
robustness_score: float # 0.0 โ 1.0
|
| 51 |
+
|
| 52 |
+
mutations_tested: List[Mutation]
|
| 53 |
+
mutations_survived: int
|
| 54 |
+
mutations_failed: int
|
| 55 |
+
|
| 56 |
+
surface_dependencies: List[str] # What surface signals the agent relied on
|
| 57 |
+
deep_dependencies: List[str] # What structural signals it used correctly
|
| 58 |
+
|
| 59 |
+
explanation: str
|
| 60 |
+
recommendations: List[str]
|
| 61 |
+
|
| 62 |
+
def to_dict(self) -> dict:
|
| 63 |
+
return {
|
| 64 |
+
"episode_id": self.episode_id,
|
| 65 |
+
"task": self.task,
|
| 66 |
+
"brittleness_level": self.brittleness_level.value,
|
| 67 |
+
"robustness_score": round(self.robustness_score, 3),
|
| 68 |
+
"mutations_tested": len(self.mutations_tested),
|
| 69 |
+
"mutations_survived": self.mutations_survived,
|
| 70 |
+
"mutations_failed": self.mutations_failed,
|
| 71 |
+
"mutations": [
|
| 72 |
+
{
|
| 73 |
+
"type": m.mutation_type,
|
| 74 |
+
"file": m.target_file,
|
| 75 |
+
"description": m.description,
|
| 76 |
+
"would_break_agent": m.would_break_agent,
|
| 77 |
+
"why": m.why,
|
| 78 |
+
}
|
| 79 |
+
for m in self.mutations_tested
|
| 80 |
+
],
|
| 81 |
+
"surface_dependencies": self.surface_dependencies,
|
| 82 |
+
"deep_dependencies": self.deep_dependencies,
|
| 83 |
+
"explanation": self.explanation,
|
| 84 |
+
"recommendations": self.recommendations,
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
class CounterfactualEngine:
|
| 89 |
+
"""
|
| 90 |
+
Analyzes brittleness by reasoning about what mutations would break the agent.
|
| 91 |
+
|
| 92 |
+
We don't need to actually re-run the agent โ we analyze the recorded
|
| 93 |
+
trajectory and ask: "If file X was named differently / had a different
|
| 94 |
+
constant, would this agent's navigation pattern still work?"
|
| 95 |
+
|
| 96 |
+
Brittle signals:
|
| 97 |
+
- Agent found bug file by pattern-matching on filename (not content search)
|
| 98 |
+
- Agent submitted after reading the same file every run
|
| 99 |
+
- Agent ignored test content and relied on positional heuristics
|
| 100 |
+
|
| 101 |
+
Robust signals:
|
| 102 |
+
- Agent used search_code to find function by name
|
| 103 |
+
- Agent read test โ traced import โ found source
|
| 104 |
+
- Agent ran tests and verified result before submitting
|
| 105 |
+
"""
|
| 106 |
+
|
| 107 |
+
MUTATION_TEMPLATES = [
|
| 108 |
+
{
|
| 109 |
+
"type": "FILENAME_RENAME",
|
| 110 |
+
"description": "Rename src/X.py to src/X_v2.py (same content)",
|
| 111 |
+
"breaks_if": "agent found file by name pattern, not by search or import tracing",
|
| 112 |
+
"surface_signal": "filename",
|
| 113 |
+
"robust_signal": "import tracing or search_code",
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
"type": "CONSTANT_CHANGE",
|
| 117 |
+
"description": "Change a numeric constant by ยฑ1 (semantically neutral for navigation)",
|
| 118 |
+
"breaks_if": "agent hardcoded expected value rather than reading actual code",
|
| 119 |
+
"surface_signal": "constant value pattern matching",
|
| 120 |
+
"robust_signal": "dynamic code reading",
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"type": "DUMMY_FUNCTION",
|
| 124 |
+
"description": "Add a dummy function with a similar name near the bug",
|
| 125 |
+
"breaks_if": "agent used first-match navigation without reading full context",
|
| 126 |
+
"surface_signal": "first result of search or first match in file",
|
| 127 |
+
"robust_signal": "reading complete function signatures before deciding",
|
| 128 |
+
},
|
| 129 |
+
{
|
| 130 |
+
"type": "DIRECTORY_SHUFFLE",
|
| 131 |
+
"description": "Move test file from tests/ to test/ (same content)",
|
| 132 |
+
"breaks_if": "agent hardcoded path prefix tests/ instead of searching",
|
| 133 |
+
"surface_signal": "hardcoded directory prefix",
|
| 134 |
+
"robust_signal": "search or dynamic discovery",
|
| 135 |
+
},
|
| 136 |
+
{
|
| 137 |
+
"type": "DOCSTRING_NOISE",
|
| 138 |
+
"description": "Add misleading docstring claiming a different function causes the bug",
|
| 139 |
+
"breaks_if": "agent read docs instead of tests to understand expected behavior",
|
| 140 |
+
"surface_signal": "docstring content",
|
| 141 |
+
"robust_signal": "test assertions as ground truth",
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"type": "IMPORT_REORDER",
|
| 145 |
+
"description": "Reorder imports in the source file",
|
| 146 |
+
"breaks_if": "agent relied on line numbers instead of function names",
|
| 147 |
+
"surface_signal": "absolute line numbers",
|
| 148 |
+
"robust_signal": "function name search",
|
| 149 |
+
},
|
| 150 |
+
]
|
| 151 |
+
|
| 152 |
+
def analyze(
|
| 153 |
+
self,
|
| 154 |
+
episode_id: str,
|
| 155 |
+
task: str,
|
| 156 |
+
trajectory_steps: List[dict],
|
| 157 |
+
variant_meta: dict,
|
| 158 |
+
files_read: List[str],
|
| 159 |
+
files_written: List[str],
|
| 160 |
+
final_score: float,
|
| 161 |
+
) -> CounterfactualReport:
|
| 162 |
+
"""
|
| 163 |
+
Analyze robustness by simulating mutations and reasoning about
|
| 164 |
+
whether the agent's recorded pattern would survive them.
|
| 165 |
+
"""
|
| 166 |
+
action_types = [s.get("action_type", "") for s in trajectory_steps]
|
| 167 |
+
action_paths = [s.get("action_path") for s in trajectory_steps]
|
| 168 |
+
|
| 169 |
+
bug_files = set(variant_meta.get("bug_files", []) or
|
| 170 |
+
variant_meta.get("files_to_implement", []) or [])
|
| 171 |
+
test_files_meta = set(variant_meta.get("test_files", []) or [])
|
| 172 |
+
|
| 173 |
+
# Infer what signals agent used
|
| 174 |
+
used_search = "search_code" in action_types
|
| 175 |
+
used_tests_first = self._tests_read_before_src(trajectory_steps, test_files_meta, bug_files)
|
| 176 |
+
used_run_tests = "run_tests" in action_types
|
| 177 |
+
blind_navigation = not used_search and not used_tests_first
|
| 178 |
+
read_count = action_types.count("read_file")
|
| 179 |
+
write_count = action_types.count("write_file")
|
| 180 |
+
immediate_write = write_count > 0 and action_types.index("write_file") <= 2
|
| 181 |
+
verified_before_submit = self._verified_before_submit(trajectory_steps)
|
| 182 |
+
|
| 183 |
+
# โโ Evaluate each mutation โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 184 |
+
mutations: List[Mutation] = []
|
| 185 |
+
|
| 186 |
+
for tmpl in self.MUTATION_TEMPLATES:
|
| 187 |
+
target_file = self._pick_target_file(tmpl["type"], files_read, bug_files)
|
| 188 |
+
would_break, why = self._would_break_agent(
|
| 189 |
+
mutation_type=tmpl["type"],
|
| 190 |
+
used_search=used_search,
|
| 191 |
+
used_tests_first=used_tests_first,
|
| 192 |
+
verified_before_submit=verified_before_submit,
|
| 193 |
+
blind_navigation=blind_navigation,
|
| 194 |
+
immediate_write=immediate_write,
|
| 195 |
+
read_count=read_count,
|
| 196 |
+
tmpl=tmpl,
|
| 197 |
+
)
|
| 198 |
+
mutations.append(Mutation(
|
| 199 |
+
mutation_type=tmpl["type"],
|
| 200 |
+
target_file=target_file or "unknown",
|
| 201 |
+
description=tmpl["description"],
|
| 202 |
+
would_break_agent=would_break,
|
| 203 |
+
why=why,
|
| 204 |
+
))
|
| 205 |
+
|
| 206 |
+
survived = sum(1 for m in mutations if not m.would_break_agent)
|
| 207 |
+
failed = len(mutations) - survived
|
| 208 |
+
|
| 209 |
+
robustness_score = survived / len(mutations) if mutations else 0.0
|
| 210 |
+
|
| 211 |
+
# โโ Surface vs deep dependency analysis โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 212 |
+
surface_deps = []
|
| 213 |
+
deep_deps = []
|
| 214 |
+
|
| 215 |
+
if not used_search:
|
| 216 |
+
surface_deps.append("Filename-based navigation (no search_code used)")
|
| 217 |
+
if not used_tests_first:
|
| 218 |
+
surface_deps.append("Skipped test-informed navigation")
|
| 219 |
+
if immediate_write:
|
| 220 |
+
surface_deps.append("Immediate write after minimal reading (blind fix)")
|
| 221 |
+
if not verified_before_submit:
|
| 222 |
+
surface_deps.append("Submitted without running tests (no verification)")
|
| 223 |
+
|
| 224 |
+
if used_search:
|
| 225 |
+
deep_deps.append("Used search_code to find functions by name (content-based)")
|
| 226 |
+
if used_tests_first:
|
| 227 |
+
deep_deps.append("Read tests first โ used expected behavior as compass")
|
| 228 |
+
if read_count >= 3:
|
| 229 |
+
deep_deps.append(f"Read {read_count} files โ explored structure before committing")
|
| 230 |
+
if verified_before_submit:
|
| 231 |
+
deep_deps.append("Verified fix with run_tests before submitting")
|
| 232 |
+
|
| 233 |
+
# โโ Brittleness classification โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 234 |
+
if robustness_score >= 0.80:
|
| 235 |
+
level = BrittlenessLevel.ROBUST
|
| 236 |
+
elif robustness_score >= 0.60:
|
| 237 |
+
level = BrittlenessLevel.MILDLY_BRITTLE
|
| 238 |
+
elif robustness_score >= 0.30:
|
| 239 |
+
level = BrittlenessLevel.BRITTLE
|
| 240 |
+
else:
|
| 241 |
+
level = BrittlenessLevel.FRAGILE
|
| 242 |
+
|
| 243 |
+
explanations = {
|
| 244 |
+
BrittlenessLevel.ROBUST: (
|
| 245 |
+
"Agent strategy is robust. It relies on deep structural signals (function names, "
|
| 246 |
+
"test assertions, causal chain traversal) rather than surface patterns. "
|
| 247 |
+
"Minor repo mutations would not break its navigation."
|
| 248 |
+
),
|
| 249 |
+
BrittlenessLevel.MILDLY_BRITTLE: (
|
| 250 |
+
"Agent strategy is mildly brittle. Some mutations would break its navigation, "
|
| 251 |
+
"particularly those that change surface signals it relied on. "
|
| 252 |
+
"Using search_code and test-first navigation consistently would improve robustness."
|
| 253 |
+
),
|
| 254 |
+
BrittlenessLevel.BRITTLE: (
|
| 255 |
+
"Agent strategy is brittle. Most mutations would break its navigation. "
|
| 256 |
+
"The agent appears to rely on stable surface patterns (filenames, positions) "
|
| 257 |
+
"rather than understanding the semantic structure of the codebase."
|
| 258 |
+
),
|
| 259 |
+
BrittlenessLevel.FRAGILE: (
|
| 260 |
+
"Agent strategy is fragile. Almost any perturbation to the repo structure "
|
| 261 |
+
"would cause this agent to fail. This indicates pure pattern-matching on "
|
| 262 |
+
"the specific repo layout rather than generalizable code understanding."
|
| 263 |
+
),
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
recs = []
|
| 267 |
+
if not used_search:
|
| 268 |
+
recs.append("Use search_code to find functions by name โ survives filename renames.")
|
| 269 |
+
if not used_tests_first:
|
| 270 |
+
recs.append("Read tests first to anchor your navigation in expected behavior, not filenames.")
|
| 271 |
+
if immediate_write:
|
| 272 |
+
recs.append("Read source files before writing to them โ avoid blind writes.")
|
| 273 |
+
if not verified_before_submit:
|
| 274 |
+
recs.append("Run tests after writing โ verify your fix holds on the actual behavior.")
|
| 275 |
+
|
| 276 |
+
return CounterfactualReport(
|
| 277 |
+
episode_id=episode_id,
|
| 278 |
+
task=task,
|
| 279 |
+
brittleness_level=level,
|
| 280 |
+
robustness_score=robustness_score,
|
| 281 |
+
mutations_tested=mutations,
|
| 282 |
+
mutations_survived=survived,
|
| 283 |
+
mutations_failed=failed,
|
| 284 |
+
surface_dependencies=surface_deps,
|
| 285 |
+
deep_dependencies=deep_deps,
|
| 286 |
+
explanation=explanations[level],
|
| 287 |
+
recommendations=recs,
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
# โโ Helpers โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 291 |
+
|
| 292 |
+
def _tests_read_before_src(
|
| 293 |
+
self, steps: List[dict], test_files: set, bug_files: set
|
| 294 |
+
) -> bool:
|
| 295 |
+
test_steps = [
|
| 296 |
+
s.get("step_number", 99) for s in steps
|
| 297 |
+
if s.get("action_type") == "read_file"
|
| 298 |
+
and any(tf in (s.get("action_path") or "") for tf in test_files)
|
| 299 |
+
]
|
| 300 |
+
src_steps = [
|
| 301 |
+
s.get("step_number", 99) for s in steps
|
| 302 |
+
if s.get("action_type") == "read_file"
|
| 303 |
+
and any(bf in (s.get("action_path") or "") for bf in bug_files)
|
| 304 |
+
]
|
| 305 |
+
if test_steps and src_steps:
|
| 306 |
+
return min(test_steps) < min(src_steps)
|
| 307 |
+
return False
|
| 308 |
+
|
| 309 |
+
def _verified_before_submit(self, steps: List[dict]) -> bool:
|
| 310 |
+
submit_step = next(
|
| 311 |
+
(s.get("step_number", 9999) for s in steps if s.get("action_type") == "submit"),
|
| 312 |
+
None,
|
| 313 |
+
)
|
| 314 |
+
if submit_step is None:
|
| 315 |
+
return False
|
| 316 |
+
return any(
|
| 317 |
+
s.get("action_type") == "run_tests"
|
| 318 |
+
and s.get("step_number", 0) < submit_step
|
| 319 |
+
for s in steps
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
def _pick_target_file(
|
| 323 |
+
self, mutation_type: str, files_read: List[str], bug_files: set
|
| 324 |
+
) -> str:
|
| 325 |
+
if mutation_type in ("FILENAME_RENAME", "DUMMY_FUNCTION", "IMPORT_REORDER"):
|
| 326 |
+
for f in bug_files:
|
| 327 |
+
return f
|
| 328 |
+
return files_read[0] if files_read else "src/main.py"
|
| 329 |
+
if mutation_type == "DIRECTORY_SHUFFLE":
|
| 330 |
+
for f in files_read:
|
| 331 |
+
if "test" in f.lower():
|
| 332 |
+
return f
|
| 333 |
+
return files_read[0] if files_read else "unknown"
|
| 334 |
+
|
| 335 |
+
def _would_break_agent(
|
| 336 |
+
self,
|
| 337 |
+
mutation_type: str,
|
| 338 |
+
used_search: bool,
|
| 339 |
+
used_tests_first: bool,
|
| 340 |
+
verified_before_submit: bool,
|
| 341 |
+
blind_navigation: bool,
|
| 342 |
+
immediate_write: bool,
|
| 343 |
+
read_count: int,
|
| 344 |
+
tmpl: dict,
|
| 345 |
+
) -> Tuple[bool, str]:
|
| 346 |
+
"""
|
| 347 |
+
Return (would_break, explanation) by reasoning about the agent's signals.
|
| 348 |
+
"""
|
| 349 |
+
if mutation_type == "FILENAME_RENAME":
|
| 350 |
+
if used_search:
|
| 351 |
+
return False, "Agent used search_code โ finds function by name, not filename"
|
| 352 |
+
if blind_navigation:
|
| 353 |
+
return True, "Agent navigated by filename without search โ rename breaks it"
|
| 354 |
+
return True, "Agent likely relied on filename pattern without search fallback"
|
| 355 |
+
|
| 356 |
+
if mutation_type == "CONSTANT_CHANGE":
|
| 357 |
+
# Almost never breaks well-behaved agents
|
| 358 |
+
if read_count >= 2:
|
| 359 |
+
return False, "Agent read files dynamically โ adapts to any constant value"
|
| 360 |
+
return True, "Agent may have hardcoded expected value in navigation heuristic"
|
| 361 |
+
|
| 362 |
+
if mutation_type == "DUMMY_FUNCTION":
|
| 363 |
+
if used_search and read_count >= 3:
|
| 364 |
+
return False, "Agent searched and read thoroughly โ would disambiguate"
|
| 365 |
+
return True, "Agent took first match without thorough reading"
|
| 366 |
+
|
| 367 |
+
if mutation_type == "DIRECTORY_SHUFFLE":
|
| 368 |
+
if used_search:
|
| 369 |
+
return False, "search_code finds tests regardless of directory"
|
| 370 |
+
return True, "Agent used hardcoded path prefix โ directory change breaks it"
|
| 371 |
+
|
| 372 |
+
if mutation_type == "DOCSTRING_NOISE":
|
| 373 |
+
if used_tests_first:
|
| 374 |
+
return False, "Agent used test assertions as ground truth, not docstrings"
|
| 375 |
+
return True, "Agent may have read misleading docstring instead of test"
|
| 376 |
+
|
| 377 |
+
if mutation_type == "IMPORT_REORDER":
|
| 378 |
+
# Only brittle if agent relied on line numbers
|
| 379 |
+
if read_count <= 1:
|
| 380 |
+
return True, "Agent skimmed โ likely used line numbers for navigation"
|
| 381 |
+
return False, "Agent read full files โ import reorder doesn't change function content"
|
| 382 |
+
|
| 383 |
+
return False, "Neutral mutation"
|
server/memory_bank.py
ADDED
|
@@ -0,0 +1,362 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/memory_bank.py
|
| 2 |
+
"""
|
| 3 |
+
Episodic Memory Bank โ v4.0
|
| 4 |
+
|
| 5 |
+
Cross-episode learning store for AI coding agents.
|
| 6 |
+
|
| 7 |
+
Every time an agent fails at a specific failure type, we store:
|
| 8 |
+
1. The failure pattern (what actions led to it)
|
| 9 |
+
2. The remediation hint (what should have been done)
|
| 10 |
+
3. A compact "lesson" that can be injected into future prompts
|
| 11 |
+
|
| 12 |
+
The memory grows across episodes. When a new episode starts:
|
| 13 |
+
- We retrieve the most relevant past lessons (by task similarity)
|
| 14 |
+
- We inject them as a "memory context" into the agent's system prompt
|
| 15 |
+
- This creates a real self-improvement loop
|
| 16 |
+
|
| 17 |
+
This is NOT implemented in any current agent framework as an
|
| 18 |
+
environment-side primitive. Devin, Copilot, etc. start fresh every run.
|
| 19 |
+
"""
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
import json
|
| 22 |
+
import time
|
| 23 |
+
import os
|
| 24 |
+
import hashlib
|
| 25 |
+
from typing import List, Dict, Any, Optional
|
| 26 |
+
from dataclasses import dataclass, field, asdict
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@dataclass
|
| 30 |
+
class MemoryEntry:
|
| 31 |
+
"""One stored episode lesson."""
|
| 32 |
+
entry_id: str
|
| 33 |
+
episode_id: str
|
| 34 |
+
task: str
|
| 35 |
+
created_at: float
|
| 36 |
+
|
| 37 |
+
# Failure details
|
| 38 |
+
failure_type: str
|
| 39 |
+
failure_evidence: str
|
| 40 |
+
score: float
|
| 41 |
+
|
| 42 |
+
# Strategy used
|
| 43 |
+
strategy: str
|
| 44 |
+
action_sequence_hash: str # Compact fingerprint of the action pattern
|
| 45 |
+
|
| 46 |
+
# Lesson extracted
|
| 47 |
+
lesson_title: str
|
| 48 |
+
lesson_body: str # Full explanation of what went wrong
|
| 49 |
+
lesson_hint: str # Compact hint to inject into future prompts
|
| 50 |
+
lesson_plan: List[str] # Step-by-step corrective plan
|
| 51 |
+
|
| 52 |
+
# Retrieval metadata
|
| 53 |
+
relevance_tags: List[str] # Tags for retrieval (task1, write_file, read_before_write...)
|
| 54 |
+
times_retrieved: int = 0
|
| 55 |
+
times_helpful: int = 0 # Incremented when retry after this lesson improved score
|
| 56 |
+
|
| 57 |
+
def to_dict(self) -> dict:
|
| 58 |
+
return asdict(self)
|
| 59 |
+
|
| 60 |
+
@classmethod
|
| 61 |
+
def from_dict(cls, d: dict) -> "MemoryEntry":
|
| 62 |
+
return cls(**d)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
@dataclass
|
| 66 |
+
class MemoryContext:
|
| 67 |
+
"""Injected memory context for a new episode."""
|
| 68 |
+
relevant_lessons: List[MemoryEntry]
|
| 69 |
+
system_prompt_injection: str # Full text to prepend to system prompt
|
| 70 |
+
user_context_injection: str # Full text to prepend to first user message
|
| 71 |
+
lessons_count: int
|
| 72 |
+
most_relevant_lesson: Optional[str]
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
class EpisodicMemoryBank:
|
| 76 |
+
"""
|
| 77 |
+
Persistent cross-episode memory bank.
|
| 78 |
+
|
| 79 |
+
Storage: JSON file on disk (or in-memory for Gradio sessions).
|
| 80 |
+
Each entry is a MemoryEntry with lesson + retrieval metadata.
|
| 81 |
+
|
| 82 |
+
Usage:
|
| 83 |
+
bank = EpisodicMemoryBank(persist_path="memory.json")
|
| 84 |
+
# After an episode:
|
| 85 |
+
bank.store(episode_result)
|
| 86 |
+
# Before next episode:
|
| 87 |
+
context = bank.retrieve(task="task1", max_lessons=3)
|
| 88 |
+
# Inject context.system_prompt_injection into agent
|
| 89 |
+
"""
|
| 90 |
+
|
| 91 |
+
MAX_ENTRIES = 50 # Keep last 50 lessons per task
|
| 92 |
+
|
| 93 |
+
def __init__(self, persist_path: Optional[str] = None):
|
| 94 |
+
self.persist_path = persist_path
|
| 95 |
+
self._entries: List[MemoryEntry] = []
|
| 96 |
+
if persist_path and os.path.exists(persist_path):
|
| 97 |
+
self._load()
|
| 98 |
+
|
| 99 |
+
def store(
|
| 100 |
+
self,
|
| 101 |
+
episode_id: str,
|
| 102 |
+
task: str,
|
| 103 |
+
failure_type: str,
|
| 104 |
+
failure_evidence: str,
|
| 105 |
+
score: float,
|
| 106 |
+
strategy: str,
|
| 107 |
+
trajectory_steps: List[dict],
|
| 108 |
+
improvement_plan: Optional[dict] = None,
|
| 109 |
+
) -> MemoryEntry:
|
| 110 |
+
"""Store a lesson from a completed episode."""
|
| 111 |
+
# Build action fingerprint
|
| 112 |
+
actions = [s.get("action_type", "?") for s in trajectory_steps]
|
| 113 |
+
seq_str = "โ".join(actions[:12])
|
| 114 |
+
seq_hash = hashlib.md5(seq_str.encode()).hexdigest()[:8]
|
| 115 |
+
|
| 116 |
+
# Relevance tags for retrieval
|
| 117 |
+
tags = [task, failure_type, strategy]
|
| 118 |
+
if "read_file" in actions:
|
| 119 |
+
tags.append("read_file")
|
| 120 |
+
if "write_file" in actions:
|
| 121 |
+
tags.append("write_file")
|
| 122 |
+
if "run_tests" not in actions:
|
| 123 |
+
tags.append("no_verification")
|
| 124 |
+
if len(actions) <= 3:
|
| 125 |
+
tags.append("too_short")
|
| 126 |
+
|
| 127 |
+
# Extract lesson from improvement plan or failure type
|
| 128 |
+
if improvement_plan:
|
| 129 |
+
lesson_title = improvement_plan.get("failure_type", failure_type)
|
| 130 |
+
lesson_body = improvement_plan.get("what_went_wrong", "Agent failed.")
|
| 131 |
+
lesson_hint = improvement_plan.get("system_prompt_addon", "")
|
| 132 |
+
lesson_plan = improvement_plan.get("step_by_step_plan", [])
|
| 133 |
+
else:
|
| 134 |
+
lesson_title, lesson_body, lesson_hint, lesson_plan = self._default_lesson(
|
| 135 |
+
failure_type, score, strategy
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
entry = MemoryEntry(
|
| 139 |
+
entry_id=f"{task}_{seq_hash}_{int(time.time())}",
|
| 140 |
+
episode_id=episode_id,
|
| 141 |
+
task=task,
|
| 142 |
+
created_at=time.time(),
|
| 143 |
+
failure_type=failure_type,
|
| 144 |
+
failure_evidence=failure_evidence[:200],
|
| 145 |
+
score=score,
|
| 146 |
+
strategy=strategy,
|
| 147 |
+
action_sequence_hash=seq_hash,
|
| 148 |
+
lesson_title=lesson_title,
|
| 149 |
+
lesson_body=lesson_body,
|
| 150 |
+
lesson_hint=lesson_hint,
|
| 151 |
+
lesson_plan=lesson_plan,
|
| 152 |
+
relevance_tags=tags,
|
| 153 |
+
times_retrieved=0,
|
| 154 |
+
times_helpful=0,
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
self._entries.append(entry)
|
| 158 |
+
self._trim()
|
| 159 |
+
if self.persist_path:
|
| 160 |
+
self._save()
|
| 161 |
+
return entry
|
| 162 |
+
|
| 163 |
+
def retrieve(
|
| 164 |
+
self,
|
| 165 |
+
task: str,
|
| 166 |
+
failure_type: Optional[str] = None,
|
| 167 |
+
strategy: Optional[str] = None,
|
| 168 |
+
max_lessons: int = 3,
|
| 169 |
+
) -> MemoryContext:
|
| 170 |
+
"""Retrieve relevant lessons for an upcoming episode."""
|
| 171 |
+
if not self._entries:
|
| 172 |
+
return MemoryContext(
|
| 173 |
+
relevant_lessons=[],
|
| 174 |
+
system_prompt_injection="",
|
| 175 |
+
user_context_injection="",
|
| 176 |
+
lessons_count=0,
|
| 177 |
+
most_relevant_lesson=None,
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
# Score each entry by relevance
|
| 181 |
+
scored: List[tuple[float, MemoryEntry]] = []
|
| 182 |
+
for e in self._entries:
|
| 183 |
+
score = 0.0
|
| 184 |
+
if e.task == task:
|
| 185 |
+
score += 3.0
|
| 186 |
+
elif task in e.relevance_tags:
|
| 187 |
+
score += 2.0
|
| 188 |
+
if failure_type and e.failure_type == failure_type:
|
| 189 |
+
score += 2.0
|
| 190 |
+
if strategy and e.strategy == strategy:
|
| 191 |
+
score += 1.0
|
| 192 |
+
# Penalize already-retrieved lessons slightly (freshness)
|
| 193 |
+
score -= e.times_retrieved * 0.1
|
| 194 |
+
# Boost low-score lessons (more informative failures)
|
| 195 |
+
score += max(0, 0.5 - e.score)
|
| 196 |
+
scored.append((score, e))
|
| 197 |
+
|
| 198 |
+
scored.sort(key=lambda x: -x[0])
|
| 199 |
+
relevant = [e for _, e in scored[:max_lessons]]
|
| 200 |
+
|
| 201 |
+
# Mark as retrieved
|
| 202 |
+
for e in relevant:
|
| 203 |
+
e.times_retrieved += 1
|
| 204 |
+
|
| 205 |
+
if not relevant:
|
| 206 |
+
return MemoryContext(
|
| 207 |
+
relevant_lessons=[],
|
| 208 |
+
system_prompt_injection="",
|
| 209 |
+
user_context_injection="",
|
| 210 |
+
lessons_count=0,
|
| 211 |
+
most_relevant_lesson=None,
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
# Build injection text
|
| 215 |
+
sys_lines = [
|
| 216 |
+
"๐ง AGENT MEMORY โ LESSONS FROM PAST EPISODES",
|
| 217 |
+
"=" * 50,
|
| 218 |
+
"You have made these mistakes before. Do NOT repeat them.",
|
| 219 |
+
"",
|
| 220 |
+
]
|
| 221 |
+
for i, e in enumerate(relevant, 1):
|
| 222 |
+
sys_lines += [
|
| 223 |
+
f"[Lesson {i}] Task: {e.task} | Failure: {e.failure_type} | Score was: {e.score:.2f}",
|
| 224 |
+
f"What went wrong: {e.lesson_body}",
|
| 225 |
+
f"IMPORTANT: {e.lesson_hint}" if e.lesson_hint else "",
|
| 226 |
+
"",
|
| 227 |
+
]
|
| 228 |
+
sys_lines.append("=" * 50)
|
| 229 |
+
system_injection = "\n".join(l for l in sys_lines if l is not None)
|
| 230 |
+
|
| 231 |
+
user_lines = [
|
| 232 |
+
"[MEMORY CONTEXT โ Read before you act]",
|
| 233 |
+
]
|
| 234 |
+
for i, e in enumerate(relevant, 1):
|
| 235 |
+
user_lines.append(f"Past lesson {i}: {e.lesson_title}")
|
| 236 |
+
if e.lesson_plan:
|
| 237 |
+
user_lines.append("Correct approach:")
|
| 238 |
+
user_lines.extend(f" {step}" for step in e.lesson_plan[:4])
|
| 239 |
+
user_injection = "\n".join(user_lines)
|
| 240 |
+
|
| 241 |
+
return MemoryContext(
|
| 242 |
+
relevant_lessons=relevant,
|
| 243 |
+
system_prompt_injection=system_injection,
|
| 244 |
+
user_context_injection=user_injection,
|
| 245 |
+
lessons_count=len(relevant),
|
| 246 |
+
most_relevant_lesson=relevant[0].lesson_title if relevant else None,
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
def get_all_entries(self) -> List[dict]:
|
| 250 |
+
return [e.to_dict() for e in self._entries]
|
| 251 |
+
|
| 252 |
+
def get_stats(self) -> dict:
|
| 253 |
+
if not self._entries:
|
| 254 |
+
return {"total_entries": 0, "tasks": {}}
|
| 255 |
+
|
| 256 |
+
from collections import Counter
|
| 257 |
+
failure_counts = Counter(e.failure_type for e in self._entries)
|
| 258 |
+
task_counts = Counter(e.task for e in self._entries)
|
| 259 |
+
avg_score = sum(e.score for e in self._entries) / len(self._entries)
|
| 260 |
+
|
| 261 |
+
return {
|
| 262 |
+
"total_entries": len(self._entries),
|
| 263 |
+
"average_score_of_stored_episodes": round(avg_score, 3),
|
| 264 |
+
"failure_breakdown": dict(failure_counts.most_common()),
|
| 265 |
+
"tasks": dict(task_counts),
|
| 266 |
+
"most_helpful_lesson": max(self._entries, key=lambda e: e.times_helpful).lesson_title
|
| 267 |
+
if any(e.times_helpful > 0 for e in self._entries) else None,
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
def mark_helpful(self, episode_id: str):
|
| 271 |
+
"""Call this when a retry with a lesson improved the score."""
|
| 272 |
+
for e in self._entries:
|
| 273 |
+
if e.episode_id == episode_id:
|
| 274 |
+
e.times_helpful += 1
|
| 275 |
+
if self.persist_path:
|
| 276 |
+
self._save()
|
| 277 |
+
|
| 278 |
+
def clear(self, task: Optional[str] = None):
|
| 279 |
+
if task:
|
| 280 |
+
self._entries = [e for e in self._entries if e.task != task]
|
| 281 |
+
else:
|
| 282 |
+
self._entries = []
|
| 283 |
+
if self.persist_path:
|
| 284 |
+
self._save()
|
| 285 |
+
|
| 286 |
+
# โโ Persistence โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 287 |
+
|
| 288 |
+
def _save(self):
|
| 289 |
+
with open(self.persist_path, "w") as f:
|
| 290 |
+
json.dump([e.to_dict() for e in self._entries], f, indent=2)
|
| 291 |
+
|
| 292 |
+
def _load(self):
|
| 293 |
+
try:
|
| 294 |
+
with open(self.persist_path, "r") as f:
|
| 295 |
+
data = json.load(f)
|
| 296 |
+
self._entries = [MemoryEntry.from_dict(d) for d in data]
|
| 297 |
+
except Exception:
|
| 298 |
+
self._entries = []
|
| 299 |
+
|
| 300 |
+
def _trim(self):
|
| 301 |
+
"""Keep at most MAX_ENTRIES, dropping oldest low-score entries first."""
|
| 302 |
+
if len(self._entries) <= self.MAX_ENTRIES:
|
| 303 |
+
return
|
| 304 |
+
# Sort by: useful first, then by recency
|
| 305 |
+
self._entries.sort(
|
| 306 |
+
key=lambda e: (
|
| 307 |
+
-e.times_helpful,
|
| 308 |
+
-e.times_retrieved,
|
| 309 |
+
e.created_at,
|
| 310 |
+
),
|
| 311 |
+
reverse=True,
|
| 312 |
+
)
|
| 313 |
+
self._entries = self._entries[:self.MAX_ENTRIES]
|
| 314 |
+
|
| 315 |
+
def _default_lesson(
|
| 316 |
+
self, failure_type: str, score: float, strategy: str
|
| 317 |
+
) -> tuple[str, str, str, List[str]]:
|
| 318 |
+
lessons = {
|
| 319 |
+
"NEVER_TESTED": (
|
| 320 |
+
"Submitted without verification",
|
| 321 |
+
"Agent submitted code without running tests. No confidence in correctness.",
|
| 322 |
+
"CRITICAL: Run run_tests after EVERY write_file. Never submit without test verification.",
|
| 323 |
+
["1. Write fix", "2. run_tests to check", "3. If passing โ submit", "4. If failing โ re-read and fix"],
|
| 324 |
+
),
|
| 325 |
+
"BLIND_WRITE": (
|
| 326 |
+
"Wrote without reading",
|
| 327 |
+
"Agent wrote to a file without reading it first. Blind writes introduce new bugs.",
|
| 328 |
+
"NEVER use write_file before read_file on the same path.",
|
| 329 |
+
["1. read_file first", "2. Understand existing code", "3. Then write minimal fix"],
|
| 330 |
+
),
|
| 331 |
+
"WRONG_FILE_NAVIGATION": (
|
| 332 |
+
"Navigated to wrong files",
|
| 333 |
+
"Agent read files unrelated to the bug. Wasted steps and missed root cause.",
|
| 334 |
+
"ALWAYS start with the failing test file. Its imports show you exactly where to go.",
|
| 335 |
+
["1. Read failing test", "2. Find its imports", "3. Navigate ONLY there"],
|
| 336 |
+
),
|
| 337 |
+
"LOOPING_BEHAVIOR": (
|
| 338 |
+
"Read same files repeatedly",
|
| 339 |
+
f"Agent looped reading the same files without progress. Score={score:.2f}.",
|
| 340 |
+
"Each file may be read AT MOST ONCE. Use search_code if confused.",
|
| 341 |
+
["1. Use search_code with function name", "2. Read matched file โ once", "3. commit to fix"],
|
| 342 |
+
),
|
| 343 |
+
}
|
| 344 |
+
defaults = lessons.get(failure_type, (
|
| 345 |
+
f"{failure_type} failure",
|
| 346 |
+
f"Agent failed with type '{failure_type}', score={score:.2f}.",
|
| 347 |
+
"Read test โ read source โ fix โ run_tests โ submit.",
|
| 348 |
+
["1. read test", "2. read source", "3. write fix", "4. run_tests", "5. submit"],
|
| 349 |
+
))
|
| 350 |
+
return defaults
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
# Global singleton (shared across the Gradio session)
|
| 354 |
+
_GLOBAL_MEMORY = EpisodicMemoryBank(
|
| 355 |
+
persist_path=os.path.join(
|
| 356 |
+
os.path.dirname(__file__), "..", "agent_memory.json"
|
| 357 |
+
)
|
| 358 |
+
)
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
def get_global_memory() -> EpisodicMemoryBank:
|
| 362 |
+
return _GLOBAL_MEMORY
|
static/viz3d.html
CHANGED
|
@@ -6,862 +6,601 @@
|
|
| 6 |
<title>Agent Trajectory 3D Visualizer</title>
|
| 7 |
<style>
|
| 8 |
* { margin: 0; padding: 0; box-sizing: border-box; }
|
| 9 |
-
body {
|
|
|
|
| 10 |
background: #0a0e1a;
|
| 11 |
color: #e0e6f0;
|
| 12 |
font-family: 'Segoe UI', system-ui, sans-serif;
|
| 13 |
overflow: hidden;
|
| 14 |
-
height: 100vh;
|
| 15 |
}
|
| 16 |
-
#
|
| 17 |
-
position:
|
| 18 |
top: 0; left: 0;
|
| 19 |
width: 100%; height: 100%;
|
| 20 |
-
|
| 21 |
-
#ui-overlay {
|
| 22 |
-
position: absolute;
|
| 23 |
-
top: 0; left: 0;
|
| 24 |
-
width: 100%; height: 100%;
|
| 25 |
-
pointer-events: none;
|
| 26 |
-
z-index: 10;
|
| 27 |
}
|
| 28 |
/* Header */
|
| 29 |
#header {
|
| 30 |
-
position:
|
| 31 |
-
top:
|
| 32 |
transform: translateX(-50%);
|
| 33 |
text-align: center;
|
|
|
|
| 34 |
pointer-events: none;
|
| 35 |
}
|
| 36 |
#header h1 {
|
| 37 |
-
font-size:
|
| 38 |
-
font-weight: 700;
|
| 39 |
color: #7dd3fc;
|
| 40 |
letter-spacing: 0.05em;
|
| 41 |
-
text-shadow: 0 0
|
| 42 |
-
}
|
| 43 |
-
#header p {
|
| 44 |
-
font-size: 11px;
|
| 45 |
-
color: #64748b;
|
| 46 |
-
margin-top: 2px;
|
| 47 |
}
|
| 48 |
-
/*
|
| 49 |
-
|
| 50 |
-
position:
|
| 51 |
-
|
| 52 |
-
background: rgba(10,14,26,0.85);
|
| 53 |
border: 1px solid rgba(125,211,252,0.2);
|
| 54 |
-
border-radius:
|
| 55 |
padding: 10px 14px;
|
| 56 |
font-size: 11px;
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
#legend h3 { color: #7dd3fc; margin-bottom: 8px; font-size: 11px; }
|
| 60 |
-
.legend-item {
|
| 61 |
-
display: flex; align-items: center; gap: 8px;
|
| 62 |
-
margin-bottom: 5px;
|
| 63 |
}
|
| 64 |
-
.
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
}
|
| 69 |
/* Info panel */
|
| 70 |
-
#info-panel {
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
}
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
color: #94a3b8;
|
| 86 |
}
|
| 87 |
-
.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
/* Timeline */
|
| 89 |
-
#timeline
|
| 90 |
-
position:
|
| 91 |
-
bottom: 20px; left: 50%;
|
| 92 |
transform: translateX(-50%);
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
border-radius: 10px;
|
| 96 |
-
padding: 14px 20px;
|
| 97 |
-
width: min(700px, 90vw);
|
| 98 |
-
pointer-events: all;
|
| 99 |
-
}
|
| 100 |
-
#timeline-panel .tl-header {
|
| 101 |
-
display: flex;
|
| 102 |
-
justify-content: space-between;
|
| 103 |
-
align-items: center;
|
| 104 |
-
margin-bottom: 10px;
|
| 105 |
-
}
|
| 106 |
-
#timeline-panel h3 {
|
| 107 |
-
font-size: 11px;
|
| 108 |
-
color: #7dd3fc;
|
| 109 |
-
letter-spacing: 0.1em;
|
| 110 |
-
}
|
| 111 |
-
#step-label {
|
| 112 |
-
font-size: 12px;
|
| 113 |
-
color: #f0abfc;
|
| 114 |
-
font-weight: 700;
|
| 115 |
}
|
| 116 |
-
#
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
|
|
|
| 120 |
background: linear-gradient(to right, #7dd3fc 0%, #7dd3fc var(--pct,0%), #1e293b var(--pct,0%));
|
| 121 |
-
border-radius: 4px;
|
| 122 |
-
outline: none;
|
| 123 |
-
cursor: pointer;
|
| 124 |
-
}
|
| 125 |
-
#timeline-slider::-webkit-slider-thumb {
|
| 126 |
-
-webkit-appearance: none;
|
| 127 |
-
width: 16px; height: 16px;
|
| 128 |
-
border-radius: 50%;
|
| 129 |
-
background: #7dd3fc;
|
| 130 |
-
cursor: pointer;
|
| 131 |
-
box-shadow: 0 0 10px rgba(125,211,252,0.7);
|
| 132 |
}
|
| 133 |
-
#
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
justify-content: center;
|
| 138 |
}
|
| 139 |
-
|
|
|
|
| 140 |
background: rgba(125,211,252,0.1);
|
| 141 |
border: 1px solid rgba(125,211,252,0.3);
|
| 142 |
-
color: #7dd3fc;
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
cursor: pointer;
|
| 146 |
-
font-size: 11px;
|
| 147 |
-
transition: all 0.2s;
|
| 148 |
}
|
| 149 |
-
.
|
| 150 |
-
.
|
| 151 |
-
/* Step log */
|
| 152 |
-
#step-log {
|
| 153 |
-
position: absolute;
|
| 154 |
-
bottom: 130px; right: 16px;
|
| 155 |
-
background: rgba(10,14,26,0.85);
|
| 156 |
-
border: 1px solid rgba(125,211,252,0.2);
|
| 157 |
-
border-radius: 8px;
|
| 158 |
-
padding: 10px 14px;
|
| 159 |
-
width: 260px;
|
| 160 |
-
max-height: 240px;
|
| 161 |
-
overflow-y: auto;
|
| 162 |
-
pointer-events: none;
|
| 163 |
-
font-size: 10px;
|
| 164 |
-
}
|
| 165 |
-
#step-log h3 { color: #7dd3fc; margin-bottom: 8px; font-size: 11px; }
|
| 166 |
-
.log-entry {
|
| 167 |
-
display: flex;
|
| 168 |
-
align-items: flex-start;
|
| 169 |
-
gap: 6px;
|
| 170 |
-
margin-bottom: 6px;
|
| 171 |
-
padding-bottom: 6px;
|
| 172 |
-
border-bottom: 1px solid rgba(255,255,255,0.05);
|
| 173 |
-
}
|
| 174 |
-
.log-entry:last-child { border-bottom: none; }
|
| 175 |
-
.log-step { color: #475569; min-width: 28px; }
|
| 176 |
-
.log-action { font-weight: 600; }
|
| 177 |
-
.log-reward { margin-left: auto; font-weight: 700; }
|
| 178 |
-
.reward-pos { color: #4ade80; }
|
| 179 |
-
.reward-neg { color: #f87171; }
|
| 180 |
-
.reward-zero { color: #94a3b8; }
|
| 181 |
/* Tooltip */
|
| 182 |
#tooltip {
|
| 183 |
-
position:
|
| 184 |
background: rgba(10,14,26,0.95);
|
| 185 |
border: 1px solid rgba(125,211,252,0.4);
|
| 186 |
-
border-radius: 6px;
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
opacity: 0;
|
| 191 |
-
transition: opacity 0.15s;
|
| 192 |
-
max-width: 200px;
|
| 193 |
-
z-index: 20;
|
| 194 |
-
}
|
| 195 |
-
#tooltip h4 { color: #7dd3fc; margin-bottom: 4px; }
|
| 196 |
-
/* Score ring */
|
| 197 |
-
#score-ring {
|
| 198 |
-
position: absolute;
|
| 199 |
-
bottom: 130px; left: 16px;
|
| 200 |
-
pointer-events: none;
|
| 201 |
}
|
| 202 |
-
#
|
| 203 |
/* Loader */
|
| 204 |
#loader {
|
| 205 |
-
position:
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
color: #7dd3fc;
|
| 209 |
-
font-size: 14px;
|
| 210 |
-
text-align: center;
|
| 211 |
}
|
| 212 |
-
.
|
| 213 |
-
width:
|
| 214 |
-
border: 3px solid rgba(125,211,252,0.
|
| 215 |
border-top-color: #7dd3fc;
|
| 216 |
border-radius: 50%;
|
| 217 |
-
animation:
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
}
|
| 220 |
-
@keyframes spin { to { transform: rotate(360deg); } }
|
| 221 |
</style>
|
| 222 |
</head>
|
| 223 |
<body>
|
| 224 |
|
| 225 |
-
<
|
| 226 |
-
<div id="viz-data" style="display:none"></div>
|
| 227 |
|
| 228 |
-
<div id="
|
| 229 |
-
|
|
|
|
|
|
|
|
|
|
| 230 |
</div>
|
| 231 |
|
| 232 |
-
<div id="
|
| 233 |
-
<div class="loader-spinner"></div>
|
| 234 |
-
<p>Initializing 3D Visualizer...</p>
|
| 235 |
-
</div>
|
| 236 |
|
| 237 |
-
<
|
| 238 |
-
|
| 239 |
-
<
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
</div>
|
|
|
|
|
|
|
|
|
|
| 243 |
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
<
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
<div class="legend-item">
|
| 271 |
-
<div class="legend-dot" style="background:#ef4444"></div><span>Modified / Bug</span>
|
| 272 |
-
</div>
|
| 273 |
-
<div class="legend-item">
|
| 274 |
-
<div class="legend-dot" style="background:#facc15; width:20px; height:4px; border-radius:2px;"></div><span>Agent path</span>
|
| 275 |
-
</div>
|
| 276 |
-
</div>
|
| 277 |
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
<circle id="score-arc" cx="40" cy="40" r="34" fill="none"
|
| 284 |
-
stroke="#7dd3fc" stroke-width="6"
|
| 285 |
-
stroke-dasharray="0 214"
|
| 286 |
-
stroke-linecap="round"
|
| 287 |
-
transform="rotate(-90 40 40)"
|
| 288 |
-
style="transition: stroke-dasharray 1s ease;"/>
|
| 289 |
-
<text id="score-text" x="40" y="45" text-anchor="middle"
|
| 290 |
-
fill="#e0e6f0" font-size="14" font-weight="700">0.0</text>
|
| 291 |
-
</svg>
|
| 292 |
-
</div>
|
| 293 |
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
<h3>STEP LOG</h3>
|
| 297 |
-
<div id="log-entries"></div>
|
| 298 |
-
</div>
|
| 299 |
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
<
|
|
|
|
| 304 |
</div>
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
<div id="
|
| 308 |
-
<
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
</
|
| 312 |
-
<
|
| 313 |
-
oninput="onSliderChange(this.value)">
|
| 314 |
-
<div id="step-actions">
|
| 315 |
-
<button class="tl-btn" onclick="stepBack()">โ Back</button>
|
| 316 |
-
<button class="tl-btn" id="play-btn" onclick="togglePlay()">โถ Play</button>
|
| 317 |
-
<button class="tl-btn" onclick="stepForward()">Forward โถ</button>
|
| 318 |
-
<button class="tl-btn" onclick="resetView()">โบ Reset</button>
|
| 319 |
-
<button class="tl-btn" id="orbit-btn" onclick="toggleOrbit()">๐ Orbit</button>
|
| 320 |
-
</div>
|
| 321 |
</div>
|
| 322 |
</div>
|
| 323 |
|
| 324 |
-
<!-- Three.js from CDN -->
|
| 325 |
<script src="https://cdnjs.cloudflare.com/ajax/libs/three.js/r128/three.min.js"></script>
|
| 326 |
<script>
|
| 327 |
-
// โโ
|
| 328 |
-
const DEFAULT_DATA = {
|
| 329 |
-
task: "task1",
|
| 330 |
-
variant_id: "variant_1",
|
| 331 |
-
final_score: 0.714,
|
| 332 |
-
strategy: "TARGETED_DEBUGGING",
|
| 333 |
-
failure_type: "CORRECT",
|
| 334 |
-
files: [
|
| 335 |
-
{ name: "tests/test_formatter.py", type: "test" },
|
| 336 |
-
{ name: "src/formatter.py", type: "src", is_bug_file: true },
|
| 337 |
-
{ name: "src/utils.py", type: "src" }
|
| 338 |
-
],
|
| 339 |
-
dependencies: [
|
| 340 |
-
{ from: "tests/test_formatter.py", to: "src/formatter.py" },
|
| 341 |
-
{ from: "src/formatter.py", to: "src/utils.py" }
|
| 342 |
-
],
|
| 343 |
-
steps: [
|
| 344 |
-
{ step: 1, action: "read_file", path: "tests/test_formatter.py", reward: 0.0 },
|
| 345 |
-
{ step: 2, action: "read_file", path: "src/formatter.py", reward: 0.05 },
|
| 346 |
-
{ step: 3, action: "search_code", path: null, reward: 0.0 },
|
| 347 |
-
{ step: 4, action: "run_tests", path: "tests/test_formatter.py", reward: 0.0 },
|
| 348 |
-
{ step: 5, action: "submit", path: null, reward: 0.694 }
|
| 349 |
-
]
|
| 350 |
-
};
|
| 351 |
-
|
| 352 |
-
// โโ Load data from injection point or use default โโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 353 |
-
function loadVizData() {
|
| 354 |
-
const el = document.getElementById('viz-data');
|
| 355 |
-
if (el && el.textContent.trim()) {
|
| 356 |
-
try { return JSON.parse(el.textContent); } catch(e) {}
|
| 357 |
-
}
|
| 358 |
-
return DEFAULT_DATA;
|
| 359 |
-
}
|
| 360 |
-
|
| 361 |
-
// โโ Three.js setup โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 362 |
const canvas = document.getElementById('three-canvas');
|
| 363 |
-
const renderer = new THREE.WebGLRenderer({ canvas, antialias: true, alpha:
|
| 364 |
-
renderer.setSize(window.innerWidth, window.innerHeight);
|
| 365 |
renderer.setPixelRatio(Math.min(window.devicePixelRatio, 2));
|
| 366 |
renderer.setClearColor(0x0a0e1a, 1);
|
| 367 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
const scene = new THREE.Scene();
|
| 369 |
-
const
|
| 370 |
-
|
| 371 |
-
camera.position.set(0, 8, 22);
|
| 372 |
camera.lookAt(0, 0, 0);
|
|
|
|
| 373 |
|
| 374 |
-
//
|
| 375 |
-
scene.add(new THREE.AmbientLight(0x1a2040, 1));
|
| 376 |
-
const
|
| 377 |
-
|
| 378 |
-
scene.add(
|
| 379 |
|
| 380 |
// Grid
|
| 381 |
-
const grid = new THREE.GridHelper(
|
| 382 |
-
grid.position.y = -3;
|
| 383 |
scene.add(grid);
|
| 384 |
|
| 385 |
// Stars
|
| 386 |
-
|
| 387 |
-
const
|
| 388 |
-
const
|
| 389 |
-
for (let i = 0; i <
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
// โโ
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
};
|
| 406 |
-
|
| 407 |
-
// โโ Orbit control (manual implementation) โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 408 |
-
let isOrbiting = false;
|
| 409 |
-
let orbitActive = false;
|
| 410 |
-
let mouse = { x: 0, y: 0, down: false, lastX: 0, lastY: 0 };
|
| 411 |
-
let spherical = { theta: 0, phi: Math.PI / 4, r: 22 };
|
| 412 |
-
|
| 413 |
-
canvas.addEventListener('mousedown', e => { mouse.down = true; mouse.lastX = e.clientX; mouse.lastY = e.clientY; });
|
| 414 |
-
canvas.addEventListener('mouseup', () => { mouse.down = false; });
|
| 415 |
-
canvas.addEventListener('mousemove', e => {
|
| 416 |
-
if (!mouse.down) {
|
| 417 |
-
// Hover for tooltip
|
| 418 |
-
checkHover(e.clientX, e.clientY);
|
| 419 |
-
return;
|
| 420 |
-
}
|
| 421 |
-
const dx = e.clientX - mouse.lastX;
|
| 422 |
-
const dy = e.clientY - mouse.lastY;
|
| 423 |
-
spherical.theta -= dx * 0.005;
|
| 424 |
-
spherical.phi = Math.max(0.1, Math.min(Math.PI / 2, spherical.phi - dy * 0.005));
|
| 425 |
-
mouse.lastX = e.clientX;
|
| 426 |
-
mouse.lastY = e.clientY;
|
| 427 |
});
|
| 428 |
canvas.addEventListener('wheel', e => {
|
| 429 |
-
|
| 430 |
});
|
| 431 |
|
| 432 |
function updateCamera() {
|
| 433 |
-
if (
|
| 434 |
-
|
| 435 |
-
camera.position.
|
| 436 |
-
|
|
|
|
|
|
|
|
|
|
| 437 |
camera.lookAt(0, 0, 0);
|
| 438 |
}
|
| 439 |
|
| 440 |
-
// โโ Scene
|
| 441 |
-
const
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
let
|
| 445 |
-
let
|
| 446 |
-
let currentStep = 0;
|
| 447 |
-
let maxStep = 0;
|
| 448 |
-
let playing = false;
|
| 449 |
-
let playInterval = null;
|
| 450 |
let vizData = null;
|
| 451 |
-
let
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
|
| 453 |
-
// โโ Build scene from data โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 454 |
function buildScene(data) {
|
|
|
|
| 455 |
vizData = data;
|
| 456 |
-
|
| 457 |
-
// Clear previous objects
|
| 458 |
-
Object.values(nodeObjects).forEach(o => scene.remove(o.mesh));
|
| 459 |
-
edgeObjects.forEach(e => scene.remove(e));
|
| 460 |
-
pathObjects.forEach(p => scene.remove(p));
|
| 461 |
-
if (agentSphere) scene.remove(agentSphere);
|
| 462 |
-
Object.keys(nodeObjects).forEach(k => delete nodeObjects[k]);
|
| 463 |
-
|
| 464 |
const files = data.files || [];
|
| 465 |
const n = files.length;
|
| 466 |
-
if (n
|
| 467 |
-
|
| 468 |
-
//
|
| 469 |
-
files.forEach((
|
| 470 |
-
const angle = (i / n) * Math.PI * 2;
|
| 471 |
-
const
|
| 472 |
-
const x = Math.cos(angle) *
|
| 473 |
-
const z = Math.sin(angle) *
|
| 474 |
-
const
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
const
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
file.type === 'spec' ? COLORS.spec : COLORS.src
|
| 484 |
-
);
|
| 485 |
const mat = new THREE.MeshPhongMaterial({
|
| 486 |
-
color,
|
| 487 |
-
|
| 488 |
-
shininess: 60,
|
| 489 |
-
transparent: true,
|
| 490 |
-
opacity: 0.9,
|
| 491 |
});
|
| 492 |
const mesh = new THREE.Mesh(geo, mat);
|
| 493 |
-
mesh.position.
|
| 494 |
-
mesh.userData = { file };
|
| 495 |
scene.add(mesh);
|
| 496 |
|
| 497 |
-
//
|
| 498 |
-
const
|
| 499 |
-
const
|
| 500 |
-
|
| 501 |
-
transparent: true,
|
| 502 |
-
opacity: 0.25,
|
| 503 |
-
side: THREE.DoubleSide,
|
| 504 |
-
});
|
| 505 |
-
const ring = new THREE.Mesh(ringGeo, ringMat);
|
| 506 |
ring.rotation.x = Math.PI / 2;
|
| 507 |
mesh.add(ring);
|
| 508 |
|
| 509 |
-
|
| 510 |
});
|
| 511 |
|
| 512 |
-
//
|
| 513 |
(data.dependencies || []).forEach(dep => {
|
| 514 |
-
const
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
const points = [fromPos.clone(), toPos.clone()];
|
| 519 |
-
const geo = new THREE.BufferGeometry().setFromPoints(points);
|
| 520 |
-
const mat = new THREE.LineBasicMaterial({
|
| 521 |
-
color: COLORS.edge,
|
| 522 |
-
transparent: true,
|
| 523 |
-
opacity: 0.4,
|
| 524 |
-
});
|
| 525 |
const line = new THREE.Line(geo, mat);
|
| 526 |
scene.add(line);
|
| 527 |
-
|
| 528 |
});
|
| 529 |
|
| 530 |
-
// Agent
|
| 531 |
-
const
|
| 532 |
-
const
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
// Update UI
|
| 543 |
-
document.getElementById('stat-task').textContent = data.task || 'โ';
|
| 544 |
-
document.getElementById('stat-variant').textContent = data.variant_id || 'โ';
|
| 545 |
-
document.getElementById('stat-steps').textContent = (data.steps || []).length;
|
| 546 |
-
document.getElementById('stat-strategy').textContent = data.strategy || 'โ';
|
| 547 |
-
document.getElementById('stat-failure').textContent = data.failure_type || 'โ';
|
| 548 |
updateScore(data.final_score || 0);
|
| 549 |
-
updateStepLog(data.steps || [], -1);
|
| 550 |
|
| 551 |
-
// Setup timeline
|
| 552 |
maxStep = (data.steps || []).length;
|
| 553 |
-
const
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
updateStepLabel(0, maxStep);
|
| 558 |
-
|
| 559 |
applyStep(0);
|
| 560 |
}
|
| 561 |
|
| 562 |
-
// โโ
|
| 563 |
-
function applyStep(
|
| 564 |
if (!vizData) return;
|
| 565 |
const steps = vizData.steps || [];
|
| 566 |
-
const visitedFiles = new Set();
|
| 567 |
-
const modifiedFiles = new Set();
|
| 568 |
|
| 569 |
// Reset all nodes
|
| 570 |
-
Object.values(
|
| 571 |
-
const
|
| 572 |
-
const
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
);
|
| 577 |
-
obj.mesh.material.color.set(baseColor);
|
| 578 |
-
obj.mesh.material.emissive.set(baseColor.clone().multiplyScalar(0.2));
|
| 579 |
-
obj.mesh.material.opacity = 0.5;
|
| 580 |
-
obj.mesh.scale.set(1, 1, 1);
|
| 581 |
});
|
| 582 |
|
| 583 |
// Remove old path lines
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
// Collect positions for path
|
| 588 |
-
const
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
if (
|
| 595 |
-
const
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
if (step.action === 'read_file') visitedFiles.add(step.path);
|
| 599 |
-
if (step.action === 'write_file') modifiedFiles.add(step.path);
|
| 600 |
}
|
|
|
|
|
|
|
| 601 |
}
|
| 602 |
|
| 603 |
-
// Color visited
|
| 604 |
-
|
| 605 |
-
if (
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
);
|
| 610 |
-
nodeObjects[name].mesh.material.opacity = 1.0;
|
| 611 |
-
nodeObjects[name].mesh.scale.set(1.2, 1.2, 1.2);
|
| 612 |
}
|
| 613 |
});
|
| 614 |
-
|
| 615 |
-
if (
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
);
|
| 620 |
-
nodeObjects[name].mesh.material.opacity = 1.0;
|
| 621 |
-
nodeObjects[name].mesh.scale.set(1.4, 1.4, 1.4);
|
| 622 |
}
|
| 623 |
});
|
| 624 |
|
| 625 |
-
//
|
| 626 |
-
if (
|
| 627 |
-
const
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
opacity: 0.85,
|
| 632 |
-
linewidth: 2,
|
| 633 |
-
});
|
| 634 |
-
const pathLine = new THREE.Line(pathGeo, pathMat);
|
| 635 |
-
scene.add(pathLine);
|
| 636 |
-
pathObjects.push(pathLine);
|
| 637 |
}
|
| 638 |
|
| 639 |
-
//
|
| 640 |
-
if (
|
| 641 |
-
const
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
} else {
|
| 646 |
-
// No file target โ float in center (for search/submit actions)
|
| 647 |
-
agentSphere.position.set(0, 2.5, 0);
|
| 648 |
-
}
|
| 649 |
-
} else {
|
| 650 |
-
agentSphere.position.set(0, 3.5, 0);
|
| 651 |
}
|
| 652 |
|
| 653 |
-
//
|
| 654 |
-
if (
|
| 655 |
-
const cur = steps[
|
| 656 |
-
if (cur && cur.path &&
|
| 657 |
-
|
|
|
|
|
|
|
|
|
|
| 658 |
}
|
|
|
|
|
|
|
| 659 |
}
|
| 660 |
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
/
|
| 665 |
-
const slider = document.getElementById('timeline-slider');
|
| 666 |
-
const pct = maxStep > 0 ? (stepIndex / maxStep * 100) : 0;
|
| 667 |
-
slider.style.setProperty('--pct', pct + '%');
|
| 668 |
}
|
| 669 |
|
| 670 |
-
// โโ Score ring โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 671 |
-
function updateScore(
|
| 672 |
-
const
|
| 673 |
-
const arc =
|
| 674 |
-
document.getElementById('score-arc').setAttribute(
|
| 675 |
-
|
| 676 |
-
);
|
| 677 |
-
|
| 678 |
-
document.getElementById('
|
| 679 |
-
|
| 680 |
-
// Color by score
|
| 681 |
-
const color = score >= 0.7 ? '#4ade80' : score >= 0.4 ? '#fbbf24' : '#f87171';
|
| 682 |
-
document.getElementById('score-arc').setAttribute('stroke', color);
|
| 683 |
}
|
| 684 |
|
| 685 |
-
// โโ Step log โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 686 |
-
function
|
| 687 |
-
const
|
|
|
|
| 688 |
container.innerHTML = '';
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
const active = i === currentIdx;
|
| 700 |
-
const past = i < currentIdx;
|
| 701 |
-
const entry = document.createElement('div');
|
| 702 |
-
entry.className = 'log-entry';
|
| 703 |
-
entry.style.opacity = past ? '0.6' : active ? '1' : '0.35';
|
| 704 |
-
if (active) entry.style.background = 'rgba(125,211,252,0.08)';
|
| 705 |
-
|
| 706 |
-
const reward = step.reward || 0;
|
| 707 |
-
const rewardClass = reward > 0 ? 'reward-pos' : reward < 0 ? 'reward-neg' : 'reward-zero';
|
| 708 |
-
const emoji = ACTION_EMOJI[step.action] || 'โข';
|
| 709 |
-
const path = step.path ? step.path.split('/').pop() : step.action;
|
| 710 |
-
|
| 711 |
-
entry.innerHTML = `
|
| 712 |
-
<span class="log-step">S${step.step}</span>
|
| 713 |
-
<span class="log-action" style="color:${active ? '#7dd3fc' : '#94a3b8'}">${emoji} ${path}</span>
|
| 714 |
-
<span class="log-reward ${rewardClass}">${reward > 0 ? '+' : ''}${reward.toFixed(2)}</span>
|
| 715 |
-
`;
|
| 716 |
-
container.appendChild(entry);
|
| 717 |
});
|
| 718 |
-
|
| 719 |
-
|
| 720 |
-
if (currentIdx >= 0) {
|
| 721 |
-
const entries = container.children;
|
| 722 |
-
if (entries[currentIdx]) {
|
| 723 |
-
entries[currentIdx].scrollIntoView({ block: 'nearest' });
|
| 724 |
-
}
|
| 725 |
}
|
| 726 |
}
|
| 727 |
|
| 728 |
-
// โโ Hover tooltip โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 729 |
-
const
|
| 730 |
-
const
|
| 731 |
-
const
|
| 732 |
|
| 733 |
function checkHover(mx, my) {
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
const
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
document.getElementById('tooltip-title').textContent = file.name;
|
| 748 |
-
document.getElementById('tooltip-body').innerHTML = `
|
| 749 |
-
Type: ${file.type}<br>
|
| 750 |
-
${file.is_bug_file ? 'โ ๏ธ Bug location' : ''}
|
| 751 |
-
`;
|
| 752 |
-
}
|
| 753 |
} else {
|
| 754 |
-
|
| 755 |
-
}
|
| 756 |
-
}
|
| 757 |
-
|
| 758 |
-
// โโ Timeline controls โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 759 |
-
function onSliderChange(val) {
|
| 760 |
-
currentStep = parseInt(val);
|
| 761 |
-
applyStep(currentStep);
|
| 762 |
-
}
|
| 763 |
-
|
| 764 |
-
function stepForward() {
|
| 765 |
-
if (currentStep < maxStep) {
|
| 766 |
-
currentStep++;
|
| 767 |
-
document.getElementById('timeline-slider').value = currentStep;
|
| 768 |
-
applyStep(currentStep);
|
| 769 |
-
}
|
| 770 |
-
}
|
| 771 |
-
|
| 772 |
-
function stepBack() {
|
| 773 |
-
if (currentStep > 0) {
|
| 774 |
-
currentStep--;
|
| 775 |
-
document.getElementById('timeline-slider').value = currentStep;
|
| 776 |
-
applyStep(currentStep);
|
| 777 |
}
|
| 778 |
}
|
| 779 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 780 |
function togglePlay() {
|
| 781 |
playing = !playing;
|
| 782 |
-
|
| 783 |
-
btn.textContent = playing ? 'โธ Pause' : 'โถ Play';
|
| 784 |
if (playing) {
|
| 785 |
-
if (
|
| 786 |
-
|
| 787 |
-
if (
|
| 788 |
-
|
| 789 |
-
|
| 790 |
-
clearInterval(playInterval);
|
| 791 |
-
return;
|
| 792 |
-
}
|
| 793 |
-
stepForward();
|
| 794 |
-
}, 900);
|
| 795 |
} else {
|
| 796 |
-
clearInterval(
|
| 797 |
}
|
| 798 |
}
|
| 799 |
-
|
| 800 |
function toggleOrbit() {
|
| 801 |
-
|
| 802 |
const btn = document.getElementById('orbit-btn');
|
| 803 |
-
btn.textContent =
|
| 804 |
-
btn.classList.toggle('active',
|
| 805 |
}
|
| 806 |
-
|
| 807 |
function resetView() {
|
| 808 |
-
|
| 809 |
-
|
| 810 |
-
document.getElementById('
|
| 811 |
applyStep(0);
|
| 812 |
}
|
|
|
|
| 813 |
|
| 814 |
-
|
| 815 |
-
document.getElementById('step-label').textContent = `Step ${step} / ${max}`;
|
| 816 |
-
}
|
| 817 |
-
|
| 818 |
-
// โโ Animation loop โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 819 |
-
let frame = 0;
|
| 820 |
function animate() {
|
| 821 |
requestAnimationFrame(animate);
|
| 822 |
frame++;
|
| 823 |
-
|
| 824 |
updateCamera();
|
| 825 |
-
|
| 826 |
-
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
Object.values(nodeObjects).forEach((obj, i) => {
|
| 835 |
-
obj.mesh.position.y = obj.position.y + Math.sin(frame * 0.02 + i) * 0.05;
|
| 836 |
});
|
| 837 |
-
|
| 838 |
renderer.render(scene, camera);
|
| 839 |
}
|
|
|
|
| 840 |
|
| 841 |
-
// โโ
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
renderer.setSize(window.innerWidth, window.innerHeight);
|
| 846 |
-
});
|
| 847 |
-
|
| 848 |
-
// โโ Public API for Gradio integration โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 849 |
-
window.loadTrajectoryData = function(jsonData) {
|
| 850 |
try {
|
| 851 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 852 |
buildScene(data);
|
|
|
|
| 853 |
} catch(e) {
|
| 854 |
-
|
|
|
|
| 855 |
}
|
| 856 |
-
}
|
| 857 |
|
| 858 |
-
// โโ
|
| 859 |
-
|
| 860 |
-
|
| 861 |
buildScene(data);
|
| 862 |
document.getElementById('loader').style.display = 'none';
|
| 863 |
-
|
| 864 |
-
}
|
|
|
|
|
|
|
|
|
|
| 865 |
</script>
|
| 866 |
</body>
|
| 867 |
</html>
|
|
|
|
| 6 |
<title>Agent Trajectory 3D Visualizer</title>
|
| 7 |
<style>
|
| 8 |
* { margin: 0; padding: 0; box-sizing: border-box; }
|
| 9 |
+
html, body {
|
| 10 |
+
width: 100%; height: 100%;
|
| 11 |
background: #0a0e1a;
|
| 12 |
color: #e0e6f0;
|
| 13 |
font-family: 'Segoe UI', system-ui, sans-serif;
|
| 14 |
overflow: hidden;
|
|
|
|
| 15 |
}
|
| 16 |
+
#three-canvas {
|
| 17 |
+
position: fixed;
|
| 18 |
top: 0; left: 0;
|
| 19 |
width: 100%; height: 100%;
|
| 20 |
+
display: block;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
}
|
| 22 |
/* Header */
|
| 23 |
#header {
|
| 24 |
+
position: fixed;
|
| 25 |
+
top: 10px; left: 50%;
|
| 26 |
transform: translateX(-50%);
|
| 27 |
text-align: center;
|
| 28 |
+
z-index: 20;
|
| 29 |
pointer-events: none;
|
| 30 |
}
|
| 31 |
#header h1 {
|
| 32 |
+
font-size: 14px; font-weight: 700;
|
|
|
|
| 33 |
color: #7dd3fc;
|
| 34 |
letter-spacing: 0.05em;
|
| 35 |
+
text-shadow: 0 0 16px rgba(125,211,252,0.6);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
}
|
| 37 |
+
/* Panel base */
|
| 38 |
+
.panel {
|
| 39 |
+
position: fixed;
|
| 40 |
+
background: rgba(10,14,26,0.88);
|
|
|
|
| 41 |
border: 1px solid rgba(125,211,252,0.2);
|
| 42 |
+
border-radius: 10px;
|
| 43 |
padding: 10px 14px;
|
| 44 |
font-size: 11px;
|
| 45 |
+
z-index: 20;
|
| 46 |
+
backdrop-filter: blur(6px);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
}
|
| 48 |
+
.panel h3 {
|
| 49 |
+
font-size: 10px; letter-spacing: 0.1em;
|
| 50 |
+
color: #7dd3fc; margin-bottom: 8px;
|
| 51 |
+
text-transform: uppercase;
|
| 52 |
}
|
| 53 |
/* Info panel */
|
| 54 |
+
#info-panel { top: 10px; left: 14px; min-width: 190px; }
|
| 55 |
+
.info-row { display: flex; justify-content: space-between; gap: 10px; margin-bottom: 4px; color: #94a3b8; }
|
| 56 |
+
.info-val { color: #e0e6f0; font-weight: 600; max-width: 110px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
|
| 57 |
+
/* Legend */
|
| 58 |
+
#legend { top: 10px; right: 14px; }
|
| 59 |
+
.leg { display: flex; align-items: center; gap: 7px; margin-bottom: 5px; }
|
| 60 |
+
.leg-dot { width: 9px; height: 9px; border-radius: 50%; flex-shrink: 0; }
|
| 61 |
+
.leg-line { width: 18px; height: 3px; border-radius: 2px; flex-shrink: 0; }
|
| 62 |
+
/* Score ring */
|
| 63 |
+
#score-ring { position: fixed; bottom: 150px; left: 14px; z-index: 20; }
|
| 64 |
+
/* Step log */
|
| 65 |
+
#step-log {
|
| 66 |
+
position: fixed; bottom: 150px; right: 14px;
|
| 67 |
+
width: 230px; max-height: 200px; overflow-y: auto;
|
| 68 |
+
z-index: 20;
|
|
|
|
| 69 |
}
|
| 70 |
+
.log-e { display: flex; gap: 5px; margin-bottom: 5px; padding-bottom: 5px; border-bottom: 1px solid rgba(255,255,255,0.05); font-size: 10px; }
|
| 71 |
+
.log-e:last-child { border-bottom: none; }
|
| 72 |
+
.log-s { color: #475569; min-width: 24px; }
|
| 73 |
+
.log-a { font-weight: 600; flex: 1; }
|
| 74 |
+
.rp { color: #4ade80; } .rn { color: #f87171; } .rz { color: #94a3b8; }
|
| 75 |
/* Timeline */
|
| 76 |
+
#timeline {
|
| 77 |
+
position: fixed; bottom: 16px; left: 50%;
|
|
|
|
| 78 |
transform: translateX(-50%);
|
| 79 |
+
width: min(680px, 92vw);
|
| 80 |
+
z-index: 20;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
}
|
| 82 |
+
#tl-header { display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px; }
|
| 83 |
+
#tl-header h3 { font-size: 10px; color: #7dd3fc; letter-spacing: 0.1em; }
|
| 84 |
+
#step-label { font-size: 11px; color: #f0abfc; font-weight: 700; }
|
| 85 |
+
#slider {
|
| 86 |
+
width: 100%; -webkit-appearance: none; height: 4px;
|
| 87 |
background: linear-gradient(to right, #7dd3fc 0%, #7dd3fc var(--pct,0%), #1e293b var(--pct,0%));
|
| 88 |
+
border-radius: 4px; outline: none; cursor: pointer;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
}
|
| 90 |
+
#slider::-webkit-slider-thumb {
|
| 91 |
+
-webkit-appearance: none; width: 15px; height: 15px;
|
| 92 |
+
border-radius: 50%; background: #7dd3fc; cursor: pointer;
|
| 93 |
+
box-shadow: 0 0 8px rgba(125,211,252,0.8);
|
|
|
|
| 94 |
}
|
| 95 |
+
#tl-btns { display: flex; gap: 7px; margin-top: 8px; justify-content: center; }
|
| 96 |
+
.tb {
|
| 97 |
background: rgba(125,211,252,0.1);
|
| 98 |
border: 1px solid rgba(125,211,252,0.3);
|
| 99 |
+
color: #7dd3fc; padding: 4px 12px;
|
| 100 |
+
border-radius: 6px; cursor: pointer; font-size: 10px;
|
| 101 |
+
transition: all 0.15s;
|
|
|
|
|
|
|
|
|
|
| 102 |
}
|
| 103 |
+
.tb:hover { background: rgba(125,211,252,0.25); }
|
| 104 |
+
.tb.active { background: rgba(125,211,252,0.3); }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
/* Tooltip */
|
| 106 |
#tooltip {
|
| 107 |
+
position: fixed; z-index: 30;
|
| 108 |
background: rgba(10,14,26,0.95);
|
| 109 |
border: 1px solid rgba(125,211,252,0.4);
|
| 110 |
+
border-radius: 6px; padding: 7px 11px;
|
| 111 |
+
font-size: 10px; pointer-events: none;
|
| 112 |
+
opacity: 0; transition: opacity 0.1s;
|
| 113 |
+
max-width: 180px;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
}
|
| 115 |
+
#tt-title { color: #7dd3fc; margin-bottom: 3px; font-weight: 700; }
|
| 116 |
/* Loader */
|
| 117 |
#loader {
|
| 118 |
+
position: fixed; top: 50%; left: 50%;
|
| 119 |
+
transform: translate(-50%,-50%);
|
| 120 |
+
text-align: center; z-index: 50; color: #7dd3fc; font-size: 13px;
|
|
|
|
|
|
|
|
|
|
| 121 |
}
|
| 122 |
+
.spin {
|
| 123 |
+
width: 36px; height: 36px; margin: 0 auto 10px;
|
| 124 |
+
border: 3px solid rgba(125,211,252,0.15);
|
| 125 |
border-top-color: #7dd3fc;
|
| 126 |
border-radius: 50%;
|
| 127 |
+
animation: sp 0.7s linear infinite;
|
| 128 |
+
}
|
| 129 |
+
@keyframes sp { to { transform: rotate(360deg); } }
|
| 130 |
+
#no-data {
|
| 131 |
+
position: fixed; top: 50%; left: 50%;
|
| 132 |
+
transform: translate(-50%,-50%);
|
| 133 |
+
text-align: center; color: #475569; font-size: 13px;
|
| 134 |
+
display: none;
|
| 135 |
}
|
|
|
|
| 136 |
</style>
|
| 137 |
</head>
|
| 138 |
<body>
|
| 139 |
|
| 140 |
+
<canvas id="three-canvas"></canvas>
|
|
|
|
| 141 |
|
| 142 |
+
<div id="loader"><div class="spin"></div><p>Loading 3D...</p></div>
|
| 143 |
+
<div id="no-data">
|
| 144 |
+
<p style="font-size:28px;margin-bottom:12px">๐</p>
|
| 145 |
+
<p style="color:#7dd3fc;font-weight:700;margin-bottom:6px">No Episode Loaded</p>
|
| 146 |
+
<p>Run an episode first, then click<br><strong style="color:#7dd3fc">Load Trajectory</strong></p>
|
| 147 |
</div>
|
| 148 |
|
| 149 |
+
<div id="header"><h1>๐ Agent Trajectory Visualizer โ 3D</h1></div>
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
+
<!-- Info panel -->
|
| 152 |
+
<div class="panel" id="info-panel">
|
| 153 |
+
<h3>Episode Stats</h3>
|
| 154 |
+
<div class="info-row"><span>Task</span><span class="info-val" id="st-task">โ</span></div>
|
| 155 |
+
<div class="info-row"><span>Variant</span><span class="info-val" id="st-var">โ</span></div>
|
| 156 |
+
<div class="info-row"><span>Steps</span><span class="info-val" id="st-steps">โ</span></div>
|
| 157 |
+
<div class="info-row"><span>Score</span><span class="info-val" id="st-score">โ</span></div>
|
| 158 |
+
<div class="info-row"><span>Strategy</span><span class="info-val" id="st-strat">โ</span></div>
|
| 159 |
+
</div>
|
| 160 |
|
| 161 |
+
<!-- Legend -->
|
| 162 |
+
<div class="panel" id="legend">
|
| 163 |
+
<h3>Legend</h3>
|
| 164 |
+
<div class="leg"><div class="leg-dot" style="background:#f97316"></div><span>Source file</span></div>
|
| 165 |
+
<div class="leg"><div class="leg-dot" style="background:#3b82f6"></div><span>Test file</span></div>
|
| 166 |
+
<div class="leg"><div class="leg-dot" style="background:#a855f7"></div><span>Spec / Docs</span></div>
|
| 167 |
+
<div class="leg"><div class="leg-dot" style="background:#22c55e"></div><span>Visited</span></div>
|
| 168 |
+
<div class="leg"><div class="leg-dot" style="background:#ef4444"></div><span>Bug / Modified</span></div>
|
| 169 |
+
<div class="leg"><div class="leg-line" style="background:#facc15"></div><span>Agent path</span></div>
|
| 170 |
+
</div>
|
| 171 |
|
| 172 |
+
<!-- Score ring -->
|
| 173 |
+
<div id="score-ring">
|
| 174 |
+
<svg width="76" height="76" viewBox="0 0 76 76">
|
| 175 |
+
<circle cx="38" cy="38" r="30" fill="none" stroke="rgba(125,211,252,0.12)" stroke-width="6"/>
|
| 176 |
+
<circle id="score-arc" cx="38" cy="38" r="30" fill="none"
|
| 177 |
+
stroke="#7dd3fc" stroke-width="6"
|
| 178 |
+
stroke-dasharray="0 188"
|
| 179 |
+
stroke-linecap="round"
|
| 180 |
+
transform="rotate(-90 38 38)"
|
| 181 |
+
style="transition:stroke-dasharray 1.2s ease"/>
|
| 182 |
+
<text id="score-txt" x="38" y="43" text-anchor="middle"
|
| 183 |
+
fill="#e0e6f0" font-size="13" font-weight="700"
|
| 184 |
+
font-family="'Segoe UI',sans-serif">0.0</text>
|
| 185 |
+
</svg>
|
| 186 |
+
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
|
| 188 |
+
<!-- Step log -->
|
| 189 |
+
<div class="panel" id="step-log">
|
| 190 |
+
<h3>Step Log</h3>
|
| 191 |
+
<div id="log-list"></div>
|
| 192 |
+
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
+
<!-- Tooltip -->
|
| 195 |
+
<div id="tooltip"><div id="tt-title"></div><div id="tt-body"></div></div>
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
+
<!-- Timeline -->
|
| 198 |
+
<div class="panel" id="timeline">
|
| 199 |
+
<div id="tl-header">
|
| 200 |
+
<h3>Timeline Replay</h3>
|
| 201 |
+
<span id="step-label">Step 0 / 0</span>
|
| 202 |
</div>
|
| 203 |
+
<input type="range" id="slider" min="0" max="0" value="0"
|
| 204 |
+
oninput="onSlider(this.value)">
|
| 205 |
+
<div id="tl-btns">
|
| 206 |
+
<button class="tb" onclick="stepBack()">โ Back</button>
|
| 207 |
+
<button class="tb" id="play-btn" onclick="togglePlay()">โถ Play</button>
|
| 208 |
+
<button class="tb" onclick="stepFwd()">Forward โถ</button>
|
| 209 |
+
<button class="tb" onclick="resetView()">โบ Reset</button>
|
| 210 |
+
<button class="tb" id="orbit-btn" onclick="toggleOrbit()">๐ Orbit</button>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
</div>
|
| 212 |
</div>
|
| 213 |
|
|
|
|
| 214 |
<script src="https://cdnjs.cloudflare.com/ajax/libs/three.js/r128/three.min.js"></script>
|
| 215 |
<script>
|
| 216 |
+
// โโ Renderer โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
const canvas = document.getElementById('three-canvas');
|
| 218 |
+
const renderer = new THREE.WebGLRenderer({ canvas, antialias: true, alpha: false });
|
|
|
|
| 219 |
renderer.setPixelRatio(Math.min(window.devicePixelRatio, 2));
|
| 220 |
renderer.setClearColor(0x0a0e1a, 1);
|
| 221 |
|
| 222 |
+
function resize() {
|
| 223 |
+
renderer.setSize(window.innerWidth, window.innerHeight, false);
|
| 224 |
+
camera.aspect = window.innerWidth / window.innerHeight;
|
| 225 |
+
camera.updateProjectionMatrix();
|
| 226 |
+
}
|
| 227 |
+
window.addEventListener('resize', resize);
|
| 228 |
+
|
| 229 |
+
// โโ Scene + Camera โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 230 |
const scene = new THREE.Scene();
|
| 231 |
+
const camera = new THREE.PerspectiveCamera(58, 1, 0.1, 1000);
|
| 232 |
+
camera.position.set(0, 8, 24);
|
|
|
|
| 233 |
camera.lookAt(0, 0, 0);
|
| 234 |
+
resize();
|
| 235 |
|
| 236 |
+
// Lights
|
| 237 |
+
scene.add(new THREE.AmbientLight(0x1a2040, 1.2));
|
| 238 |
+
const dl = new THREE.DirectionalLight(0x7dd3fc, 0.5);
|
| 239 |
+
dl.position.set(5, 12, 5);
|
| 240 |
+
scene.add(dl);
|
| 241 |
|
| 242 |
// Grid
|
| 243 |
+
const grid = new THREE.GridHelper(50, 25, 0x1e293b, 0x0f172a);
|
| 244 |
+
grid.position.y = -3.5;
|
| 245 |
scene.add(grid);
|
| 246 |
|
| 247 |
// Stars
|
| 248 |
+
(function() {
|
| 249 |
+
const geo = new THREE.BufferGeometry();
|
| 250 |
+
const pos = new Float32Array(900 * 3);
|
| 251 |
+
for (let i = 0; i < 900 * 3; i++) pos[i] = (Math.random() - 0.5) * 220;
|
| 252 |
+
geo.setAttribute('position', new THREE.BufferAttribute(pos, 3));
|
| 253 |
+
scene.add(new THREE.Points(geo, new THREE.PointsMaterial({ color: 0x1e3a5f, size: 0.25 })));
|
| 254 |
+
})();
|
| 255 |
+
|
| 256 |
+
// โโ Orbit controls (manual) โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 257 |
+
let sph = { theta: 0, phi: 1.1, r: 24 };
|
| 258 |
+
let orbitAuto = false, dragging = false, lastX = 0, lastY = 0;
|
| 259 |
+
|
| 260 |
+
canvas.addEventListener('mousedown', e => { dragging = true; lastX = e.clientX; lastY = e.clientY; });
|
| 261 |
+
window.addEventListener('mouseup', () => { dragging = false; });
|
| 262 |
+
window.addEventListener('mousemove', e => {
|
| 263 |
+
if (dragging) {
|
| 264 |
+
sph.theta -= (e.clientX - lastX) * 0.006;
|
| 265 |
+
sph.phi = Math.max(0.15, Math.min(1.55, sph.phi - (e.clientY - lastY) * 0.006));
|
| 266 |
+
lastX = e.clientX; lastY = e.clientY;
|
| 267 |
+
} else { checkHover(e.clientX, e.clientY); }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
});
|
| 269 |
canvas.addEventListener('wheel', e => {
|
| 270 |
+
sph.r = Math.max(8, Math.min(55, sph.r + e.deltaY * 0.025));
|
| 271 |
});
|
| 272 |
|
| 273 |
function updateCamera() {
|
| 274 |
+
if (orbitAuto) sph.theta += 0.004;
|
| 275 |
+
const sin_p = Math.sin(sph.phi);
|
| 276 |
+
camera.position.set(
|
| 277 |
+
sph.r * sin_p * Math.sin(sph.theta),
|
| 278 |
+
sph.r * Math.cos(sph.phi),
|
| 279 |
+
sph.r * sin_p * Math.cos(sph.theta)
|
| 280 |
+
);
|
| 281 |
camera.lookAt(0, 0, 0);
|
| 282 |
}
|
| 283 |
|
| 284 |
+
// โโ Scene state โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 285 |
+
const COLS = { src:0xf97316, test:0x3b82f6, spec:0xa855f7, visited:0x22c55e, bug:0xef4444, agent:0xfbbf24, path:0xfacc15, edge:0x334155 };
|
| 286 |
+
|
| 287 |
+
let nodeMap = {}; // filename โ { mesh, basePos }
|
| 288 |
+
let pathLines = [], edgeLines = [];
|
| 289 |
+
let agentMesh = null;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
let vizData = null;
|
| 291 |
+
let curStep = 0, maxStep = 0;
|
| 292 |
+
let playing = false, playTimer = null;
|
| 293 |
+
let frame = 0;
|
| 294 |
+
|
| 295 |
+
// โโ Build scene โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 296 |
+
function clearScene() {
|
| 297 |
+
Object.values(nodeMap).forEach(o => scene.remove(o.mesh));
|
| 298 |
+
pathLines.forEach(l => scene.remove(l));
|
| 299 |
+
edgeLines.forEach(l => scene.remove(l));
|
| 300 |
+
if (agentMesh) scene.remove(agentMesh);
|
| 301 |
+
nodeMap = {}; pathLines = []; edgeLines = []; agentMesh = null;
|
| 302 |
+
}
|
| 303 |
|
|
|
|
| 304 |
function buildScene(data) {
|
| 305 |
+
clearScene();
|
| 306 |
vizData = data;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
const files = data.files || [];
|
| 308 |
const n = files.length;
|
| 309 |
+
if (!n) return;
|
| 310 |
+
|
| 311 |
+
// Layout: circle
|
| 312 |
+
files.forEach((f, i) => {
|
| 313 |
+
const angle = (i / n) * Math.PI * 2 - Math.PI / 2;
|
| 314 |
+
const R = Math.max(5, n * 1.0);
|
| 315 |
+
const x = Math.cos(angle) * R;
|
| 316 |
+
const z = Math.sin(angle) * R;
|
| 317 |
+
const pos = new THREE.Vector3(x, 0, z);
|
| 318 |
+
|
| 319 |
+
const baseColor = f.is_bug_file ? COLS.bug :
|
| 320 |
+
f.type === 'test' ? COLS.test :
|
| 321 |
+
f.type === 'spec' ? COLS.spec : COLS.src;
|
| 322 |
+
const col = new THREE.Color(baseColor);
|
| 323 |
+
|
| 324 |
+
// Main sphere
|
| 325 |
+
const geo = new THREE.SphereGeometry(0.55, 20, 20);
|
|
|
|
|
|
|
| 326 |
const mat = new THREE.MeshPhongMaterial({
|
| 327 |
+
color: col, emissive: col.clone().multiplyScalar(0.25),
|
| 328 |
+
shininess: 70, transparent: true, opacity: 0.85,
|
|
|
|
|
|
|
|
|
|
| 329 |
});
|
| 330 |
const mesh = new THREE.Mesh(geo, mat);
|
| 331 |
+
mesh.position.copy(pos);
|
| 332 |
+
mesh.userData = { file: f, basePos: pos.clone() };
|
| 333 |
scene.add(mesh);
|
| 334 |
|
| 335 |
+
// Ring halo
|
| 336 |
+
const rg = new THREE.RingGeometry(0.7, 0.82, 32);
|
| 337 |
+
const rm = new THREE.MeshBasicMaterial({ color: col, transparent: true, opacity: 0.2, side: THREE.DoubleSide });
|
| 338 |
+
const ring = new THREE.Mesh(rg, rm);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
ring.rotation.x = Math.PI / 2;
|
| 340 |
mesh.add(ring);
|
| 341 |
|
| 342 |
+
nodeMap[f.name] = { mesh, basePos: pos.clone() };
|
| 343 |
});
|
| 344 |
|
| 345 |
+
// Dependency edges
|
| 346 |
(data.dependencies || []).forEach(dep => {
|
| 347 |
+
const a = nodeMap[dep.from], b = nodeMap[dep.to];
|
| 348 |
+
if (!a || !b) return;
|
| 349 |
+
const geo = new THREE.BufferGeometry().setFromPoints([a.basePos.clone(), b.basePos.clone()]);
|
| 350 |
+
const mat = new THREE.LineBasicMaterial({ color: COLS.edge, transparent: true, opacity: 0.35 });
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
const line = new THREE.Line(geo, mat);
|
| 352 |
scene.add(line);
|
| 353 |
+
edgeLines.push(line);
|
| 354 |
});
|
| 355 |
|
| 356 |
+
// Agent sphere
|
| 357 |
+
const ag = new THREE.SphereGeometry(0.32, 14, 14);
|
| 358 |
+
const am = new THREE.MeshPhongMaterial({ color: COLS.agent, emissive: 0xfbbf24, emissiveIntensity: 0.9, shininess: 120 });
|
| 359 |
+
agentMesh = new THREE.Mesh(ag, am);
|
| 360 |
+
agentMesh.position.set(0, 3, 0);
|
| 361 |
+
scene.add(agentMesh);
|
| 362 |
+
|
| 363 |
+
// Update UI stats
|
| 364 |
+
document.getElementById('st-task').textContent = data.task || 'โ';
|
| 365 |
+
document.getElementById('st-var').textContent = (data.variant_id || 'โ').slice(0, 12);
|
| 366 |
+
document.getElementById('st-steps').textContent = (data.steps || []).length;
|
| 367 |
+
document.getElementById('st-strat').textContent = data.strategy || 'โ';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
updateScore(data.final_score || 0);
|
|
|
|
| 369 |
|
|
|
|
| 370 |
maxStep = (data.steps || []).length;
|
| 371 |
+
const sl = document.getElementById('slider');
|
| 372 |
+
sl.max = maxStep; sl.value = 0;
|
| 373 |
+
curStep = 0;
|
| 374 |
+
updateLabel(0, maxStep);
|
|
|
|
|
|
|
| 375 |
applyStep(0);
|
| 376 |
}
|
| 377 |
|
| 378 |
+
// โโ Apply step โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 379 |
+
function applyStep(idx) {
|
| 380 |
if (!vizData) return;
|
| 381 |
const steps = vizData.steps || [];
|
|
|
|
|
|
|
| 382 |
|
| 383 |
// Reset all nodes
|
| 384 |
+
Object.values(nodeMap).forEach(({ mesh, basePos: _ }) => {
|
| 385 |
+
const f = mesh.userData.file;
|
| 386 |
+
const bc = f.is_bug_file ? COLS.bug : f.type === 'test' ? COLS.test : f.type === 'spec' ? COLS.spec : COLS.src;
|
| 387 |
+
mesh.material.color.set(bc);
|
| 388 |
+
mesh.material.emissive.set(new THREE.Color(bc).multiplyScalar(0.2));
|
| 389 |
+
mesh.material.opacity = 0.55;
|
| 390 |
+
mesh.scale.setScalar(1);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
});
|
| 392 |
|
| 393 |
// Remove old path lines
|
| 394 |
+
pathLines.forEach(l => scene.remove(l));
|
| 395 |
+
pathLines = [];
|
| 396 |
+
|
| 397 |
+
// Collect positions for path
|
| 398 |
+
const pathPts = [];
|
| 399 |
+
const visited = new Set(), modified = new Set();
|
| 400 |
+
|
| 401 |
+
for (let i = 0; i < idx; i++) {
|
| 402 |
+
const s = steps[i];
|
| 403 |
+
if (!s) continue;
|
| 404 |
+
if (s.path && nodeMap[s.path]) {
|
| 405 |
+
const p = nodeMap[s.path].basePos.clone().add(new THREE.Vector3(0, 0.15, 0));
|
| 406 |
+
pathPts.push(p);
|
|
|
|
|
|
|
|
|
|
| 407 |
}
|
| 408 |
+
if (s.action === 'read_file' && s.path) visited.add(s.path);
|
| 409 |
+
if (s.action === 'write_file' && s.path) modified.add(s.path);
|
| 410 |
}
|
| 411 |
|
| 412 |
+
// Color visited/modified
|
| 413 |
+
visited.forEach(name => {
|
| 414 |
+
if (nodeMap[name]) {
|
| 415 |
+
nodeMap[name].mesh.material.color.set(COLS.visited);
|
| 416 |
+
nodeMap[name].mesh.material.emissive.set(new THREE.Color(COLS.visited).multiplyScalar(0.4));
|
| 417 |
+
nodeMap[name].mesh.material.opacity = 1;
|
| 418 |
+
nodeMap[name].mesh.scale.setScalar(1.25);
|
|
|
|
|
|
|
| 419 |
}
|
| 420 |
});
|
| 421 |
+
modified.forEach(name => {
|
| 422 |
+
if (nodeMap[name]) {
|
| 423 |
+
nodeMap[name].mesh.material.color.set(COLS.bug);
|
| 424 |
+
nodeMap[name].mesh.material.emissive.set(new THREE.Color(COLS.bug).multiplyScalar(0.5));
|
| 425 |
+
nodeMap[name].mesh.material.opacity = 1;
|
| 426 |
+
nodeMap[name].mesh.scale.setScalar(1.45);
|
|
|
|
|
|
|
| 427 |
}
|
| 428 |
});
|
| 429 |
|
| 430 |
+
// Highlight current node
|
| 431 |
+
if (idx > 0 && idx <= steps.length) {
|
| 432 |
+
const cur = steps[idx - 1];
|
| 433 |
+
if (cur && cur.path && nodeMap[cur.path]) {
|
| 434 |
+
nodeMap[cur.path].mesh.scale.setScalar(1.65);
|
| 435 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 436 |
}
|
| 437 |
|
| 438 |
+
// Draw path
|
| 439 |
+
if (pathPts.length >= 2) {
|
| 440 |
+
const geo = new THREE.BufferGeometry().setFromPoints(pathPts);
|
| 441 |
+
const mat = new THREE.LineBasicMaterial({ color: COLS.path, transparent: true, opacity: 0.9 });
|
| 442 |
+
const line = new THREE.Line(geo, mat);
|
| 443 |
+
scene.add(line); pathLines.push(line);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 444 |
}
|
| 445 |
|
| 446 |
+
// Move agent
|
| 447 |
+
if (idx > 0 && idx <= steps.length) {
|
| 448 |
+
const cur = steps[idx - 1];
|
| 449 |
+
if (cur && cur.path && nodeMap[cur.path]) {
|
| 450 |
+
const tp = nodeMap[cur.path].basePos;
|
| 451 |
+
agentMesh.position.set(tp.x, tp.y + 1.3, tp.z);
|
| 452 |
+
} else {
|
| 453 |
+
agentMesh.position.set(0, 2.5, 0);
|
| 454 |
}
|
| 455 |
+
} else {
|
| 456 |
+
agentMesh.position.set(0, 3.5, 0);
|
| 457 |
}
|
| 458 |
|
| 459 |
+
updateLog(steps, idx - 1);
|
| 460 |
+
updateLabel(idx, maxStep);
|
| 461 |
+
const sl = document.getElementById('slider');
|
| 462 |
+
sl.style.setProperty('--pct', (maxStep > 0 ? idx / maxStep * 100 : 0) + '%');
|
|
|
|
|
|
|
|
|
|
| 463 |
}
|
| 464 |
|
| 465 |
+
// โโ Score ring โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 466 |
+
function updateScore(s) {
|
| 467 |
+
const c = 2 * Math.PI * 30;
|
| 468 |
+
const arc = c * Math.min(1, Math.max(0, s));
|
| 469 |
+
document.getElementById('score-arc').setAttribute('stroke-dasharray', `${arc} ${c}`);
|
| 470 |
+
document.getElementById('score-txt').textContent = s.toFixed(2);
|
| 471 |
+
document.getElementById('st-score').textContent = s.toFixed(3);
|
| 472 |
+
const col = s >= 0.7 ? '#4ade80' : s >= 0.4 ? '#fbbf24' : '#f87171';
|
| 473 |
+
document.getElementById('score-arc').setAttribute('stroke', col);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
}
|
| 475 |
|
| 476 |
+
// โโ Step log โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 477 |
+
function updateLog(steps, curIdx) {
|
| 478 |
+
const em = { read_file:'๐', write_file:'โ๏ธ', run_tests:'๐งช', search_code:'๐', submit:'๐' };
|
| 479 |
+
const container = document.getElementById('log-list');
|
| 480 |
container.innerHTML = '';
|
| 481 |
+
steps.forEach((s, i) => {
|
| 482 |
+
const e = document.createElement('div');
|
| 483 |
+
e.className = 'log-e';
|
| 484 |
+
e.style.opacity = i < curIdx ? '0.55' : i === curIdx ? '1' : '0.3';
|
| 485 |
+
if (i === curIdx) e.style.background = 'rgba(125,211,252,0.07)';
|
| 486 |
+
const r = s.reward || 0;
|
| 487 |
+
const rc = r > 0 ? 'rp' : r < 0 ? 'rn' : 'rz';
|
| 488 |
+
const name = (s.path || s.action || '').split('/').pop() || s.action;
|
| 489 |
+
e.innerHTML = `<span class="log-s">S${s.step}</span><span class="log-a">${em[s.action]||'โข'} ${name}</span><span class="${rc}">${r>0?'+':''}${r.toFixed(2)}</span>`;
|
| 490 |
+
container.appendChild(e);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 491 |
});
|
| 492 |
+
if (curIdx >= 0 && container.children[curIdx]) {
|
| 493 |
+
container.children[curIdx].scrollIntoView({ block: 'nearest' });
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
}
|
| 495 |
}
|
| 496 |
|
| 497 |
+
// โโ Hover tooltip โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 498 |
+
const ray = new THREE.Raycaster();
|
| 499 |
+
const mv = new THREE.Vector2();
|
| 500 |
+
const tt = document.getElementById('tooltip');
|
| 501 |
|
| 502 |
function checkHover(mx, my) {
|
| 503 |
+
mv.x = (mx / window.innerWidth) * 2 - 1;
|
| 504 |
+
mv.y = -(my / window.innerHeight) * 2 + 1;
|
| 505 |
+
ray.setFromCamera(mv, camera);
|
| 506 |
+
const meshes = Object.values(nodeMap).map(o => o.mesh);
|
| 507 |
+
const hits = ray.intersectObjects(meshes);
|
| 508 |
+
if (hits.length) {
|
| 509 |
+
const f = hits[0].object.userData.file;
|
| 510 |
+
tt.style.opacity = '1';
|
| 511 |
+
tt.style.left = (mx + 12) + 'px';
|
| 512 |
+
tt.style.top = (my - 8) + 'px';
|
| 513 |
+
document.getElementById('tt-title').textContent = f.name;
|
| 514 |
+
document.getElementById('tt-body').innerHTML =
|
| 515 |
+
`Type: ${f.type}${f.is_bug_file ? '<br>โ ๏ธ Bug location' : ''}${f.visited ? '<br>โ
Visited' : ''}`;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 516 |
} else {
|
| 517 |
+
tt.style.opacity = '0';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 518 |
}
|
| 519 |
}
|
| 520 |
|
| 521 |
+
// โโ Controls โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 522 |
+
function onSlider(v) { curStep = +v; applyStep(curStep); }
|
| 523 |
+
function stepFwd() { if (curStep < maxStep) { curStep++; document.getElementById('slider').value = curStep; applyStep(curStep); } }
|
| 524 |
+
function stepBack() { if (curStep > 0) { curStep--; document.getElementById('slider').value = curStep; applyStep(curStep); } }
|
| 525 |
function togglePlay() {
|
| 526 |
playing = !playing;
|
| 527 |
+
document.getElementById('play-btn').textContent = playing ? 'โธ Pause' : 'โถ Play';
|
|
|
|
| 528 |
if (playing) {
|
| 529 |
+
if (curStep >= maxStep) curStep = 0;
|
| 530 |
+
playTimer = setInterval(() => {
|
| 531 |
+
if (curStep >= maxStep) { playing = false; document.getElementById('play-btn').textContent = 'โถ Play'; clearInterval(playTimer); return; }
|
| 532 |
+
stepFwd();
|
| 533 |
+
}, 850);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 534 |
} else {
|
| 535 |
+
clearInterval(playTimer);
|
| 536 |
}
|
| 537 |
}
|
|
|
|
| 538 |
function toggleOrbit() {
|
| 539 |
+
orbitAuto = !orbitAuto;
|
| 540 |
const btn = document.getElementById('orbit-btn');
|
| 541 |
+
btn.textContent = orbitAuto ? 'โน Stop' : '๐ Orbit';
|
| 542 |
+
btn.classList.toggle('active', orbitAuto);
|
| 543 |
}
|
|
|
|
| 544 |
function resetView() {
|
| 545 |
+
sph = { theta: 0, phi: 1.1, r: 24 };
|
| 546 |
+
curStep = 0;
|
| 547 |
+
document.getElementById('slider').value = 0;
|
| 548 |
applyStep(0);
|
| 549 |
}
|
| 550 |
+
function updateLabel(s, m) { document.getElementById('step-label').textContent = `Step ${s} / ${m}`; }
|
| 551 |
|
| 552 |
+
// โโ Animation loop โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 553 |
function animate() {
|
| 554 |
requestAnimationFrame(animate);
|
| 555 |
frame++;
|
|
|
|
| 556 |
updateCamera();
|
| 557 |
+
// Pulsing agent
|
| 558 |
+
if (agentMesh) {
|
| 559 |
+
const p = 1 + Math.sin(frame * 0.09) * 0.18;
|
| 560 |
+
agentMesh.scale.setScalar(p);
|
| 561 |
+
agentMesh.rotation.y += 0.04;
|
| 562 |
+
}
|
| 563 |
+
// Subtle node float
|
| 564 |
+
Object.values(nodeMap).forEach(({ mesh, basePos }, i) => {
|
| 565 |
+
mesh.position.y = basePos.y + Math.sin(frame * 0.018 + i * 1.1) * 0.07;
|
|
|
|
|
|
|
| 566 |
});
|
|
|
|
| 567 |
renderer.render(scene, camera);
|
| 568 |
}
|
| 569 |
+
animate();
|
| 570 |
|
| 571 |
+
// โโ Load data from API โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 572 |
+
async function fetchAndLoad() {
|
| 573 |
+
document.getElementById('loader').style.display = 'block';
|
| 574 |
+
document.getElementById('no-data').style.display = 'none';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 575 |
try {
|
| 576 |
+
// Try to determine base URL from window location
|
| 577 |
+
const base = window.location.origin;
|
| 578 |
+
const res = await fetch(`${base}/viz-data`, { cache: 'no-store' });
|
| 579 |
+
if (!res.ok) throw new Error('no data');
|
| 580 |
+
const data = await res.json();
|
| 581 |
+
if (data.error || !data.files || data.files.length === 0) {
|
| 582 |
+
document.getElementById('loader').style.display = 'none';
|
| 583 |
+
document.getElementById('no-data').style.display = 'block';
|
| 584 |
+
return;
|
| 585 |
+
}
|
| 586 |
buildScene(data);
|
| 587 |
+
document.getElementById('loader').style.display = 'none';
|
| 588 |
} catch(e) {
|
| 589 |
+
document.getElementById('loader').style.display = 'none';
|
| 590 |
+
document.getElementById('no-data').style.display = 'block';
|
| 591 |
}
|
| 592 |
+
}
|
| 593 |
|
| 594 |
+
// โโ Public API (can be called from parent window) โโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 595 |
+
window.loadData = function(data) {
|
| 596 |
+
if (typeof data === 'string') { try { data = JSON.parse(data); } catch(e) { return; } }
|
| 597 |
buildScene(data);
|
| 598 |
document.getElementById('loader').style.display = 'none';
|
| 599 |
+
document.getElementById('no-data').style.display = 'none';
|
| 600 |
+
};
|
| 601 |
+
|
| 602 |
+
// Auto-load on init
|
| 603 |
+
window.addEventListener('load', fetchAndLoad);
|
| 604 |
</script>
|
| 605 |
</body>
|
| 606 |
</html>
|