Chirag0123's picture
feat(ui): add comprehensive Quick Start guide to Gradio dashboard
5567f49
#!/usr/bin/env python3
"""
app.py โ€” Gradio UI v4.0 โ€” Full Research Platform
13 tabs:
๐ŸŽฎ Interactive โ€” manual control
๐Ÿค– Run Agent โ€” deterministic demo agent
๐Ÿ“Š Evaluation โ€” 6-dimension process evaluation
๐Ÿง  Intelligence โ€” failure, strategy, advanced metrics
๐Ÿ” Self-Improve โ€” improvement plan with prompt injection
โš–๏ธ Compare Agents โ€” multi-agent strategy comparison
๐ŸŒ 3D Visualizer โ€” Three.js trajectory viz (FIXED: iframe)
๐Ÿงช Causal Probe โ€” causal reasoning vs guessing
๐ŸŽญ Counterfactual โ€” brittleness / robustness testing
๐Ÿ“ Confidence โ€” calibration: overconfident vs underconfident
๐Ÿ† Benchmark โ€” automated leaderboard
๐Ÿ“ˆ Analytics โ€” unified research-grade report
๐Ÿ“– API โ€” REST reference
"""
import os
import json
import gradio as gr
from server.app import (
app as fastapi_app,
env,
failure_clf,
strategy_det,
adv_metrics as adv_metrics_engine,
improvement as improvement_engine,
multi_agent as multi_agent_engine,
_causal as causal_probe,
_counter as counterfactual_engine,
_calibrator as confidence_calibrator,
_benchmark as benchmark_runner,
_analytics as analytics_engine,
)
from server.models import RepoAction
from server.memory_bank import get_global_memory
# โ”€โ”€ Global instances โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# All engines and the environment are imported from server.app so that
# Gradio interactions and direct HTTP REST calls use the exact same state.
memory_bank = get_global_memory()
# โ”€โ”€ Helpers โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def _get_traj_and_meta():
traj = env.get_trajectory()
if not traj:
return None, None, None, None
meta = env.variant.meta if env.variant else {}
steps = traj.get("steps", [])
return traj, meta, steps, traj.get("episode_id", "")
def _no_traj():
return "โš ๏ธ No trajectory. Run an episode first (Interactive or Run Agent tab)."
# โ”€โ”€ Tab 1: Interactive โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def reset_environment(task):
try:
result = env.reset(task=task)
obs = result.observation
tree = "\n".join(f" ๐Ÿ“„ {f}" for f in obs.repo_tree)
failing = ", ".join(obs.failing_tests) if obs.failing_tests else "None"
fi = result.info.get("fault_injection", {})
faults = ""
if fi.get("faults_injected"):
faults = f"\n\nโš ๏ธ Fault Injection ({fi.get('difficulty_multiplier',1):.1f}ร—):\n"
faults += "\n".join(f" โ€ข {f}" for f in fi["faults_injected"][:5])
status = (
f"โœ… Episode started โ€” {task} (variant: {result.info.get('variant_id','?')})\n"
f"โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\n"
f"Steps remaining: {obs.steps_remaining}\n\n"
f"๐Ÿ“ Files:\n{tree}\n\n"
f"๐Ÿ”ด Failing Tests: {failing}\n\n"
f"๐Ÿ“‹ {obs.task_description}{faults}"
)
return status, "", "0", "0.000"
except Exception as e:
return f"โŒ Error: {e}", "", "0", "0.000"
def take_step(action_type, path, query, content):
if env.done:
return "โŒ Episode done. Reset first.", "", "", ""
try:
action = RepoAction(
action_type=action_type,
path=path.strip() or None,
query=query.strip() or None,
content=content.strip() or None,
)
result = env.step(action)
obs = result.observation
result_text = obs.last_action_result or ""
err = f"\nโš ๏ธ {obs.last_action_error}" if obs.last_action_error else ""
flags = result.info.get("security_flags", [])
sec = f"\n๐Ÿ”’ {flags}" if flags else ""
status = (
f"Step {result.info['steps_taken']} | Reward: {result.reward:+.3f} | "
f"Left: {obs.steps_remaining}{err}{sec}"
)
if result.done:
status += f"\n\n๐Ÿ DONE โ€” Score: {result.info['final_score']:.3f}"
return status, result_text[:3000], str(result.info["steps_taken"]), f"{result.info.get('cumulative_reward',0):.3f}"
except Exception as e:
return f"โŒ {e}", "", "", ""
# โ”€โ”€ Tab 2: Run Agent โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def run_builtin_agent(task):
try:
result = env.reset(task=task)
obs = result.observation
tree = obs.repo_tree
log = [f"๐Ÿš€ {task} (variant: {result.info.get('variant_id')})", f" Files: {tree}"]
test_files = sorted([f for f in tree if f.startswith("tests/")])
src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")])
spec_files = sorted([f for f in tree if f.endswith(".md")])
steps = 0
if task == "task3" and spec_files:
for sf in spec_files[:2]:
if env.done: break
r = env.step(RepoAction(action_type="read_file", path=sf))
steps += 1; log.append(f" Step {steps}: read_file {sf} โ†’ {r.reward:+.3f}")
for tf in test_files:
if env.done: break
r = env.step(RepoAction(action_type="read_file", path=tf))
steps += 1; log.append(f" Step {steps}: read_file {tf} โ†’ {r.reward:+.3f}")
if not env.done:
r = env.step(RepoAction(action_type="search_code", query="def "))
steps += 1; log.append(f" Step {steps}: search_code โ†’ {r.reward:+.3f}")
for sf in src_files:
if env.done or steps >= 14: break
r = env.step(RepoAction(action_type="read_file", path=sf))
steps += 1; log.append(f" Step {steps}: read_file {sf} โ†’ {r.reward:+.3f}")
if not env.done and test_files:
r = env.step(RepoAction(action_type="run_tests", path=test_files[0]))
steps += 1; log.append(f" Step {steps}: run_tests โ†’ {r.reward:+.3f}")
if not env.done:
r = env.step(RepoAction(action_type="submit"))
steps += 1; log.append(f" Step {steps}: submit โ†’ {r.reward:+.3f}")
log += ["", f"๐Ÿ Score: {env.final_score:.3f} | Steps: {steps} | Reward: {env.cumulative_reward:.3f}"]
# Store in memory
traj = env.get_trajectory()
if traj:
meta = env.variant.meta if env.variant else {}
fail_r = failure_clf.classify(
traj.get("episode_id",""), task, traj.get("steps",[]), meta,
list(env.files_read), list(env.files_written), env.final_score
)
strat_r = strategy_det.detect(traj.get("steps",[]), task, meta, list(env.files_read), env.final_score)
imp_plan = improvement_engine.generate_improvement_plan(
traj.get("episode_id",""), task, fail_r.primary_failure,
[], env.final_score, traj.get("steps",[]),
list(env.files_read), list(env.files_written)
)
memory_bank.store(
traj.get("episode_id",""), task, fail_r.primary_failure,
fail_r.failure_summary or "", env.final_score,
strat_r.strategy, traj.get("steps",[]), imp_plan.to_dict()
)
log.append(f"๐Ÿ’พ Stored lesson in memory bank ({memory_bank.get_stats()['total_entries']} total)")
return "\n".join(log)
except Exception as e:
return f"โŒ {e}"
# โ”€โ”€ Tab 3: Evaluation โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def get_evaluation():
try:
ev = env.get_evaluation()
if "error" in ev:
return _no_traj()
lines = [f"๐ŸŽฏ Composite Score: {ev['composite_score']:.3f}", "โ”"*50]
for name, dim in ev.get("dimensions", {}).items():
bar = "โ–ˆ" * int(dim["score"]*20) + "โ–‘" * (20-int(dim["score"]*20))
lines.append(f" {name:15s} [{bar}] {dim['score']:.3f}")
for e in dim.get("evidence",[])[:2]:
lines.append(f" โ†’ {e}")
if ev.get("strengths"):
lines += ["\n๐Ÿ’ช Strengths:"] + [f" โœ… {s}" for s in ev["strengths"]]
if ev.get("failure_analysis"):
lines += ["\nโš ๏ธ Failures:"] + [f" โŒ {f}" for f in ev["failure_analysis"]]
if ev.get("recommendations"):
lines += ["\n๐Ÿ’ก Recs:"] + [f" โ†’ {r}" for r in ev["recommendations"]]
return "\n".join(lines)
except Exception as e:
return f"Error: {e}"
def get_metrics():
try:
return json.dumps(env.get_metrics(), indent=2, default=str)
except Exception as e:
return f"Error: {e}"
def get_trajectory():
try:
t = env.get_trajectory()
if not t: return _no_traj()
lines = [
f"Episode: {t.get('episode_id')}", f"Task: {t.get('task')} | Variant: {t.get('variant_id')}",
f"Score: {t.get('final_score',0):.3f} | Duration: {t.get('duration_seconds','?')}s", "โ”"*60,
]
em = {"read_file":"๐Ÿ“–","write_file":"โœ๏ธ","run_tests":"๐Ÿงช","search_code":"๐Ÿ”","submit":"๐Ÿ"}
for step in t.get("steps",[]):
p = step.get("action_path") or step.get("action_query") or ""
err = " โŒ" if step.get("error") else ""
lines.append(f" {em.get(step['action_type'],'โ€ข')} {step['step_number']:2d}: {step['action_type']:12s} {p:25s} reward={step['reward']:+.3f}{err}")
return "\n".join(lines)
except Exception as e:
return f"Error: {e}"
# โ”€โ”€ Tab 4: Intelligence โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def get_failure_classification():
try:
traj, meta, steps, ep_id = _get_traj_and_meta()
if not traj: return _no_traj()
r = failure_clf.classify(ep_id, env.current_task or "?", steps, meta,
list(env.files_read), list(env.files_written), env.final_score)
d = r.to_dict()
lines = [
f"{'โœ… SUCCESS' if d['success'] else 'โŒ FAILURE'}",
f"Primary: {d['primary_failure']} | Count: {d['failure_count']}", "โ”"*50,
]
for f in d.get("failures",[]):
lines += [f"\n[{f['severity'].upper()}] {f['type']} @ step {f['step']}",
f" Evidence: {f['evidence']}", f" Fix: {f['remediation']}"]
if d.get("failure_summary"):
lines += ["\n๐Ÿ“‹ Summary:", f" {d['failure_summary']}"]
if d.get("retry_hint"):
lines += [f"\n๐Ÿ” Retry hint: {d['retry_hint']}"]
return "\n".join(lines)
except Exception as e: return f"Error: {e}"
def get_strategy_detection():
try:
traj, meta, steps, _ = _get_traj_and_meta()
if not traj: return _no_traj()
r = strategy_det.detect(steps, env.current_task or "?", meta, list(env.files_read), env.final_score)
d = r.to_dict()
bar = "โ–ˆ"*int(d["score"]*20)+"โ–‘"*(20-int(d["score"]*20))
lines = [
f"๐Ÿงญ Strategy: {d['strategy']}", f" [{bar}] {d['score']:.3f} (confidence: {d['confidence']:.0%})",
f"\n{d['strategy_description']}",
f"\nExploration: {d['exploration_ratio']:.2f} | Pivots: {d['pivot_count']}",
]
if d.get("sub_patterns"): lines += ["\nSub-patterns:"] + [f" โ€ข {p}" for p in d["sub_patterns"]]
if d.get("evidence"): lines += ["\nEvidence:"] + [f" โ†’ {e}" for e in d["evidence"]]
return "\n".join(lines)
except Exception as e: return f"Error: {e}"
def get_advanced_metrics():
try:
traj, meta, steps, _ = _get_traj_and_meta()
if not traj: return _no_traj()
r = adv_metrics_engine.compute(steps, meta, env.final_score, list(env.files_read), list(env.files_written))
d = r.to_dict()
def bar(v): return "โ–ˆ"*int(v*20)+"โ–‘"*(20-int(v*20))
lines = ["โšก ADVANCED METRICS", "โ”"*50,
f" Reasoning Efficiency [{bar(d['reasoning_efficiency'])}] {d['reasoning_efficiency']:.3f}",
f" Reliability Index [{bar(d['reliability_index'])}] {d['reliability_index']:.3f}",
f" Exploration Ratio [{bar(d['exploration_ratio'])}] {d['exploration_ratio']:.3f}",
f" Decision Entropy [{bar(d['decision_entropy'])}] {d['decision_entropy']:.3f}",
f" Wasteful Ratio [{bar(d['wasteful_ratio'])}] {d['wasteful_ratio']:.3f}",
f" Pivot Rate {d['pivot_rate']:.2f}/10 steps | Consistency {d['consistency_score']:.3f} ({d['runs_analyzed']} runs)",
]
if d.get("action_distribution"):
lines += ["\nAction Distribution:"] + [f" {a:14s}: {c}" for a,c in d["action_distribution"].items()]
return "\n".join(lines)
except Exception as e: return f"Error: {e}"
# โ”€โ”€ Tab 5: Self-Improve โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def get_improvement_plan():
try:
traj, meta, steps, ep_id = _get_traj_and_meta()
if not traj: return _no_traj()
fail_r = failure_clf.classify(ep_id, env.current_task or "?", steps, meta,
list(env.files_read), list(env.files_written), env.final_score)
plan = improvement_engine.generate_improvement_plan(
ep_id, env.current_task or "?", fail_r.primary_failure,
[f.evidence for f in fail_r.failures], env.final_score,
steps, list(env.files_read), list(env.files_written)
)
d = plan.to_dict()
lines = [
"๐Ÿ” SELF-IMPROVEMENT PLAN", "โ”"*50,
f"Original Score: {d['original_score']:.3f} | Failure: {d['failure_type']}",
f"\nโŒ What went wrong:\n {d['what_went_wrong']}",
f"\n๐ŸŽฏ Improved strategy:\n {d['improved_strategy']}",
"\n๐Ÿ“‹ Step-by-step plan:",
] + [f" {s}" for s in d.get("step_by_step_plan",[])]
lines += ["\n๐Ÿ’‰ System Prompt Injection:", "โ”€"*40, d.get("system_prompt_addon","None")]
return "\n".join(lines)
except Exception as e: return f"Error: {e}"
def get_memory_context_for_task(task):
try:
ctx = memory_bank.retrieve(task=task, max_lessons=3)
stats = memory_bank.get_stats()
lines = [
f"๐Ÿง  MEMORY BANK โ€” {stats['total_entries']} total lessons",
f"Retrieving for: {task}", "โ”"*50,
]
if not ctx.relevant_lessons:
lines.append("No lessons stored yet. Run episodes to build memory.")
else:
lines.append(f"\n๐Ÿ“š {ctx.lessons_count} relevant lesson(s):\n")
for i, e in enumerate(ctx.relevant_lessons, 1):
lines += [
f"[Lesson {i}] Task: {e.task} | Failure: {e.failure_type} | Score: {e.score:.2f}",
f" Title: {e.lesson_title}",
f" Lesson: {e.lesson_body[:120]}",
f" Hint: {e.lesson_hint[:120]}" if e.lesson_hint else "",
"",
]
lines += ["\n๐Ÿ’‰ System Prompt Injection:", "โ”€"*40, ctx.system_prompt_injection]
return "\n".join(l for l in lines)
except Exception as e: return f"Error: {e}"
# โ”€โ”€ Tab 6: Compare Agents โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def run_comparison(task, selected_agents):
try:
agents = selected_agents or None
report = multi_agent_engine.compare(env, task=task, agents=agents)
d = report.to_dict()
lines = [
f"โš–๏ธ MULTI-AGENT COMPARISON โ€” {task} (variant: {d.get('variant_id')})",
f"๐Ÿ† Winner: {d.get('winner')} (score: {d.get('winner_score',0):.3f})", "โ”"*80,
f"{'Rank':<5} {'Agent':<16} {'Score':<8} {'Steps':<7} {'Strategy':<22} {'Failure':<20} {'Reliability'}",
"โ”€"*80,
]
for row in d.get("summary_table",[]):
lines.append(f"#{row['rank']:<4} {row['agent']:<16} {row['score']:<8.3f} {row['steps']:<7} {row['strategy']:<22} {row['failure']:<20} {row['reliability']:.3f}")
lines.append("โ”"*80)
if d.get("insights"):
lines += ["\n๐Ÿ’ก Insights:"] + [f" โ†’ {i}" for i in d["insights"]]
lines.append("\n๐Ÿ“Š Action Sequences:")
for run in d.get("detailed_runs",[]):
seq = " โ†’ ".join(run.get("action_sequence",[]))
lines.append(f" {run['agent_name']:16s}: {seq}")
return "\n".join(lines)
except Exception as e: return f"โŒ {e}"
# โ”€โ”€ Tab 7: 3D Visualizer โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def get_viz_iframe():
"""Return iframe pointing to /static/viz3d.html โ€” fixes Three.js canvas rendering."""
# Add a cache-busting timestamp so Gradio re-renders on refresh
import time
ts = int(time.time())
return (
f'<iframe src="/static/viz3d.html?t={ts}" '
f'width="100%" height="640" frameborder="0" '
f'style="border-radius:10px;border:1px solid rgba(125,211,252,0.2);'
f'background:#0a0e1a;" '
f'allow="accelerometer; autoplay" loading="lazy">'
f'</iframe>'
)
# โ”€โ”€ Tab 8: Causal Probe โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def get_causal_probe():
try:
traj, meta, steps, ep_id = _get_traj_and_meta()
if not traj: return _no_traj()
r = causal_probe.probe(ep_id, env.current_task or "?", steps, meta,
list(env.files_read), list(env.files_written), env.final_score)
d = r.to_dict()
bar = lambda v: "โ–ˆ"*int(v*20)+"โ–‘"*(20-int(v*20))
lines = [
f"๐Ÿงช CAUSAL REASONING PROBE",
f"โ”"*55,
f"Understanding Level: {d['understanding_level']}",
f"Causal Score: [{bar(d['causal_score'])}] {d['causal_score']:.3f}",
f"Chain Coverage: [{bar(d['chain_coverage'])}] {d['chain_coverage']:.3f}",
f"Chain Order Score: [{bar(d['chain_order_score'])}] {d['chain_order_score']:.3f}",
f"\n๐Ÿ“ก Behavioral Signals:",
]
sigs = d.get("behavioral_signals",{})
for k,v in sigs.items():
lines.append(f" {'โœ…' if v else 'โŒ'} {k.replace('_',' ').title()}")
if d.get("understanding_indicators"):
lines += ["\nโœ… Understanding Indicators:"] + [f" โ€ข {i}" for i in d["understanding_indicators"]]
if d.get("guessing_indicators"):
lines += ["\nโŒ Guessing Indicators:"] + [f" โ€ข {i}" for i in d["guessing_indicators"]]
diag = d.get("diagnostics",{})
if diag.get("false_confidence_detected"):
lines.append("\nโš ๏ธ FALSE CONFIDENCE DETECTED โ€” submitted without adequate exploration")
if diag.get("shortcut_learning_detected"):
lines.append("โš ๏ธ SHORTCUT LEARNING DETECTED โ€” wrote without reading source")
lines += [f"\n๐Ÿ“ {d['explanation']}"]
if d.get("recommendations"):
lines += ["\n๐Ÿ’ก Recommendations:"] + [f" โ†’ {r_}" for r_ in d["recommendations"]]
return "\n".join(lines)
except Exception as e: return f"Error: {e}"
# โ”€โ”€ Tab 9: Counterfactual โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def get_counterfactual():
try:
traj, meta, steps, ep_id = _get_traj_and_meta()
if not traj: return _no_traj()
r = counterfactual_engine.analyze(ep_id, env.current_task or "?", steps, meta,
list(env.files_read), list(env.files_written), env.final_score)
d = r.to_dict()
bar = lambda v: "โ–ˆ"*int(v*20)+"โ–‘"*(20-int(v*20))
lines = [
f"๐ŸŽญ COUNTERFACTUAL ROBUSTNESS TEST",
f"โ”"*55,
f"Brittleness Level: {d['brittleness_level']}",
f"Robustness Score: [{bar(d['robustness_score'])}] {d['robustness_score']:.3f}",
f"Mutations Tested: {d['mutations_tested']}",
f"Mutations Survived: {d['mutations_survived']} โœ… | Failed: {d['mutations_failed']} โŒ",
f"\n๐Ÿงฌ Mutation Results:",
]
for m in d.get("mutations",[]):
icon = "โœ…" if not m["would_break_agent"] else "โŒ"
lines.append(f" {icon} [{m['type']}] {m['description'][:55]}")
lines.append(f" {m['why'][:80]}")
if d.get("surface_dependencies"):
lines += ["\nโš ๏ธ Surface Dependencies:"] + [f" โ€ข {s}" for s in d["surface_dependencies"]]
if d.get("deep_dependencies"):
lines += ["\nโœ… Deep Dependencies:"] + [f" โ€ข {s}" for s in d["deep_dependencies"]]
lines += [f"\n๐Ÿ“ {d['explanation']}"]
if d.get("recommendations"):
lines += ["\n๐Ÿ’ก Recommendations:"] + [f" โ†’ {r_}" for r_ in d["recommendations"]]
return "\n".join(lines)
except Exception as e: return f"Error: {e}"
# โ”€โ”€ Tab 10: Confidence Calibration โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def get_calibration():
try:
traj, meta, steps, ep_id = _get_traj_and_meta()
if not traj: return _no_traj()
r = confidence_calibrator.calibrate(ep_id, env.current_task or "?", steps, env.final_score)
d = r.to_dict()
bar = lambda v: "โ–ˆ"*int(v*20)+"โ–‘"*(20-int(v*20))
lines = [
f"๐Ÿ“ CONFIDENCE CALIBRATION REPORT",
f"โ”"*55,
f"Calibration Profile: {d['profile']}",
f"Calibration Score: [{bar(d['calibration_score'])}] {d['calibration_score']:.3f}",
f"Inferred Confidence: [{bar(d['inferred_confidence'])}] {d['inferred_confidence']:.3f}",
f"Actual Performance: [{bar(d['actual_performance'])}] {d['actual_performance']:.3f}",
f"Calibration Error: {d['expected_calibration_error']:.3f} (lower=better)",
f"Conf-Acc Correlation: {d['confidence_accuracy_correlation']:.3f}",
f"\n๐Ÿ“Š Behavioral Signals:",
]
sigs = d.get("signals",{})
lines.append(f" Commitment Speed: {sigs.get('commitment_speed',0):.3f} (high=fast commit)")
lines.append(f" Re-Exploration Rate: {sigs.get('re_exploration_rate',0):.3f} (high=uncertain)")
lines.append(f" Verification Rate: {sigs.get('verification_rate',0):.3f} tests/write")
lines.append(f" Submit Speed: {sigs.get('submit_speed',0):.3f} (high=early submit)")
lines += [f"\n๐Ÿ“ {d['diagnosis']}"]
if d.get("recommendations"):
lines += ["\n๐Ÿ’ก Recommendations:"] + [f" โ†’ {r_}" for r_ in d["recommendations"]]
if d.get("confidence_trajectory"):
lines.append("\n๐Ÿ“ˆ Confidence Trajectory:")
for s in d["confidence_trajectory"][:8]:
acc_str = f" | acc={s['accuracy']:.2f}" if s['accuracy'] is not None else ""
lines.append(f" S{s['step']}: {s['action']:12s} conf={s['confidence']:.2f}{acc_str}")
return "\n".join(lines)
except Exception as e: return f"Error: {e}"
# โ”€โ”€ Tab 11: Benchmark โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def run_benchmark(tasks_selected, agents_selected):
try:
tasks = tasks_selected if tasks_selected else ["task1", "task2", "task3"]
agents = agents_selected if agents_selected else None
report = benchmark_runner.run(env, tasks=tasks, agents=agents)
return report.render_table()
except Exception as e:
return f"โŒ Benchmark error: {e}"
# โ”€โ”€ Tab 12: Analytics โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def get_analytics():
try:
if not env.get_trajectory():
return _no_traj()
report = analytics_engine.analyze(env)
return report.render_text()
except Exception as e:
return f"Error: {e}"
def get_analytics_json():
try:
if not env.get_trajectory():
return _no_traj()
report = analytics_engine.analyze(env)
return json.dumps(report.to_dict(), indent=2, default=str)
except Exception as e:
return f"Error: {e}"
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
# Gradio UI
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
with gr.Blocks(title="Codebase Navigation & Repair โ€” OpenEnv v4") as demo:
gr.Markdown(
"# ๐Ÿ” Codebase Navigation & Repair โ€” OpenEnv v4\n"
"**The first platform that scientifically measures, explains, and improves AI agent reasoning.** "
"Navigate ยท Fix ยท Evaluate Process ยท Probe Causality ยท Test Counterfactuals ยท Calibrate Confidence ยท Benchmark."
)
with gr.Tabs():
# โ”€โ”€ Tab 0: Quick Start Guide โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.TabItem("๐Ÿ“– Quick Start Guide"):
gr.Markdown("""
### Welcome to Codebase Navigation & Repair โ€” OpenEnv v4
This interactive dashboard allows you to experience the environment infrastructure, run simulations, and analyze advanced agent logic.
#### ๐Ÿš€ Step-by-Step Evaluation Guide:
1. **Initialize an Episode**
- Navigate to the **๐Ÿค– Run Agent** tab.
- Select a task (`task1`, `task2`, or `task3`) and click **"Run Agent"**.
- *This simulates an AI executing an episode dynamically against the environment and stores the trajectory.*
2. **Trigger Advanced Intelligence Diagnostics (v3/v4 Features)**
- Go to **๐Ÿงช Causal Probe** and click it to evaluate if the agent truly understood the bug, or if it was just pattern-matching.
- Go to **๐ŸŽญ Counterfactual** to run mutation tests and analyze the brittleness of the agent's logic.
- Go to **๐Ÿ“ Confidence** to see if the agent over-explored or submitted too early.
- Go to **๐Ÿง  Intelligence** to execute failure classification and strategy detection.
3. **Visualize the Thought Process**
- Head over to the **๐ŸŒ 3D Visualizer** tab.
- Click **"Load / Refresh Visualizer"**.
- Using Three.js, this generates a dynamic 3D web of exactly how the agent traversed the repository files (cubes) and tests (prisms).
4. **Experiment Manually**
- Want to play the game yourself? Go to the **๐ŸŽฎ Interactive** tab.
- Click **Reset Environment**, then use the dropdowns to `read_file`, `write_file`, and finally `submit` to grade yourself.
5. **REST API / CLI Runner**
- The entire platform operates out of incredibly fast, natively compliant REST endpoints. Check the **๐Ÿ“– API** tab for standard cURL routing.
""")
# โ”€โ”€ Tab 1: Interactive โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.TabItem("๐ŸŽฎ Interactive"):
with gr.Row():
with gr.Column(scale=1):
task_sel = gr.Dropdown(["task1","task2","task3"], value="task1", label="Task")
reset_btn = gr.Button("๐Ÿ”„ Reset Environment", variant="primary")
gr.Markdown("### Action")
act_type = gr.Dropdown(["read_file","write_file","run_tests","search_code","submit"], value="read_file", label="Action Type")
act_path = gr.Textbox(label="Path", placeholder="src/auth.py")
act_query = gr.Textbox(label="Query", placeholder="validate_token")
act_content = gr.Textbox(label="Content (write_file)", lines=4)
step_btn = gr.Button("โ–ถ๏ธ Execute Step", variant="secondary")
with gr.Column(scale=2):
status_box = gr.Textbox(label="Status", lines=14, interactive=False)
result_box = gr.Textbox(label="Last Result", lines=8, interactive=False)
with gr.Row():
steps_box = gr.Textbox(label="Steps", value="0", interactive=False)
reward_box = gr.Textbox(label="Cumulative Reward", value="0.000", interactive=False)
reset_btn.click(reset_environment, [task_sel], [status_box, result_box, steps_box, reward_box])
step_btn.click(take_step, [act_type, act_path, act_query, act_content], [status_box, result_box, steps_box, reward_box])
# โ”€โ”€ Tab 2: Run Agent โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.TabItem("๐Ÿค– Run Agent"):
gr.Markdown("### Built-in Demonstration Agent\nRuns test-first deterministic strategy + stores lesson in memory bank.")
agent_task = gr.Dropdown(["task1","task2","task3"], value="task1", label="Task")
run_btn = gr.Button("๐Ÿš€ Run Agent", variant="primary")
agent_out = gr.Textbox(label="Agent Log", lines=22, interactive=False)
run_btn.click(run_builtin_agent, [agent_task], [agent_out])
# โ”€โ”€ Tab 3: Evaluation โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.TabItem("๐Ÿ“Š Evaluation"):
with gr.Row():
eval_btn = gr.Button("๐ŸŽฏ Evaluation Report", variant="primary")
metrics_btn = gr.Button("๐Ÿ“ˆ Metrics JSON", variant="secondary")
traj_btn = gr.Button("๐Ÿ—บ๏ธ Trajectory", variant="secondary")
eval_out = gr.Textbox(label="Output", lines=28, interactive=False)
eval_btn.click(get_evaluation, outputs=[eval_out])
metrics_btn.click(get_metrics, outputs=[eval_out])
traj_btn.click(get_trajectory, outputs=[eval_out])
# โ”€โ”€ Tab 4: Intelligence โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.TabItem("๐Ÿง  Intelligence"):
gr.Markdown("### Deep Agent Intelligence Analysis")
with gr.Row():
clf_btn = gr.Button("๐Ÿ”ฌ Classify Failure", variant="primary")
strat_btn = gr.Button("๐Ÿงญ Detect Strategy", variant="secondary")
adv_btn = gr.Button("โšก Advanced Metrics", variant="secondary")
intel_out = gr.Textbox(label="Analysis", lines=32, interactive=False)
clf_btn.click(get_failure_classification, outputs=[intel_out])
strat_btn.click(get_strategy_detection, outputs=[intel_out])
adv_btn.click(get_advanced_metrics, outputs=[intel_out])
# โ”€โ”€ Tab 5: Self-Improve โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.TabItem("๐Ÿ” Self-Improve"):
gr.Markdown("### Self-Improvement Loop + Episodic Memory")
with gr.Row():
improve_btn = gr.Button("๐Ÿ” Improvement Plan", variant="primary")
mem_task = gr.Dropdown(["task1","task2","task3"], value="task1", label="Task for Memory")
mem_btn = gr.Button("๐Ÿง  Retrieve Memory", variant="secondary")
improve_out = gr.Textbox(label="Output", lines=32, interactive=False)
improve_btn.click(get_improvement_plan, outputs=[improve_out])
mem_btn.click(get_memory_context_for_task, [mem_task], [improve_out])
# โ”€โ”€ Tab 6: Compare Agents โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.TabItem("โš–๏ธ Compare Agents"):
gr.Markdown("### Multi-Agent Strategy Comparison")
with gr.Row():
comp_task = gr.Dropdown(["task1","task2","task3"], value="task1", label="Task")
comp_agents = gr.CheckboxGroup(
["test-first","search-first","minimal","exhaustive"],
value=["test-first","search-first","minimal","exhaustive"],
label="Agents",
)
comp_btn = gr.Button("โš–๏ธ Run Comparison", variant="primary")
comp_out = gr.Textbox(label="Report", lines=30, interactive=False)
comp_btn.click(run_comparison, [comp_task, comp_agents], [comp_out])
# โ”€โ”€ Tab 7: 3D Visualizer โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.TabItem("๐ŸŒ 3D Visualizer"):
gr.Markdown(
"### Agent Trajectory 3D Visualization\n"
"Files = glowing 3D spheres ยท Dependencies = edges ยท Agent = animated beam ยท **Run an episode first.**"
)
refresh_btn = gr.Button("๐Ÿ”„ Load / Refresh Visualizer", variant="primary")
viz_html = gr.HTML(
value='<div style="text-align:center;padding:60px;color:#475569;background:#0a0e1a;border-radius:10px">'
'<p style="font-size:24px">๐ŸŒ</p>'
'<p style="color:#7dd3fc;font-weight:700">Run an episode then click Load</p></div>'
)
refresh_btn.click(get_viz_iframe, outputs=[viz_html])
# โ”€โ”€ Tab 8: Causal Probe โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.TabItem("๐Ÿงช Causal Probe"):
gr.Markdown(
"### Causal Reasoning Evaluation\n"
"Did the agent truly understand WHY the bug exists, "
"or did it pattern-match and guess? "
"Measures chain coverage, order, and shortcut learning."
)
causal_btn = gr.Button("๐Ÿงช Run Causal Probe", variant="primary")
causal_out = gr.Textbox(label="Causal Reasoning Report", lines=32, interactive=False)
causal_btn.click(get_causal_probe, outputs=[causal_out])
# โ”€โ”€ Tab 9: Counterfactual โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.TabItem("๐ŸŽญ Counterfactual"):
gr.Markdown(
"### Counterfactual Robustness Testing\n"
"Applies 6 semantic-neutral mutations (filename rename, constant change, "
"dummy function, directory shift, docstring noise, import reorder) "
"and measures whether the agent's strategy survives."
)
cf_btn = gr.Button("๐ŸŽญ Run Counterfactual Analysis", variant="primary")
cf_out = gr.Textbox(label="Robustness Report", lines=32, interactive=False)
cf_btn.click(get_counterfactual, outputs=[cf_out])
# โ”€โ”€ Tab 10: Confidence โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.TabItem("๐Ÿ“ Confidence"):
gr.Markdown(
"### Confidence Calibration Analysis\n"
"Infers agent confidence from behavioral proxies (commitment speed, "
"re-exploration rate, verification rate, submit timing) "
"and compares to actual performance. Detects overconfident and underconfident agents."
)
calib_btn = gr.Button("๐Ÿ“ Analyze Calibration", variant="primary")
calib_out = gr.Textbox(label="Calibration Report", lines=32, interactive=False)
calib_btn.click(get_calibration, outputs=[calib_out])
# โ”€โ”€ Tab 11: Benchmark โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.TabItem("๐Ÿ† Benchmark"):
gr.Markdown(
"### Automated Benchmark Leaderboard\n"
"Runs all selected agent strategies ร— all selected tasks automatically. "
"Ranks by composite score: correctness + causal reasoning + robustness + calibration + generalization."
)
with gr.Row():
bench_tasks = gr.CheckboxGroup(["task1","task2","task3"], value=["task1","task2"], label="Tasks to Benchmark")
bench_agents = gr.CheckboxGroup(
["test-first","search-first","minimal","exhaustive"],
value=["test-first","minimal"],
label="Agent Strategies",
)
bench_btn = gr.Button("๐Ÿ† Run Benchmark (2โ€“4 min)", variant="primary")
bench_out = gr.Textbox(label="Leaderboard", lines=35, interactive=False)
bench_btn.click(run_benchmark, [bench_tasks, bench_agents], [bench_out])
# โ”€โ”€ Tab 12: Analytics โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.TabItem("๐Ÿ“ˆ Analytics"):
gr.Markdown(
"### Unified Research-Grade Analytics\n"
"Synthesizes ALL evaluation dimensions into one report: "
"reasoning graph, root cause tree, alternative paths, profile tags, "
"decision efficiency, composite score. Paper-ready JSON available."
)
with gr.Row():
analytics_btn = gr.Button("๐Ÿ“ˆ Full Analytics Report", variant="primary")
analytics_json_btn = gr.Button("๐Ÿ“‹ Export JSON", variant="secondary")
analytics_out = gr.Textbox(label="Analytics Report", lines=40, interactive=False)
analytics_btn.click(get_analytics, outputs=[analytics_out])
analytics_json_btn.click(get_analytics_json, outputs=[analytics_out])
# โ”€โ”€ Tab 13: API โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.TabItem("๐Ÿ“– API"):
gr.Markdown("""
### REST API โ€” v4.0 Endpoints
#### Core
| `/reset` POST | `/step` POST | `/state` GET | `/health` GET |
#### Evaluation
| `/trajectory` GET | `/evaluate` GET | `/metrics` GET | `/fault-config` POST |
#### Intelligence (v3)
| `/classify` GET | `/strategy` GET | `/advanced-metrics` GET | `/improvement-plan` GET | `/compare-agents` POST | `/viz-data` GET |
#### Research (v4 NEW)
| `/causal-probe` GET | `/counterfactual` GET | `/confidence` GET | `/benchmark` POST | `/analytics` GET |
```bash
BASE="http://localhost:7860"
# Run a full episode
curl -X POST "$BASE/reset?task=task1"
curl -X POST "$BASE/step" -H "Content-Type: application/json" -d '{"action_type":"read_file","path":"tests/test_formatter.py"}'
curl -X POST "$BASE/step" -d '{"action_type":"submit"}'
# All intelligence endpoints
curl "$BASE/classify"
curl "$BASE/causal-probe"
curl "$BASE/counterfactual"
curl "$BASE/confidence"
curl "$BASE/analytics"
# Benchmark
curl -X POST "$BASE/benchmark?tasks=task1,task2"
```
""")
# โ”€โ”€ Mount FastAPI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
from server.app import app as fastapi_app
gr_app = gr.mount_gradio_app(fastapi_app, demo, path="/")
if __name__ == "__main__":
import uvicorn
uvicorn.run(fastapi_app, host="0.0.0.0", port=7860)