Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| e2e_test_v3.py β Full End-to-End test suite for v3.0 | |
| Tests every endpoint, all 3 tasks, all new intelligence modules, | |
| multi-agent comparison, and the 3D viz-data endpoint. | |
| """ | |
| import sys | |
| import json | |
| import time | |
| import requests | |
| BASE = "http://localhost:7860" | |
| PASS = 0 | |
| FAIL = 0 | |
| RESULTS = [] | |
| def check(name, condition, detail=""): | |
| global PASS, FAIL | |
| status = "β PASS" if condition else "β FAIL" | |
| if condition: | |
| PASS += 1 | |
| else: | |
| FAIL += 1 | |
| msg = f" {status} {name}" | |
| if detail: | |
| msg += f" β {detail}" | |
| print(msg) | |
| RESULTS.append({"name": name, "passed": condition, "detail": detail}) | |
| def section(title): | |
| print(f"\n{'β'*60}") | |
| print(f" {title}") | |
| print(f"{'β'*60}") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("1. HEALTH & BASIC CONNECTIVITY") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| r = requests.get(f"{BASE}/health") | |
| check("GET /health returns 200", r.status_code == 200) | |
| data = r.json() | |
| check("Health version is 3.0.0", data.get("version") == "3.0.0", data.get("version")) | |
| check("Health status is ok", data.get("status") == "ok") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("2. CORE OPENENV β ALL 3 TASKS") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| for task in ["task1", "task2", "task3"]: | |
| r = requests.post(f"{BASE}/reset?task={task}") | |
| check(f"POST /reset?task={task} β 200", r.status_code == 200, f"status={r.status_code}") | |
| if r.status_code == 200: | |
| d = r.json() | |
| obs = d.get("observation", {}) | |
| check(f" {task}: has repo_tree", bool(obs.get("repo_tree")), str(obs.get("repo_tree", [])[:2])) | |
| check(f" {task}: has variant_id", bool(d.get("info", {}).get("variant_id"))) | |
| check(f" {task}: steps_remaining > 0", obs.get("steps_remaining", 0) > 0) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("3. STEP ACTIONS β FULL EPISODE (task1)") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| r = requests.post(f"{BASE}/reset?task=task1") | |
| obs = r.json()["observation"] | |
| tree = obs["repo_tree"] | |
| test_files = [f for f in tree if f.startswith("tests/")] | |
| src_files = [f for f in tree if f.startswith("src/")] | |
| # read_file | |
| r = requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": test_files[0]}) | |
| check("POST /step read_file test file β 200", r.status_code == 200) | |
| check("read_file reward >= 0", r.json().get("reward", -1) >= 0, str(r.json().get("reward"))) | |
| r = requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": src_files[0]}) | |
| check("POST /step read_file src file β 200", r.status_code == 200) | |
| # search_code | |
| r = requests.post(f"{BASE}/step", json={"action_type": "search_code", "query": "def "}) | |
| check("POST /step search_code β 200", r.status_code == 200) | |
| # run_tests | |
| r = requests.post(f"{BASE}/step", json={"action_type": "run_tests"}) | |
| check("POST /step run_tests β 200", r.status_code == 200, f"reward={r.json().get('reward')}") | |
| # submit | |
| r = requests.post(f"{BASE}/step", json={"action_type": "submit"}) | |
| check("POST /step submit β 200", r.status_code == 200) | |
| final_score = r.json()["info"].get("final_score", 0) | |
| check("Episode done after submit", r.json().get("done") == True) | |
| # Try stepping after done β should get 400 | |
| r = requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": "x.py"}) | |
| check("POST /step after done β 400", r.status_code == 400) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("4. STATE ENDPOINT") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| requests.post(f"{BASE}/reset?task=task1") | |
| requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": test_files[0]}) | |
| r = requests.get(f"{BASE}/state") | |
| check("GET /state β 200", r.status_code == 200) | |
| d = r.json() | |
| check("State has observation", "observation" in d) | |
| check("State total_steps_taken >= 1", d.get("total_steps_taken", 0) >= 1) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("5. TRAJECTORY & EVALUATION") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| requests.post(f"{BASE}/step", json={"action_type": "submit"}) | |
| r = requests.get(f"{BASE}/trajectory") | |
| check("GET /trajectory β 200", r.status_code == 200) | |
| traj = r.json() | |
| check("Trajectory has episode_id", bool(traj.get("episode_id"))) | |
| check("Trajectory steps > 0", len(traj.get("steps", [])) > 0, f"steps={len(traj.get('steps',[]))}") | |
| r = requests.get(f"{BASE}/evaluate") | |
| check("GET /evaluate β 200", r.status_code == 200) | |
| ev = r.json() | |
| check("Evaluation has composite_score", "composite_score" in ev, str(ev.get("composite_score"))) | |
| check("Evaluation has 6 dimensions", len(ev.get("dimensions", {})) == 6, str(list(ev.get("dimensions", {}).keys()))) | |
| r = requests.get(f"{BASE}/metrics") | |
| check("GET /metrics β 200", r.status_code == 200) | |
| m = r.json() | |
| check("Metrics has timeline", "timeline" in m, str(list(m.keys())[:5])) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("6. FAULT INJECTION") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| r = requests.post(f"{BASE}/fault-config", json={"level": "light"}) | |
| check("POST /fault-config light β 200", r.status_code == 200) | |
| r = requests.post(f"{BASE}/reset?task=task1") | |
| check("Reset with fault injection β 200", r.status_code == 200) | |
| fi = r.json().get("info", {}).get("fault_injection", {}) | |
| check("Fault injection info present", "difficulty_multiplier" in fi or "faults_injected" in fi, str(fi)) | |
| # Reset back | |
| requests.post(f"{BASE}/fault-config", json={"level": "none"}) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("7. INTELLIGENCE β FAILURE CLASSIFIER") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Run a fresh episode with minimal effort to get a known failure | |
| requests.post(f"{BASE}/reset?task=task1") | |
| requests.post(f"{BASE}/step", json={"action_type": "submit"}) # Submit without doing anything | |
| r = requests.get(f"{BASE}/classify") | |
| check("GET /classify β 200", r.status_code == 200) | |
| d = r.json() | |
| check("Classify has episode_id", "episode_id" in d, d.get("episode_id")) | |
| check("Classify has primary_failure", "primary_failure" in d, d.get("primary_failure")) | |
| check("Classify has success field", "success" in d) | |
| check("Classify success=False for minimal effort", d.get("success") == False) | |
| check("Classify has retry_hint", bool(d.get("retry_hint")), d.get("retry_hint", "")[:60]) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("8. INTELLIGENCE β STRATEGY DETECTOR") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| r = requests.get(f"{BASE}/strategy") | |
| check("GET /strategy β 200", r.status_code == 200) | |
| d = r.json() | |
| check("Strategy has strategy field", "strategy" in d, d.get("strategy")) | |
| VALID_STRATEGIES = ["TARGETED_DEBUGGING", "SYSTEMATIC_SEARCH", "BRUTE_FORCE", | |
| "RANDOM_EXPLORATION", "SPEC_DRIVEN", "MINIMAL_EFFORT"] | |
| check("Strategy is a known label", d.get("strategy") in VALID_STRATEGIES, d.get("strategy")) | |
| check("Strategy has score 0-1", 0 <= d.get("score", -1) <= 1, str(d.get("score"))) | |
| check("Strategy has exploration_ratio", "exploration_ratio" in d) | |
| check("Strategy has sub_patterns list", isinstance(d.get("sub_patterns"), list)) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("9. INTELLIGENCE β ADVANCED METRICS") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| r = requests.get(f"{BASE}/advanced-metrics") | |
| check("GET /advanced-metrics β 200", r.status_code == 200) | |
| d = r.json() | |
| expected_keys = ["reasoning_efficiency", "exploration_ratio", "decision_entropy", | |
| "reliability_index", "pivot_rate", "wasteful_ratio", "consistency_score"] | |
| for key in expected_keys: | |
| check(f" advanced-metrics has '{key}'", key in d, str(d.get(key, "MISSING"))) | |
| check("reliability_index in [0,1]", 0 <= d.get("reliability_index", -1) <= 1) | |
| check("action_distribution is dict", isinstance(d.get("action_distribution"), dict)) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("10. INTELLIGENCE β IMPROVEMENT PLAN") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| r = requests.get(f"{BASE}/improvement-plan") | |
| check("GET /improvement-plan β 200", r.status_code == 200) | |
| d = r.json() | |
| check("Plan has failure_type", "failure_type" in d, d.get("failure_type")) | |
| check("Plan has what_went_wrong", bool(d.get("what_went_wrong"))) | |
| check("Plan has improved_strategy", bool(d.get("improved_strategy"))) | |
| check("Plan has step_by_step_plan list", isinstance(d.get("step_by_step_plan"), list)) | |
| check("Plan step_by_step_plan not empty", len(d.get("step_by_step_plan", [])) > 0) | |
| check("Plan has system_prompt_addon", "system_prompt_addon" in d) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("11. MULTI-AGENT COMPARISON") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| r = requests.post(f"{BASE}/compare-agents?task=task1&agents=test-first,minimal") | |
| check("POST /compare-agents (2 agents) β 200", r.status_code == 200, f"status={r.status_code}") | |
| if r.status_code == 200: | |
| d = r.json() | |
| check("Comparison has winner", "winner" in d, d.get("winner")) | |
| check("Comparison has summary_table", "summary_table" in d) | |
| check("Summary table has 2 rows", len(d.get("summary_table", [])) == 2, | |
| str(len(d.get("summary_table", [])))) | |
| check("Each row has score/steps/strategy", all( | |
| "score" in row and "steps" in row and "strategy" in row | |
| for row in d.get("summary_table", []) | |
| )) | |
| check("Comparison has insights", "insights" in d) | |
| check("Comparison has detailed_runs", len(d.get("detailed_runs", [])) == 2) | |
| # Test all 4 agents | |
| r = requests.post(f"{BASE}/compare-agents?task=task1") | |
| check("POST /compare-agents (all agents) β 200", r.status_code == 200) | |
| if r.status_code == 200: | |
| d = r.json() | |
| check("All 4 agents ran", len(d.get("summary_table", [])) == 4, | |
| f"rows={len(d.get('summary_table',[]))}") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("12. 3D VISUALIZATION DATA") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Run a full episode first for viz data | |
| requests.post(f"{BASE}/reset?task=task1") | |
| requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": test_files[0]}) | |
| requests.post(f"{BASE}/step", json={"action_type": "submit"}) | |
| r = requests.get(f"{BASE}/viz-data") | |
| check("GET /viz-data β 200", r.status_code == 200) | |
| d = r.json() | |
| check("Viz-data has files array", isinstance(d.get("files"), list), f"len={len(d.get('files',[]))}") | |
| check("Viz-data files > 0", len(d.get("files", [])) > 0) | |
| check("Viz-data has dependencies", isinstance(d.get("dependencies"), list)) | |
| check("Viz-data has steps", isinstance(d.get("steps"), list)) | |
| check("Viz-data has strategy", "strategy" in d, d.get("strategy")) | |
| check("Viz-data has final_score", "final_score" in d) | |
| if d.get("files"): | |
| f = d["files"][0] | |
| check("File node has name/type/is_bug_file", all(k in f for k in ["name","type","is_bug_file"])) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("13. INVALID ACTION HANDLING") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| requests.post(f"{BASE}/reset?task=task1") | |
| # Invalid task | |
| r = requests.post(f"{BASE}/reset?task=task99") | |
| check("Invalid task β 400", r.status_code == 400) | |
| # Invalid action type | |
| r = requests.post(f"{BASE}/step", json={"action_type": "hack_system"}) | |
| check("Invalid action_type β 400 or 422", r.status_code in (400, 422)) | |
| # Non-existent file | |
| r = requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": "non_existent.py"}) | |
| check("Read non-existent file β 200 with error", r.status_code == 200) | |
| obs = r.json().get("observation", {}) | |
| check("Non-existent file has error in obs", bool(obs.get("last_action_error")), obs.get("last_action_error","")[:60]) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("14. SECURITY SCANNING") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| requests.post(f"{BASE}/reset?task=task1") | |
| # Try to write a file with dangerous code | |
| r = requests.post(f"{BASE}/step", json={ | |
| "action_type": "write_file", | |
| "path": src_files[0] if src_files else "src/hack.py", | |
| "content": "import os\nos.system('rm -rf /')\n" | |
| }) | |
| check("Write dangerous code β 200", r.status_code == 200) | |
| if r.status_code == 200: | |
| info = r.json().get("info", {}) | |
| flags = info.get("security_flags", []) | |
| check("Security flags populated for os.system", len(flags) > 0, str(flags[:2])) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("15. GRADIO UI ENDPOINTS") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| r = requests.get(f"{BASE}/") | |
| check("GET / (Gradio UI) β 200", r.status_code == 200) | |
| check("Response is HTML", "text/html" in r.headers.get("content-type", "")) | |
| r = requests.get(f"{BASE}/static/viz3d.html") | |
| check("GET /static/viz3d.html β 200", r.status_code == 200) | |
| check("viz3d.html is HTML", "html" in r.text.lower()[:200]) | |
| check("viz3d.html has Three.js", "three" in r.text.lower()) | |
| check("viz3d.html has timeline-slider", "timeline-slider" in r.text) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("16. TASK2 & TASK3 FULL EPISODE") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| for task in ["task2", "task3"]: | |
| r = requests.post(f"{BASE}/reset?task={task}") | |
| check(f"{task} reset β 200", r.status_code == 200) | |
| obs = r.json()["observation"] | |
| tree = obs["repo_tree"] | |
| tf = [f for f in tree if f.startswith("tests/")] | |
| sf = [f for f in tree if f.startswith("src/")] | |
| md = [f for f in tree if f.endswith(".md")] | |
| if task == "task3" and md: | |
| requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": md[0]}) | |
| if tf: | |
| requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": tf[0]}) | |
| if sf: | |
| requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": sf[0]}) | |
| r = requests.post(f"{BASE}/step", json={"action_type": "submit"}) | |
| check(f"{task} submit β done", r.json().get("done") == True) | |
| # Verify all intelligence endpoints work post-episode | |
| r = requests.get(f"{BASE}/classify") | |
| check(f"{task} /classify works", r.status_code == 200 and "primary_failure" in r.json()) | |
| r = requests.get(f"{BASE}/strategy") | |
| check(f"{task} /strategy works", r.status_code == 200 and "strategy" in r.json()) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| section("17. CONSISTENCY β 3 RUNS SAME TASK") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| scores = [] | |
| for i in range(3): | |
| requests.post(f"{BASE}/reset?task=task1") | |
| r = requests.get(f"{BASE}/state") | |
| tree = r.json()["observation"]["repo_tree"] | |
| tf = [f for f in tree if f.startswith("tests/")] | |
| if tf: | |
| requests.post(f"{BASE}/step", json={"action_type": "read_file", "path": tf[0]}) | |
| requests.post(f"{BASE}/step", json={"action_type": "submit"}) | |
| metrics = requests.get(f"{BASE}/advanced-metrics").json() | |
| scores.append(requests.get(f"{BASE}/evaluate").json().get("composite_score", 0)) | |
| check("3 runs completed", len(scores) == 3, str(scores)) | |
| check("All runs have valid scores", all(0 <= s <= 1 for s in scores), str(scores)) | |
| # Consistency metric | |
| r = requests.get(f"{BASE}/advanced-metrics") | |
| d = r.json() | |
| check("Consistency score populated after multiple runs", d.get("runs_analyzed", 0) >= 1, | |
| f"runs={d.get('runs_analyzed')}, consistency={d.get('consistency_score'):.3f}") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print(f"\n{'β'*60}") | |
| print(f" E2E RESULTS: {PASS} passed | {FAIL} failed | {PASS+FAIL} total") | |
| print(f" Score: {PASS/(PASS+FAIL)*100:.1f}%") | |
| print(f"{'β'*60}") | |
| if FAIL > 0: | |
| print("\nFailed tests:") | |
| for r in RESULTS: | |
| if not r["passed"]: | |
| print(f" β {r['name']}: {r['detail']}") | |
| sys.exit(0 if FAIL == 0 else 1) | |