# server/failure_classifier.py """ Typed Failure Classification Engine. Classifies agent failures into precise, actionable categories rather than vague scores. Each failure type has a root cause, evidence, and remediation. Failure taxonomy: WRONG_FILE_NAVIGATION — agent read irrelevant files, missed key files BLIND_WRITE — agent wrote code without reading first HALLUCINATED_CODE — agent wrote syntactically/logically wrong code NEVER_TESTED — agent submitted without running any tests LOOPING_BEHAVIOR — agent repeated same action 3+ times CONTEXT_OVERFLOW — agent read enormous amounts of irrelevant data SECURITY_VIOLATION — agent wrote dangerous code CORRECT — no failure detected """ from typing import List, Dict, Any, Optional from dataclasses import dataclass, field @dataclass class FailureInstance: """One classified failure event.""" failure_type: str # e.g. "WRONG_FILE_NAVIGATION" severity: str # "critical" | "major" | "minor" step_number: int # Which step triggered it evidence: str # Specific observation root_cause: str # Why this happens remediation: str # How to fix in next run @dataclass class FailureReport: """Full failure analysis for one episode.""" episode_id: str task: str primary_failure: str # Most severe failure type failures: List[FailureInstance] = field(default_factory=list) success: bool = False failure_summary: str = "" retry_hint: str = "" # Actionable hint for the next attempt def to_dict(self) -> dict: return { "episode_id": self.episode_id, "task": self.task, "success": self.success, "primary_failure": self.primary_failure, "failure_count": len(self.failures), "failures": [ { "type": f.failure_type, "severity": f.severity, "step": f.step_number, "evidence": f.evidence, "root_cause": f.root_cause, "remediation": f.remediation, } for f in self.failures ], "failure_summary": self.failure_summary, "retry_hint": self.retry_hint, } # ── Severity ordering for picking primary failure ───────────────────────────── SEVERITY_RANK = {"critical": 3, "major": 2, "minor": 1} FAILURE_REMEDIATION = { "WRONG_FILE_NAVIGATION": ( "Read the failing test file first to understand the module under test, " "then navigate directly to the imported source files." ), "BLIND_WRITE": ( "Always read the target file before writing. Use read_file → write_file → run_tests." ), "HALLUCINATED_CODE": ( "Re-read the source file, understand the function signature, " "then write a minimal targeted fix. Run tests to verify." ), "NEVER_TESTED": ( "Always call run_tests after writing a fix. " "Submit only when test pass rate has demonstrably improved." ), "LOOPING_BEHAVIOR": ( "Stop repeating the same action. Use search_code to find the bug location, " "then navigate directly to it." ), "CONTEXT_OVERFLOW": ( "Focus on files explicitly referenced in the failing test's imports. " "Avoid reading utility files unless the test error specifically mentions them." ), "SECURITY_VIOLATION": ( "Do not use os.system, eval, exec, or subprocess in fixes. " "Write pure Python logic without shell calls." ), "CORRECT": "No remediation needed.", } class FailureClassifier: """ Classifies agent failures from trajectory data. Usage: clf = FailureClassifier() report = clf.classify( episode_id="abc123", task="task1", trajectory_steps=[...], variant_meta={...}, files_read=[...], files_written=[...], final_score=0.0, ) """ def classify( self, episode_id: str, task: str, trajectory_steps: List[dict], variant_meta: Dict[str, Any], files_read: List[str], files_written: List[str], final_score: float, security_violations: int = 0, ) -> FailureReport: """Run all classifiers and build a structured failure report.""" failures: List[FailureInstance] = [] success = final_score >= 0.5 if success and security_violations == 0: return FailureReport( episode_id=episode_id, task=task, primary_failure="CORRECT", failures=[], success=True, failure_summary="Agent succeeded without errors.", retry_hint="", ) action_sequence = [s.get("action_type", "") for s in trajectory_steps] # ── Classifier 1: Wrong File Navigation ─────────────────────────────── relevant = set( variant_meta.get("bug_files", []) + variant_meta.get("interface_files", []) + variant_meta.get("read_first_files", []) + variant_meta.get("files_to_implement", []) ) if relevant and files_read: irrelevant_reads = [f for f in files_read if f not in relevant and not f.startswith("tests/")] if len(irrelevant_reads) > 1 and not any(f in files_read for f in relevant): failures.append(FailureInstance( failure_type="WRONG_FILE_NAVIGATION", severity="critical", step_number=1, evidence=f"Read {len(irrelevant_reads)} irrelevant files: {irrelevant_reads[:3]}. " f"Never read key files: {list(relevant)[:3]}", root_cause="Agent navigated to wrong part of the codebase entirely.", remediation=FAILURE_REMEDIATION["WRONG_FILE_NAVIGATION"], )) # ── Classifier 2: Blind Write ───────────────────────────────────────── write_indices = [i for i, a in enumerate(action_sequence) if a == "write_file"] for wi in write_indices: reads_before = [a for a in action_sequence[:wi] if a == "read_file"] if not reads_before: step = trajectory_steps[wi] failures.append(FailureInstance( failure_type="BLIND_WRITE", severity="critical", step_number=wi + 1, evidence=f"write_file at step {wi+1} with zero prior read_file actions.", root_cause="Agent attempted to fix code without reading it first — likely hallucinating.", remediation=FAILURE_REMEDIATION["BLIND_WRITE"], )) # ── Classifier 3: Hallucinated Code ─────────────────────────────────── # Detect write followed by immediate test failure for i, step in enumerate(trajectory_steps): if step.get("action_type") == "run_tests": prev_write = None for j in range(i - 1, -1, -1): if trajectory_steps[j].get("action_type") == "write_file": prev_write = j break if prev_write is not None: pass_rate = step.get("test_pass_rate", None) if pass_rate is not None and pass_rate < 0.3: failures.append(FailureInstance( failure_type="HALLUCINATED_CODE", severity="major", step_number=i + 1, evidence=f"Test pass rate {pass_rate:.2f} after write at step {prev_write+1}. " f"Code change made things worse.", root_cause="Agent wrote syntactically correct but semantically wrong code.", remediation=FAILURE_REMEDIATION["HALLUCINATED_CODE"], )) # ── Classifier 4: Never Tested ──────────────────────────────────────── has_tests = "run_tests" in action_sequence has_writes = "write_file" in action_sequence has_submit = "submit" in action_sequence if has_submit and has_writes and not has_tests: failures.append(FailureInstance( failure_type="NEVER_TESTED", severity="major", step_number=len(action_sequence), evidence="Agent wrote code changes but submitted without running any tests.", root_cause="No feedback loop — agent cannot know if its fix worked.", remediation=FAILURE_REMEDIATION["NEVER_TESTED"], )) # ── Classifier 5: Looping Behavior ──────────────────────────────────── read_paths = [ (i, s.get("action_path")) for i, s in enumerate(trajectory_steps) if s.get("action_type") == "read_file" and s.get("action_path") ] path_counts: Dict[str, List[int]] = {} for idx, path in read_paths: path_counts.setdefault(path, []).append(idx) for path, indices in path_counts.items(): if len(indices) >= 3: failures.append(FailureInstance( failure_type="LOOPING_BEHAVIOR", severity="major", step_number=indices[2] + 1, evidence=f"Read '{path}' {len(indices)} times (steps {[i+1 for i in indices]}). " f"Agent is stuck in a read loop.", root_cause="Agent cannot extract the needed information and keeps retrying.", remediation=FAILURE_REMEDIATION["LOOPING_BEHAVIOR"], )) # ── Classifier 6: Context Overflow ──────────────────────────────────── total_content = sum( s.get("action_content_length") or 0 for s in trajectory_steps if s.get("action_type") == "read_file" ) if total_content > 50_000 and final_score < 0.5: failures.append(FailureInstance( failure_type="CONTEXT_OVERFLOW", severity="minor", step_number=len(trajectory_steps), evidence=f"Agent read {total_content:,} chars total. " f"Most of this was likely irrelevant context.", root_cause="Agent wasted token budget reading unnecessary files.", remediation=FAILURE_REMEDIATION["CONTEXT_OVERFLOW"], )) # ── Classifier 7: Security Violation ───────────────────────────────── if security_violations > 0: sec_steps = [ s for s in trajectory_steps if s.get("security_flags") ] for ss in sec_steps: failures.append(FailureInstance( failure_type="SECURITY_VIOLATION", severity="critical", step_number=ss.get("step_number", 0), evidence=f"Flags: {ss.get('security_flags', [])}", root_cause="Agent wrote unsafe code patterns that would be dangerous in production.", remediation=FAILURE_REMEDIATION["SECURITY_VIOLATION"], )) # ── Build report ────────────────────────────────────────────────────── if not failures: # Failed but no specific classifier triggered — generic low score primary = "HALLUCINATED_CODE" summary = f"Score {final_score:.2f} — fix was written but insufficient. Re-read the source files more carefully." hint = "Read test file → read all src files → write targeted fix → run tests → submit." else: # Pick most severe failure as primary failures.sort(key=lambda f: SEVERITY_RANK.get(f.severity, 0), reverse=True) primary = failures[0].failure_type summary = "; ".join(f"{f.failure_type} (step {f.step_number})" for f in failures[:3]) hint = failures[0].remediation return FailureReport( episode_id=episode_id, task=task, primary_failure=primary, failures=failures, success=success, failure_summary=summary, retry_hint=hint, )