Spaces:
Sleeping
Sleeping
| # server/counterfactual_engine.py | |
| """ | |
| Counterfactual Robustness Engine β v4.0 | |
| The key scientific question: Is the agent's strategy robust, or is it brittle? | |
| We test this by: | |
| 1. Running an episode β recording strategy | |
| 2. Applying small, semantically-neutral mutations to the repo | |
| (rename variable, change a constant, add a dummy function) | |
| 3. Measuring whether the agent's recorded strategy would fail on the mutated repo | |
| IMPORTANT: This does NOT re-run the agent. It analyzes whether the | |
| already-recorded navigation pattern was based on deep structure (robust) | |
| or surface signals like filenames/constants (brittle). | |
| This is completely novel β no benchmark or tool does this. | |
| """ | |
| from __future__ import annotations | |
| import random | |
| import hashlib | |
| from typing import List, Dict, Any, Tuple | |
| from dataclasses import dataclass, field | |
| from enum import Enum | |
| class BrittlenessLevel(str, Enum): | |
| ROBUST = "ROBUST" # Strategy survives all mutations | |
| MILDLY_BRITTLE = "MILDLY_BRITTLE" # Survives 60-80% of mutations | |
| BRITTLE = "BRITTLE" # Survives < 60% | |
| FRAGILE = "FRAGILE" # Survives < 30% | |
| class Mutation: | |
| """A single counterfactual mutation applied to the repo.""" | |
| mutation_type: str | |
| target_file: str | |
| description: str | |
| would_break_agent: bool # Would this mutation cause agent's strategy to fail? | |
| why: str # Explanation | |
| class CounterfactualReport: | |
| """Results of counterfactual robustness testing.""" | |
| episode_id: str | |
| task: str | |
| brittleness_level: BrittlenessLevel | |
| robustness_score: float # 0.0 β 1.0 | |
| mutations_tested: List[Mutation] | |
| mutations_survived: int | |
| mutations_failed: int | |
| surface_dependencies: List[str] # What surface signals the agent relied on | |
| deep_dependencies: List[str] # What structural signals it used correctly | |
| explanation: str | |
| recommendations: List[str] | |
| def to_dict(self) -> dict: | |
| return { | |
| "episode_id": self.episode_id, | |
| "task": self.task, | |
| "brittleness_level": self.brittleness_level.value, | |
| "robustness_score": round(self.robustness_score, 3), | |
| "mutations_tested": len(self.mutations_tested), | |
| "mutations_survived": self.mutations_survived, | |
| "mutations_failed": self.mutations_failed, | |
| "mutations": [ | |
| { | |
| "type": m.mutation_type, | |
| "file": m.target_file, | |
| "description": m.description, | |
| "would_break_agent": m.would_break_agent, | |
| "why": m.why, | |
| } | |
| for m in self.mutations_tested | |
| ], | |
| "surface_dependencies": self.surface_dependencies, | |
| "deep_dependencies": self.deep_dependencies, | |
| "explanation": self.explanation, | |
| "recommendations": self.recommendations, | |
| } | |
| class CounterfactualEngine: | |
| """ | |
| Analyzes brittleness by reasoning about what mutations would break the agent. | |
| We don't need to actually re-run the agent β we analyze the recorded | |
| trajectory and ask: "If file X was named differently / had a different | |
| constant, would this agent's navigation pattern still work?" | |
| Brittle signals: | |
| - Agent found bug file by pattern-matching on filename (not content search) | |
| - Agent submitted after reading the same file every run | |
| - Agent ignored test content and relied on positional heuristics | |
| Robust signals: | |
| - Agent used search_code to find function by name | |
| - Agent read test β traced import β found source | |
| - Agent ran tests and verified result before submitting | |
| """ | |
| MUTATION_TEMPLATES = [ | |
| { | |
| "type": "FILENAME_RENAME", | |
| "description": "Rename src/X.py to src/X_v2.py (same content)", | |
| "breaks_if": "agent found file by name pattern, not by search or import tracing", | |
| "surface_signal": "filename", | |
| "robust_signal": "import tracing or search_code", | |
| }, | |
| { | |
| "type": "CONSTANT_CHANGE", | |
| "description": "Change a numeric constant by Β±1 (semantically neutral for navigation)", | |
| "breaks_if": "agent hardcoded expected value rather than reading actual code", | |
| "surface_signal": "constant value pattern matching", | |
| "robust_signal": "dynamic code reading", | |
| }, | |
| { | |
| "type": "DUMMY_FUNCTION", | |
| "description": "Add a dummy function with a similar name near the bug", | |
| "breaks_if": "agent used first-match navigation without reading full context", | |
| "surface_signal": "first result of search or first match in file", | |
| "robust_signal": "reading complete function signatures before deciding", | |
| }, | |
| { | |
| "type": "DIRECTORY_SHUFFLE", | |
| "description": "Move test file from tests/ to test/ (same content)", | |
| "breaks_if": "agent hardcoded path prefix tests/ instead of searching", | |
| "surface_signal": "hardcoded directory prefix", | |
| "robust_signal": "search or dynamic discovery", | |
| }, | |
| { | |
| "type": "DOCSTRING_NOISE", | |
| "description": "Add misleading docstring claiming a different function causes the bug", | |
| "breaks_if": "agent read docs instead of tests to understand expected behavior", | |
| "surface_signal": "docstring content", | |
| "robust_signal": "test assertions as ground truth", | |
| }, | |
| { | |
| "type": "IMPORT_REORDER", | |
| "description": "Reorder imports in the source file", | |
| "breaks_if": "agent relied on line numbers instead of function names", | |
| "surface_signal": "absolute line numbers", | |
| "robust_signal": "function name search", | |
| }, | |
| ] | |
| def analyze( | |
| self, | |
| episode_id: str, | |
| task: str, | |
| trajectory_steps: List[dict], | |
| variant_meta: dict, | |
| files_read: List[str], | |
| files_written: List[str], | |
| final_score: float, | |
| ) -> CounterfactualReport: | |
| """ | |
| Analyze robustness by simulating mutations and reasoning about | |
| whether the agent's recorded pattern would survive them. | |
| """ | |
| action_types = [s.get("action_type", "") for s in trajectory_steps] | |
| action_paths = [s.get("action_path") for s in trajectory_steps] | |
| bug_files = set(variant_meta.get("bug_files", []) or | |
| variant_meta.get("files_to_implement", []) or []) | |
| test_files_meta = set(variant_meta.get("test_files", []) or []) | |
| # Infer what signals agent used | |
| used_search = "search_code" in action_types | |
| used_tests_first = self._tests_read_before_src(trajectory_steps, test_files_meta, bug_files) | |
| used_run_tests = "run_tests" in action_types | |
| blind_navigation = not used_search and not used_tests_first | |
| read_count = action_types.count("read_file") | |
| write_count = action_types.count("write_file") | |
| immediate_write = write_count > 0 and action_types.index("write_file") <= 2 | |
| verified_before_submit = self._verified_before_submit(trajectory_steps) | |
| # ββ Evaluate each mutation ββββββββββββββββββββββββββββββββββββββββββββ | |
| mutations: List[Mutation] = [] | |
| for tmpl in self.MUTATION_TEMPLATES: | |
| target_file = self._pick_target_file(tmpl["type"], files_read, bug_files) | |
| would_break, why = self._would_break_agent( | |
| mutation_type=tmpl["type"], | |
| used_search=used_search, | |
| used_tests_first=used_tests_first, | |
| verified_before_submit=verified_before_submit, | |
| blind_navigation=blind_navigation, | |
| immediate_write=immediate_write, | |
| read_count=read_count, | |
| tmpl=tmpl, | |
| ) | |
| mutations.append(Mutation( | |
| mutation_type=tmpl["type"], | |
| target_file=target_file or "unknown", | |
| description=tmpl["description"], | |
| would_break_agent=would_break, | |
| why=why, | |
| )) | |
| survived = sum(1 for m in mutations if not m.would_break_agent) | |
| failed = len(mutations) - survived | |
| robustness_score = survived / len(mutations) if mutations else 0.0 | |
| # ββ Surface vs deep dependency analysis ββββββββββββββββββββββββββββββ | |
| surface_deps = [] | |
| deep_deps = [] | |
| if not used_search: | |
| surface_deps.append("Filename-based navigation (no search_code used)") | |
| if not used_tests_first: | |
| surface_deps.append("Skipped test-informed navigation") | |
| if immediate_write: | |
| surface_deps.append("Immediate write after minimal reading (blind fix)") | |
| if not verified_before_submit: | |
| surface_deps.append("Submitted without running tests (no verification)") | |
| if used_search: | |
| deep_deps.append("Used search_code to find functions by name (content-based)") | |
| if used_tests_first: | |
| deep_deps.append("Read tests first β used expected behavior as compass") | |
| if read_count >= 3: | |
| deep_deps.append(f"Read {read_count} files β explored structure before committing") | |
| if verified_before_submit: | |
| deep_deps.append("Verified fix with run_tests before submitting") | |
| # ββ Brittleness classification ββββββββββββββββββββββββββββββββββββββββ | |
| if robustness_score >= 0.80: | |
| level = BrittlenessLevel.ROBUST | |
| elif robustness_score >= 0.60: | |
| level = BrittlenessLevel.MILDLY_BRITTLE | |
| elif robustness_score >= 0.30: | |
| level = BrittlenessLevel.BRITTLE | |
| else: | |
| level = BrittlenessLevel.FRAGILE | |
| explanations = { | |
| BrittlenessLevel.ROBUST: ( | |
| "Agent strategy is robust. It relies on deep structural signals (function names, " | |
| "test assertions, causal chain traversal) rather than surface patterns. " | |
| "Minor repo mutations would not break its navigation." | |
| ), | |
| BrittlenessLevel.MILDLY_BRITTLE: ( | |
| "Agent strategy is mildly brittle. Some mutations would break its navigation, " | |
| "particularly those that change surface signals it relied on. " | |
| "Using search_code and test-first navigation consistently would improve robustness." | |
| ), | |
| BrittlenessLevel.BRITTLE: ( | |
| "Agent strategy is brittle. Most mutations would break its navigation. " | |
| "The agent appears to rely on stable surface patterns (filenames, positions) " | |
| "rather than understanding the semantic structure of the codebase." | |
| ), | |
| BrittlenessLevel.FRAGILE: ( | |
| "Agent strategy is fragile. Almost any perturbation to the repo structure " | |
| "would cause this agent to fail. This indicates pure pattern-matching on " | |
| "the specific repo layout rather than generalizable code understanding." | |
| ), | |
| } | |
| recs = [] | |
| if not used_search: | |
| recs.append("Use search_code to find functions by name β survives filename renames.") | |
| if not used_tests_first: | |
| recs.append("Read tests first to anchor your navigation in expected behavior, not filenames.") | |
| if immediate_write: | |
| recs.append("Read source files before writing to them β avoid blind writes.") | |
| if not verified_before_submit: | |
| recs.append("Run tests after writing β verify your fix holds on the actual behavior.") | |
| return CounterfactualReport( | |
| episode_id=episode_id, | |
| task=task, | |
| brittleness_level=level, | |
| robustness_score=robustness_score, | |
| mutations_tested=mutations, | |
| mutations_survived=survived, | |
| mutations_failed=failed, | |
| surface_dependencies=surface_deps, | |
| deep_dependencies=deep_deps, | |
| explanation=explanations[level], | |
| recommendations=recs, | |
| ) | |
| # ββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _tests_read_before_src( | |
| self, steps: List[dict], test_files: set, bug_files: set | |
| ) -> bool: | |
| test_steps = [ | |
| s.get("step_number", 99) for s in steps | |
| if s.get("action_type") == "read_file" | |
| and any(tf in (s.get("action_path") or "") for tf in test_files) | |
| ] | |
| src_steps = [ | |
| s.get("step_number", 99) for s in steps | |
| if s.get("action_type") == "read_file" | |
| and any(bf in (s.get("action_path") or "") for bf in bug_files) | |
| ] | |
| if test_steps and src_steps: | |
| return min(test_steps) < min(src_steps) | |
| return False | |
| def _verified_before_submit(self, steps: List[dict]) -> bool: | |
| submit_step = next( | |
| (s.get("step_number", 9999) for s in steps if s.get("action_type") == "submit"), | |
| None, | |
| ) | |
| if submit_step is None: | |
| return False | |
| return any( | |
| s.get("action_type") == "run_tests" | |
| and s.get("step_number", 0) < submit_step | |
| for s in steps | |
| ) | |
| def _pick_target_file( | |
| self, mutation_type: str, files_read: List[str], bug_files: set | |
| ) -> str: | |
| if mutation_type in ("FILENAME_RENAME", "DUMMY_FUNCTION", "IMPORT_REORDER"): | |
| for f in bug_files: | |
| return f | |
| return files_read[0] if files_read else "src/main.py" | |
| if mutation_type == "DIRECTORY_SHUFFLE": | |
| for f in files_read: | |
| if "test" in f.lower(): | |
| return f | |
| return files_read[0] if files_read else "unknown" | |
| def _would_break_agent( | |
| self, | |
| mutation_type: str, | |
| used_search: bool, | |
| used_tests_first: bool, | |
| verified_before_submit: bool, | |
| blind_navigation: bool, | |
| immediate_write: bool, | |
| read_count: int, | |
| tmpl: dict, | |
| ) -> Tuple[bool, str]: | |
| """ | |
| Return (would_break, explanation) by reasoning about the agent's signals. | |
| """ | |
| if mutation_type == "FILENAME_RENAME": | |
| if used_search: | |
| return False, "Agent used search_code β finds function by name, not filename" | |
| if blind_navigation: | |
| return True, "Agent navigated by filename without search β rename breaks it" | |
| return True, "Agent likely relied on filename pattern without search fallback" | |
| if mutation_type == "CONSTANT_CHANGE": | |
| # Almost never breaks well-behaved agents | |
| if read_count >= 2: | |
| return False, "Agent read files dynamically β adapts to any constant value" | |
| return True, "Agent may have hardcoded expected value in navigation heuristic" | |
| if mutation_type == "DUMMY_FUNCTION": | |
| if used_search and read_count >= 3: | |
| return False, "Agent searched and read thoroughly β would disambiguate" | |
| return True, "Agent took first match without thorough reading" | |
| if mutation_type == "DIRECTORY_SHUFFLE": | |
| if used_search: | |
| return False, "search_code finds tests regardless of directory" | |
| return True, "Agent used hardcoded path prefix β directory change breaks it" | |
| if mutation_type == "DOCSTRING_NOISE": | |
| if used_tests_first: | |
| return False, "Agent used test assertions as ground truth, not docstrings" | |
| return True, "Agent may have read misleading docstring instead of test" | |
| if mutation_type == "IMPORT_REORDER": | |
| # Only brittle if agent relied on line numbers | |
| if read_count <= 1: | |
| return True, "Agent skimmed β likely used line numbers for navigation" | |
| return False, "Agent read full files β import reorder doesn't change function content" | |
| return False, "Neutral mutation" | |