Spaces:

Chirag0123
/

codebase-nav-env

Sleeping

App Files Files Community

codebase-nav-env / server /counterfactual_engine.py

Chirag0123

v4 Research Modules & Pre-submission tweaks

0b0338d 12 days ago

raw

history blame contribute delete

16.5 kB

	# server/counterfactual_engine.py
	"""
	Counterfactual Robustness Engine — v4.0

	The key scientific question: Is the agent's strategy robust, or is it brittle?

	We test this by:
	1. Running an episode → recording strategy
	2. Applying small, semantically-neutral mutations to the repo
	(rename variable, change a constant, add a dummy function)
	3. Measuring whether the agent's recorded strategy would fail on the mutated repo

	IMPORTANT: This does NOT re-run the agent. It analyzes whether the
	already-recorded navigation pattern was based on deep structure (robust)
	or surface signals like filenames/constants (brittle).

	This is completely novel — no benchmark or tool does this.
	"""
	from __future__ import annotations
	import random
	import hashlib
	from typing import List, Dict, Any, Tuple
	from dataclasses import dataclass, field
	from enum import Enum


	class BrittlenessLevel(str, Enum):
	ROBUST = "ROBUST" # Strategy survives all mutations
	MILDLY_BRITTLE = "MILDLY_BRITTLE" # Survives 60-80% of mutations
	BRITTLE = "BRITTLE" # Survives < 60%
	FRAGILE = "FRAGILE" # Survives < 30%


	@dataclass
	class Mutation:
	"""A single counterfactual mutation applied to the repo."""
	mutation_type: str
	target_file: str
	description: str
	would_break_agent: bool # Would this mutation cause agent's strategy to fail?
	why: str # Explanation


	@dataclass
	class CounterfactualReport:
	"""Results of counterfactual robustness testing."""
	episode_id: str
	task: str
	brittleness_level: BrittlenessLevel
	robustness_score: float # 0.0 – 1.0

	mutations_tested: List[Mutation]
	mutations_survived: int
	mutations_failed: int

	surface_dependencies: List[str] # What surface signals the agent relied on
	deep_dependencies: List[str] # What structural signals it used correctly

	explanation: str
	recommendations: List[str]

	def to_dict(self) -> dict:
	return {
	"episode_id": self.episode_id,
	"task": self.task,
	"brittleness_level": self.brittleness_level.value,
	"robustness_score": round(self.robustness_score, 3),
	"mutations_tested": len(self.mutations_tested),
	"mutations_survived": self.mutations_survived,
	"mutations_failed": self.mutations_failed,
	"mutations": [
	{
	"type": m.mutation_type,
	"file": m.target_file,
	"description": m.description,
	"would_break_agent": m.would_break_agent,
	"why": m.why,
	}
	for m in self.mutations_tested
	],
	"surface_dependencies": self.surface_dependencies,
	"deep_dependencies": self.deep_dependencies,
	"explanation": self.explanation,
	"recommendations": self.recommendations,
	}


	class CounterfactualEngine:
	"""
	Analyzes brittleness by reasoning about what mutations would break the agent.

	We don't need to actually re-run the agent — we analyze the recorded
	trajectory and ask: "If file X was named differently / had a different
	constant, would this agent's navigation pattern still work?"

	Brittle signals:
	- Agent found bug file by pattern-matching on filename (not content search)
	- Agent submitted after reading the same file every run
	- Agent ignored test content and relied on positional heuristics

	Robust signals:
	- Agent used search_code to find function by name
	- Agent read test → traced import → found source
	- Agent ran tests and verified result before submitting
	"""

	MUTATION_TEMPLATES = [
	{
	"type": "FILENAME_RENAME",
	"description": "Rename src/X.py to src/X_v2.py (same content)",
	"breaks_if": "agent found file by name pattern, not by search or import tracing",
	"surface_signal": "filename",
	"robust_signal": "import tracing or search_code",
	},
	{
	"type": "CONSTANT_CHANGE",
	"description": "Change a numeric constant by ±1 (semantically neutral for navigation)",
	"breaks_if": "agent hardcoded expected value rather than reading actual code",
	"surface_signal": "constant value pattern matching",
	"robust_signal": "dynamic code reading",
	},
	{
	"type": "DUMMY_FUNCTION",
	"description": "Add a dummy function with a similar name near the bug",
	"breaks_if": "agent used first-match navigation without reading full context",
	"surface_signal": "first result of search or first match in file",
	"robust_signal": "reading complete function signatures before deciding",
	},
	{
	"type": "DIRECTORY_SHUFFLE",
	"description": "Move test file from tests/ to test/ (same content)",
	"breaks_if": "agent hardcoded path prefix tests/ instead of searching",
	"surface_signal": "hardcoded directory prefix",
	"robust_signal": "search or dynamic discovery",
	},
	{
	"type": "DOCSTRING_NOISE",
	"description": "Add misleading docstring claiming a different function causes the bug",
	"breaks_if": "agent read docs instead of tests to understand expected behavior",
	"surface_signal": "docstring content",
	"robust_signal": "test assertions as ground truth",
	},
	{
	"type": "IMPORT_REORDER",
	"description": "Reorder imports in the source file",
	"breaks_if": "agent relied on line numbers instead of function names",
	"surface_signal": "absolute line numbers",
	"robust_signal": "function name search",
	},
	]

	def analyze(
	self,
	episode_id: str,
	task: str,
	trajectory_steps: List[dict],
	variant_meta: dict,
	files_read: List[str],
	files_written: List[str],
	final_score: float,
	) -> CounterfactualReport:
	"""
	Analyze robustness by simulating mutations and reasoning about
	whether the agent's recorded pattern would survive them.
	"""
	action_types = [s.get("action_type", "") for s in trajectory_steps]
	action_paths = [s.get("action_path") for s in trajectory_steps]

	bug_files = set(variant_meta.get("bug_files", []) or
	variant_meta.get("files_to_implement", []) or [])
	test_files_meta = set(variant_meta.get("test_files", []) or [])

	# Infer what signals agent used
	used_search = "search_code" in action_types
	used_tests_first = self._tests_read_before_src(trajectory_steps, test_files_meta, bug_files)
	used_run_tests = "run_tests" in action_types
	blind_navigation = not used_search and not used_tests_first
	read_count = action_types.count("read_file")
	write_count = action_types.count("write_file")
	immediate_write = write_count > 0 and action_types.index("write_file") <= 2
	verified_before_submit = self._verified_before_submit(trajectory_steps)

	# ── Evaluate each mutation ────────────────────────────────────────────
	mutations: List[Mutation] = []

	for tmpl in self.MUTATION_TEMPLATES:
	target_file = self._pick_target_file(tmpl["type"], files_read, bug_files)
	would_break, why = self._would_break_agent(
	mutation_type=tmpl["type"],
	used_search=used_search,
	used_tests_first=used_tests_first,
	verified_before_submit=verified_before_submit,
	blind_navigation=blind_navigation,
	immediate_write=immediate_write,
	read_count=read_count,
	tmpl=tmpl,
	)
	mutations.append(Mutation(
	mutation_type=tmpl["type"],
	target_file=target_file or "unknown",
	description=tmpl["description"],
	would_break_agent=would_break,
	why=why,
	))

	survived = sum(1 for m in mutations if not m.would_break_agent)
	failed = len(mutations) - survived

	robustness_score = survived / len(mutations) if mutations else 0.0

	# ── Surface vs deep dependency analysis ──────────────────────────────
	surface_deps = []
	deep_deps = []

	if not used_search:
	surface_deps.append("Filename-based navigation (no search_code used)")
	if not used_tests_first:
	surface_deps.append("Skipped test-informed navigation")
	if immediate_write:
	surface_deps.append("Immediate write after minimal reading (blind fix)")
	if not verified_before_submit:
	surface_deps.append("Submitted without running tests (no verification)")

	if used_search:
	deep_deps.append("Used search_code to find functions by name (content-based)")
	if used_tests_first:
	deep_deps.append("Read tests first — used expected behavior as compass")
	if read_count >= 3:
	deep_deps.append(f"Read {read_count} files — explored structure before committing")
	if verified_before_submit:
	deep_deps.append("Verified fix with run_tests before submitting")

	# ── Brittleness classification ────────────────────────────────────────
	if robustness_score >= 0.80:
	level = BrittlenessLevel.ROBUST
	elif robustness_score >= 0.60:
	level = BrittlenessLevel.MILDLY_BRITTLE
	elif robustness_score >= 0.30:
	level = BrittlenessLevel.BRITTLE
	else:
	level = BrittlenessLevel.FRAGILE

	explanations = {
	BrittlenessLevel.ROBUST: (
	"Agent strategy is robust. It relies on deep structural signals (function names, "
	"test assertions, causal chain traversal) rather than surface patterns. "
	"Minor repo mutations would not break its navigation."
	),
	BrittlenessLevel.MILDLY_BRITTLE: (
	"Agent strategy is mildly brittle. Some mutations would break its navigation, "
	"particularly those that change surface signals it relied on. "
	"Using search_code and test-first navigation consistently would improve robustness."
	),
	BrittlenessLevel.BRITTLE: (
	"Agent strategy is brittle. Most mutations would break its navigation. "
	"The agent appears to rely on stable surface patterns (filenames, positions) "
	"rather than understanding the semantic structure of the codebase."
	),
	BrittlenessLevel.FRAGILE: (
	"Agent strategy is fragile. Almost any perturbation to the repo structure "
	"would cause this agent to fail. This indicates pure pattern-matching on "
	"the specific repo layout rather than generalizable code understanding."
	),
	}

	recs = []
	if not used_search:
	recs.append("Use search_code to find functions by name — survives filename renames.")
	if not used_tests_first:
	recs.append("Read tests first to anchor your navigation in expected behavior, not filenames.")
	if immediate_write:
	recs.append("Read source files before writing to them — avoid blind writes.")
	if not verified_before_submit:
	recs.append("Run tests after writing — verify your fix holds on the actual behavior.")

	return CounterfactualReport(
	episode_id=episode_id,
	task=task,
	brittleness_level=level,
	robustness_score=robustness_score,
	mutations_tested=mutations,
	mutations_survived=survived,
	mutations_failed=failed,
	surface_dependencies=surface_deps,
	deep_dependencies=deep_deps,
	explanation=explanations[level],
	recommendations=recs,
	)

	# ── Helpers ───────────────────────────────────────────────────────────────

	def _tests_read_before_src(
	self, steps: List[dict], test_files: set, bug_files: set
	) -> bool:
	test_steps = [
	s.get("step_number", 99) for s in steps
	if s.get("action_type") == "read_file"
	and any(tf in (s.get("action_path") or "") for tf in test_files)
	]
	src_steps = [
	s.get("step_number", 99) for s in steps
	if s.get("action_type") == "read_file"
	and any(bf in (s.get("action_path") or "") for bf in bug_files)
	]
	if test_steps and src_steps:
	return min(test_steps) < min(src_steps)
	return False

	def _verified_before_submit(self, steps: List[dict]) -> bool:
	submit_step = next(
	(s.get("step_number", 9999) for s in steps if s.get("action_type") == "submit"),
	None,
	)
	if submit_step is None:
	return False
	return any(
	s.get("action_type") == "run_tests"
	and s.get("step_number", 0) < submit_step
	for s in steps
	)

	def _pick_target_file(
	self, mutation_type: str, files_read: List[str], bug_files: set
	) -> str:
	if mutation_type in ("FILENAME_RENAME", "DUMMY_FUNCTION", "IMPORT_REORDER"):
	for f in bug_files:
	return f
	return files_read[0] if files_read else "src/main.py"
	if mutation_type == "DIRECTORY_SHUFFLE":
	for f in files_read:
	if "test" in f.lower():
	return f
	return files_read[0] if files_read else "unknown"

	def _would_break_agent(
	self,
	mutation_type: str,
	used_search: bool,
	used_tests_first: bool,
	verified_before_submit: bool,
	blind_navigation: bool,
	immediate_write: bool,
	read_count: int,
	tmpl: dict,
	) -> Tuple[bool, str]:
	"""
	Return (would_break, explanation) by reasoning about the agent's signals.
	"""
	if mutation_type == "FILENAME_RENAME":
	if used_search:
	return False, "Agent used search_code — finds function by name, not filename"
	if blind_navigation:
	return True, "Agent navigated by filename without search — rename breaks it"
	return True, "Agent likely relied on filename pattern without search fallback"

	if mutation_type == "CONSTANT_CHANGE":
	# Almost never breaks well-behaved agents
	if read_count >= 2:
	return False, "Agent read files dynamically — adapts to any constant value"
	return True, "Agent may have hardcoded expected value in navigation heuristic"

	if mutation_type == "DUMMY_FUNCTION":
	if used_search and read_count >= 3:
	return False, "Agent searched and read thoroughly — would disambiguate"
	return True, "Agent took first match without thorough reading"

	if mutation_type == "DIRECTORY_SHUFFLE":
	if used_search:
	return False, "search_code finds tests regardless of directory"
	return True, "Agent used hardcoded path prefix — directory change breaks it"

	if mutation_type == "DOCSTRING_NOISE":
	if used_tests_first:
	return False, "Agent used test assertions as ground truth, not docstrings"
	return True, "Agent may have read misleading docstring instead of test"

	if mutation_type == "IMPORT_REORDER":
	# Only brittle if agent relied on line numbers
	if read_count <= 1:
	return True, "Agent skimmed — likely used line numbers for navigation"
	return False, "Agent read full files — import reorder doesn't change function content"

	return False, "Neutral mutation"