Spaces:

SouravNath
/

repomind-api

Running

App Files Files Community

repomind-api / agent /failure_categoriser.py

SouravNath

Initial commit

dc71cad 4 days ago

raw

history blame contribute delete

5.44 kB

	"""
	agent/failure_categoriser.py
	──────────────────────────────
	Rule-based + regex failure categoriser.

	After each failed attempt, the agent parses pytest output and classifies
	the failure into one of these categories:

	syntax_error — the patch introduced a SyntaxError
	hallucinated_api — agent called a function/attribute that doesn't exist
	wrong_file_edit — agent edited the wrong file (tests in different module fail)
	incomplete_patch — partial fix: some tests pass but not all FAIL_TO_PASS
	flaky_test — test is non-deterministic (passes on retry)
	import_error — missing import or circular import introduced
	type_error — wrong argument type passed
	assertion_error — logic bug remains, assertion fails with unexpected value
	unknown — can't categorise

	The category is logged to MLflow and stored in trajectory JSONL.
	This taxonomy directly drives which trajectories we select for fine-tuning
	(Phase 7 filters on known-category failures).
	"""
	from __future__ import annotations

	import re
	from typing import Literal

	FailureCategory = Literal[
	"syntax_error",
	"hallucinated_api",
	"wrong_file_edit",
	"incomplete_patch",
	"flaky_test",
	"import_error",
	"type_error",
	"assertion_error",
	"success",
	"unknown",
	]

	# ── Regex patterns ────────────────────────────────────────────────────────────

	_PATTERNS: list[tuple[FailureCategory, re.Pattern]] = [
	("syntax_error", re.compile(r"SyntaxError\|IndentationError\|TabError", re.I)),
	("import_error", re.compile(r"ImportError\|ModuleNotFoundError\|cannot import name", re.I)),
	("hallucinated_api", re.compile(
	r"AttributeError: .+ object has no attribute\|"
	r"TypeError: .+ takes \d+ positional argument\|"
	r"NameError: name .+ is not defined",
	re.I
	)),
	("type_error", re.compile(r"TypeError:", re.I)),
	("assertion_error", re.compile(r"AssertionError", re.I)),
	]

	_FLAKY_PATTERNS = re.compile(
	r"ResourceWarning\|"
	r"random\|"
	r"race condition\|"
	r"flaky\|"
	r"connection refused\|"
	r"socket\.timeout",
	re.I
	)


	def categorise_failure(
	test_stdout: str,
	patch_apply_success: bool,
	fail_to_pass_results: dict[str, bool],
	pass_to_pass_results: dict[str, bool],
	attempt_num: int = 1,
	previous_categories: list[FailureCategory] \| None = None,
	) -> FailureCategory:
	"""
	Classify a failed attempt into a FailureCategory.

	Decision flow:
	1. Patch didn't apply → syntax_error
	2. All FAIL_TO_PASS pass → success
	3. Scan error messages in stdout for pattern matches
	4. If same test failed differently across attempts → flaky_test
	5. If some FTP pass but not all → incomplete_patch
	6. Fallback: unknown

	Args:
	test_stdout: raw pytest output
	patch_apply_success: whether `git apply` succeeded
	fail_to_pass_results: {test_id: passed} for FAIL_TO_PASS tests
	pass_to_pass_results: {test_id: still_passing} for PASS_TO_PASS tests
	attempt_num: current attempt number (1-indexed)
	previous_categories: categories from earlier attempts (flaky detection)

	Returns:
	FailureCategory string
	"""
	# 1. Patch apply failed → likely syntax_error in diff
	if not patch_apply_success:
	return "syntax_error"

	# 2. All tests pass → success
	ftp_ok = all(fail_to_pass_results.values()) if fail_to_pass_results else False
	ptp_ok = all(pass_to_pass_results.values()) if pass_to_pass_results else True
	if ftp_ok and ptp_ok:
	return "success"

	# 3. Scan pytest output for error patterns
	for category, pattern in _PATTERNS:
	if pattern.search(test_stdout):
	return category

	# 4. Flaky test detection: if we've seen different failures across attempts
	if previous_categories and len(set(previous_categories)) > 1:
	if _FLAKY_PATTERNS.search(test_stdout):
	return "flaky_test"

	# 5. Partial success — some FTP tests pass but not all
	ftp_passed = sum(1 for v in fail_to_pass_results.values() if v)
	ftp_total = len(fail_to_pass_results)
	if ftp_passed > 0 and ftp_passed < ftp_total:
	return "incomplete_patch"

	# 6. PASS_TO_PASS regression only (our patch broke existing tests)
	ptp_failed = sum(1 for v in pass_to_pass_results.values() if not v)
	if ptp_failed > 0 and ftp_passed == ftp_total:
	return "wrong_file_edit"

	return "unknown"


	def extract_first_error_context(test_stdout: str, max_lines: int = 20) -> str:
	"""
	Extract the most relevant error lines from pytest output.
	Used to build the reflection prompt — give the LLM targeted failure info.
	"""
	lines = test_stdout.splitlines()

	# Find first FAILED line and return context around it
	for i, line in enumerate(lines):
	if "FAILED" in line or "ERROR" in line or "assert" in line.lower():
	start = max(0, i - 2)
	end = min(len(lines), i + max_lines)
	return "\n".join(lines[start:end])

	# Fallback: last N lines (pytest puts summary at end)
	return "\n".join(lines[-max_lines:])