Spaces:

SouravNath
/

repomind-api

Running

File size: 5,435 Bytes

dc71cad

"""
agent/failure_categoriser.py
──────────────────────────────
Rule-based + regex failure categoriser.

After each failed attempt, the agent parses pytest output and classifies
the failure into one of these categories:

  syntax_error        — the patch introduced a SyntaxError
  hallucinated_api    — agent called a function/attribute that doesn't exist
  wrong_file_edit     — agent edited the wrong file (tests in different module fail)
  incomplete_patch    — partial fix: some tests pass but not all FAIL_TO_PASS
  flaky_test          — test is non-deterministic (passes on retry)
  import_error        — missing import or circular import introduced
  type_error          — wrong argument type passed
  assertion_error     — logic bug remains, assertion fails with unexpected value
  unknown             — can't categorise

The category is logged to MLflow and stored in trajectory JSONL.
This taxonomy directly drives which trajectories we select for fine-tuning
(Phase 7 filters on known-category failures).
"""
from __future__ import annotations

import re
from typing import Literal

FailureCategory = Literal[
    "syntax_error",
    "hallucinated_api",
    "wrong_file_edit",
    "incomplete_patch",
    "flaky_test",
    "import_error",
    "type_error",
    "assertion_error",
    "success",
    "unknown",
]

# ── Regex patterns ────────────────────────────────────────────────────────────

_PATTERNS: list[tuple[FailureCategory, re.Pattern]] = [
    ("syntax_error",     re.compile(r"SyntaxError|IndentationError|TabError", re.I)),
    ("import_error",     re.compile(r"ImportError|ModuleNotFoundError|cannot import name", re.I)),
    ("hallucinated_api", re.compile(
        r"AttributeError: .+ object has no attribute|"
        r"TypeError: .+ takes \d+ positional argument|"
        r"NameError: name .+ is not defined",
        re.I
    )),
    ("type_error",       re.compile(r"TypeError:", re.I)),
    ("assertion_error",  re.compile(r"AssertionError", re.I)),
]

_FLAKY_PATTERNS = re.compile(
    r"ResourceWarning|"
    r"random|"
    r"race condition|"
    r"flaky|"
    r"connection refused|"
    r"socket\.timeout",
    re.I
)


def categorise_failure(
    test_stdout: str,
    patch_apply_success: bool,
    fail_to_pass_results: dict[str, bool],
    pass_to_pass_results: dict[str, bool],
    attempt_num: int = 1,
    previous_categories: list[FailureCategory] | None = None,
) -> FailureCategory:
    """
    Classify a failed attempt into a FailureCategory.

    Decision flow:
      1. Patch didn't apply → syntax_error
      2. All FAIL_TO_PASS pass → success
      3. Scan error messages in stdout for pattern matches
      4. If same test failed differently across attempts → flaky_test
      5. If some FTP pass but not all → incomplete_patch
      6. Fallback: unknown

    Args:
        test_stdout:           raw pytest output
        patch_apply_success:   whether `git apply` succeeded
        fail_to_pass_results:  {test_id: passed} for FAIL_TO_PASS tests
        pass_to_pass_results:  {test_id: still_passing} for PASS_TO_PASS tests
        attempt_num:           current attempt number (1-indexed)
        previous_categories:   categories from earlier attempts (flaky detection)

    Returns:
        FailureCategory string
    """
    # 1. Patch apply failed → likely syntax_error in diff
    if not patch_apply_success:
        return "syntax_error"

    # 2. All tests pass → success
    ftp_ok = all(fail_to_pass_results.values()) if fail_to_pass_results else False
    ptp_ok = all(pass_to_pass_results.values()) if pass_to_pass_results else True
    if ftp_ok and ptp_ok:
        return "success"

    # 3. Scan pytest output for error patterns
    for category, pattern in _PATTERNS:
        if pattern.search(test_stdout):
            return category

    # 4. Flaky test detection: if we've seen different failures across attempts
    if previous_categories and len(set(previous_categories)) > 1:
        if _FLAKY_PATTERNS.search(test_stdout):
            return "flaky_test"

    # 5. Partial success — some FTP tests pass but not all
    ftp_passed = sum(1 for v in fail_to_pass_results.values() if v)
    ftp_total = len(fail_to_pass_results)
    if ftp_passed > 0 and ftp_passed < ftp_total:
        return "incomplete_patch"

    # 6. PASS_TO_PASS regression only (our patch broke existing tests)
    ptp_failed = sum(1 for v in pass_to_pass_results.values() if not v)
    if ptp_failed > 0 and ftp_passed == ftp_total:
        return "wrong_file_edit"

    return "unknown"


def extract_first_error_context(test_stdout: str, max_lines: int = 20) -> str:
    """
    Extract the most relevant error lines from pytest output.
    Used to build the reflection prompt — give the LLM targeted failure info.
    """
    lines = test_stdout.splitlines()

    # Find first FAILED line and return context around it
    for i, line in enumerate(lines):
        if "FAILED" in line or "ERROR" in line or "assert" in line.lower():
            start = max(0, i - 2)
            end = min(len(lines), i + max_lines)
            return "\n".join(lines[start:end])

    # Fallback: last N lines (pytest puts summary at end)
    return "\n".join(lines[-max_lines:])