Spaces:

SouravNath
/

repomind-api

Running

File size: 9,848 Bytes

dc71cad

"""
tests/test_phase1_sandbox.py
────────────────────────────
Unit tests for Phase 1: Sandbox executor, SWE-bench loader, and evaluator.
Run with: pytest tests/test_phase1_sandbox.py -v
"""
from __future__ import annotations

import json
import textwrap
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest

# ── Sandbox Executor Tests ────────────────────────────────────────────────────

class TestSandboxExecutor:
    def test_parse_pytest_output_passed(self):
        from sandbox.executor import SandboxExecutor, ExecResult
        raw = textwrap.dedent("""
            tests/test_foo.py::test_basic PASSED                              [ 50%]
            tests/test_foo.py::test_edge PASSED                              [100%]
        """)
        result = ExecResult("pytest", 0, raw, "", 1.0)
        test_result = SandboxExecutor._parse_pytest_output(result)
        assert "tests/test_foo.py::test_basic" in test_result.passed
        assert "tests/test_foo.py::test_edge" in test_result.passed
        assert test_result.failed == []

    def test_parse_pytest_output_failed(self):
        from sandbox.executor import SandboxExecutor, ExecResult
        raw = textwrap.dedent("""
            tests/test_foo.py::test_basic PASSED
            tests/test_bar.py::test_regression FAILED
            tests/test_bar.py::test_setup ERROR
        """)
        result = ExecResult("pytest", 1, raw, "", 2.0)
        test_result = SandboxExecutor._parse_pytest_output(result)
        assert "tests/test_foo.py::test_basic" in test_result.passed
        assert "tests/test_bar.py::test_regression" in test_result.failed
        assert "tests/test_bar.py::test_setup" in test_result.errors

    def test_check_tests_resolved(self):
        from sandbox.executor import TestResult
        tr = TestResult(
            passed=["tests/test_a.py::test_x", "tests/test_b.py::test_y"],
            failed=[],
            errors=[],
        )
        resolved, ftp, ptp = tr.check_tests(
            fail_to_pass=["tests/test_a.py::test_x"],
            pass_to_pass=["tests/test_b.py::test_y"],
        )
        assert resolved is True
        assert ftp["tests/test_a.py::test_x"] is True
        assert ptp["tests/test_b.py::test_y"] is True

    def test_check_tests_not_resolved(self):
        from sandbox.executor import TestResult
        tr = TestResult(
            passed=["tests/test_b.py::test_y"],
            failed=["tests/test_a.py::test_x"],
            errors=[],
        )
        resolved, ftp, ptp = tr.check_tests(
            fail_to_pass=["tests/test_a.py::test_x"],
            pass_to_pass=["tests/test_b.py::test_y"],
        )
        assert resolved is False
        assert ftp["tests/test_a.py::test_x"] is False

    def test_command_whitelist_rejects_rm(self):
        from sandbox.executor import _validate_command
        with pytest.raises(ValueError, match="not in the allowed command whitelist"):
            _validate_command(["rm", "-rf", "/"])

    def test_command_whitelist_accepts_pytest(self):
        from sandbox.executor import _validate_command
        # Should not raise
        _validate_command(["pytest", "-v", "tests/"])

    def test_empty_patch_returns_failure(self, tmp_path):
        from sandbox.executor import SandboxExecutor
        executor = SandboxExecutor(use_docker=False)
        result = executor.apply_patch("", tmp_path)
        assert result.success is False

    def test_timeout_result(self):
        from sandbox.executor import ExecResult
        result = ExecResult("pytest", -1, "", "TIMEOUT after 60s", 60.0, timed_out=True)
        assert result.success is False
        assert result.timed_out is True


# ── SWE-bench Loader Tests ────────────────────────────────────────────────────

class TestSWEBenchLoader:
    def test_parse_list_from_string(self):
        from swe_bench.loader import _parse_list
        result = _parse_list('["test_a", "test_b"]')
        assert result == ["test_a", "test_b"]

    def test_parse_list_from_list(self):
        from swe_bench.loader import _parse_list
        result = _parse_list(["test_a", "test_b"])
        assert result == ["test_a", "test_b"]

    def test_parse_list_invalid_returns_empty(self):
        from swe_bench.loader import _parse_list
        result = _parse_list("not_json")
        assert result == []

    def test_swe_instance_repo_name(self):
        from swe_bench.loader import SWEInstance
        inst = SWEInstance(
            instance_id="django__django-12345",
            repo="django/django",
            base_commit="abc123",
            problem_statement="Fix bug",
            patch="--- a\n+++ b\n",
            test_patch="",
            fail_to_pass=[],
            pass_to_pass=[],
        )
        assert inst.repo_name == "django__django"
        assert inst.org == "django"
        assert inst.project == "django"

    def test_local_cache_load(self, tmp_path):
        from swe_bench.loader import load_swebench_lite, _instance_to_dict, SWEInstance
        import json

        # Create a fake cached dataset
        fake_instance = SWEInstance(
            instance_id="test__repo-1",
            repo="test/repo",
            base_commit="deadbeef",
            problem_statement="Test issue",
            patch="--- a/foo.py\n+++ b/foo.py\n@@ -1 +1 @@\n-bug\n+fix\n",
            test_patch="",
            fail_to_pass=["tests/test_foo.py::test_basic"],
            pass_to_pass=[],
        )
        cache_path = tmp_path / "swebench_lite_test.json"
        cache_path.write_text(json.dumps([_instance_to_dict(fake_instance)]))

        instances = load_swebench_lite(cache_dir=tmp_path, split="test")
        assert len(instances) == 1
        assert instances[0].instance_id == "test__repo-1"
        assert instances[0].fail_to_pass == ["tests/test_foo.py::test_basic"]


# ── Evaluator Tests ───────────────────────────────────────────────────────────

class TestEvaluator:
    def _make_result(self, instance_id: str, resolved: bool, attempts: int = 1):
        from swe_bench.evaluator import InstanceResult, AttemptResult
        attempt_list = [
            AttemptResult(
                attempt_num=i + 1,
                patch="",
                test_stdout="",
                fail_to_pass_results={},
                pass_to_pass_results={},
                resolved=(i + 1 == attempts and resolved),
                failure_category="success" if (i + 1 == attempts and resolved) else "wrong_file_edit",
            )
            for i in range(attempts)
        ]
        return InstanceResult(
            instance_id=instance_id,
            repo="test/repo",
            resolved=resolved,
            attempts=attempt_list,
            total_attempts=attempts,
        )

    def test_aggregate_resolved_rate(self):
        from swe_bench.evaluator import aggregate_results
        results = [
            self._make_result("inst-1", resolved=True),
            self._make_result("inst-2", resolved=True),
            self._make_result("inst-3", resolved=False),
            self._make_result("inst-4", resolved=False),
        ]
        report = aggregate_results(results)
        assert report.resolved_count == 2
        assert report.total_instances == 4
        assert abs(report.resolved_rate - 0.5) < 1e-6

    def test_aggregate_empty(self):
        from swe_bench.evaluator import aggregate_results
        report = aggregate_results([])
        assert report.total_instances == 0
        assert report.resolved_count == 0

    def test_attempts_to_fix(self):
        from swe_bench.evaluator import aggregate_results
        # One instance resolved on attempt 2
        results = [self._make_result("inst-1", resolved=True, attempts=2)]
        report = aggregate_results(results)
        assert report.avg_attempts == 2.0

    def test_failure_categories_counted(self):
        from swe_bench.evaluator import aggregate_results
        results = [
            self._make_result("inst-1", resolved=False, attempts=1),
            self._make_result("inst-2", resolved=False, attempts=1),
        ]
        report = aggregate_results(results)
        assert sum(report.failure_categories.values()) == 2

    def test_save_and_load_results(self, tmp_path):
        from swe_bench.evaluator import aggregate_results, save_results
        results = [
            self._make_result("inst-1", resolved=True),
            self._make_result("inst-2", resolved=False),
        ]
        report = aggregate_results(results)
        save_results(report, tmp_path)

        summary = json.loads((tmp_path / "eval_summary.json").read_text())
        assert summary["resolved_count"] == 1
        assert summary["total_instances"] == 2


# ── Naive Baseline Patch Cleaning Tests ──────────────────────────────────────

class TestNaiveBaseline:
    def test_strip_code_fences(self):
        from agent.naive_baseline import _strip_code_fences
        raw = "```diff\n--- a/foo.py\n+++ b/foo.py\n```"
        cleaned = _strip_code_fences(raw)
        assert "```" not in cleaned
        assert "--- a/foo.py" in cleaned

    def test_strip_triple_backtick(self):
        from agent.naive_baseline import _strip_code_fences
        raw = "```\n--- a/foo.py\n+++ b/foo.py\n```"
        cleaned = _strip_code_fences(raw)
        assert cleaned.startswith("--- a/foo.py")