Spaces:
Running
Running
| """ | |
| tests/test_phase1_sandbox.py | |
| ββββββββββββββββββββββββββββ | |
| Unit tests for Phase 1: Sandbox executor, SWE-bench loader, and evaluator. | |
| Run with: pytest tests/test_phase1_sandbox.py -v | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import textwrap | |
| from pathlib import Path | |
| from unittest.mock import MagicMock, patch | |
| import pytest | |
| # ββ Sandbox Executor Tests ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestSandboxExecutor: | |
| def test_parse_pytest_output_passed(self): | |
| from sandbox.executor import SandboxExecutor, ExecResult | |
| raw = textwrap.dedent(""" | |
| tests/test_foo.py::test_basic PASSED [ 50%] | |
| tests/test_foo.py::test_edge PASSED [100%] | |
| """) | |
| result = ExecResult("pytest", 0, raw, "", 1.0) | |
| test_result = SandboxExecutor._parse_pytest_output(result) | |
| assert "tests/test_foo.py::test_basic" in test_result.passed | |
| assert "tests/test_foo.py::test_edge" in test_result.passed | |
| assert test_result.failed == [] | |
| def test_parse_pytest_output_failed(self): | |
| from sandbox.executor import SandboxExecutor, ExecResult | |
| raw = textwrap.dedent(""" | |
| tests/test_foo.py::test_basic PASSED | |
| tests/test_bar.py::test_regression FAILED | |
| tests/test_bar.py::test_setup ERROR | |
| """) | |
| result = ExecResult("pytest", 1, raw, "", 2.0) | |
| test_result = SandboxExecutor._parse_pytest_output(result) | |
| assert "tests/test_foo.py::test_basic" in test_result.passed | |
| assert "tests/test_bar.py::test_regression" in test_result.failed | |
| assert "tests/test_bar.py::test_setup" in test_result.errors | |
| def test_check_tests_resolved(self): | |
| from sandbox.executor import TestResult | |
| tr = TestResult( | |
| passed=["tests/test_a.py::test_x", "tests/test_b.py::test_y"], | |
| failed=[], | |
| errors=[], | |
| ) | |
| resolved, ftp, ptp = tr.check_tests( | |
| fail_to_pass=["tests/test_a.py::test_x"], | |
| pass_to_pass=["tests/test_b.py::test_y"], | |
| ) | |
| assert resolved is True | |
| assert ftp["tests/test_a.py::test_x"] is True | |
| assert ptp["tests/test_b.py::test_y"] is True | |
| def test_check_tests_not_resolved(self): | |
| from sandbox.executor import TestResult | |
| tr = TestResult( | |
| passed=["tests/test_b.py::test_y"], | |
| failed=["tests/test_a.py::test_x"], | |
| errors=[], | |
| ) | |
| resolved, ftp, ptp = tr.check_tests( | |
| fail_to_pass=["tests/test_a.py::test_x"], | |
| pass_to_pass=["tests/test_b.py::test_y"], | |
| ) | |
| assert resolved is False | |
| assert ftp["tests/test_a.py::test_x"] is False | |
| def test_command_whitelist_rejects_rm(self): | |
| from sandbox.executor import _validate_command | |
| with pytest.raises(ValueError, match="not in the allowed command whitelist"): | |
| _validate_command(["rm", "-rf", "/"]) | |
| def test_command_whitelist_accepts_pytest(self): | |
| from sandbox.executor import _validate_command | |
| # Should not raise | |
| _validate_command(["pytest", "-v", "tests/"]) | |
| def test_empty_patch_returns_failure(self, tmp_path): | |
| from sandbox.executor import SandboxExecutor | |
| executor = SandboxExecutor(use_docker=False) | |
| result = executor.apply_patch("", tmp_path) | |
| assert result.success is False | |
| def test_timeout_result(self): | |
| from sandbox.executor import ExecResult | |
| result = ExecResult("pytest", -1, "", "TIMEOUT after 60s", 60.0, timed_out=True) | |
| assert result.success is False | |
| assert result.timed_out is True | |
| # ββ SWE-bench Loader Tests ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestSWEBenchLoader: | |
| def test_parse_list_from_string(self): | |
| from swe_bench.loader import _parse_list | |
| result = _parse_list('["test_a", "test_b"]') | |
| assert result == ["test_a", "test_b"] | |
| def test_parse_list_from_list(self): | |
| from swe_bench.loader import _parse_list | |
| result = _parse_list(["test_a", "test_b"]) | |
| assert result == ["test_a", "test_b"] | |
| def test_parse_list_invalid_returns_empty(self): | |
| from swe_bench.loader import _parse_list | |
| result = _parse_list("not_json") | |
| assert result == [] | |
| def test_swe_instance_repo_name(self): | |
| from swe_bench.loader import SWEInstance | |
| inst = SWEInstance( | |
| instance_id="django__django-12345", | |
| repo="django/django", | |
| base_commit="abc123", | |
| problem_statement="Fix bug", | |
| patch="--- a\n+++ b\n", | |
| test_patch="", | |
| fail_to_pass=[], | |
| pass_to_pass=[], | |
| ) | |
| assert inst.repo_name == "django__django" | |
| assert inst.org == "django" | |
| assert inst.project == "django" | |
| def test_local_cache_load(self, tmp_path): | |
| from swe_bench.loader import load_swebench_lite, _instance_to_dict, SWEInstance | |
| import json | |
| # Create a fake cached dataset | |
| fake_instance = SWEInstance( | |
| instance_id="test__repo-1", | |
| repo="test/repo", | |
| base_commit="deadbeef", | |
| problem_statement="Test issue", | |
| patch="--- a/foo.py\n+++ b/foo.py\n@@ -1 +1 @@\n-bug\n+fix\n", | |
| test_patch="", | |
| fail_to_pass=["tests/test_foo.py::test_basic"], | |
| pass_to_pass=[], | |
| ) | |
| cache_path = tmp_path / "swebench_lite_test.json" | |
| cache_path.write_text(json.dumps([_instance_to_dict(fake_instance)])) | |
| instances = load_swebench_lite(cache_dir=tmp_path, split="test") | |
| assert len(instances) == 1 | |
| assert instances[0].instance_id == "test__repo-1" | |
| assert instances[0].fail_to_pass == ["tests/test_foo.py::test_basic"] | |
| # ββ Evaluator Tests βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestEvaluator: | |
| def _make_result(self, instance_id: str, resolved: bool, attempts: int = 1): | |
| from swe_bench.evaluator import InstanceResult, AttemptResult | |
| attempt_list = [ | |
| AttemptResult( | |
| attempt_num=i + 1, | |
| patch="", | |
| test_stdout="", | |
| fail_to_pass_results={}, | |
| pass_to_pass_results={}, | |
| resolved=(i + 1 == attempts and resolved), | |
| failure_category="success" if (i + 1 == attempts and resolved) else "wrong_file_edit", | |
| ) | |
| for i in range(attempts) | |
| ] | |
| return InstanceResult( | |
| instance_id=instance_id, | |
| repo="test/repo", | |
| resolved=resolved, | |
| attempts=attempt_list, | |
| total_attempts=attempts, | |
| ) | |
| def test_aggregate_resolved_rate(self): | |
| from swe_bench.evaluator import aggregate_results | |
| results = [ | |
| self._make_result("inst-1", resolved=True), | |
| self._make_result("inst-2", resolved=True), | |
| self._make_result("inst-3", resolved=False), | |
| self._make_result("inst-4", resolved=False), | |
| ] | |
| report = aggregate_results(results) | |
| assert report.resolved_count == 2 | |
| assert report.total_instances == 4 | |
| assert abs(report.resolved_rate - 0.5) < 1e-6 | |
| def test_aggregate_empty(self): | |
| from swe_bench.evaluator import aggregate_results | |
| report = aggregate_results([]) | |
| assert report.total_instances == 0 | |
| assert report.resolved_count == 0 | |
| def test_attempts_to_fix(self): | |
| from swe_bench.evaluator import aggregate_results | |
| # One instance resolved on attempt 2 | |
| results = [self._make_result("inst-1", resolved=True, attempts=2)] | |
| report = aggregate_results(results) | |
| assert report.avg_attempts == 2.0 | |
| def test_failure_categories_counted(self): | |
| from swe_bench.evaluator import aggregate_results | |
| results = [ | |
| self._make_result("inst-1", resolved=False, attempts=1), | |
| self._make_result("inst-2", resolved=False, attempts=1), | |
| ] | |
| report = aggregate_results(results) | |
| assert sum(report.failure_categories.values()) == 2 | |
| def test_save_and_load_results(self, tmp_path): | |
| from swe_bench.evaluator import aggregate_results, save_results | |
| results = [ | |
| self._make_result("inst-1", resolved=True), | |
| self._make_result("inst-2", resolved=False), | |
| ] | |
| report = aggregate_results(results) | |
| save_results(report, tmp_path) | |
| summary = json.loads((tmp_path / "eval_summary.json").read_text()) | |
| assert summary["resolved_count"] == 1 | |
| assert summary["total_instances"] == 2 | |
| # ββ Naive Baseline Patch Cleaning Tests ββββββββββββββββββββββββββββββββββββββ | |
| class TestNaiveBaseline: | |
| def test_strip_code_fences(self): | |
| from agent.naive_baseline import _strip_code_fences | |
| raw = "```diff\n--- a/foo.py\n+++ b/foo.py\n```" | |
| cleaned = _strip_code_fences(raw) | |
| assert "```" not in cleaned | |
| assert "--- a/foo.py" in cleaned | |
| def test_strip_triple_backtick(self): | |
| from agent.naive_baseline import _strip_code_fences | |
| raw = "```\n--- a/foo.py\n+++ b/foo.py\n```" | |
| cleaned = _strip_code_fences(raw) | |
| assert cleaned.startswith("--- a/foo.py") | |