File size: 9,848 Bytes
dc71cad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
"""
tests/test_phase1_sandbox.py
────────────────────────────
Unit tests for Phase 1: Sandbox executor, SWE-bench loader, and evaluator.
Run with: pytest tests/test_phase1_sandbox.py -v
"""
from __future__ import annotations

import json
import textwrap
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest

# ── Sandbox Executor Tests ────────────────────────────────────────────────────

class TestSandboxExecutor:
    def test_parse_pytest_output_passed(self):
        from sandbox.executor import SandboxExecutor, ExecResult
        raw = textwrap.dedent("""
            tests/test_foo.py::test_basic PASSED                              [ 50%]
            tests/test_foo.py::test_edge PASSED                              [100%]
        """)
        result = ExecResult("pytest", 0, raw, "", 1.0)
        test_result = SandboxExecutor._parse_pytest_output(result)
        assert "tests/test_foo.py::test_basic" in test_result.passed
        assert "tests/test_foo.py::test_edge" in test_result.passed
        assert test_result.failed == []

    def test_parse_pytest_output_failed(self):
        from sandbox.executor import SandboxExecutor, ExecResult
        raw = textwrap.dedent("""
            tests/test_foo.py::test_basic PASSED
            tests/test_bar.py::test_regression FAILED
            tests/test_bar.py::test_setup ERROR
        """)
        result = ExecResult("pytest", 1, raw, "", 2.0)
        test_result = SandboxExecutor._parse_pytest_output(result)
        assert "tests/test_foo.py::test_basic" in test_result.passed
        assert "tests/test_bar.py::test_regression" in test_result.failed
        assert "tests/test_bar.py::test_setup" in test_result.errors

    def test_check_tests_resolved(self):
        from sandbox.executor import TestResult
        tr = TestResult(
            passed=["tests/test_a.py::test_x", "tests/test_b.py::test_y"],
            failed=[],
            errors=[],
        )
        resolved, ftp, ptp = tr.check_tests(
            fail_to_pass=["tests/test_a.py::test_x"],
            pass_to_pass=["tests/test_b.py::test_y"],
        )
        assert resolved is True
        assert ftp["tests/test_a.py::test_x"] is True
        assert ptp["tests/test_b.py::test_y"] is True

    def test_check_tests_not_resolved(self):
        from sandbox.executor import TestResult
        tr = TestResult(
            passed=["tests/test_b.py::test_y"],
            failed=["tests/test_a.py::test_x"],
            errors=[],
        )
        resolved, ftp, ptp = tr.check_tests(
            fail_to_pass=["tests/test_a.py::test_x"],
            pass_to_pass=["tests/test_b.py::test_y"],
        )
        assert resolved is False
        assert ftp["tests/test_a.py::test_x"] is False

    def test_command_whitelist_rejects_rm(self):
        from sandbox.executor import _validate_command
        with pytest.raises(ValueError, match="not in the allowed command whitelist"):
            _validate_command(["rm", "-rf", "/"])

    def test_command_whitelist_accepts_pytest(self):
        from sandbox.executor import _validate_command
        # Should not raise
        _validate_command(["pytest", "-v", "tests/"])

    def test_empty_patch_returns_failure(self, tmp_path):
        from sandbox.executor import SandboxExecutor
        executor = SandboxExecutor(use_docker=False)
        result = executor.apply_patch("", tmp_path)
        assert result.success is False

    def test_timeout_result(self):
        from sandbox.executor import ExecResult
        result = ExecResult("pytest", -1, "", "TIMEOUT after 60s", 60.0, timed_out=True)
        assert result.success is False
        assert result.timed_out is True


# ── SWE-bench Loader Tests ────────────────────────────────────────────────────

class TestSWEBenchLoader:
    def test_parse_list_from_string(self):
        from swe_bench.loader import _parse_list
        result = _parse_list('["test_a", "test_b"]')
        assert result == ["test_a", "test_b"]

    def test_parse_list_from_list(self):
        from swe_bench.loader import _parse_list
        result = _parse_list(["test_a", "test_b"])
        assert result == ["test_a", "test_b"]

    def test_parse_list_invalid_returns_empty(self):
        from swe_bench.loader import _parse_list
        result = _parse_list("not_json")
        assert result == []

    def test_swe_instance_repo_name(self):
        from swe_bench.loader import SWEInstance
        inst = SWEInstance(
            instance_id="django__django-12345",
            repo="django/django",
            base_commit="abc123",
            problem_statement="Fix bug",
            patch="--- a\n+++ b\n",
            test_patch="",
            fail_to_pass=[],
            pass_to_pass=[],
        )
        assert inst.repo_name == "django__django"
        assert inst.org == "django"
        assert inst.project == "django"

    def test_local_cache_load(self, tmp_path):
        from swe_bench.loader import load_swebench_lite, _instance_to_dict, SWEInstance
        import json

        # Create a fake cached dataset
        fake_instance = SWEInstance(
            instance_id="test__repo-1",
            repo="test/repo",
            base_commit="deadbeef",
            problem_statement="Test issue",
            patch="--- a/foo.py\n+++ b/foo.py\n@@ -1 +1 @@\n-bug\n+fix\n",
            test_patch="",
            fail_to_pass=["tests/test_foo.py::test_basic"],
            pass_to_pass=[],
        )
        cache_path = tmp_path / "swebench_lite_test.json"
        cache_path.write_text(json.dumps([_instance_to_dict(fake_instance)]))

        instances = load_swebench_lite(cache_dir=tmp_path, split="test")
        assert len(instances) == 1
        assert instances[0].instance_id == "test__repo-1"
        assert instances[0].fail_to_pass == ["tests/test_foo.py::test_basic"]


# ── Evaluator Tests ───────────────────────────────────────────────────────────

class TestEvaluator:
    def _make_result(self, instance_id: str, resolved: bool, attempts: int = 1):
        from swe_bench.evaluator import InstanceResult, AttemptResult
        attempt_list = [
            AttemptResult(
                attempt_num=i + 1,
                patch="",
                test_stdout="",
                fail_to_pass_results={},
                pass_to_pass_results={},
                resolved=(i + 1 == attempts and resolved),
                failure_category="success" if (i + 1 == attempts and resolved) else "wrong_file_edit",
            )
            for i in range(attempts)
        ]
        return InstanceResult(
            instance_id=instance_id,
            repo="test/repo",
            resolved=resolved,
            attempts=attempt_list,
            total_attempts=attempts,
        )

    def test_aggregate_resolved_rate(self):
        from swe_bench.evaluator import aggregate_results
        results = [
            self._make_result("inst-1", resolved=True),
            self._make_result("inst-2", resolved=True),
            self._make_result("inst-3", resolved=False),
            self._make_result("inst-4", resolved=False),
        ]
        report = aggregate_results(results)
        assert report.resolved_count == 2
        assert report.total_instances == 4
        assert abs(report.resolved_rate - 0.5) < 1e-6

    def test_aggregate_empty(self):
        from swe_bench.evaluator import aggregate_results
        report = aggregate_results([])
        assert report.total_instances == 0
        assert report.resolved_count == 0

    def test_attempts_to_fix(self):
        from swe_bench.evaluator import aggregate_results
        # One instance resolved on attempt 2
        results = [self._make_result("inst-1", resolved=True, attempts=2)]
        report = aggregate_results(results)
        assert report.avg_attempts == 2.0

    def test_failure_categories_counted(self):
        from swe_bench.evaluator import aggregate_results
        results = [
            self._make_result("inst-1", resolved=False, attempts=1),
            self._make_result("inst-2", resolved=False, attempts=1),
        ]
        report = aggregate_results(results)
        assert sum(report.failure_categories.values()) == 2

    def test_save_and_load_results(self, tmp_path):
        from swe_bench.evaluator import aggregate_results, save_results
        results = [
            self._make_result("inst-1", resolved=True),
            self._make_result("inst-2", resolved=False),
        ]
        report = aggregate_results(results)
        save_results(report, tmp_path)

        summary = json.loads((tmp_path / "eval_summary.json").read_text())
        assert summary["resolved_count"] == 1
        assert summary["total_instances"] == 2


# ── Naive Baseline Patch Cleaning Tests ──────────────────────────────────────

class TestNaiveBaseline:
    def test_strip_code_fences(self):
        from agent.naive_baseline import _strip_code_fences
        raw = "```diff\n--- a/foo.py\n+++ b/foo.py\n```"
        cleaned = _strip_code_fences(raw)
        assert "```" not in cleaned
        assert "--- a/foo.py" in cleaned

    def test_strip_triple_backtick(self):
        from agent.naive_baseline import _strip_code_fences
        raw = "```\n--- a/foo.py\n+++ b/foo.py\n```"
        cleaned = _strip_code_fences(raw)
        assert cleaned.startswith("--- a/foo.py")