Spaces:

SouravNath
/

repomind-api

Running

App Files Files Community

repomind-api / tests /test_phase1_sandbox.py

SouravNath

Initial commit

dc71cad 4 days ago

raw

history blame contribute delete

9.85 kB

	"""
	tests/test_phase1_sandbox.py
	────────────────────────────
	Unit tests for Phase 1: Sandbox executor, SWE-bench loader, and evaluator.
	Run with: pytest tests/test_phase1_sandbox.py -v
	"""
	from __future__ import annotations

	import json
	import textwrap
	from pathlib import Path
	from unittest.mock import MagicMock, patch

	import pytest

	# ── Sandbox Executor Tests ────────────────────────────────────────────────────

	class TestSandboxExecutor:
	def test_parse_pytest_output_passed(self):
	from sandbox.executor import SandboxExecutor, ExecResult
	raw = textwrap.dedent("""
	tests/test_foo.py::test_basic PASSED [ 50%]
	tests/test_foo.py::test_edge PASSED [100%]
	""")
	result = ExecResult("pytest", 0, raw, "", 1.0)
	test_result = SandboxExecutor._parse_pytest_output(result)
	assert "tests/test_foo.py::test_basic" in test_result.passed
	assert "tests/test_foo.py::test_edge" in test_result.passed
	assert test_result.failed == []

	def test_parse_pytest_output_failed(self):
	from sandbox.executor import SandboxExecutor, ExecResult
	raw = textwrap.dedent("""
	tests/test_foo.py::test_basic PASSED
	tests/test_bar.py::test_regression FAILED
	tests/test_bar.py::test_setup ERROR
	""")
	result = ExecResult("pytest", 1, raw, "", 2.0)
	test_result = SandboxExecutor._parse_pytest_output(result)
	assert "tests/test_foo.py::test_basic" in test_result.passed
	assert "tests/test_bar.py::test_regression" in test_result.failed
	assert "tests/test_bar.py::test_setup" in test_result.errors

	def test_check_tests_resolved(self):
	from sandbox.executor import TestResult
	tr = TestResult(
	passed=["tests/test_a.py::test_x", "tests/test_b.py::test_y"],
	failed=[],
	errors=[],
	)
	resolved, ftp, ptp = tr.check_tests(
	fail_to_pass=["tests/test_a.py::test_x"],
	pass_to_pass=["tests/test_b.py::test_y"],
	)
	assert resolved is True
	assert ftp["tests/test_a.py::test_x"] is True
	assert ptp["tests/test_b.py::test_y"] is True

	def test_check_tests_not_resolved(self):
	from sandbox.executor import TestResult
	tr = TestResult(
	passed=["tests/test_b.py::test_y"],
	failed=["tests/test_a.py::test_x"],
	errors=[],
	)
	resolved, ftp, ptp = tr.check_tests(
	fail_to_pass=["tests/test_a.py::test_x"],
	pass_to_pass=["tests/test_b.py::test_y"],
	)
	assert resolved is False
	assert ftp["tests/test_a.py::test_x"] is False

	def test_command_whitelist_rejects_rm(self):
	from sandbox.executor import _validate_command
	with pytest.raises(ValueError, match="not in the allowed command whitelist"):
	_validate_command(["rm", "-rf", "/"])

	def test_command_whitelist_accepts_pytest(self):
	from sandbox.executor import _validate_command
	# Should not raise
	_validate_command(["pytest", "-v", "tests/"])

	def test_empty_patch_returns_failure(self, tmp_path):
	from sandbox.executor import SandboxExecutor
	executor = SandboxExecutor(use_docker=False)
	result = executor.apply_patch("", tmp_path)
	assert result.success is False

	def test_timeout_result(self):
	from sandbox.executor import ExecResult
	result = ExecResult("pytest", -1, "", "TIMEOUT after 60s", 60.0, timed_out=True)
	assert result.success is False
	assert result.timed_out is True


	# ── SWE-bench Loader Tests ────────────────────────────────────────────────────

	class TestSWEBenchLoader:
	def test_parse_list_from_string(self):
	from swe_bench.loader import _parse_list
	result = _parse_list('["test_a", "test_b"]')
	assert result == ["test_a", "test_b"]

	def test_parse_list_from_list(self):
	from swe_bench.loader import _parse_list
	result = _parse_list(["test_a", "test_b"])
	assert result == ["test_a", "test_b"]

	def test_parse_list_invalid_returns_empty(self):
	from swe_bench.loader import _parse_list
	result = _parse_list("not_json")
	assert result == []

	def test_swe_instance_repo_name(self):
	from swe_bench.loader import SWEInstance
	inst = SWEInstance(
	instance_id="django__django-12345",
	repo="django/django",
	base_commit="abc123",
	problem_statement="Fix bug",
	patch="--- a\n+++ b\n",
	test_patch="",
	fail_to_pass=[],
	pass_to_pass=[],
	)
	assert inst.repo_name == "django__django"
	assert inst.org == "django"
	assert inst.project == "django"

	def test_local_cache_load(self, tmp_path):
	from swe_bench.loader import load_swebench_lite, _instance_to_dict, SWEInstance
	import json

	# Create a fake cached dataset
	fake_instance = SWEInstance(
	instance_id="test__repo-1",
	repo="test/repo",
	base_commit="deadbeef",
	problem_statement="Test issue",
	patch="--- a/foo.py\n+++ b/foo.py\n@@ -1 +1 @@\n-bug\n+fix\n",
	test_patch="",
	fail_to_pass=["tests/test_foo.py::test_basic"],
	pass_to_pass=[],
	)
	cache_path = tmp_path / "swebench_lite_test.json"
	cache_path.write_text(json.dumps([_instance_to_dict(fake_instance)]))

	instances = load_swebench_lite(cache_dir=tmp_path, split="test")
	assert len(instances) == 1
	assert instances[0].instance_id == "test__repo-1"
	assert instances[0].fail_to_pass == ["tests/test_foo.py::test_basic"]


	# ── Evaluator Tests ───────────────────────────────────────────────────────────

	class TestEvaluator:
	def _make_result(self, instance_id: str, resolved: bool, attempts: int = 1):
	from swe_bench.evaluator import InstanceResult, AttemptResult
	attempt_list = [
	AttemptResult(
	attempt_num=i + 1,
	patch="",
	test_stdout="",
	fail_to_pass_results={},
	pass_to_pass_results={},
	resolved=(i + 1 == attempts and resolved),
	failure_category="success" if (i + 1 == attempts and resolved) else "wrong_file_edit",
	)
	for i in range(attempts)
	]
	return InstanceResult(
	instance_id=instance_id,
	repo="test/repo",
	resolved=resolved,
	attempts=attempt_list,
	total_attempts=attempts,
	)

	def test_aggregate_resolved_rate(self):
	from swe_bench.evaluator import aggregate_results
	results = [
	self._make_result("inst-1", resolved=True),
	self._make_result("inst-2", resolved=True),
	self._make_result("inst-3", resolved=False),
	self._make_result("inst-4", resolved=False),
	]
	report = aggregate_results(results)
	assert report.resolved_count == 2
	assert report.total_instances == 4
	assert abs(report.resolved_rate - 0.5) < 1e-6

	def test_aggregate_empty(self):
	from swe_bench.evaluator import aggregate_results
	report = aggregate_results([])
	assert report.total_instances == 0
	assert report.resolved_count == 0

	def test_attempts_to_fix(self):
	from swe_bench.evaluator import aggregate_results
	# One instance resolved on attempt 2
	results = [self._make_result("inst-1", resolved=True, attempts=2)]
	report = aggregate_results(results)
	assert report.avg_attempts == 2.0

	def test_failure_categories_counted(self):
	from swe_bench.evaluator import aggregate_results
	results = [
	self._make_result("inst-1", resolved=False, attempts=1),
	self._make_result("inst-2", resolved=False, attempts=1),
	]
	report = aggregate_results(results)
	assert sum(report.failure_categories.values()) == 2

	def test_save_and_load_results(self, tmp_path):
	from swe_bench.evaluator import aggregate_results, save_results
	results = [
	self._make_result("inst-1", resolved=True),
	self._make_result("inst-2", resolved=False),
	]
	report = aggregate_results(results)
	save_results(report, tmp_path)

	summary = json.loads((tmp_path / "eval_summary.json").read_text())
	assert summary["resolved_count"] == 1
	assert summary["total_instances"] == 2


	# ── Naive Baseline Patch Cleaning Tests ──────────────────────────────────────

	class TestNaiveBaseline:
	def test_strip_code_fences(self):
	from agent.naive_baseline import _strip_code_fences
	raw = "```diff\n--- a/foo.py\n+++ b/foo.py\n```"
	cleaned = _strip_code_fences(raw)
	assert "```" not in cleaned
	assert "--- a/foo.py" in cleaned

	def test_strip_triple_backtick(self):
	from agent.naive_baseline import _strip_code_fences
	raw = "```\n--- a/foo.py\n+++ b/foo.py\n```"
	cleaned = _strip_code_fences(raw)
	assert cleaned.startswith("--- a/foo.py")