Spaces:

ArchCoder
/

Openenv

Sleeping

App Files Files Community

Priyansh Saxena commited on 16 days ago

Commit

72a7241

1 Parent(s): fb9822b

tests: added extensive testing

Browse files

Signed-off-by: Priyansh Saxena <priyena.programming@gmail.com>

Files changed (9) hide show

pytest.ini +2 -1
server/app.py +4 -0
src/pytorch_debug_env/reward.py +1 -1
src/pytorch_debug_env/scenario_generator.py +4 -1
tests/test_environment.py +4 -0
tests/test_environment_edge_cases.py +98 -0
tests/test_graders.py +60 -1
tests/test_reward.py +54 -1
tests/test_scenario_generator.py +10 -0

pytest.ini CHANGED Viewed

@@ -1,2 +1,3 @@
 [pytest]
-asyncio_mode = auto

 [pytest]
+asyncio_mode = auto
+asyncio_default_fixture_loop_scope = function

server/app.py CHANGED Viewed

@@ -12,3 +12,7 @@ def main():
     host = os.environ.get("HOST", "0.0.0.0")
     port = int(os.environ.get("PORT", "7860"))
     uvicorn.run(app, host=host, port=port)

     host = os.environ.get("HOST", "0.0.0.0")
     port = int(os.environ.get("PORT", "7860"))
     uvicorn.run(app, host=host, port=port)
+if __name__ == "__main__":
+    main()

src/pytorch_debug_env/reward.py CHANGED Viewed

@@ -81,7 +81,7 @@ def compute_step_reward(
             diagnosis_reward += max(0.0, 0.08 * (max_steps - step_num))
     total = 0.60 * delta + 0.20 * investigation_reward + 0.20 * diagnosis_reward + confirmation_bonus
-    total = round(min(max(total, -0.2), 1.0), 4)
     return total, {
         "hypothesis_quality": current_quality,

             diagnosis_reward += max(0.0, 0.08 * (max_steps - step_num))
     total = 0.60 * delta + 0.20 * investigation_reward + 0.20 * diagnosis_reward + confirmation_bonus
+    total = round(min(max(total, 0.0), 1.0), 4)
     return total, {
         "hypothesis_quality": current_quality,

src/pytorch_debug_env/scenario_generator.py CHANGED Viewed

@@ -28,7 +28,10 @@ class ScenarioGenerator:
     def generate(self, difficulty: str, seed: int | None = None) -> Scenario:
         rng = random.Random(seed)
-        template = rng.choice([b for b in self.bug_templates if b.difficulty == difficulty])
         repo_files = self._base_repo(rng)
         repo_files = template.repo_mutator(repo_files, rng)

     def generate(self, difficulty: str, seed: int | None = None) -> Scenario:
         rng = random.Random(seed)
+        candidates = [b for b in self.bug_templates if b.difficulty == difficulty]
+        if not candidates:
+            raise ValueError(f"Unknown difficulty: {difficulty}")
+        template = rng.choice(candidates)
         repo_files = self._base_repo(rng)
         repo_files = template.repo_mutator(repo_files, rng)

tests/test_environment.py CHANGED Viewed

@@ -10,3 +10,7 @@ async def test_env_reset():
     env = PyTorchDebugEnv(generator=generator)
     obs = await env.reset("easy")
     assert obs.task_id == "easy"

     env = PyTorchDebugEnv(generator=generator)
     obs = await env.reset("easy")
     assert obs.task_id == "easy"
+    assert "train.py" in obs.revealed_files
+    assert "config/training_config.yaml" in obs.revealed_files
+    assert obs.step_num == 0
+    assert obs.steps_remaining >= 0

tests/test_environment_edge_cases.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import pytest
+from src.pytorch_debug_env.bug_library import BUG_TEMPLATES
+from src.pytorch_debug_env.environment import PyTorchDebugEnv
+from src.pytorch_debug_env.models import (
+    FinalDiagnosis,
+    Hypothesis,
+    InvestigationAction,
+    PyTorchDebugAction,
+)
+from src.pytorch_debug_env.scenario_generator import ScenarioGenerator
+def make_env():
+    generator = ScenarioGenerator(BUG_TEMPLATES)
+    return PyTorchDebugEnv(generator=generator)
+def base_hypothesis():
+    return Hypothesis(
+        bug_type="missing_zero_grad",
+        affected_file="train.py",
+        confidence=0.6,
+    )
+def final_diagnosis():
+    return FinalDiagnosis(
+        bug_type="missing_zero_grad",
+        affected_file="train.py",
+        line_range=[9, 14],
+        fix_strategy="Call optimizer.zero_grad() before loss.backward()",
+        confidence=0.7,
+    )
+@pytest.mark.asyncio
+async def test_state_before_reset_returns_none():
+    env = make_env()
+    assert await env.state() is None
+@pytest.mark.asyncio
+async def test_step_without_reset_raises():
+    env = make_env()
+    action = PyTorchDebugAction(current_hypothesis=base_hypothesis())
+    with pytest.raises(RuntimeError):
+        await env.step(action)
+@pytest.mark.asyncio
+async def test_reveal_file_adds_to_observation():
+    env = make_env()
+    await env.reset("easy")
+    target = "data/dataset.py"
+    action = PyTorchDebugAction(
+        current_hypothesis=base_hypothesis(),
+        investigation_action=InvestigationAction(action="reveal_file", target=target),
+    )
+    result = await env.step(action)
+    assert target in result["observation"].revealed_files
+@pytest.mark.asyncio
+async def test_step_after_done_raises():
+    env = make_env()
+    await env.reset("easy")
+    action = PyTorchDebugAction(
+        current_hypothesis=base_hypothesis(),
+        commit_diagnosis=True,
+        final_diagnosis=final_diagnosis(),
+    )
+    await env.step(action)
+    with pytest.raises(RuntimeError):
+        await env.step(action)
+@pytest.mark.asyncio
+async def test_reward_range_and_info_keys():
+    env = make_env()
+    await env.reset("easy")
+    action = PyTorchDebugAction(
+        current_hypothesis=base_hypothesis(),
+        investigation_action=InvestigationAction(
+            action="reveal_file",
+            target="model/attention.py",
+        ),
+    )
+    result = await env.step(action)
+    assert 0.0 <= result["reward"] <= 1.0
+    for key in (
+        "hypothesis_quality",
+        "hypothesis_delta",
+        "investigation_reward",
+        "diagnosis_reward",
+        "confirmation_bonus",
+    ):
+        assert key in result["info"]

tests/test_graders.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # tests/test_graders.py
-from src.pytorch_debug_env.graders import grade_easy
 def test_grade_easy():
     gt = {
@@ -17,3 +17,62 @@ def test_grade_easy():
         "confidence": 0.8
     }
     assert grade_easy(action, gt) > 0.8

 # tests/test_graders.py
+from src.pytorch_debug_env.graders import grade_easy, grade_hard, grade_medium
 def test_grade_easy():
     gt = {
         "confidence": 0.8
     }
     assert grade_easy(action, gt) > 0.8
+def test_grade_medium_related_file_bonus():
+    gt = {
+        "bug_type": "data_leakage",
+        "primary_bug_file": "data/dataset.py",
+        "related_files": ["data/preprocessing.py"],
+        "line_range": [4, 6],
+        "fix_strategy": "Ensure validation split is strictly separate from training",
+    }
+    action = {
+        "bug_type": "data_leakage",
+        "affected_file": "data/preprocessing.py",
+        "line_range": [1, 2],
+        "fix_strategy": "Ensure validation split is strictly separate from training",
+        "confidence": 0.6,
+    }
+    assert grade_medium(action, gt) >= grade_easy(action, gt)
+def test_grade_hard_category_partial_credit():
+    gt = {
+        "bug_type": "missing_zero_grad",
+        "category": "optimization",
+        "primary_bug_file": "train.py",
+        "related_files": [],
+        "red_herring_file": "model/attention.py",
+        "line_range": [10, 12],
+        "fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
+    }
+    action = {
+        "bug_type": "wrong_loss_function",
+        "affected_file": "data/dataset.py",
+        "line_range": [1, 2],
+        "fix_strategy": "Use CrossEntropyLoss instead of MSE",
+        "confidence": 0.5,
+    }
+    assert grade_hard(action, gt) >= 0.18
+def test_grade_hard_penalizes_red_herring():
+    gt = {
+        "bug_type": "memory_leak",
+        "category": "resource",
+        "primary_bug_file": "data/dataset.py",
+        "related_files": ["train.py"],
+        "red_herring_file": "model/attention.py",
+        "line_range": [5, 9],
+        "fix_strategy": "Avoid holding reference to tensors in class cache",
+    }
+    action = {
+        "bug_type": "memory_leak",
+        "affected_file": "model/attention.py",
+        "line_range": [5, 9],
+        "fix_strategy": "Avoid holding reference to tensors in class cache",
+        "confidence": 0.7,
+    }
+    penalized = grade_hard(action, gt)
+    assert penalized <= 0.9

tests/test_reward.py CHANGED Viewed

@@ -1,5 +1,10 @@
 # tests/test_reward.py
-from src.pytorch_debug_env.reward import hypothesis_quality
 def test_hypothesis_quality_exact_match():
@@ -14,3 +19,51 @@ def test_hypothesis_quality_exact_match():
         "confidence": 0.8,
     }
     assert hypothesis_quality(hyp, gt) > 0.8

 # tests/test_reward.py
+from src.pytorch_debug_env.reward import (
+    compute_step_reward,
+    final_diagnosis_score,
+    hypothesis_quality,
+    line_overlap,
+)
 def test_hypothesis_quality_exact_match():
         "confidence": 0.8,
     }
     assert hypothesis_quality(hyp, gt) > 0.8
+def test_line_overlap_handles_no_overlap():
+    assert line_overlap([1, 2], [5, 6]) == 0.0
+def test_final_diagnosis_score_bounds():
+    gt = {
+        "bug_type": "missing_zero_grad",
+        "primary_bug_file": "train.py",
+        "related_files": [],
+        "line_range": [10, 12],
+        "fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
+    }
+    action = {
+        "bug_type": "missing_zero_grad",
+        "affected_file": "train.py",
+        "line_range": [10, 12],
+        "fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
+    }
+    assert 0.0 <= final_diagnosis_score(action, gt) <= 1.0
+def test_compute_step_reward_clamps_non_negative():
+    gt = {
+        "bug_type": "missing_zero_grad",
+        "primary_bug_file": "train.py",
+        "related_files": [],
+        "red_herring_file": "model/architecture.py",
+        "line_range": [10, 12],
+        "fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
+    }
+    hypothesis = {
+        "bug_type": "data_leakage",
+        "affected_file": "unknown.py",
+        "confidence": 0.1,
+    }
+    reward, components = compute_step_reward(
+        previous_quality=0.6,
+        current_hypothesis=hypothesis,
+        ground_truth=gt,
+        investigation_target="model/architecture.py",
+        committed_diagnosis=None,
+        step_num=1,
+        max_steps=5,
+    )
+    assert reward >= 0.0
+    assert components["investigation_reward"] <= 0.0

tests/test_scenario_generator.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import pytest
+from src.pytorch_debug_env.bug_library import BUG_TEMPLATES
+from src.pytorch_debug_env.scenario_generator import ScenarioGenerator
+def test_generate_invalid_difficulty_raises():
+    generator = ScenarioGenerator(BUG_TEMPLATES)
+    with pytest.raises(ValueError):
+        generator.generate("unknown")