Priyansh Saxena commited on
Commit ·
72a7241
1
Parent(s): fb9822b
tests: added extensive testing
Browse filesSigned-off-by: Priyansh Saxena <priyena.programming@gmail.com>
- pytest.ini +2 -1
- server/app.py +4 -0
- src/pytorch_debug_env/reward.py +1 -1
- src/pytorch_debug_env/scenario_generator.py +4 -1
- tests/test_environment.py +4 -0
- tests/test_environment_edge_cases.py +98 -0
- tests/test_graders.py +60 -1
- tests/test_reward.py +54 -1
- tests/test_scenario_generator.py +10 -0
pytest.ini
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
[pytest]
|
| 2 |
-
asyncio_mode = auto
|
|
|
|
|
|
| 1 |
[pytest]
|
| 2 |
+
asyncio_mode = auto
|
| 3 |
+
asyncio_default_fixture_loop_scope = function
|
server/app.py
CHANGED
|
@@ -12,3 +12,7 @@ def main():
|
|
| 12 |
host = os.environ.get("HOST", "0.0.0.0")
|
| 13 |
port = int(os.environ.get("PORT", "7860"))
|
| 14 |
uvicorn.run(app, host=host, port=port)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
host = os.environ.get("HOST", "0.0.0.0")
|
| 13 |
port = int(os.environ.get("PORT", "7860"))
|
| 14 |
uvicorn.run(app, host=host, port=port)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
if __name__ == "__main__":
|
| 18 |
+
main()
|
src/pytorch_debug_env/reward.py
CHANGED
|
@@ -81,7 +81,7 @@ def compute_step_reward(
|
|
| 81 |
diagnosis_reward += max(0.0, 0.08 * (max_steps - step_num))
|
| 82 |
|
| 83 |
total = 0.60 * delta + 0.20 * investigation_reward + 0.20 * diagnosis_reward + confirmation_bonus
|
| 84 |
-
total = round(min(max(total,
|
| 85 |
|
| 86 |
return total, {
|
| 87 |
"hypothesis_quality": current_quality,
|
|
|
|
| 81 |
diagnosis_reward += max(0.0, 0.08 * (max_steps - step_num))
|
| 82 |
|
| 83 |
total = 0.60 * delta + 0.20 * investigation_reward + 0.20 * diagnosis_reward + confirmation_bonus
|
| 84 |
+
total = round(min(max(total, 0.0), 1.0), 4)
|
| 85 |
|
| 86 |
return total, {
|
| 87 |
"hypothesis_quality": current_quality,
|
src/pytorch_debug_env/scenario_generator.py
CHANGED
|
@@ -28,7 +28,10 @@ class ScenarioGenerator:
|
|
| 28 |
|
| 29 |
def generate(self, difficulty: str, seed: int | None = None) -> Scenario:
|
| 30 |
rng = random.Random(seed)
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
repo_files = self._base_repo(rng)
|
| 34 |
repo_files = template.repo_mutator(repo_files, rng)
|
|
|
|
| 28 |
|
| 29 |
def generate(self, difficulty: str, seed: int | None = None) -> Scenario:
|
| 30 |
rng = random.Random(seed)
|
| 31 |
+
candidates = [b for b in self.bug_templates if b.difficulty == difficulty]
|
| 32 |
+
if not candidates:
|
| 33 |
+
raise ValueError(f"Unknown difficulty: {difficulty}")
|
| 34 |
+
template = rng.choice(candidates)
|
| 35 |
|
| 36 |
repo_files = self._base_repo(rng)
|
| 37 |
repo_files = template.repo_mutator(repo_files, rng)
|
tests/test_environment.py
CHANGED
|
@@ -10,3 +10,7 @@ async def test_env_reset():
|
|
| 10 |
env = PyTorchDebugEnv(generator=generator)
|
| 11 |
obs = await env.reset("easy")
|
| 12 |
assert obs.task_id == "easy"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
env = PyTorchDebugEnv(generator=generator)
|
| 11 |
obs = await env.reset("easy")
|
| 12 |
assert obs.task_id == "easy"
|
| 13 |
+
assert "train.py" in obs.revealed_files
|
| 14 |
+
assert "config/training_config.yaml" in obs.revealed_files
|
| 15 |
+
assert obs.step_num == 0
|
| 16 |
+
assert obs.steps_remaining >= 0
|
tests/test_environment_edge_cases.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
|
| 3 |
+
from src.pytorch_debug_env.bug_library import BUG_TEMPLATES
|
| 4 |
+
from src.pytorch_debug_env.environment import PyTorchDebugEnv
|
| 5 |
+
from src.pytorch_debug_env.models import (
|
| 6 |
+
FinalDiagnosis,
|
| 7 |
+
Hypothesis,
|
| 8 |
+
InvestigationAction,
|
| 9 |
+
PyTorchDebugAction,
|
| 10 |
+
)
|
| 11 |
+
from src.pytorch_debug_env.scenario_generator import ScenarioGenerator
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def make_env():
|
| 15 |
+
generator = ScenarioGenerator(BUG_TEMPLATES)
|
| 16 |
+
return PyTorchDebugEnv(generator=generator)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def base_hypothesis():
|
| 20 |
+
return Hypothesis(
|
| 21 |
+
bug_type="missing_zero_grad",
|
| 22 |
+
affected_file="train.py",
|
| 23 |
+
confidence=0.6,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def final_diagnosis():
|
| 28 |
+
return FinalDiagnosis(
|
| 29 |
+
bug_type="missing_zero_grad",
|
| 30 |
+
affected_file="train.py",
|
| 31 |
+
line_range=[9, 14],
|
| 32 |
+
fix_strategy="Call optimizer.zero_grad() before loss.backward()",
|
| 33 |
+
confidence=0.7,
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@pytest.mark.asyncio
|
| 38 |
+
async def test_state_before_reset_returns_none():
|
| 39 |
+
env = make_env()
|
| 40 |
+
assert await env.state() is None
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@pytest.mark.asyncio
|
| 44 |
+
async def test_step_without_reset_raises():
|
| 45 |
+
env = make_env()
|
| 46 |
+
action = PyTorchDebugAction(current_hypothesis=base_hypothesis())
|
| 47 |
+
with pytest.raises(RuntimeError):
|
| 48 |
+
await env.step(action)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
@pytest.mark.asyncio
|
| 52 |
+
async def test_reveal_file_adds_to_observation():
|
| 53 |
+
env = make_env()
|
| 54 |
+
await env.reset("easy")
|
| 55 |
+
target = "data/dataset.py"
|
| 56 |
+
action = PyTorchDebugAction(
|
| 57 |
+
current_hypothesis=base_hypothesis(),
|
| 58 |
+
investigation_action=InvestigationAction(action="reveal_file", target=target),
|
| 59 |
+
)
|
| 60 |
+
result = await env.step(action)
|
| 61 |
+
assert target in result["observation"].revealed_files
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
@pytest.mark.asyncio
|
| 65 |
+
async def test_step_after_done_raises():
|
| 66 |
+
env = make_env()
|
| 67 |
+
await env.reset("easy")
|
| 68 |
+
action = PyTorchDebugAction(
|
| 69 |
+
current_hypothesis=base_hypothesis(),
|
| 70 |
+
commit_diagnosis=True,
|
| 71 |
+
final_diagnosis=final_diagnosis(),
|
| 72 |
+
)
|
| 73 |
+
await env.step(action)
|
| 74 |
+
with pytest.raises(RuntimeError):
|
| 75 |
+
await env.step(action)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
@pytest.mark.asyncio
|
| 79 |
+
async def test_reward_range_and_info_keys():
|
| 80 |
+
env = make_env()
|
| 81 |
+
await env.reset("easy")
|
| 82 |
+
action = PyTorchDebugAction(
|
| 83 |
+
current_hypothesis=base_hypothesis(),
|
| 84 |
+
investigation_action=InvestigationAction(
|
| 85 |
+
action="reveal_file",
|
| 86 |
+
target="model/attention.py",
|
| 87 |
+
),
|
| 88 |
+
)
|
| 89 |
+
result = await env.step(action)
|
| 90 |
+
assert 0.0 <= result["reward"] <= 1.0
|
| 91 |
+
for key in (
|
| 92 |
+
"hypothesis_quality",
|
| 93 |
+
"hypothesis_delta",
|
| 94 |
+
"investigation_reward",
|
| 95 |
+
"diagnosis_reward",
|
| 96 |
+
"confirmation_bonus",
|
| 97 |
+
):
|
| 98 |
+
assert key in result["info"]
|
tests/test_graders.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
# tests/test_graders.py
|
| 2 |
-
from src.pytorch_debug_env.graders import grade_easy
|
| 3 |
|
| 4 |
def test_grade_easy():
|
| 5 |
gt = {
|
|
@@ -17,3 +17,62 @@ def test_grade_easy():
|
|
| 17 |
"confidence": 0.8
|
| 18 |
}
|
| 19 |
assert grade_easy(action, gt) > 0.8
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# tests/test_graders.py
|
| 2 |
+
from src.pytorch_debug_env.graders import grade_easy, grade_hard, grade_medium
|
| 3 |
|
| 4 |
def test_grade_easy():
|
| 5 |
gt = {
|
|
|
|
| 17 |
"confidence": 0.8
|
| 18 |
}
|
| 19 |
assert grade_easy(action, gt) > 0.8
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def test_grade_medium_related_file_bonus():
|
| 23 |
+
gt = {
|
| 24 |
+
"bug_type": "data_leakage",
|
| 25 |
+
"primary_bug_file": "data/dataset.py",
|
| 26 |
+
"related_files": ["data/preprocessing.py"],
|
| 27 |
+
"line_range": [4, 6],
|
| 28 |
+
"fix_strategy": "Ensure validation split is strictly separate from training",
|
| 29 |
+
}
|
| 30 |
+
action = {
|
| 31 |
+
"bug_type": "data_leakage",
|
| 32 |
+
"affected_file": "data/preprocessing.py",
|
| 33 |
+
"line_range": [1, 2],
|
| 34 |
+
"fix_strategy": "Ensure validation split is strictly separate from training",
|
| 35 |
+
"confidence": 0.6,
|
| 36 |
+
}
|
| 37 |
+
assert grade_medium(action, gt) >= grade_easy(action, gt)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def test_grade_hard_category_partial_credit():
|
| 41 |
+
gt = {
|
| 42 |
+
"bug_type": "missing_zero_grad",
|
| 43 |
+
"category": "optimization",
|
| 44 |
+
"primary_bug_file": "train.py",
|
| 45 |
+
"related_files": [],
|
| 46 |
+
"red_herring_file": "model/attention.py",
|
| 47 |
+
"line_range": [10, 12],
|
| 48 |
+
"fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
|
| 49 |
+
}
|
| 50 |
+
action = {
|
| 51 |
+
"bug_type": "wrong_loss_function",
|
| 52 |
+
"affected_file": "data/dataset.py",
|
| 53 |
+
"line_range": [1, 2],
|
| 54 |
+
"fix_strategy": "Use CrossEntropyLoss instead of MSE",
|
| 55 |
+
"confidence": 0.5,
|
| 56 |
+
}
|
| 57 |
+
assert grade_hard(action, gt) >= 0.18
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def test_grade_hard_penalizes_red_herring():
|
| 61 |
+
gt = {
|
| 62 |
+
"bug_type": "memory_leak",
|
| 63 |
+
"category": "resource",
|
| 64 |
+
"primary_bug_file": "data/dataset.py",
|
| 65 |
+
"related_files": ["train.py"],
|
| 66 |
+
"red_herring_file": "model/attention.py",
|
| 67 |
+
"line_range": [5, 9],
|
| 68 |
+
"fix_strategy": "Avoid holding reference to tensors in class cache",
|
| 69 |
+
}
|
| 70 |
+
action = {
|
| 71 |
+
"bug_type": "memory_leak",
|
| 72 |
+
"affected_file": "model/attention.py",
|
| 73 |
+
"line_range": [5, 9],
|
| 74 |
+
"fix_strategy": "Avoid holding reference to tensors in class cache",
|
| 75 |
+
"confidence": 0.7,
|
| 76 |
+
}
|
| 77 |
+
penalized = grade_hard(action, gt)
|
| 78 |
+
assert penalized <= 0.9
|
tests/test_reward.py
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
# tests/test_reward.py
|
| 2 |
-
from src.pytorch_debug_env.reward import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
def test_hypothesis_quality_exact_match():
|
|
@@ -14,3 +19,51 @@ def test_hypothesis_quality_exact_match():
|
|
| 14 |
"confidence": 0.8,
|
| 15 |
}
|
| 16 |
assert hypothesis_quality(hyp, gt) > 0.8
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# tests/test_reward.py
|
| 2 |
+
from src.pytorch_debug_env.reward import (
|
| 3 |
+
compute_step_reward,
|
| 4 |
+
final_diagnosis_score,
|
| 5 |
+
hypothesis_quality,
|
| 6 |
+
line_overlap,
|
| 7 |
+
)
|
| 8 |
|
| 9 |
|
| 10 |
def test_hypothesis_quality_exact_match():
|
|
|
|
| 19 |
"confidence": 0.8,
|
| 20 |
}
|
| 21 |
assert hypothesis_quality(hyp, gt) > 0.8
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def test_line_overlap_handles_no_overlap():
|
| 25 |
+
assert line_overlap([1, 2], [5, 6]) == 0.0
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def test_final_diagnosis_score_bounds():
|
| 29 |
+
gt = {
|
| 30 |
+
"bug_type": "missing_zero_grad",
|
| 31 |
+
"primary_bug_file": "train.py",
|
| 32 |
+
"related_files": [],
|
| 33 |
+
"line_range": [10, 12],
|
| 34 |
+
"fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
|
| 35 |
+
}
|
| 36 |
+
action = {
|
| 37 |
+
"bug_type": "missing_zero_grad",
|
| 38 |
+
"affected_file": "train.py",
|
| 39 |
+
"line_range": [10, 12],
|
| 40 |
+
"fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
|
| 41 |
+
}
|
| 42 |
+
assert 0.0 <= final_diagnosis_score(action, gt) <= 1.0
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def test_compute_step_reward_clamps_non_negative():
|
| 46 |
+
gt = {
|
| 47 |
+
"bug_type": "missing_zero_grad",
|
| 48 |
+
"primary_bug_file": "train.py",
|
| 49 |
+
"related_files": [],
|
| 50 |
+
"red_herring_file": "model/architecture.py",
|
| 51 |
+
"line_range": [10, 12],
|
| 52 |
+
"fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
|
| 53 |
+
}
|
| 54 |
+
hypothesis = {
|
| 55 |
+
"bug_type": "data_leakage",
|
| 56 |
+
"affected_file": "unknown.py",
|
| 57 |
+
"confidence": 0.1,
|
| 58 |
+
}
|
| 59 |
+
reward, components = compute_step_reward(
|
| 60 |
+
previous_quality=0.6,
|
| 61 |
+
current_hypothesis=hypothesis,
|
| 62 |
+
ground_truth=gt,
|
| 63 |
+
investigation_target="model/architecture.py",
|
| 64 |
+
committed_diagnosis=None,
|
| 65 |
+
step_num=1,
|
| 66 |
+
max_steps=5,
|
| 67 |
+
)
|
| 68 |
+
assert reward >= 0.0
|
| 69 |
+
assert components["investigation_reward"] <= 0.0
|
tests/test_scenario_generator.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
|
| 3 |
+
from src.pytorch_debug_env.bug_library import BUG_TEMPLATES
|
| 4 |
+
from src.pytorch_debug_env.scenario_generator import ScenarioGenerator
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def test_generate_invalid_difficulty_raises():
|
| 8 |
+
generator = ScenarioGenerator(BUG_TEMPLATES)
|
| 9 |
+
with pytest.raises(ValueError):
|
| 10 |
+
generator.generate("unknown")
|