Priyansh Saxena commited on
Commit
72a7241
·
1 Parent(s): fb9822b

tests: added extensive testing

Browse files

Signed-off-by: Priyansh Saxena <priyena.programming@gmail.com>

pytest.ini CHANGED
@@ -1,2 +1,3 @@
1
  [pytest]
2
- asyncio_mode = auto
 
 
1
  [pytest]
2
+ asyncio_mode = auto
3
+ asyncio_default_fixture_loop_scope = function
server/app.py CHANGED
@@ -12,3 +12,7 @@ def main():
12
  host = os.environ.get("HOST", "0.0.0.0")
13
  port = int(os.environ.get("PORT", "7860"))
14
  uvicorn.run(app, host=host, port=port)
 
 
 
 
 
12
  host = os.environ.get("HOST", "0.0.0.0")
13
  port = int(os.environ.get("PORT", "7860"))
14
  uvicorn.run(app, host=host, port=port)
15
+
16
+
17
+ if __name__ == "__main__":
18
+ main()
src/pytorch_debug_env/reward.py CHANGED
@@ -81,7 +81,7 @@ def compute_step_reward(
81
  diagnosis_reward += max(0.0, 0.08 * (max_steps - step_num))
82
 
83
  total = 0.60 * delta + 0.20 * investigation_reward + 0.20 * diagnosis_reward + confirmation_bonus
84
- total = round(min(max(total, -0.2), 1.0), 4)
85
 
86
  return total, {
87
  "hypothesis_quality": current_quality,
 
81
  diagnosis_reward += max(0.0, 0.08 * (max_steps - step_num))
82
 
83
  total = 0.60 * delta + 0.20 * investigation_reward + 0.20 * diagnosis_reward + confirmation_bonus
84
+ total = round(min(max(total, 0.0), 1.0), 4)
85
 
86
  return total, {
87
  "hypothesis_quality": current_quality,
src/pytorch_debug_env/scenario_generator.py CHANGED
@@ -28,7 +28,10 @@ class ScenarioGenerator:
28
 
29
  def generate(self, difficulty: str, seed: int | None = None) -> Scenario:
30
  rng = random.Random(seed)
31
- template = rng.choice([b for b in self.bug_templates if b.difficulty == difficulty])
 
 
 
32
 
33
  repo_files = self._base_repo(rng)
34
  repo_files = template.repo_mutator(repo_files, rng)
 
28
 
29
  def generate(self, difficulty: str, seed: int | None = None) -> Scenario:
30
  rng = random.Random(seed)
31
+ candidates = [b for b in self.bug_templates if b.difficulty == difficulty]
32
+ if not candidates:
33
+ raise ValueError(f"Unknown difficulty: {difficulty}")
34
+ template = rng.choice(candidates)
35
 
36
  repo_files = self._base_repo(rng)
37
  repo_files = template.repo_mutator(repo_files, rng)
tests/test_environment.py CHANGED
@@ -10,3 +10,7 @@ async def test_env_reset():
10
  env = PyTorchDebugEnv(generator=generator)
11
  obs = await env.reset("easy")
12
  assert obs.task_id == "easy"
 
 
 
 
 
10
  env = PyTorchDebugEnv(generator=generator)
11
  obs = await env.reset("easy")
12
  assert obs.task_id == "easy"
13
+ assert "train.py" in obs.revealed_files
14
+ assert "config/training_config.yaml" in obs.revealed_files
15
+ assert obs.step_num == 0
16
+ assert obs.steps_remaining >= 0
tests/test_environment_edge_cases.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from src.pytorch_debug_env.bug_library import BUG_TEMPLATES
4
+ from src.pytorch_debug_env.environment import PyTorchDebugEnv
5
+ from src.pytorch_debug_env.models import (
6
+ FinalDiagnosis,
7
+ Hypothesis,
8
+ InvestigationAction,
9
+ PyTorchDebugAction,
10
+ )
11
+ from src.pytorch_debug_env.scenario_generator import ScenarioGenerator
12
+
13
+
14
+ def make_env():
15
+ generator = ScenarioGenerator(BUG_TEMPLATES)
16
+ return PyTorchDebugEnv(generator=generator)
17
+
18
+
19
+ def base_hypothesis():
20
+ return Hypothesis(
21
+ bug_type="missing_zero_grad",
22
+ affected_file="train.py",
23
+ confidence=0.6,
24
+ )
25
+
26
+
27
+ def final_diagnosis():
28
+ return FinalDiagnosis(
29
+ bug_type="missing_zero_grad",
30
+ affected_file="train.py",
31
+ line_range=[9, 14],
32
+ fix_strategy="Call optimizer.zero_grad() before loss.backward()",
33
+ confidence=0.7,
34
+ )
35
+
36
+
37
+ @pytest.mark.asyncio
38
+ async def test_state_before_reset_returns_none():
39
+ env = make_env()
40
+ assert await env.state() is None
41
+
42
+
43
+ @pytest.mark.asyncio
44
+ async def test_step_without_reset_raises():
45
+ env = make_env()
46
+ action = PyTorchDebugAction(current_hypothesis=base_hypothesis())
47
+ with pytest.raises(RuntimeError):
48
+ await env.step(action)
49
+
50
+
51
+ @pytest.mark.asyncio
52
+ async def test_reveal_file_adds_to_observation():
53
+ env = make_env()
54
+ await env.reset("easy")
55
+ target = "data/dataset.py"
56
+ action = PyTorchDebugAction(
57
+ current_hypothesis=base_hypothesis(),
58
+ investigation_action=InvestigationAction(action="reveal_file", target=target),
59
+ )
60
+ result = await env.step(action)
61
+ assert target in result["observation"].revealed_files
62
+
63
+
64
+ @pytest.mark.asyncio
65
+ async def test_step_after_done_raises():
66
+ env = make_env()
67
+ await env.reset("easy")
68
+ action = PyTorchDebugAction(
69
+ current_hypothesis=base_hypothesis(),
70
+ commit_diagnosis=True,
71
+ final_diagnosis=final_diagnosis(),
72
+ )
73
+ await env.step(action)
74
+ with pytest.raises(RuntimeError):
75
+ await env.step(action)
76
+
77
+
78
+ @pytest.mark.asyncio
79
+ async def test_reward_range_and_info_keys():
80
+ env = make_env()
81
+ await env.reset("easy")
82
+ action = PyTorchDebugAction(
83
+ current_hypothesis=base_hypothesis(),
84
+ investigation_action=InvestigationAction(
85
+ action="reveal_file",
86
+ target="model/attention.py",
87
+ ),
88
+ )
89
+ result = await env.step(action)
90
+ assert 0.0 <= result["reward"] <= 1.0
91
+ for key in (
92
+ "hypothesis_quality",
93
+ "hypothesis_delta",
94
+ "investigation_reward",
95
+ "diagnosis_reward",
96
+ "confirmation_bonus",
97
+ ):
98
+ assert key in result["info"]
tests/test_graders.py CHANGED
@@ -1,5 +1,5 @@
1
  # tests/test_graders.py
2
- from src.pytorch_debug_env.graders import grade_easy
3
 
4
  def test_grade_easy():
5
  gt = {
@@ -17,3 +17,62 @@ def test_grade_easy():
17
  "confidence": 0.8
18
  }
19
  assert grade_easy(action, gt) > 0.8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # tests/test_graders.py
2
+ from src.pytorch_debug_env.graders import grade_easy, grade_hard, grade_medium
3
 
4
  def test_grade_easy():
5
  gt = {
 
17
  "confidence": 0.8
18
  }
19
  assert grade_easy(action, gt) > 0.8
20
+
21
+
22
+ def test_grade_medium_related_file_bonus():
23
+ gt = {
24
+ "bug_type": "data_leakage",
25
+ "primary_bug_file": "data/dataset.py",
26
+ "related_files": ["data/preprocessing.py"],
27
+ "line_range": [4, 6],
28
+ "fix_strategy": "Ensure validation split is strictly separate from training",
29
+ }
30
+ action = {
31
+ "bug_type": "data_leakage",
32
+ "affected_file": "data/preprocessing.py",
33
+ "line_range": [1, 2],
34
+ "fix_strategy": "Ensure validation split is strictly separate from training",
35
+ "confidence": 0.6,
36
+ }
37
+ assert grade_medium(action, gt) >= grade_easy(action, gt)
38
+
39
+
40
+ def test_grade_hard_category_partial_credit():
41
+ gt = {
42
+ "bug_type": "missing_zero_grad",
43
+ "category": "optimization",
44
+ "primary_bug_file": "train.py",
45
+ "related_files": [],
46
+ "red_herring_file": "model/attention.py",
47
+ "line_range": [10, 12],
48
+ "fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
49
+ }
50
+ action = {
51
+ "bug_type": "wrong_loss_function",
52
+ "affected_file": "data/dataset.py",
53
+ "line_range": [1, 2],
54
+ "fix_strategy": "Use CrossEntropyLoss instead of MSE",
55
+ "confidence": 0.5,
56
+ }
57
+ assert grade_hard(action, gt) >= 0.18
58
+
59
+
60
+ def test_grade_hard_penalizes_red_herring():
61
+ gt = {
62
+ "bug_type": "memory_leak",
63
+ "category": "resource",
64
+ "primary_bug_file": "data/dataset.py",
65
+ "related_files": ["train.py"],
66
+ "red_herring_file": "model/attention.py",
67
+ "line_range": [5, 9],
68
+ "fix_strategy": "Avoid holding reference to tensors in class cache",
69
+ }
70
+ action = {
71
+ "bug_type": "memory_leak",
72
+ "affected_file": "model/attention.py",
73
+ "line_range": [5, 9],
74
+ "fix_strategy": "Avoid holding reference to tensors in class cache",
75
+ "confidence": 0.7,
76
+ }
77
+ penalized = grade_hard(action, gt)
78
+ assert penalized <= 0.9
tests/test_reward.py CHANGED
@@ -1,5 +1,10 @@
1
  # tests/test_reward.py
2
- from src.pytorch_debug_env.reward import hypothesis_quality
 
 
 
 
 
3
 
4
 
5
  def test_hypothesis_quality_exact_match():
@@ -14,3 +19,51 @@ def test_hypothesis_quality_exact_match():
14
  "confidence": 0.8,
15
  }
16
  assert hypothesis_quality(hyp, gt) > 0.8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # tests/test_reward.py
2
+ from src.pytorch_debug_env.reward import (
3
+ compute_step_reward,
4
+ final_diagnosis_score,
5
+ hypothesis_quality,
6
+ line_overlap,
7
+ )
8
 
9
 
10
  def test_hypothesis_quality_exact_match():
 
19
  "confidence": 0.8,
20
  }
21
  assert hypothesis_quality(hyp, gt) > 0.8
22
+
23
+
24
+ def test_line_overlap_handles_no_overlap():
25
+ assert line_overlap([1, 2], [5, 6]) == 0.0
26
+
27
+
28
+ def test_final_diagnosis_score_bounds():
29
+ gt = {
30
+ "bug_type": "missing_zero_grad",
31
+ "primary_bug_file": "train.py",
32
+ "related_files": [],
33
+ "line_range": [10, 12],
34
+ "fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
35
+ }
36
+ action = {
37
+ "bug_type": "missing_zero_grad",
38
+ "affected_file": "train.py",
39
+ "line_range": [10, 12],
40
+ "fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
41
+ }
42
+ assert 0.0 <= final_diagnosis_score(action, gt) <= 1.0
43
+
44
+
45
+ def test_compute_step_reward_clamps_non_negative():
46
+ gt = {
47
+ "bug_type": "missing_zero_grad",
48
+ "primary_bug_file": "train.py",
49
+ "related_files": [],
50
+ "red_herring_file": "model/architecture.py",
51
+ "line_range": [10, 12],
52
+ "fix_strategy": "Call optimizer.zero_grad() before loss.backward()",
53
+ }
54
+ hypothesis = {
55
+ "bug_type": "data_leakage",
56
+ "affected_file": "unknown.py",
57
+ "confidence": 0.1,
58
+ }
59
+ reward, components = compute_step_reward(
60
+ previous_quality=0.6,
61
+ current_hypothesis=hypothesis,
62
+ ground_truth=gt,
63
+ investigation_target="model/architecture.py",
64
+ committed_diagnosis=None,
65
+ step_num=1,
66
+ max_steps=5,
67
+ )
68
+ assert reward >= 0.0
69
+ assert components["investigation_reward"] <= 0.0
tests/test_scenario_generator.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from src.pytorch_debug_env.bug_library import BUG_TEMPLATES
4
+ from src.pytorch_debug_env.scenario_generator import ScenarioGenerator
5
+
6
+
7
+ def test_generate_invalid_difficulty_raises():
8
+ generator = ScenarioGenerator(BUG_TEMPLATES)
9
+ with pytest.raises(ValueError):
10
+ generator.generate("unknown")