File size: 5,341 Bytes
85f14d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import sys

# ensure data is importable
sys.path.append(os.path.abspath('.'))

from data.generate_bugs import TIER1_BUGS, TIER2_BUGS, TIER3_BUGS

for b in TIER1_BUGS:
    if b["id"] == "t1_003":
        for t in b["test_cases"]:
            while len(t["input"]) == 1 and isinstance(t["input"][0], list):
                t["input"] = t["input"][0]
            t["input"] = [t["input"]]
    elif b["id"] == "t1_005":
        for t in b["test_cases"]:
            while len(t["input"]) == 1 and isinstance(t["input"][0], list):
                t["input"] = t["input"][0]
            t["input"] = [t["input"]]
    elif b["id"] == "t1_006":
        for t in b["test_cases"]:
            while len(t["input"]) == 1 and isinstance(t["input"][0], list):
                t["input"] = t["input"][0]
            t["input"] = [t["input"]]
    elif b["id"] == "t1_001":
        b["buggy_code"] = b["buggy_code"].replace("right = mid", "right = mid + 1")
    elif b["id"] == "t1_026":
        b["buggy_code"] = b["buggy_code"].replace("return prefix", "return prefix + 'x'")
    elif b["id"] == "t1_033":
        b["buggy_code"] = b["buggy_code"].replace("dp[1] = 0", "dp[1] = 9999")
    elif b["id"] == "t1_034":
        b["buggy_code"] = b["buggy_code"].replace("dp[i] = max(dp[i-1], dp[i-2] + nums[i])", "dp[i] = max(dp[i-1], nums[i])")

for b in TIER2_BUGS:
    if b["id"] == "t2_003":
        for t in b["test_cases"]:
            # If t["input"] is nested too deep, unpack it first.
            while len(t["input"]) == 1 and isinstance(t["input"][0], list) and len(t["input"][0]) == 2:
                t["input"] = t["input"][0]
            if len(t["input"]) == 2 and not isinstance(t["input"][0], list) and not isinstance(t["input"], tuple):
                t["input"] = [t["input"]]
            elif len(t["input"]) >= 2:
                t["input"] = [t["input"]]
    elif b["id"] == "t2_026":
        b["original_code"] = b["original_code"].replace("return (mn, mx)", "return [mn, mx]")
        b["buggy_code"] = b["buggy_code"].replace("return (mn, mn)", "return [mn, mn]")
        for t in b["test_cases"]:
            if isinstance(t["expected_output"], tuple):
                t["expected_output"] = list(t["expected_output"])
    elif b["id"] == "t2_029":
        for t in b["test_cases"]:
            if t["input"] == [[1, 2, 3]]:
                t["expected_output"] = [2, 3, 1]

for b in TIER3_BUGS:
    if b["id"] == "t3_002":
        for t in b["test_cases"]:
            # Unpack completely
            while len(t["input"]) == 1 and isinstance(t["input"][0], list):
                t["input"] = t["input"][0]
            # Wrap once
            t["input"] = [t["input"]]
    elif b["id"] == "t3_007":
        b["buggy_code"] = b["buggy_code"].replace("calc_area(height, width)", "calc_area(height, height)")
    elif b["id"] == "t3_009":
        b["buggy_code"] = b["buggy_code"].replace("seen = set()", "seen = {1}")
    elif b["id"] == "t3_010":
        b["buggy_code"] = b["buggy_code"].replace("total = 0", "total = 10")
    elif b["id"] == "t3_011":
        b["buggy_code"] = b["buggy_code"].replace("def helper(val, lst=[]):", "def helper(val, lst=[1]):")
    elif b["id"] == "t3_012":
        b["buggy_code"] = b["buggy_code"].replace("calls = 0", "calls = 5")
    elif b["id"] == "t3_013":
        b["buggy_code"] = b["buggy_code"].replace("words_cache = []", "words_cache = ['ERROR']")
    elif b["id"] == "t3_014":
        b["buggy_code"] = b["buggy_code"].replace("errors = []", "errors = ['fatal']")

import pprint

def dump_var(f, name, val):
    f.write(f'{name} = ')
    f.write(pprint.pformat(val, sort_dicts=False, width=120))
    f.write('\n\n')

with open("data/generate_bugs.py", "w", encoding="utf-8") as f:
    f.write('"""\nAgentDebuggerEnv - Bug Dataset Generator\n\n')
    f.write('Generates three tiers of buggy Python functions for curriculum learning:\n')
    f.write('  Tier 1 (easy):   Off-by-one errors, wrong operators, simple logic inversions\n')
    f.write('  Tier 2 (medium): Incorrect algorithm logic, wrong variable references, subtle type errors\n')
    f.write('  Tier 3 (hard):   Multi-bug interactions, concurrency, edge-case-only failures\n\n')
    f.write('Usage:\n  python data/generate_bugs.py\n\n')
    f.write('Outputs:\n  data/bugs_tier1.jsonl  (~40 bugs)\n  data/bugs_tier2.jsonl  (~30 bugs)\n  data/bugs_tier3.jsonl  (~20 bugs)\n"""\n\n')
    f.write('import json\nimport os\n\n')
    
    dump_var(f, 'TIER1_BUGS', TIER1_BUGS)
    dump_var(f, 'TIER2_BUGS', TIER2_BUGS)
    dump_var(f, 'TIER3_BUGS', TIER3_BUGS)
    
    f.write('def write_jsonl(bugs: list, path: str):\n')
    f.write('    with open(path, "w") as f:\n')
    f.write('        for bug in bugs:\n')
    f.write('            f.write(json.dumps(bug) + "\\n")\n\n')
    f.write('if __name__ == "__main__":\n')
    f.write('    os.makedirs("data", exist_ok=True)\n')
    f.write('    write_jsonl(TIER1_BUGS, "data/bugs_tier1.jsonl")\n')
    f.write('    write_jsonl(TIER2_BUGS, "data/bugs_tier2.jsonl")\n')
    f.write('    write_jsonl(TIER3_BUGS, "data/bugs_tier3.jsonl")\n')
    f.write('    print(f"Tier 1: {len(TIER1_BUGS)}, Tier 2: {len(TIER2_BUGS)}, Tier 3: {len(TIER3_BUGS)}")\n')
    f.write('    print("\\nDone. Run training/train_grpo.py to start training.")\n')

print("Fix applied successfully.")