File size: 1,847 Bytes
85f14d3
 
 
 
b92ad01
85f14d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b92ad01
85f14d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import json
import subprocess
import tempfile
import os
import sys

def test_passes(code, func, inp, expected):
    if isinstance(inp, (list, tuple)):
        args = ', '.join(repr(x) for x in inp)
    else:
        args = repr(inp)
    
    script = f"""{code}

try:
    r = {func}({args})
    expected = {repr(expected)}
    print("PASS" if r == expected else f"FAIL: got {{r}}")
except Exception as e:
    print(f"ERROR: {{e}}")
"""
    try:
        with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
            f.write(script)
            fname = f.name
        r = subprocess.run(
            [sys.executable, fname],
            capture_output=True, text=True, timeout=5
        )
        os.unlink(fname)
        return 'PASS' in r.stdout
    except:
        return False

for tier in [1, 2, 3]:
    bugs = [json.loads(l) for l in open(f'data/bugs_tier{tier}.jsonl') if l.strip()]
    
    broken_original = []
    buggy_not_failing = []
    
    for b in bugs:
        orig_passes = all(
            test_passes(b['original_code'], b['function_name'],
                       t['input'], t['expected_output'])
            for t in b['test_cases']
        )
        buggy_fails_some = any(
            not test_passes(b['buggy_code'], b['function_name'],
                           t['input'], t['expected_output'])
            for t in b['test_cases']
        )
        
        if not orig_passes:
            broken_original.append(b['id'])
        if not buggy_fails_some:
            buggy_not_failing.append(b['id'])
    
    print(f'\nTier {tier}:')
    if broken_original:
        print(f'  BROKEN original_code: {broken_original}')
    if buggy_not_failing:
        print(f'  BUGGY code not failing: {buggy_not_failing}')
    if not broken_original and not buggy_not_failing:
        print(f'  All good!')