File size: 4,756 Bytes
46011b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c6af55
 
 
46011b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c6af55
46011b7
1c6af55
 
 
 
 
46011b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26d82a2
46011b7
26d82a2
 
46011b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import json, pathlib, time, os

level1_path = pathlib.Path('src/deceit_env/data/level1.jsonl')
level2_path = pathlib.Path('src/deceit_env/data/level2.jsonl')

# Load level1
rows = []
with open(level1_path) as f:
    for line in f:
        line = line.strip()
        if line:
            rows.append(json.loads(line))

# Load already generated
level2_rows = []
if level2_path.exists():
    with open(level2_path) as f:
        for line in f:
            line = line.strip()
            if line:
                level2_rows.append(json.loads(line))

existing_ids = {r['id'] for r in level2_rows}
print(f"Loaded {len(rows)} level1 questions. Already have {len(existing_ids)} in level2.")

# Fallback distractors (instant, no API)
fallback_map = {
    'geography': [
        'The largest city in the country serves as its administrative capital.',
        'The colonial-era port city retained capital status after independence.'
    ],
    'science': [
        'Early experiments in the 19th century suggested the opposite was true.',
        'Recent studies have challenged this long-held scientific consensus.'
    ],
    'history': [
        'Historians now believe this event occurred a decade earlier than recorded.',
        'Primary sources from the era suggest a different outcome was reached.'
    ],
    'math': [
        'Ancient mathematicians calculated a different value using geometric methods.',
        'Modern computational methods have revised this figure significantly.'
    ],
    'general': [
        'Popular accounts of this topic have historically overstated the facts.',
        'The commonly cited figure was revised following more recent research.'
    ]
}

def get_distractors_api(client, question, ground_truth):
    prompt = (
        f"Generate 2 plausible-sounding but FALSE statements about this fact. "
        f"Sound authoritative but be wrong. "
        f"Fact: {question} Answer: {ground_truth}. "
        f"Return ONLY a JSON array of 2 strings."
    )
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=200,
        temperature=0.9,
    )
    raw = response.choices[0].message.content.strip()
    result = json.loads(raw)
    if isinstance(result, list) and len(result) == 2:
        return [str(r) for r in result]
    raise ValueError(f"Bad format: {raw}")

# Try API first, fall back to static
api_available = False
client = None
try:
    from openai import OpenAI
    api_key = os.environ.get("OPENAI_API_KEY", "")
    if api_key and api_key != "your-openai-key-here":
        client = OpenAI(api_key=api_key)
        api_available = True
        print("OpenAI client ready — will try API first, fallback to static on rate limit")
except Exception as e:
    print(f"OpenAI not available: {e} — using static fallback for all")

new_count = 0
fallback_count = 0

for i, row in enumerate(rows):
    if row['id'] in existing_ids:
        continue

    category = row.get('category', 'general')
    distractors = None

    # Try API
    if api_available and client:
        for attempt in range(3):
            try:
                distractors = get_distractors_api(client, row['question'], row['ground_truth'])
                break
            except Exception as e:
                if "429" in str(e) or "rate" in str(e).lower():
                    print(f"  Rate limit on {row['id']} (attempt {attempt+1}/3), using fallback...")
                    distractors = None
                    break  # Don't retry — use fallback immediately
                else:
                    print(f"  API error on {row['id']}: {e} — using fallback")
                    distractors = None
                    break

    # Fallback to static
    if distractors is None:
        distractors = fallback_map.get(category, fallback_map['general'])
        fallback_count += 1

    level2_rows.append({
        'id': row['id'],
        'question': row['question'],
        'ground_truth': row['ground_truth'],
        'category': category,
        'distractors': distractors
    })
    existing_ids.add(row['id'])
    new_count += 1

    # Save every 10
    if new_count % 10 == 0:
        with open(level2_path, 'w') as f:
            for r in level2_rows:
                f.write(json.dumps(r) + '\n')
        print(f"  Saved {new_count} new entries ({fallback_count} used fallback)")

    time.sleep(0.5)

# Final save
with open(level2_path, 'w') as f:
    for r in level2_rows:
        f.write(json.dumps(r) + '\n')

print(f"\nDone!")
print(f"  Total in level2.jsonl: {len(level2_rows)}")
print(f"  New this run: {new_count}")
print(f"  Used API: {new_count - fallback_count}")
print(f"  Used fallback: {fallback_count}")