rogermt commited on
Commit
d979316
·
verified ·
1 Parent(s): 4db7aa4

Add Jupyter notebook for LLM synthesis (NVIDIA NIM / GLM 4.7 / DeepSeek V4)

Browse files
Files changed (1) hide show
  1. notebooks/pemf_llm_solver.ipynb +490 -0
notebooks/pemf_llm_solver.ipynb ADDED
@@ -0,0 +1,490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# PEMF ARC-AGI — LLM Program Synthesis\n",
8
+ "\n",
9
+ "Uses NVIDIA NIM (free) with GLM 4.7 / DeepSeek V4 to solve ARC tasks.\n",
10
+ "\n",
11
+ "**Pipeline:** For each unsolved task → build prompt → LLM generates Python `transform()` → verify against ALL training pairs → apply to test.\n",
12
+ "\n",
13
+ "**Prerequisites:**\n",
14
+ "- NVIDIA NIM API key from https://build.nvidia.com/settings/api-keys\n",
15
+ "- Internet access enabled"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "markdown",
20
+ "metadata": {},
21
+ "source": [
22
+ "## 1. Setup"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": null,
28
+ "metadata": {},
29
+ "outputs": [],
30
+ "source": [
31
+ "# ============================================================\n",
32
+ "# CONFIGURATION — EDIT THESE\n",
33
+ "# ============================================================\n",
34
+ "\n",
35
+ "NVIDIA_API_KEY = \"nvapi-YOUR-KEY-HERE\" # Get from https://build.nvidia.com/settings/api-keys\n",
36
+ "\n",
37
+ "MODEL = \"z-ai/glm4.7\" # Default: GLM 4.7\n",
38
+ "# MODEL = \"deepseek-ai/deepseek-v4-pro\" # Alternative: DeepSeek V4\n",
39
+ "\n",
40
+ "N_CANDIDATES = 8 # Candidates per task (more = better but slower)\n",
41
+ "RATE_LIMIT_SLEEP = 2 # Seconds between API calls"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": null,
47
+ "metadata": {},
48
+ "outputs": [],
49
+ "source": [
50
+ "# Download ARC dataset\n",
51
+ "import os, subprocess\n",
52
+ "\n",
53
+ "if not os.path.exists('arc_data/training'):\n",
54
+ " print('Downloading ARC dataset...')\n",
55
+ " subprocess.run(['git', 'clone', '--depth', '1', 'https://github.com/fchollet/ARC-AGI.git', '/tmp/arc'], \n",
56
+ " capture_output=True)\n",
57
+ " os.makedirs('arc_data', exist_ok=True)\n",
58
+ " subprocess.run(['cp', '-r', '/tmp/arc/data/training', 'arc_data/training'], capture_output=True)\n",
59
+ " print(f'Downloaded {len(os.listdir(\"arc_data/training\"))} tasks')\n",
60
+ "else:\n",
61
+ " print(f'ARC data already present: {len(os.listdir(\"arc_data/training\"))} tasks')"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": null,
67
+ "metadata": {},
68
+ "outputs": [],
69
+ "source": [
70
+ "# Already solved by symbolic pipeline (70 tasks)\n",
71
+ "ALREADY_SOLVED = {\n",
72
+ " \"007bbfb7\",\"00d62c1b\",\"0d3d703e\",\"1190e5a7\",\"1cf80156\",\"1e0a9b12\",\"1f85a75f\",\n",
73
+ " \"2013d3e2\",\"22168020\",\"22eb0ac0\",\"239be575\",\"23b5c85d\",\"28bf18c6\",\"2dee498d\",\n",
74
+ " \"3618c87e\",\"3906de3d\",\"3aa6fb7a\",\"3af2c5a8\",\"3c9b0459\",\"42a50994\",\"4347f46a\",\n",
75
+ " \"50cb2852\",\"6150a2bd\",\"62c24649\",\"67385a82\",\"67a3c6ac\",\"67e8384a\",\"68b16354\",\n",
76
+ " \"6d0aefbc\",\"6f8cd79b\",\"6fa7a44f\",\"746b3537\",\"74dd1130\",\"7b7f7511\",\"7e0986d6\",\n",
77
+ " \"7f4411dc\",\"868de0fa\",\"8be77c9e\",\"8d5021e8\",\"91714a58\",\"9172f3a0\",\"9565186b\",\n",
78
+ " \"9dfd6313\",\"a416b8f3\",\"a5313dff\",\"a699fb00\",\"aabf363d\",\"aedd82e4\",\"b1948b0a\",\n",
79
+ " \"b6afb2da\",\"ba97ae07\",\"bb43febb\",\"bda2d7a6\",\"be94b721\",\"c0f76784\",\"c59eb873\",\n",
80
+ " \"c8f0f002\",\"c9e6f938\",\"d10ecb37\",\"d23f8c26\",\"d511f180\",\"d631b094\",\"d90796e8\",\n",
81
+ " \"d9fac9be\",\"de1cd16c\",\"ded97339\",\"e26a3af2\",\"eb5a1d5d\",\"ed36ccf7\",\"f76d97a5\",\n",
82
+ "}\n",
83
+ "print(f'Already solved by symbolic pipeline: {len(ALREADY_SOLVED)} tasks')"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "markdown",
88
+ "metadata": {},
89
+ "source": [
90
+ "## 2. LLM Engine"
91
+ ]
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "execution_count": null,
96
+ "metadata": {},
97
+ "outputs": [],
98
+ "source": [
99
+ "import json\n",
100
+ "import time\n",
101
+ "import re\n",
102
+ "import glob\n",
103
+ "import numpy as np\n",
104
+ "import urllib.request\n",
105
+ "from collections import Counter\n",
106
+ "\n",
107
+ "\n",
108
+ "def call_nvidia(prompt, api_key, model=\"z-ai/glm4.7\", temperature=0.7):\n",
109
+ " \"\"\"Call NVIDIA NIM API.\"\"\"\n",
110
+ " url = \"https://integrate.api.nvidia.com/v1/chat/completions\"\n",
111
+ " payload = {\n",
112
+ " \"model\": model,\n",
113
+ " \"messages\": [{\"role\": \"user\", \"content\": prompt}],\n",
114
+ " \"max_tokens\": 2048,\n",
115
+ " \"temperature\": temperature,\n",
116
+ " }\n",
117
+ " data = json.dumps(payload).encode('utf-8')\n",
118
+ " req = urllib.request.Request(url, data=data,\n",
119
+ " headers={\"Content-Type\": \"application/json\",\n",
120
+ " \"Authorization\": f\"Bearer {api_key}\"},\n",
121
+ " method='POST')\n",
122
+ " try:\n",
123
+ " with urllib.request.urlopen(req, timeout=120) as resp:\n",
124
+ " result = json.loads(resp.read().decode())\n",
125
+ " return result['choices'][0]['message']['content']\n",
126
+ " except Exception as e:\n",
127
+ " return f\"ERROR: {e}\"\n",
128
+ "\n",
129
+ "\n",
130
+ "def build_prompt(task):\n",
131
+ " \"\"\"Build prompt for ARC task.\"\"\"\n",
132
+ " train_pairs = task.get('train', [])\n",
133
+ " examples = []\n",
134
+ " for i, pair in enumerate(train_pairs):\n",
135
+ " examples.append(\n",
136
+ " f\"Example {i+1}:\\n\"\n",
137
+ " f\" Input: {json.dumps(pair['input'])}\\n\"\n",
138
+ " f\" Output: {json.dumps(pair['output'])}\"\n",
139
+ " )\n",
140
+ " examples_str = \"\\n\".join(examples)\n",
141
+ "\n",
142
+ " inputs = [np.array(p['input']) for p in train_pairs]\n",
143
+ " outputs = [np.array(p['output']) for p in train_pairs]\n",
144
+ " same_shape = all(i.shape == o.shape for i, o in zip(inputs, outputs))\n",
145
+ " in_colors = sorted(set(c for i in inputs for c in np.unique(i).tolist()))\n",
146
+ " out_colors = sorted(set(c for o in outputs for c in np.unique(o).tolist()))\n",
147
+ "\n",
148
+ " analysis = f\" Same input/output shape: {same_shape}\\n\"\n",
149
+ " analysis += f\" Input colors: {in_colors}, Output colors: {out_colors}\\n\"\n",
150
+ " if not same_shape:\n",
151
+ " for i, o in zip(inputs[:1], outputs[:1]):\n",
152
+ " analysis += f\" Shape: {i.shape} -> {o.shape}\\n\"\n",
153
+ "\n",
154
+ " return f\"\"\"Solve this ARC-AGI puzzle. Write ONLY a Python function, no explanations.\n",
155
+ "\n",
156
+ "{examples_str}\n",
157
+ "\n",
158
+ "Analysis:\n",
159
+ "{analysis}\n",
160
+ "```python\n",
161
+ "import numpy as np\n",
162
+ "from collections import Counter, deque\n",
163
+ "\n",
164
+ "def transform(grid: list[list[int]]) -> list[list[int]]:\n",
165
+ " grid = np.array(grid)\n",
166
+ "\"\"\"\n",
167
+ "\n",
168
+ "\n",
169
+ "def extract_code(response):\n",
170
+ " \"\"\"Extract Python function from LLM response.\"\"\"\n",
171
+ " for pattern in [r'```python\\s*(.*?)```', r'```\\s*(.*?)```']:\n",
172
+ " matches = re.findall(pattern, response, re.DOTALL)\n",
173
+ " for match in matches:\n",
174
+ " if 'def transform' in match:\n",
175
+ " return match.strip()\n",
176
+ " idx = response.find('def transform')\n",
177
+ " if idx >= 0:\n",
178
+ " before = response[:idx]\n",
179
+ " import_start = max(before.rfind('import '), before.rfind('from '))\n",
180
+ " start = import_start if import_start >= 0 else idx\n",
181
+ " code = response[start:]\n",
182
+ " end = code.find('```')\n",
183
+ " if end > 0:\n",
184
+ " code = code[:end]\n",
185
+ " return code.strip()\n",
186
+ " stripped = response.strip()\n",
187
+ " if stripped.startswith(('import', 'def transform', 'from')):\n",
188
+ " return stripped\n",
189
+ " return None\n",
190
+ "\n",
191
+ "\n",
192
+ "def verify_program(code, train_pairs):\n",
193
+ " \"\"\"Execute program and verify against all training pairs.\"\"\"\n",
194
+ " namespace = {'np': np, 'numpy': np, 'Counter': Counter,\n",
195
+ " 'deque': __import__('collections').deque}\n",
196
+ " try:\n",
197
+ " import scipy.ndimage\n",
198
+ " namespace['scipy'] = __import__('scipy')\n",
199
+ " except ImportError:\n",
200
+ " pass\n",
201
+ " try:\n",
202
+ " exec(code, namespace)\n",
203
+ " except Exception:\n",
204
+ " return False\n",
205
+ " if 'transform' not in namespace:\n",
206
+ " return False\n",
207
+ " fn = namespace['transform']\n",
208
+ " for pair in train_pairs:\n",
209
+ " try:\n",
210
+ " result = fn([row[:] for row in pair['input']])\n",
211
+ " if result is None:\n",
212
+ " return False\n",
213
+ " r = np.array(result, dtype=int)\n",
214
+ " e = np.array(pair['output'], dtype=int)\n",
215
+ " if r.shape != e.shape or not np.array_equal(r, e):\n",
216
+ " return False\n",
217
+ " except Exception:\n",
218
+ " return False\n",
219
+ " return True\n",
220
+ "\n",
221
+ "\n",
222
+ "def apply_program(code, test_input):\n",
223
+ " \"\"\"Apply verified program to test input.\"\"\"\n",
224
+ " namespace = {'np': np, 'numpy': np, 'Counter': Counter,\n",
225
+ " 'deque': __import__('collections').deque}\n",
226
+ " try:\n",
227
+ " import scipy.ndimage\n",
228
+ " namespace['scipy'] = __import__('scipy')\n",
229
+ " except ImportError:\n",
230
+ " pass\n",
231
+ " try:\n",
232
+ " exec(code, namespace)\n",
233
+ " result = namespace['transform']([row[:] for row in test_input])\n",
234
+ " if result is not None:\n",
235
+ " return np.array(result, dtype=int).tolist()\n",
236
+ " except Exception:\n",
237
+ " pass\n",
238
+ " return None\n",
239
+ "\n",
240
+ "\n",
241
+ "print('LLM engine ready.')"
242
+ ]
243
+ },
244
+ {
245
+ "cell_type": "markdown",
246
+ "metadata": {},
247
+ "source": [
248
+ "## 3. Quick Test (1 task)"
249
+ ]
250
+ },
251
+ {
252
+ "cell_type": "code",
253
+ "execution_count": null,
254
+ "metadata": {},
255
+ "outputs": [],
256
+ "source": [
257
+ "# Quick test — verify API works before running all 330 tasks\n",
258
+ "test_tid = '0520fde7'\n",
259
+ "with open(f'arc_data/training/{test_tid}.json') as f:\n",
260
+ " test_task = json.load(f)\n",
261
+ "\n",
262
+ "print(f'Testing on {test_tid}...')\n",
263
+ "for i, p in enumerate(test_task['train']):\n",
264
+ " inp = np.array(p['input']); out = np.array(p['output'])\n",
265
+ " print(f' Pair {i}: {inp.shape} -> {out.shape}')\n",
266
+ "\n",
267
+ "prompt = build_prompt(test_task)\n",
268
+ "print(f'Prompt: {len(prompt)} chars')\n",
269
+ "\n",
270
+ "response = call_nvidia(prompt, NVIDIA_API_KEY, MODEL, temperature=0.1)\n",
271
+ "if response.startswith('ERROR:'):\n",
272
+ " print(f'\\n❌ API Error: {response}')\n",
273
+ " print('Check your NVIDIA_API_KEY and MODEL settings above.')\n",
274
+ "else:\n",
275
+ " code = extract_code(response)\n",
276
+ " if code:\n",
277
+ " ok = verify_program(code, test_task['train'])\n",
278
+ " print(f'\\nCode extracted: {len(code)} chars')\n",
279
+ " print(f'Verified: {\"✅\" if ok else \"❌\"}')\n",
280
+ " if ok:\n",
281
+ " print('API working and generating correct code!')\n",
282
+ " else:\n",
283
+ " print('API working but code failed verification (normal — will try more candidates in full run)')\n",
284
+ " else:\n",
285
+ " print(f'\\nNo code extracted from response ({len(response)} chars)')\n",
286
+ " print('API working but response format unexpected. Will retry with different temperatures in full run.')"
287
+ ]
288
+ },
289
+ {
290
+ "cell_type": "markdown",
291
+ "metadata": {},
292
+ "source": [
293
+ "## 4. Run on All Unsolved Tasks"
294
+ ]
295
+ },
296
+ {
297
+ "cell_type": "code",
298
+ "execution_count": null,
299
+ "metadata": {},
300
+ "outputs": [],
301
+ "source": [
302
+ "# Load all unsolved tasks\n",
303
+ "task_files = sorted(glob.glob('arc_data/training/*.json'))\n",
304
+ "unsolved = []\n",
305
+ "for tf in task_files:\n",
306
+ " tid = os.path.basename(tf).replace('.json', '')\n",
307
+ " if tid not in ALREADY_SOLVED:\n",
308
+ " unsolved.append((tid, tf))\n",
309
+ "\n",
310
+ "print(f'Total tasks: {len(task_files)}')\n",
311
+ "print(f'Already solved (symbolic): {len(ALREADY_SOLVED)}')\n",
312
+ "print(f'To attempt with LLM: {len(unsolved)}')\n",
313
+ "print(f'Model: {MODEL}')\n",
314
+ "print(f'Candidates per task: {N_CANDIDATES}')\n",
315
+ "print(f'\\nStarting...')"
316
+ ]
317
+ },
318
+ {
319
+ "cell_type": "code",
320
+ "execution_count": null,
321
+ "metadata": {},
322
+ "outputs": [],
323
+ "source": [
324
+ "# Main loop\n",
325
+ "results = {}\n",
326
+ "solved = 0\n",
327
+ "total_time = 0\n",
328
+ "\n",
329
+ "# Resume from previous run if exists\n",
330
+ "if os.path.exists('llm_results.json'):\n",
331
+ " with open('llm_results.json') as f:\n",
332
+ " prev = json.load(f)\n",
333
+ " results = prev.get('results', {})\n",
334
+ " solved = sum(1 for r in results.values() if r['status'] == 'solved')\n",
335
+ " print(f'Resuming from previous run: {solved} already solved by LLM')\n",
336
+ "\n",
337
+ "for idx, (tid, tf) in enumerate(unsolved):\n",
338
+ " # Skip if already attempted\n",
339
+ " if tid in results:\n",
340
+ " continue\n",
341
+ " \n",
342
+ " with open(tf) as f:\n",
343
+ " task = json.load(f)\n",
344
+ " \n",
345
+ " print(f'[{idx+1:3d}/{len(unsolved)}] {tid}:', end=' ', flush=True)\n",
346
+ " start = time.time()\n",
347
+ " \n",
348
+ " prompt = build_prompt(task)\n",
349
+ " task_solved = False\n",
350
+ " \n",
351
+ " for i in range(N_CANDIDATES):\n",
352
+ " temp = 0.1 if i == 0 else min(0.4 + 0.15 * i, 1.2)\n",
353
+ " response = call_nvidia(prompt, NVIDIA_API_KEY, MODEL, temp)\n",
354
+ " \n",
355
+ " if response.startswith('ERROR:'):\n",
356
+ " if '429' in response or 'rate' in response.lower():\n",
357
+ " time.sleep(10) # Rate limit — wait longer\n",
358
+ " continue\n",
359
+ " \n",
360
+ " code = extract_code(response)\n",
361
+ " if code is None:\n",
362
+ " continue\n",
363
+ " \n",
364
+ " if verify_program(code, task['train']):\n",
365
+ " elapsed = time.time() - start\n",
366
+ " total_time += elapsed\n",
367
+ " solved += 1\n",
368
+ " \n",
369
+ " test_outputs = [apply_program(code, t['input']) for t in task.get('test', [])]\n",
370
+ " results[tid] = {\n",
371
+ " 'status': 'solved', 'rule': f'llm_c{i+1}_t{temp:.1f}',\n",
372
+ " 'code': code, 'test_outputs': test_outputs,\n",
373
+ " 'time_s': round(elapsed, 2),\n",
374
+ " }\n",
375
+ " print(f'✅ c{i+1} ({elapsed:.1f}s) [total: {len(ALREADY_SOLVED)+solved}/{len(task_files)}]')\n",
376
+ " task_solved = True\n",
377
+ " break\n",
378
+ " \n",
379
+ " time.sleep(RATE_LIMIT_SLEEP)\n",
380
+ " \n",
381
+ " if not task_solved:\n",
382
+ " elapsed = time.time() - start\n",
383
+ " total_time += elapsed\n",
384
+ " results[tid] = {'status': 'failed', 'time_s': round(elapsed, 2)}\n",
385
+ " print(f'❌ ({elapsed:.1f}s)')\n",
386
+ " \n",
387
+ " # Save progress every 10 tasks\n",
388
+ " if (idx + 1) % 10 == 0:\n",
389
+ " with open('llm_results.json', 'w') as f:\n",
390
+ " json.dump({\n",
391
+ " 'model': MODEL, 'n_candidates': N_CANDIDATES,\n",
392
+ " 'llm_solved': solved, 'attempted': sum(1 for r in results.values()),\n",
393
+ " 'symbolic_solved': len(ALREADY_SOLVED),\n",
394
+ " 'total_solved': len(ALREADY_SOLVED) + solved,\n",
395
+ " 'total_tasks': len(task_files),\n",
396
+ " 'solve_rate': round(100 * (len(ALREADY_SOLVED) + solved) / len(task_files), 2),\n",
397
+ " 'total_time_s': round(total_time, 1),\n",
398
+ " 'results': results,\n",
399
+ " }, f, indent=2)\n",
400
+ " print(f' [Saved: {len(ALREADY_SOLVED)+solved}/{len(task_files)} total]')"
401
+ ]
402
+ },
403
+ {
404
+ "cell_type": "code",
405
+ "execution_count": null,
406
+ "metadata": {},
407
+ "outputs": [],
408
+ "source": [
409
+ "# Final save\n",
410
+ "with open('llm_results.json', 'w') as f:\n",
411
+ " json.dump({\n",
412
+ " 'model': MODEL, 'n_candidates': N_CANDIDATES,\n",
413
+ " 'llm_solved': solved, 'attempted': sum(1 for r in results.values()),\n",
414
+ " 'symbolic_solved': len(ALREADY_SOLVED),\n",
415
+ " 'total_solved': len(ALREADY_SOLVED) + solved,\n",
416
+ " 'total_tasks': len(task_files),\n",
417
+ " 'solve_rate': round(100 * (len(ALREADY_SOLVED) + solved) / len(task_files), 2),\n",
418
+ " 'total_time_s': round(total_time, 1),\n",
419
+ " 'results': results,\n",
420
+ " }, f, indent=2)\n",
421
+ "\n",
422
+ "print(f'\\n{\"=\"*60}')\n",
423
+ "print(f'FINAL RESULTS')\n",
424
+ "print(f'{\"=\"*60}')\n",
425
+ "print(f'LLM solved: {solved}')\n",
426
+ "print(f'Symbolic solved: {len(ALREADY_SOLVED)}')\n",
427
+ "print(f'TOTAL SOLVED: {len(ALREADY_SOLVED)+solved}/{len(task_files)} ({100*(len(ALREADY_SOLVED)+solved)/len(task_files):.1f}%)')\n",
428
+ "print(f'Time: {total_time:.0f}s')\n",
429
+ "print(f'\\nResults saved to: llm_results.json')"
430
+ ]
431
+ },
432
+ {
433
+ "cell_type": "markdown",
434
+ "metadata": {},
435
+ "source": [
436
+ "## 5. Results Analysis"
437
+ ]
438
+ },
439
+ {
440
+ "cell_type": "code",
441
+ "execution_count": null,
442
+ "metadata": {},
443
+ "outputs": [],
444
+ "source": [
445
+ "# Load and analyze results\n",
446
+ "with open('llm_results.json') as f:\n",
447
+ " data = json.load(f)\n",
448
+ "\n",
449
+ "print(f'Model: {data[\"model\"]}')\n",
450
+ "print(f'Candidates per task: {data[\"n_candidates\"]}')\n",
451
+ "print(f'\\nSymbolic solved: {data[\"symbolic_solved\"]}')\n",
452
+ "print(f'LLM solved: {data[\"llm_solved\"]}')\n",
453
+ "print(f'TOTAL: {data[\"total_solved\"]}/{data[\"total_tasks\"]} ({data[\"solve_rate\"]}%)')\n",
454
+ "\n",
455
+ "llm_solved_tasks = [tid for tid, r in data['results'].items() if r['status'] == 'solved']\n",
456
+ "print(f'\\nLLM-solved tasks ({len(llm_solved_tasks)}):')\n",
457
+ "for tid in sorted(llm_solved_tasks):\n",
458
+ " rule = data['results'][tid].get('rule', '?')\n",
459
+ " t = data['results'][tid].get('time_s', 0)\n",
460
+ " print(f' {tid}: {rule} ({t}s)')"
461
+ ]
462
+ },
463
+ {
464
+ "cell_type": "markdown",
465
+ "metadata": {},
466
+ "source": [
467
+ "## 6. Download Results\n",
468
+ "\n",
469
+ "Download `llm_results.json` from the notebook output, then merge with symbolic results:\n",
470
+ "\n",
471
+ "```bash\n",
472
+ "python scripts/merge_results.py arc_results/summary_v4.json llm_results.json\n",
473
+ "```"
474
+ ]
475
+ }
476
+ ],
477
+ "metadata": {
478
+ "kernelspec": {
479
+ "display_name": "Python 3",
480
+ "language": "python",
481
+ "name": "python3"
482
+ },
483
+ "language_info": {
484
+ "name": "python",
485
+ "version": "3.10.0"
486
+ }
487
+ },
488
+ "nbformat": 4,
489
+ "nbformat_minor": 4
490
+ }