{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# PEMF ARC-AGI — LLM Program Synthesis\n", "\n", "Uses NVIDIA NIM (free) with GLM 4.7 / DeepSeek V4 to solve ARC tasks.\n", "\n", "**Pipeline:** For each unsolved task → build prompt → LLM generates Python `transform()` → verify against ALL training pairs → apply to test.\n", "\n", "**Prerequisites:**\n", "- NVIDIA NIM API key from https://build.nvidia.com/settings/api-keys\n", "- Internet access enabled" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Setup" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# ============================================================\n", "# CONFIGURATION — EDIT THESE\n", "# ============================================================\n", "\n", "NVIDIA_API_KEY = \"nvapi-YOUR-KEY-HERE\" # Get from https://build.nvidia.com/settings/api-keys\n", "\n", "MODEL = \"z-ai/glm4.7\" # Default: GLM 4.7\n", "# MODEL = \"deepseek-ai/deepseek-v4-pro\" # Alternative: DeepSeek V4\n", "\n", "N_CANDIDATES = 8 # Candidates per task (more = better but slower)\n", "RATE_LIMIT_SLEEP = 2 # Seconds between API calls" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Download ARC dataset\n", "import os, subprocess\n", "\n", "if not os.path.exists('arc_data/training'):\n", " print('Downloading ARC dataset...')\n", " subprocess.run(['git', 'clone', '--depth', '1', 'https://github.com/fchollet/ARC-AGI.git', '/tmp/arc'], \n", " capture_output=True)\n", " os.makedirs('arc_data', exist_ok=True)\n", " subprocess.run(['cp', '-r', '/tmp/arc/data/training', 'arc_data/training'], capture_output=True)\n", " print(f'Downloaded {len(os.listdir(\"arc_data/training\"))} tasks')\n", "else:\n", " print(f'ARC data already present: {len(os.listdir(\"arc_data/training\"))} tasks')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Already solved by symbolic pipeline (70 tasks)\n", "ALREADY_SOLVED = {\n", " \"007bbfb7\",\"00d62c1b\",\"0d3d703e\",\"1190e5a7\",\"1cf80156\",\"1e0a9b12\",\"1f85a75f\",\n", " \"2013d3e2\",\"22168020\",\"22eb0ac0\",\"239be575\",\"23b5c85d\",\"28bf18c6\",\"2dee498d\",\n", " \"3618c87e\",\"3906de3d\",\"3aa6fb7a\",\"3af2c5a8\",\"3c9b0459\",\"42a50994\",\"4347f46a\",\n", " \"50cb2852\",\"6150a2bd\",\"62c24649\",\"67385a82\",\"67a3c6ac\",\"67e8384a\",\"68b16354\",\n", " \"6d0aefbc\",\"6f8cd79b\",\"6fa7a44f\",\"746b3537\",\"74dd1130\",\"7b7f7511\",\"7e0986d6\",\n", " \"7f4411dc\",\"868de0fa\",\"8be77c9e\",\"8d5021e8\",\"91714a58\",\"9172f3a0\",\"9565186b\",\n", " \"9dfd6313\",\"a416b8f3\",\"a5313dff\",\"a699fb00\",\"aabf363d\",\"aedd82e4\",\"b1948b0a\",\n", " \"b6afb2da\",\"ba97ae07\",\"bb43febb\",\"bda2d7a6\",\"be94b721\",\"c0f76784\",\"c59eb873\",\n", " \"c8f0f002\",\"c9e6f938\",\"d10ecb37\",\"d23f8c26\",\"d511f180\",\"d631b094\",\"d90796e8\",\n", " \"d9fac9be\",\"de1cd16c\",\"ded97339\",\"e26a3af2\",\"eb5a1d5d\",\"ed36ccf7\",\"f76d97a5\",\n", "}\n", "print(f'Already solved by symbolic pipeline: {len(ALREADY_SOLVED)} tasks')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. LLM Engine" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "import time\n", "import re\n", "import glob\n", "import numpy as np\n", "import urllib.request\n", "from collections import Counter\n", "\n", "\n", "def call_nvidia(prompt, api_key, model=\"z-ai/glm4.7\", temperature=0.7):\n", " \"\"\"Call NVIDIA NIM API.\"\"\"\n", " url = \"https://integrate.api.nvidia.com/v1/chat/completions\"\n", " payload = {\n", " \"model\": model,\n", " \"messages\": [{\"role\": \"user\", \"content\": prompt}],\n", " \"max_tokens\": 2048,\n", " \"temperature\": temperature,\n", " }\n", " data = json.dumps(payload).encode('utf-8')\n", " req = urllib.request.Request(url, data=data,\n", " headers={\"Content-Type\": \"application/json\",\n", " \"Authorization\": f\"Bearer {api_key}\"},\n", " method='POST')\n", " try:\n", " with urllib.request.urlopen(req, timeout=120) as resp:\n", " result = json.loads(resp.read().decode())\n", " return result['choices'][0]['message']['content']\n", " except Exception as e:\n", " return f\"ERROR: {e}\"\n", "\n", "\n", "def build_prompt(task):\n", " \"\"\"Build prompt for ARC task.\"\"\"\n", " train_pairs = task.get('train', [])\n", " examples = []\n", " for i, pair in enumerate(train_pairs):\n", " examples.append(\n", " f\"Example {i+1}:\\n\"\n", " f\" Input: {json.dumps(pair['input'])}\\n\"\n", " f\" Output: {json.dumps(pair['output'])}\"\n", " )\n", " examples_str = \"\\n\".join(examples)\n", "\n", " inputs = [np.array(p['input']) for p in train_pairs]\n", " outputs = [np.array(p['output']) for p in train_pairs]\n", " same_shape = all(i.shape == o.shape for i, o in zip(inputs, outputs))\n", " in_colors = sorted(set(c for i in inputs for c in np.unique(i).tolist()))\n", " out_colors = sorted(set(c for o in outputs for c in np.unique(o).tolist()))\n", "\n", " analysis = f\" Same input/output shape: {same_shape}\\n\"\n", " analysis += f\" Input colors: {in_colors}, Output colors: {out_colors}\\n\"\n", " if not same_shape:\n", " for i, o in zip(inputs[:1], outputs[:1]):\n", " analysis += f\" Shape: {i.shape} -> {o.shape}\\n\"\n", "\n", " return f\"\"\"Solve this ARC-AGI puzzle. Write ONLY a Python function, no explanations.\n", "\n", "{examples_str}\n", "\n", "Analysis:\n", "{analysis}\n", "```python\n", "import numpy as np\n", "from collections import Counter, deque\n", "\n", "def transform(grid: list[list[int]]) -> list[list[int]]:\n", " grid = np.array(grid)\n", "\"\"\"\n", "\n", "\n", "def extract_code(response):\n", " \"\"\"Extract Python function from LLM response.\"\"\"\n", " for pattern in [r'```python\\s*(.*?)```', r'```\\s*(.*?)```']:\n", " matches = re.findall(pattern, response, re.DOTALL)\n", " for match in matches:\n", " if 'def transform' in match:\n", " return match.strip()\n", " idx = response.find('def transform')\n", " if idx >= 0:\n", " before = response[:idx]\n", " import_start = max(before.rfind('import '), before.rfind('from '))\n", " start = import_start if import_start >= 0 else idx\n", " code = response[start:]\n", " end = code.find('```')\n", " if end > 0:\n", " code = code[:end]\n", " return code.strip()\n", " stripped = response.strip()\n", " if stripped.startswith(('import', 'def transform', 'from')):\n", " return stripped\n", " return None\n", "\n", "\n", "def verify_program(code, train_pairs):\n", " \"\"\"Execute program and verify against all training pairs.\"\"\"\n", " namespace = {'np': np, 'numpy': np, 'Counter': Counter,\n", " 'deque': __import__('collections').deque}\n", " try:\n", " import scipy.ndimage\n", " namespace['scipy'] = __import__('scipy')\n", " except ImportError:\n", " pass\n", " try:\n", " exec(code, namespace)\n", " except Exception:\n", " return False\n", " if 'transform' not in namespace:\n", " return False\n", " fn = namespace['transform']\n", " for pair in train_pairs:\n", " try:\n", " result = fn([row[:] for row in pair['input']])\n", " if result is None:\n", " return False\n", " r = np.array(result, dtype=int)\n", " e = np.array(pair['output'], dtype=int)\n", " if r.shape != e.shape or not np.array_equal(r, e):\n", " return False\n", " except Exception:\n", " return False\n", " return True\n", "\n", "\n", "def apply_program(code, test_input):\n", " \"\"\"Apply verified program to test input.\"\"\"\n", " namespace = {'np': np, 'numpy': np, 'Counter': Counter,\n", " 'deque': __import__('collections').deque}\n", " try:\n", " import scipy.ndimage\n", " namespace['scipy'] = __import__('scipy')\n", " except ImportError:\n", " pass\n", " try:\n", " exec(code, namespace)\n", " result = namespace['transform']([row[:] for row in test_input])\n", " if result is not None:\n", " return np.array(result, dtype=int).tolist()\n", " except Exception:\n", " pass\n", " return None\n", "\n", "\n", "print('LLM engine ready.')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Quick Test (1 task)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Quick test — verify API works before running all 330 tasks\n", "test_tid = '0520fde7'\n", "with open(f'arc_data/training/{test_tid}.json') as f:\n", " test_task = json.load(f)\n", "\n", "print(f'Testing on {test_tid}...')\n", "for i, p in enumerate(test_task['train']):\n", " inp = np.array(p['input']); out = np.array(p['output'])\n", " print(f' Pair {i}: {inp.shape} -> {out.shape}')\n", "\n", "prompt = build_prompt(test_task)\n", "print(f'Prompt: {len(prompt)} chars')\n", "\n", "response = call_nvidia(prompt, NVIDIA_API_KEY, MODEL, temperature=0.1)\n", "if response.startswith('ERROR:'):\n", " print(f'\\n❌ API Error: {response}')\n", " print('Check your NVIDIA_API_KEY and MODEL settings above.')\n", "else:\n", " code = extract_code(response)\n", " if code:\n", " ok = verify_program(code, test_task['train'])\n", " print(f'\\nCode extracted: {len(code)} chars')\n", " print(f'Verified: {\"✅\" if ok else \"❌\"}')\n", " if ok:\n", " print('API working and generating correct code!')\n", " else:\n", " print('API working but code failed verification (normal — will try more candidates in full run)')\n", " else:\n", " print(f'\\nNo code extracted from response ({len(response)} chars)')\n", " print('API working but response format unexpected. Will retry with different temperatures in full run.')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Run on All Unsolved Tasks" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load all unsolved tasks\n", "task_files = sorted(glob.glob('arc_data/training/*.json'))\n", "unsolved = []\n", "for tf in task_files:\n", " tid = os.path.basename(tf).replace('.json', '')\n", " if tid not in ALREADY_SOLVED:\n", " unsolved.append((tid, tf))\n", "\n", "print(f'Total tasks: {len(task_files)}')\n", "print(f'Already solved (symbolic): {len(ALREADY_SOLVED)}')\n", "print(f'To attempt with LLM: {len(unsolved)}')\n", "print(f'Model: {MODEL}')\n", "print(f'Candidates per task: {N_CANDIDATES}')\n", "print(f'\\nStarting...')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Main loop\n", "results = {}\n", "solved = 0\n", "total_time = 0\n", "\n", "# Resume from previous run if exists\n", "if os.path.exists('llm_results.json'):\n", " with open('llm_results.json') as f:\n", " prev = json.load(f)\n", " results = prev.get('results', {})\n", " solved = sum(1 for r in results.values() if r['status'] == 'solved')\n", " print(f'Resuming from previous run: {solved} already solved by LLM')\n", "\n", "for idx, (tid, tf) in enumerate(unsolved):\n", " # Skip if already attempted\n", " if tid in results:\n", " continue\n", " \n", " with open(tf) as f:\n", " task = json.load(f)\n", " \n", " print(f'[{idx+1:3d}/{len(unsolved)}] {tid}:', end=' ', flush=True)\n", " start = time.time()\n", " \n", " prompt = build_prompt(task)\n", " task_solved = False\n", " \n", " for i in range(N_CANDIDATES):\n", " temp = 0.1 if i == 0 else min(0.4 + 0.15 * i, 1.2)\n", " response = call_nvidia(prompt, NVIDIA_API_KEY, MODEL, temp)\n", " \n", " if response.startswith('ERROR:'):\n", " if '429' in response or 'rate' in response.lower():\n", " time.sleep(10) # Rate limit — wait longer\n", " continue\n", " \n", " code = extract_code(response)\n", " if code is None:\n", " continue\n", " \n", " if verify_program(code, task['train']):\n", " elapsed = time.time() - start\n", " total_time += elapsed\n", " solved += 1\n", " \n", " test_outputs = [apply_program(code, t['input']) for t in task.get('test', [])]\n", " results[tid] = {\n", " 'status': 'solved', 'rule': f'llm_c{i+1}_t{temp:.1f}',\n", " 'code': code, 'test_outputs': test_outputs,\n", " 'time_s': round(elapsed, 2),\n", " }\n", " print(f'✅ c{i+1} ({elapsed:.1f}s) [total: {len(ALREADY_SOLVED)+solved}/{len(task_files)}]')\n", " task_solved = True\n", " break\n", " \n", " time.sleep(RATE_LIMIT_SLEEP)\n", " \n", " if not task_solved:\n", " elapsed = time.time() - start\n", " total_time += elapsed\n", " results[tid] = {'status': 'failed', 'time_s': round(elapsed, 2)}\n", " print(f'❌ ({elapsed:.1f}s)')\n", " \n", " # Save progress every 10 tasks\n", " if (idx + 1) % 10 == 0:\n", " with open('llm_results.json', 'w') as f:\n", " json.dump({\n", " 'model': MODEL, 'n_candidates': N_CANDIDATES,\n", " 'llm_solved': solved, 'attempted': sum(1 for r in results.values()),\n", " 'symbolic_solved': len(ALREADY_SOLVED),\n", " 'total_solved': len(ALREADY_SOLVED) + solved,\n", " 'total_tasks': len(task_files),\n", " 'solve_rate': round(100 * (len(ALREADY_SOLVED) + solved) / len(task_files), 2),\n", " 'total_time_s': round(total_time, 1),\n", " 'results': results,\n", " }, f, indent=2)\n", " print(f' [Saved: {len(ALREADY_SOLVED)+solved}/{len(task_files)} total]')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Final save\n", "with open('llm_results.json', 'w') as f:\n", " json.dump({\n", " 'model': MODEL, 'n_candidates': N_CANDIDATES,\n", " 'llm_solved': solved, 'attempted': sum(1 for r in results.values()),\n", " 'symbolic_solved': len(ALREADY_SOLVED),\n", " 'total_solved': len(ALREADY_SOLVED) + solved,\n", " 'total_tasks': len(task_files),\n", " 'solve_rate': round(100 * (len(ALREADY_SOLVED) + solved) / len(task_files), 2),\n", " 'total_time_s': round(total_time, 1),\n", " 'results': results,\n", " }, f, indent=2)\n", "\n", "print(f'\\n{\"=\"*60}')\n", "print(f'FINAL RESULTS')\n", "print(f'{\"=\"*60}')\n", "print(f'LLM solved: {solved}')\n", "print(f'Symbolic solved: {len(ALREADY_SOLVED)}')\n", "print(f'TOTAL SOLVED: {len(ALREADY_SOLVED)+solved}/{len(task_files)} ({100*(len(ALREADY_SOLVED)+solved)/len(task_files):.1f}%)')\n", "print(f'Time: {total_time:.0f}s')\n", "print(f'\\nResults saved to: llm_results.json')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Results Analysis" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load and analyze results\n", "with open('llm_results.json') as f:\n", " data = json.load(f)\n", "\n", "print(f'Model: {data[\"model\"]}')\n", "print(f'Candidates per task: {data[\"n_candidates\"]}')\n", "print(f'\\nSymbolic solved: {data[\"symbolic_solved\"]}')\n", "print(f'LLM solved: {data[\"llm_solved\"]}')\n", "print(f'TOTAL: {data[\"total_solved\"]}/{data[\"total_tasks\"]} ({data[\"solve_rate\"]}%)')\n", "\n", "llm_solved_tasks = [tid for tid, r in data['results'].items() if r['status'] == 'solved']\n", "print(f'\\nLLM-solved tasks ({len(llm_solved_tasks)}):')\n", "for tid in sorted(llm_solved_tasks):\n", " rule = data['results'][tid].get('rule', '?')\n", " t = data['results'][tid].get('time_s', 0)\n", " print(f' {tid}: {rule} ({t}s)')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. Download Results\n", "\n", "Download `llm_results.json` from the notebook output, then merge with symbolic results:\n", "\n", "```bash\n", "python scripts/merge_results.py arc_results/summary_v4.json llm_results.json\n", "```" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 4 }