rogermt
/

ARC-AGI

Model card Files Files and versions

xet

Community

rogermt commited on 10 days ago

Commit

d979316

verified ·

1 Parent(s): 4db7aa4

Add Jupyter notebook for LLM synthesis (NVIDIA NIM / GLM 4.7 / DeepSeek V4)

Browse files

Files changed (1) hide show

notebooks/pemf_llm_solver.ipynb +490 -0

notebooks/pemf_llm_solver.ipynb ADDED Viewed

	@@ -0,0 +1,490 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# PEMF ARC-AGI — LLM Program Synthesis\n",
+    "\n",
+    "Uses NVIDIA NIM (free) with GLM 4.7 / DeepSeek V4 to solve ARC tasks.\n",
+    "\n",
+    "**Pipeline:** For each unsolved task → build prompt → LLM generates Python `transform()` → verify against ALL training pairs → apply to test.\n",
+    "\n",
+    "**Prerequisites:**\n",
+    "- NVIDIA NIM API key from https://build.nvidia.com/settings/api-keys\n",
+    "- Internet access enabled"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ============================================================\n",
+    "# CONFIGURATION — EDIT THESE\n",
+    "# ============================================================\n",
+    "\n",
+    "NVIDIA_API_KEY = \"nvapi-YOUR-KEY-HERE\"  # Get from https://build.nvidia.com/settings/api-keys\n",
+    "\n",
+    "MODEL = \"z-ai/glm4.7\"                  # Default: GLM 4.7\n",
+    "# MODEL = \"deepseek-ai/deepseek-v4-pro\"  # Alternative: DeepSeek V4\n",
+    "\n",
+    "N_CANDIDATES = 8      # Candidates per task (more = better but slower)\n",
+    "RATE_LIMIT_SLEEP = 2  # Seconds between API calls"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Download ARC dataset\n",
+    "import os, subprocess\n",
+    "\n",
+    "if not os.path.exists('arc_data/training'):\n",
+    "    print('Downloading ARC dataset...')\n",
+    "    subprocess.run(['git', 'clone', '--depth', '1', 'https://github.com/fchollet/ARC-AGI.git', '/tmp/arc'], \n",
+    "                   capture_output=True)\n",
+    "    os.makedirs('arc_data', exist_ok=True)\n",
+    "    subprocess.run(['cp', '-r', '/tmp/arc/data/training', 'arc_data/training'], capture_output=True)\n",
+    "    print(f'Downloaded {len(os.listdir(\"arc_data/training\"))} tasks')\n",
+    "else:\n",
+    "    print(f'ARC data already present: {len(os.listdir(\"arc_data/training\"))} tasks')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Already solved by symbolic pipeline (70 tasks)\n",
+    "ALREADY_SOLVED = {\n",
+    "    \"007bbfb7\",\"00d62c1b\",\"0d3d703e\",\"1190e5a7\",\"1cf80156\",\"1e0a9b12\",\"1f85a75f\",\n",
+    "    \"2013d3e2\",\"22168020\",\"22eb0ac0\",\"239be575\",\"23b5c85d\",\"28bf18c6\",\"2dee498d\",\n",
+    "    \"3618c87e\",\"3906de3d\",\"3aa6fb7a\",\"3af2c5a8\",\"3c9b0459\",\"42a50994\",\"4347f46a\",\n",
+    "    \"50cb2852\",\"6150a2bd\",\"62c24649\",\"67385a82\",\"67a3c6ac\",\"67e8384a\",\"68b16354\",\n",
+    "    \"6d0aefbc\",\"6f8cd79b\",\"6fa7a44f\",\"746b3537\",\"74dd1130\",\"7b7f7511\",\"7e0986d6\",\n",
+    "    \"7f4411dc\",\"868de0fa\",\"8be77c9e\",\"8d5021e8\",\"91714a58\",\"9172f3a0\",\"9565186b\",\n",
+    "    \"9dfd6313\",\"a416b8f3\",\"a5313dff\",\"a699fb00\",\"aabf363d\",\"aedd82e4\",\"b1948b0a\",\n",
+    "    \"b6afb2da\",\"ba97ae07\",\"bb43febb\",\"bda2d7a6\",\"be94b721\",\"c0f76784\",\"c59eb873\",\n",
+    "    \"c8f0f002\",\"c9e6f938\",\"d10ecb37\",\"d23f8c26\",\"d511f180\",\"d631b094\",\"d90796e8\",\n",
+    "    \"d9fac9be\",\"de1cd16c\",\"ded97339\",\"e26a3af2\",\"eb5a1d5d\",\"ed36ccf7\",\"f76d97a5\",\n",
+    "}\n",
+    "print(f'Already solved by symbolic pipeline: {len(ALREADY_SOLVED)} tasks')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. LLM Engine"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import time\n",
+    "import re\n",
+    "import glob\n",
+    "import numpy as np\n",
+    "import urllib.request\n",
+    "from collections import Counter\n",
+    "\n",
+    "\n",
+    "def call_nvidia(prompt, api_key, model=\"z-ai/glm4.7\", temperature=0.7):\n",
+    "    \"\"\"Call NVIDIA NIM API.\"\"\"\n",
+    "    url = \"https://integrate.api.nvidia.com/v1/chat/completions\"\n",
+    "    payload = {\n",
+    "        \"model\": model,\n",
+    "        \"messages\": [{\"role\": \"user\", \"content\": prompt}],\n",
+    "        \"max_tokens\": 2048,\n",
+    "        \"temperature\": temperature,\n",
+    "    }\n",
+    "    data = json.dumps(payload).encode('utf-8')\n",
+    "    req = urllib.request.Request(url, data=data,\n",
+    "                                headers={\"Content-Type\": \"application/json\",\n",
+    "                                         \"Authorization\": f\"Bearer {api_key}\"},\n",
+    "                                method='POST')\n",
+    "    try:\n",
+    "        with urllib.request.urlopen(req, timeout=120) as resp:\n",
+    "            result = json.loads(resp.read().decode())\n",
+    "            return result['choices'][0]['message']['content']\n",
+    "    except Exception as e:\n",
+    "        return f\"ERROR: {e}\"\n",
+    "\n",
+    "\n",
+    "def build_prompt(task):\n",
+    "    \"\"\"Build prompt for ARC task.\"\"\"\n",
+    "    train_pairs = task.get('train', [])\n",
+    "    examples = []\n",
+    "    for i, pair in enumerate(train_pairs):\n",
+    "        examples.append(\n",
+    "            f\"Example {i+1}:\\n\"\n",
+    "            f\"  Input:  {json.dumps(pair['input'])}\\n\"\n",
+    "            f\"  Output: {json.dumps(pair['output'])}\"\n",
+    "        )\n",
+    "    examples_str = \"\\n\".join(examples)\n",
+    "\n",
+    "    inputs = [np.array(p['input']) for p in train_pairs]\n",
+    "    outputs = [np.array(p['output']) for p in train_pairs]\n",
+    "    same_shape = all(i.shape == o.shape for i, o in zip(inputs, outputs))\n",
+    "    in_colors = sorted(set(c for i in inputs for c in np.unique(i).tolist()))\n",
+    "    out_colors = sorted(set(c for o in outputs for c in np.unique(o).tolist()))\n",
+    "\n",
+    "    analysis = f\"  Same input/output shape: {same_shape}\\n\"\n",
+    "    analysis += f\"  Input colors: {in_colors}, Output colors: {out_colors}\\n\"\n",
+    "    if not same_shape:\n",
+    "        for i, o in zip(inputs[:1], outputs[:1]):\n",
+    "            analysis += f\"  Shape: {i.shape} -> {o.shape}\\n\"\n",
+    "\n",
+    "    return f\"\"\"Solve this ARC-AGI puzzle. Write ONLY a Python function, no explanations.\n",
+    "\n",
+    "{examples_str}\n",
+    "\n",
+    "Analysis:\n",
+    "{analysis}\n",
+    "```python\n",
+    "import numpy as np\n",
+    "from collections import Counter, deque\n",
+    "\n",
+    "def transform(grid: list[list[int]]) -> list[list[int]]:\n",
+    "    grid = np.array(grid)\n",
+    "\"\"\"\n",
+    "\n",
+    "\n",
+    "def extract_code(response):\n",
+    "    \"\"\"Extract Python function from LLM response.\"\"\"\n",
+    "    for pattern in [r'```python\\s*(.*?)```', r'```\\s*(.*?)```']:\n",
+    "        matches = re.findall(pattern, response, re.DOTALL)\n",
+    "        for match in matches:\n",
+    "            if 'def transform' in match:\n",
+    "                return match.strip()\n",
+    "    idx = response.find('def transform')\n",
+    "    if idx >= 0:\n",
+    "        before = response[:idx]\n",
+    "        import_start = max(before.rfind('import '), before.rfind('from '))\n",
+    "        start = import_start if import_start >= 0 else idx\n",
+    "        code = response[start:]\n",
+    "        end = code.find('```')\n",
+    "        if end > 0:\n",
+    "            code = code[:end]\n",
+    "        return code.strip()\n",
+    "    stripped = response.strip()\n",
+    "    if stripped.startswith(('import', 'def transform', 'from')):\n",
+    "        return stripped\n",
+    "    return None\n",
+    "\n",
+    "\n",
+    "def verify_program(code, train_pairs):\n",
+    "    \"\"\"Execute program and verify against all training pairs.\"\"\"\n",
+    "    namespace = {'np': np, 'numpy': np, 'Counter': Counter,\n",
+    "                 'deque': __import__('collections').deque}\n",
+    "    try:\n",
+    "        import scipy.ndimage\n",
+    "        namespace['scipy'] = __import__('scipy')\n",
+    "    except ImportError:\n",
+    "        pass\n",
+    "    try:\n",
+    "        exec(code, namespace)\n",
+    "    except Exception:\n",
+    "        return False\n",
+    "    if 'transform' not in namespace:\n",
+    "        return False\n",
+    "    fn = namespace['transform']\n",
+    "    for pair in train_pairs:\n",
+    "        try:\n",
+    "            result = fn([row[:] for row in pair['input']])\n",
+    "            if result is None:\n",
+    "                return False\n",
+    "            r = np.array(result, dtype=int)\n",
+    "            e = np.array(pair['output'], dtype=int)\n",
+    "            if r.shape != e.shape or not np.array_equal(r, e):\n",
+    "                return False\n",
+    "        except Exception:\n",
+    "            return False\n",
+    "    return True\n",
+    "\n",
+    "\n",
+    "def apply_program(code, test_input):\n",
+    "    \"\"\"Apply verified program to test input.\"\"\"\n",
+    "    namespace = {'np': np, 'numpy': np, 'Counter': Counter,\n",
+    "                 'deque': __import__('collections').deque}\n",
+    "    try:\n",
+    "        import scipy.ndimage\n",
+    "        namespace['scipy'] = __import__('scipy')\n",
+    "    except ImportError:\n",
+    "        pass\n",
+    "    try:\n",
+    "        exec(code, namespace)\n",
+    "        result = namespace['transform']([row[:] for row in test_input])\n",
+    "        if result is not None:\n",
+    "            return np.array(result, dtype=int).tolist()\n",
+    "    except Exception:\n",
+    "        pass\n",
+    "    return None\n",
+    "\n",
+    "\n",
+    "print('LLM engine ready.')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Quick Test (1 task)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Quick test — verify API works before running all 330 tasks\n",
+    "test_tid = '0520fde7'\n",
+    "with open(f'arc_data/training/{test_tid}.json') as f:\n",
+    "    test_task = json.load(f)\n",
+    "\n",
+    "print(f'Testing on {test_tid}...')\n",
+    "for i, p in enumerate(test_task['train']):\n",
+    "    inp = np.array(p['input']); out = np.array(p['output'])\n",
+    "    print(f'  Pair {i}: {inp.shape} -> {out.shape}')\n",
+    "\n",
+    "prompt = build_prompt(test_task)\n",
+    "print(f'Prompt: {len(prompt)} chars')\n",
+    "\n",
+    "response = call_nvidia(prompt, NVIDIA_API_KEY, MODEL, temperature=0.1)\n",
+    "if response.startswith('ERROR:'):\n",
+    "    print(f'\\n❌ API Error: {response}')\n",
+    "    print('Check your NVIDIA_API_KEY and MODEL settings above.')\n",
+    "else:\n",
+    "    code = extract_code(response)\n",
+    "    if code:\n",
+    "        ok = verify_program(code, test_task['train'])\n",
+    "        print(f'\\nCode extracted: {len(code)} chars')\n",
+    "        print(f'Verified: {\"✅\" if ok else \"❌\"}')\n",
+    "        if ok:\n",
+    "            print('API working and generating correct code!')\n",
+    "        else:\n",
+    "            print('API working but code failed verification (normal — will try more candidates in full run)')\n",
+    "    else:\n",
+    "        print(f'\\nNo code extracted from response ({len(response)} chars)')\n",
+    "        print('API working but response format unexpected. Will retry with different temperatures in full run.')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Run on All Unsolved Tasks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load all unsolved tasks\n",
+    "task_files = sorted(glob.glob('arc_data/training/*.json'))\n",
+    "unsolved = []\n",
+    "for tf in task_files:\n",
+    "    tid = os.path.basename(tf).replace('.json', '')\n",
+    "    if tid not in ALREADY_SOLVED:\n",
+    "        unsolved.append((tid, tf))\n",
+    "\n",
+    "print(f'Total tasks: {len(task_files)}')\n",
+    "print(f'Already solved (symbolic): {len(ALREADY_SOLVED)}')\n",
+    "print(f'To attempt with LLM: {len(unsolved)}')\n",
+    "print(f'Model: {MODEL}')\n",
+    "print(f'Candidates per task: {N_CANDIDATES}')\n",
+    "print(f'\\nStarting...')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Main loop\n",
+    "results = {}\n",
+    "solved = 0\n",
+    "total_time = 0\n",
+    "\n",
+    "# Resume from previous run if exists\n",
+    "if os.path.exists('llm_results.json'):\n",
+    "    with open('llm_results.json') as f:\n",
+    "        prev = json.load(f)\n",
+    "    results = prev.get('results', {})\n",
+    "    solved = sum(1 for r in results.values() if r['status'] == 'solved')\n",
+    "    print(f'Resuming from previous run: {solved} already solved by LLM')\n",
+    "\n",
+    "for idx, (tid, tf) in enumerate(unsolved):\n",
+    "    # Skip if already attempted\n",
+    "    if tid in results:\n",
+    "        continue\n",
+    "    \n",
+    "    with open(tf) as f:\n",
+    "        task = json.load(f)\n",
+    "    \n",
+    "    print(f'[{idx+1:3d}/{len(unsolved)}] {tid}:', end=' ', flush=True)\n",
+    "    start = time.time()\n",
+    "    \n",
+    "    prompt = build_prompt(task)\n",
+    "    task_solved = False\n",
+    "    \n",
+    "    for i in range(N_CANDIDATES):\n",
+    "        temp = 0.1 if i == 0 else min(0.4 + 0.15 * i, 1.2)\n",
+    "        response = call_nvidia(prompt, NVIDIA_API_KEY, MODEL, temp)\n",
+    "        \n",
+    "        if response.startswith('ERROR:'):\n",
+    "            if '429' in response or 'rate' in response.lower():\n",
+    "                time.sleep(10)  # Rate limit — wait longer\n",
+    "            continue\n",
+    "        \n",
+    "        code = extract_code(response)\n",
+    "        if code is None:\n",
+    "            continue\n",
+    "        \n",
+    "        if verify_program(code, task['train']):\n",
+    "            elapsed = time.time() - start\n",
+    "            total_time += elapsed\n",
+    "            solved += 1\n",
+    "            \n",
+    "            test_outputs = [apply_program(code, t['input']) for t in task.get('test', [])]\n",
+    "            results[tid] = {\n",
+    "                'status': 'solved', 'rule': f'llm_c{i+1}_t{temp:.1f}',\n",
+    "                'code': code, 'test_outputs': test_outputs,\n",
+    "                'time_s': round(elapsed, 2),\n",
+    "            }\n",
+    "            print(f'✅ c{i+1} ({elapsed:.1f}s)  [total: {len(ALREADY_SOLVED)+solved}/{len(task_files)}]')\n",
+    "            task_solved = True\n",
+    "            break\n",
+    "        \n",
+    "        time.sleep(RATE_LIMIT_SLEEP)\n",
+    "    \n",
+    "    if not task_solved:\n",
+    "        elapsed = time.time() - start\n",
+    "        total_time += elapsed\n",
+    "        results[tid] = {'status': 'failed', 'time_s': round(elapsed, 2)}\n",
+    "        print(f'❌ ({elapsed:.1f}s)')\n",
+    "    \n",
+    "    # Save progress every 10 tasks\n",
+    "    if (idx + 1) % 10 == 0:\n",
+    "        with open('llm_results.json', 'w') as f:\n",
+    "            json.dump({\n",
+    "                'model': MODEL, 'n_candidates': N_CANDIDATES,\n",
+    "                'llm_solved': solved, 'attempted': sum(1 for r in results.values()),\n",
+    "                'symbolic_solved': len(ALREADY_SOLVED),\n",
+    "                'total_solved': len(ALREADY_SOLVED) + solved,\n",
+    "                'total_tasks': len(task_files),\n",
+    "                'solve_rate': round(100 * (len(ALREADY_SOLVED) + solved) / len(task_files), 2),\n",
+    "                'total_time_s': round(total_time, 1),\n",
+    "                'results': results,\n",
+    "            }, f, indent=2)\n",
+    "        print(f'  [Saved: {len(ALREADY_SOLVED)+solved}/{len(task_files)} total]')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Final save\n",
+    "with open('llm_results.json', 'w') as f:\n",
+    "    json.dump({\n",
+    "        'model': MODEL, 'n_candidates': N_CANDIDATES,\n",
+    "        'llm_solved': solved, 'attempted': sum(1 for r in results.values()),\n",
+    "        'symbolic_solved': len(ALREADY_SOLVED),\n",
+    "        'total_solved': len(ALREADY_SOLVED) + solved,\n",
+    "        'total_tasks': len(task_files),\n",
+    "        'solve_rate': round(100 * (len(ALREADY_SOLVED) + solved) / len(task_files), 2),\n",
+    "        'total_time_s': round(total_time, 1),\n",
+    "        'results': results,\n",
+    "    }, f, indent=2)\n",
+    "\n",
+    "print(f'\\n{\"=\"*60}')\n",
+    "print(f'FINAL RESULTS')\n",
+    "print(f'{\"=\"*60}')\n",
+    "print(f'LLM solved:      {solved}')\n",
+    "print(f'Symbolic solved:  {len(ALREADY_SOLVED)}')\n",
+    "print(f'TOTAL SOLVED:     {len(ALREADY_SOLVED)+solved}/{len(task_files)} ({100*(len(ALREADY_SOLVED)+solved)/len(task_files):.1f}%)')\n",
+    "print(f'Time:             {total_time:.0f}s')\n",
+    "print(f'\\nResults saved to: llm_results.json')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Results Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load and analyze results\n",
+    "with open('llm_results.json') as f:\n",
+    "    data = json.load(f)\n",
+    "\n",
+    "print(f'Model: {data[\"model\"]}')\n",
+    "print(f'Candidates per task: {data[\"n_candidates\"]}')\n",
+    "print(f'\\nSymbolic solved: {data[\"symbolic_solved\"]}')\n",
+    "print(f'LLM solved: {data[\"llm_solved\"]}')\n",
+    "print(f'TOTAL: {data[\"total_solved\"]}/{data[\"total_tasks\"]} ({data[\"solve_rate\"]}%)')\n",
+    "\n",
+    "llm_solved_tasks = [tid for tid, r in data['results'].items() if r['status'] == 'solved']\n",
+    "print(f'\\nLLM-solved tasks ({len(llm_solved_tasks)}):')\n",
+    "for tid in sorted(llm_solved_tasks):\n",
+    "    rule = data['results'][tid].get('rule', '?')\n",
+    "    t = data['results'][tid].get('time_s', 0)\n",
+    "    print(f'  {tid}: {rule} ({t}s)')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Download Results\n",
+    "\n",
+    "Download `llm_results.json` from the notebook output, then merge with symbolic results:\n",
+    "\n",
+    "```bash\n",
+    "python scripts/merge_results.py arc_results/summary_v4.json llm_results.json\n",
+    "```"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}