rogermt
/

ARC-AGI

Model card Files Files and versions

xet

Community

rogermt commited on 10 days ago

Commit

b9f62c4

verified ·

1 Parent(s): a57fa41

Add scripts/kaggle_llm_solver.py

Browse files

Files changed (1) hide show

scripts/kaggle_llm_solver.py +452 -0

scripts/kaggle_llm_solver.py ADDED Viewed

	@@ -0,0 +1,452 @@

+"""
+PEMF ARC-AGI — LLM Program Synthesis via Ollama (Kaggle Edition)
+================================================================
+Self-contained script for Kaggle GPU notebooks.
+Pulls a model via Ollama, runs LLM synthesis on unsolved ARC tasks.
+Usage on Kaggle:
+  1. Enable GPU (T4 x2 or P100)
+  2. Enable internet access
+  3. Upload this file + arc_data/ + already_solved.json
+  4. Run all cells
+The script:
+  - Installs Ollama
+  - Pulls the model (qwen2.5-coder:32b or smaller)
+  - Loads ARC tasks
+  - For each unsolved task: generates Python transform(), verifies against training pairs
+  - Saves results to llm_results.json
+"""
+import subprocess
+import sys
+import os
+import json
+import time
+import re
+import signal
+import numpy as np
+from typing import Dict, List, Optional, Tuple
+from collections import Counter
+from pathlib import Path
+# =============================================================================
+# 1. OLLAMA SETUP
+# =============================================================================
+def install_ollama():
+    """Install Ollama on Kaggle/Linux."""
+    print("Installing Ollama...")
+    subprocess.run("curl -fsSL https://ollama.com/install.sh | sh",
+                   shell=True, check=True, capture_output=True)
+    print("Ollama installed.")
+def start_ollama():
+    """Start Ollama server in background."""
+    print("Starting Ollama server...")
+    proc = subprocess.Popen(
+        ["ollama", "serve"],
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+    time.sleep(3)  # Wait for server to start
+    print(f"Ollama server started (PID {proc.pid})")
+    return proc
+def pull_model(model_name: str):
+    """Pull a model via Ollama."""
+    print(f"Pulling model {model_name}... (this may take several minutes)")
+    result = subprocess.run(
+        ["ollama", "pull", model_name],
+        capture_output=True, text=True, timeout=1800
+    )
+    if result.returncode != 0:
+        print(f"Pull failed: {result.stderr}")
+        raise RuntimeError(f"Failed to pull {model_name}")
+    print(f"Model {model_name} ready.")
+def call_ollama(prompt: str, model: str = "qwen2.5-coder:32b",
+                temperature: float = 0.7, timeout_s: int = 120) -> str:
+    """Call Ollama API and return response text."""
+    import urllib.request
+    payload = {
+        "model": model,
+        "prompt": prompt,
+        "stream": False,
+        "options": {
+            "temperature": temperature,
+            "num_predict": 2048,
+        }
+    }
+    data = json.dumps(payload).encode('utf-8')
+    req = urllib.request.Request(
+        "http://localhost:11434/api/generate",
+        data=data,
+        headers={"Content-Type": "application/json"},
+        method='POST'
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=timeout_s) as resp:
+            result = json.loads(resp.read().decode())
+            return result.get('response', '')
+    except Exception as e:
+        return f"ERROR: {e}"
+# =============================================================================
+# 2. PROMPT BUILDING
+# =============================================================================
+def build_prompt(task: Dict) -> str:
+    """Build prompt for ARC task."""
+    train_pairs = task.get('train', [])
+    examples = []
+    for i, pair in enumerate(train_pairs):
+        examples.append(
+            f"Example {i+1}:\n"
+            f"  Input:  {json.dumps(pair['input'])}\n"
+            f"  Output: {json.dumps(pair['output'])}"
+        )
+    examples_str = "\n".join(examples)
+    # Basic analysis
+    inputs = [np.array(p['input']) for p in train_pairs]
+    outputs = [np.array(p['output']) for p in train_pairs]
+    same_shape = all(i.shape == o.shape for i, o in zip(inputs, outputs))
+    in_colors = sorted(set(c for i in inputs for c in np.unique(i).tolist()))
+    out_colors = sorted(set(c for o in outputs for c in np.unique(o).tolist()))
+    analysis = f"  Same input/output shape: {same_shape}\n"
+    analysis += f"  Input colors: {in_colors}\n"
+    analysis += f"  Output colors: {out_colors}\n"
+    if not same_shape:
+        ratios = [(o.shape[0]/i.shape[0], o.shape[1]/i.shape[1])
+                  for i, o in zip(inputs, outputs)]
+        analysis += f"  Shape ratios (h,w): {ratios}\n"
+    prompt = f"""Solve this ARC-AGI puzzle. Write ONLY a Python function, no explanations.
+{examples_str}
+Analysis:
+{analysis}
+Write a complete Python function that transforms any input grid to its output.
+The function MUST work correctly for ALL examples above.
+```python
+import numpy as np
+from collections import Counter
+def transform(grid: list[list[int]]) -> list[list[int]]:
+    grid = np.array(grid)
+"""
+    return prompt
+# =============================================================================
+# 3. CODE EXTRACTION AND VERIFICATION
+# =============================================================================
+def extract_code(response: str) -> Optional[str]:
+    """Extract Python function from LLM response."""
+    # Try ```python blocks
+    for pattern in [r'```python\s*(.*?)```', r'```\s*(.*?)```']:
+        matches = re.findall(pattern, response, re.DOTALL)
+        for match in matches:
+            if 'def transform' in match:
+                return match.strip()
+    # Try finding def transform directly
+    idx = response.find('def transform')
+    if idx >= 0:
+        # Look backwards for imports
+        before = response[:idx]
+        import_start = before.rfind('import ')
+        if import_start >= 0:
+            code = response[import_start:]
+        else:
+            code = response[idx:]
+        # Trim at next ``` or double newline after function ends
+        end = code.find('```')
+        if end > 0:
+            code = code[:end]
+        return code.strip()
+    # If response itself looks like code (starts with import or def)
+    stripped = response.strip()
+    if stripped.startswith('import') or stripped.startswith('def transform'):
+        return stripped
+    return None
+def verify_program(code: str, train_pairs: List[Dict]) -> bool:
+    """Execute program and verify against all training pairs."""
+    namespace = {'np': np, 'numpy': np, 'Counter': Counter,
+                 'collections': __import__('collections')}
+    try:
+        exec(code, namespace)
+    except Exception:
+        return False
+    if 'transform' not in namespace:
+        return False
+    transform_fn = namespace['transform']
+    for pair in train_pairs:
+        try:
+            inp = [row[:] for row in pair['input']]  # deep copy
+            result = transform_fn(inp)
+            if result is None:
+                return False
+            result_arr = np.array(result, dtype=int)
+            expected_arr = np.array(pair['output'], dtype=int)
+            if result_arr.shape != expected_arr.shape:
+                return False
+            if not np.array_equal(result_arr, expected_arr):
+                return False
+        except Exception:
+            return False
+    return True
+def apply_program(code: str, test_input: List[List[int]]) -> Optional[List[List[int]]]:
+    """Apply verified program to test input."""
+    namespace = {'np': np, 'numpy': np, 'Counter': Counter,
+                 'collections': __import__('collections')}
+    try:
+        exec(code, namespace)
+        result = namespace['transform']([row[:] for row in test_input])
+        if result is not None:
+            return [list(row) for row in np.array(result, dtype=int).tolist()]
+    except Exception:
+        pass
+    return None
+# =============================================================================
+# 4. SYNTHESIS ENGINE
+# =============================================================================
+def synthesize_task(task: Dict, model: str = "qwen2.5-coder:32b",
+                    n_candidates: int = 8, verbose: bool = False) -> Optional[Tuple[str, str]]:
+    """
+    Try to solve a task via LLM.
+    Returns (rule_name, code) if successful, None otherwise.
+    """
+    train_pairs = task.get('train', [])
+    if not train_pairs:
+        return None
+    prompt = build_prompt(task)
+    for i in range(n_candidates):
+        temp = 0.1 if i == 0 else 0.5 + 0.1 * i  # first try low temp, then increase
+        response = call_ollama(prompt, model=model, temperature=min(temp, 1.0))
+        if response.startswith("ERROR:"):
+            if verbose:
+                print(f"    Candidate {i+1}: API error")
+            continue
+        code = extract_code(response)
+        if code is None:
+            if verbose:
+                print(f"    Candidate {i+1}: No code extracted")
+            continue
+        if verbose:
+            print(f"    Candidate {i+1}: {len(code)} chars", end="")
+        if verify_program(code, train_pairs):
+            if verbose:
+                print(f" ✅")
+            return (f"llm_c{i+1}_t{temp:.1f}", code)
+        else:
+            if verbose:
+                print(f" ❌")
+    return None
+# =============================================================================
+# 5. MAIN RUNNER
+# =============================================================================
+def main():
+    # --- Configuration ---
+    MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:32b")
+    # For smaller GPUs, use:
+    #   MODEL = "qwen2.5-coder:14b"   (fits T4 16GB)
+    #   MODEL = "qwen2.5-coder:7b"    (fits any GPU)
+    N_CANDIDATES = int(os.environ.get("N_CANDIDATES", "8"))
+    ARC_DIR = os.environ.get("ARC_DIR", "arc_data/training")
+    ALREADY_SOLVED_FILE = os.environ.get("ALREADY_SOLVED", "already_solved.json")
+    OUTPUT_FILE = os.environ.get("OUTPUT_FILE", "llm_results.json")
+    print("=" * 60)
+    print("PEMF ARC-AGI — LLM Program Synthesis (Kaggle/Ollama)")
+    print("=" * 60)
+    print(f"Model: {MODEL}")
+    print(f"Candidates per task: {N_CANDIDATES}")
+    print(f"ARC data: {ARC_DIR}")
+    print()
+    # --- Install & start Ollama ---
+    try:
+        subprocess.run(["ollama", "--version"], capture_output=True, check=True)
+        print("Ollama already installed.")
+    except (FileNotFoundError, subprocess.CalledProcessError):
+        install_ollama()
+    server = start_ollama()
+    try:
+        pull_model(MODEL)
+    except Exception as e:
+        print(f"Failed to pull {MODEL}: {e}")
+        print("Trying smaller model...")
+        MODEL = "qwen2.5-coder:7b"
+        pull_model(MODEL)
+    # --- Load already solved tasks ---
+    already_solved = set()
+    if os.path.exists(ALREADY_SOLVED_FILE):
+        with open(ALREADY_SOLVED_FILE) as f:
+            already_solved = set(json.load(f))
+        print(f"Already solved (symbolic): {len(already_solved)} tasks")
+    # --- Load ARC tasks ---
+    import glob
+    task_files = sorted(glob.glob(os.path.join(ARC_DIR, "*.json")))
+    print(f"Total ARC tasks: {len(task_files)}")
+    unsolved_files = []
+    for tf in task_files:
+        tid = os.path.basename(tf).replace('.json', '')
+        if tid not in already_solved:
+            unsolved_files.append((tid, tf))
+    print(f"Unsolved tasks to try: {len(unsolved_files)}")
+    print()
+    # --- Run synthesis ---
+    results = {}
+    solved = 0
+    total_time = 0
+    for idx, (tid, tf) in enumerate(unsolved_files):
+        with open(tf) as f:
+            task = json.load(f)
+        print(f"[{idx+1:3d}/{len(unsolved_files)}] {tid}:", end=" ", flush=True)
+        start = time.time()
+        result = synthesize_task(task, model=MODEL, n_candidates=N_CANDIDATES, verbose=False)
+        elapsed = time.time() - start
+        total_time += elapsed
+        if result:
+            rule_name, code = result
+            solved += 1
+            # Apply to test pairs
+            test_outputs = []
+            for test in task.get('test', []):
+                out = apply_program(code, test['input'])
+                test_outputs.append(out)
+            results[tid] = {
+                'status': 'solved',
+                'rule': rule_name,
+                'code': code,
+                'test_outputs': test_outputs,
+                'time_s': round(elapsed, 2),
+            }
+            print(f"✅ {rule_name} ({elapsed:.1f}s)")
+        else:
+            results[tid] = {
+                'status': 'failed',
+                'time_s': round(elapsed, 2),
+            }
+            print(f"❌ ({elapsed:.1f}s)")
+        # Save progress periodically
+        if (idx + 1) % 10 == 0:
+            with open(OUTPUT_FILE, 'w') as f:
+                json.dump({
+                    'model': MODEL,
+                    'n_candidates': N_CANDIDATES,
+                    'solved': solved,
+                    'attempted': idx + 1,
+                    'total_time_s': round(total_time, 1),
+                    'results': results,
+                }, f, indent=2)
+            print(f"  [Progress saved: {solved}/{idx+1} solved]")
+    # --- Final save ---
+    with open(OUTPUT_FILE, 'w') as f:
+        json.dump({
+            'model': MODEL,
+            'n_candidates': N_CANDIDATES,
+            'solved': solved,
+            'attempted': len(unsolved_files),
+            'total_time_s': round(total_time, 1),
+            'already_solved_symbolic': len(already_solved),
+            'total_solved': len(already_solved) + solved,
+            'total_tasks': len(task_files),
+            'solve_rate': round(100 * (len(already_solved) + solved) / len(task_files), 2),
+            'results': results,
+        }, f, indent=2)
+    # --- Summary ---
+    print()
+    print("=" * 60)
+    print("FINAL RESULTS")
+    print("=" * 60)
+    print(f"LLM solved:        {solved}/{len(unsolved_files)} unsolved tasks")
+    print(f"Symbolic solved:   {len(already_solved)}")
+    print(f"TOTAL SOLVED:      {len(already_solved) + solved}/{len(task_files)} ({100*(len(already_solved)+solved)/len(task_files):.1f}%)")
+    print(f"Total LLM time:    {total_time:.0f}s ({total_time/max(1,len(unsolved_files)):.1f}s/task)")
+    print(f"Results saved to:  {OUTPUT_FILE}")
+    # Cleanup
+    server.terminate()
+# =============================================================================
+# 6. GENERATE already_solved.json FROM SYMBOLIC RESULTS
+# =============================================================================
+def generate_already_solved(summary_file: str, output_file: str = "already_solved.json"):
+    """
+    Generate already_solved.json from a v4 summary file.
+    Run this BEFORE running on Kaggle.
+    """
+    with open(summary_file) as f:
+        data = json.load(f)
+    solved = [r['task_id'] for r in data['results'] if r.get('all_train_solved')]
+    with open(output_file, 'w') as f:
+        json.dump(solved, f)
+    print(f"Wrote {len(solved)} solved task IDs to {output_file}")
+if __name__ == "__main__":
+    # If run with --generate-solved, create the already_solved.json
+    if len(sys.argv) > 1 and sys.argv[1] == "--generate-solved":
+        summary = sys.argv[2] if len(sys.argv) > 2 else "arc_results/summary_v4.json"
+        generate_already_solved(summary)
+    else:
+        main()