File size: 14,996 Bytes

feb08d1

"""
PEMF ARC-AGI — LLM Program Synthesis via Ollama (Kaggle Edition)
================================================================

Self-contained script for Kaggle GPU notebooks.
Pulls a model via Ollama, runs LLM synthesis on unsolved ARC tasks.

Usage on Kaggle:
  1. Enable GPU (T4 x2 or P100)
  2. Enable internet access
  3. Upload this file + arc_data/ + already_solved.json
  4. Run all cells

The script:
  - Installs Ollama
  - Pulls the model (qwen2.5-coder:32b or smaller)
  - Loads ARC tasks
  - For each unsolved task: generates Python transform(), verifies against training pairs
  - Saves results to llm_results.json
"""

import subprocess
import sys
import os
import json
import time
import re
import signal
import numpy as np
from typing import Dict, List, Optional, Tuple
from collections import Counter
from pathlib import Path


# =============================================================================
# 1. OLLAMA SETUP
# =============================================================================

def install_ollama():
    """Install Ollama on Kaggle/Linux."""
    print("Installing Ollama...")
    subprocess.run("curl -fsSL https://ollama.com/install.sh | sh", 
                   shell=True, check=True, capture_output=True)
    print("Ollama installed.")


def start_ollama():
    """Start Ollama server in background."""
    print("Starting Ollama server...")
    proc = subprocess.Popen(
        ["ollama", "serve"],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL,
    )
    time.sleep(3)  # Wait for server to start
    print(f"Ollama server started (PID {proc.pid})")
    return proc


def pull_model(model_name: str):
    """Pull a model via Ollama."""
    print(f"Pulling model {model_name}... (this may take several minutes)")
    result = subprocess.run(
        ["ollama", "pull", model_name],
        capture_output=True, text=True, timeout=1800
    )
    if result.returncode != 0:
        print(f"Pull failed: {result.stderr}")
        raise RuntimeError(f"Failed to pull {model_name}")
    print(f"Model {model_name} ready.")


def call_ollama(prompt: str, model: str = "qwen2.5-coder:32b",
                temperature: float = 0.7, timeout_s: int = 120) -> str:
    """Call Ollama API and return response text."""
    import urllib.request

    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "options": {
            "temperature": temperature,
            "num_predict": 2048,
        }
    }

    data = json.dumps(payload).encode('utf-8')
    req = urllib.request.Request(
        "http://localhost:11434/api/generate",
        data=data,
        headers={"Content-Type": "application/json"},
        method='POST'
    )

    try:
        with urllib.request.urlopen(req, timeout=timeout_s) as resp:
            result = json.loads(resp.read().decode())
            return result.get('response', '')
    except Exception as e:
        return f"ERROR: {e}"


# =============================================================================
# 2. PROMPT BUILDING
# =============================================================================

def build_prompt(task: Dict) -> str:
    """Build prompt for ARC task."""
    train_pairs = task.get('train', [])

    examples = []
    for i, pair in enumerate(train_pairs):
        examples.append(
            f"Example {i+1}:\n"
            f"  Input:  {json.dumps(pair['input'])}\n"
            f"  Output: {json.dumps(pair['output'])}"
        )
    examples_str = "\n".join(examples)

    # Basic analysis
    inputs = [np.array(p['input']) for p in train_pairs]
    outputs = [np.array(p['output']) for p in train_pairs]
    same_shape = all(i.shape == o.shape for i, o in zip(inputs, outputs))
    in_colors = sorted(set(c for i in inputs for c in np.unique(i).tolist()))
    out_colors = sorted(set(c for o in outputs for c in np.unique(o).tolist()))

    analysis = f"  Same input/output shape: {same_shape}\n"
    analysis += f"  Input colors: {in_colors}\n"
    analysis += f"  Output colors: {out_colors}\n"
    if not same_shape:
        ratios = [(o.shape[0]/i.shape[0], o.shape[1]/i.shape[1]) 
                  for i, o in zip(inputs, outputs)]
        analysis += f"  Shape ratios (h,w): {ratios}\n"

    prompt = f"""Solve this ARC-AGI puzzle. Write ONLY a Python function, no explanations.

{examples_str}

Analysis:
{analysis}
Write a complete Python function that transforms any input grid to its output.
The function MUST work correctly for ALL examples above.

```python
import numpy as np
from collections import Counter

def transform(grid: list[list[int]]) -> list[list[int]]:
    grid = np.array(grid)
"""
    return prompt


# =============================================================================
# 3. CODE EXTRACTION AND VERIFICATION
# =============================================================================

def extract_code(response: str) -> Optional[str]:
    """Extract Python function from LLM response."""
    # Try ```python blocks
    for pattern in [r'```python\s*(.*?)```', r'```\s*(.*?)```']:
        matches = re.findall(pattern, response, re.DOTALL)
        for match in matches:
            if 'def transform' in match:
                return match.strip()

    # Try finding def transform directly
    idx = response.find('def transform')
    if idx >= 0:
        # Look backwards for imports
        before = response[:idx]
        import_start = before.rfind('import ')
        if import_start >= 0:
            code = response[import_start:]
        else:
            code = response[idx:]
        # Trim at next ``` or double newline after function ends
        end = code.find('```')
        if end > 0:
            code = code[:end]
        return code.strip()

    # If response itself looks like code (starts with import or def)
    stripped = response.strip()
    if stripped.startswith('import') or stripped.startswith('def transform'):
        return stripped

    return None


def verify_program(code: str, train_pairs: List[Dict]) -> bool:
    """Execute program and verify against all training pairs."""
    namespace = {'np': np, 'numpy': np, 'Counter': Counter,
                 'collections': __import__('collections')}

    try:
        exec(code, namespace)
    except Exception:
        return False

    if 'transform' not in namespace:
        return False

    transform_fn = namespace['transform']

    for pair in train_pairs:
        try:
            inp = [row[:] for row in pair['input']]  # deep copy
            result = transform_fn(inp)
            if result is None:
                return False
            result_arr = np.array(result, dtype=int)
            expected_arr = np.array(pair['output'], dtype=int)
            if result_arr.shape != expected_arr.shape:
                return False
            if not np.array_equal(result_arr, expected_arr):
                return False
        except Exception:
            return False

    return True


def apply_program(code: str, test_input: List[List[int]]) -> Optional[List[List[int]]]:
    """Apply verified program to test input."""
    namespace = {'np': np, 'numpy': np, 'Counter': Counter,
                 'collections': __import__('collections')}
    try:
        exec(code, namespace)
        result = namespace['transform']([row[:] for row in test_input])
        if result is not None:
            return [list(row) for row in np.array(result, dtype=int).tolist()]
    except Exception:
        pass
    return None


# =============================================================================
# 4. SYNTHESIS ENGINE
# =============================================================================

def synthesize_task(task: Dict, model: str = "qwen2.5-coder:32b",
                    n_candidates: int = 8, verbose: bool = False) -> Optional[Tuple[str, str]]:
    """
    Try to solve a task via LLM.
    Returns (rule_name, code) if successful, None otherwise.
    """
    train_pairs = task.get('train', [])
    if not train_pairs:
        return None

    prompt = build_prompt(task)

    for i in range(n_candidates):
        temp = 0.1 if i == 0 else 0.5 + 0.1 * i  # first try low temp, then increase
        response = call_ollama(prompt, model=model, temperature=min(temp, 1.0))

        if response.startswith("ERROR:"):
            if verbose:
                print(f"    Candidate {i+1}: API error")
            continue

        code = extract_code(response)
        if code is None:
            if verbose:
                print(f"    Candidate {i+1}: No code extracted")
            continue

        if verbose:
            print(f"    Candidate {i+1}: {len(code)} chars", end="")

        if verify_program(code, train_pairs):
            if verbose:
                print(f" ✅")
            return (f"llm_c{i+1}_t{temp:.1f}", code)
        else:
            if verbose:
                print(f" ❌")

    return None


# =============================================================================
# 5. MAIN RUNNER
# =============================================================================

def main():
    # --- Configuration ---
    MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:32b")
    # For smaller GPUs, use:
    #   MODEL = "qwen2.5-coder:14b"   (fits T4 16GB)
    #   MODEL = "qwen2.5-coder:7b"    (fits any GPU)

    N_CANDIDATES = int(os.environ.get("N_CANDIDATES", "8"))
    ARC_DIR = os.environ.get("ARC_DIR", "arc_data/training")
    ALREADY_SOLVED_FILE = os.environ.get("ALREADY_SOLVED", "already_solved.json")
    OUTPUT_FILE = os.environ.get("OUTPUT_FILE", "llm_results.json")

    print("=" * 60)
    print("PEMF ARC-AGI — LLM Program Synthesis (Kaggle/Ollama)")
    print("=" * 60)
    print(f"Model: {MODEL}")
    print(f"Candidates per task: {N_CANDIDATES}")
    print(f"ARC data: {ARC_DIR}")
    print()

    # --- Install & start Ollama ---
    try:
        subprocess.run(["ollama", "--version"], capture_output=True, check=True)
        print("Ollama already installed.")
    except (FileNotFoundError, subprocess.CalledProcessError):
        install_ollama()

    server = start_ollama()

    try:
        pull_model(MODEL)
    except Exception as e:
        print(f"Failed to pull {MODEL}: {e}")
        print("Trying smaller model...")
        MODEL = "qwen2.5-coder:7b"
        pull_model(MODEL)

    # --- Load already solved tasks ---
    already_solved = set()
    if os.path.exists(ALREADY_SOLVED_FILE):
        with open(ALREADY_SOLVED_FILE) as f:
            already_solved = set(json.load(f))
        print(f"Already solved (symbolic): {len(already_solved)} tasks")

    # --- Load ARC tasks ---
    import glob
    task_files = sorted(glob.glob(os.path.join(ARC_DIR, "*.json")))
    print(f"Total ARC tasks: {len(task_files)}")

    unsolved_files = []
    for tf in task_files:
        tid = os.path.basename(tf).replace('.json', '')
        if tid not in already_solved:
            unsolved_files.append((tid, tf))
    print(f"Unsolved tasks to try: {len(unsolved_files)}")
    print()

    # --- Run synthesis ---
    results = {}
    solved = 0
    total_time = 0

    for idx, (tid, tf) in enumerate(unsolved_files):
        with open(tf) as f:
            task = json.load(f)

        print(f"[{idx+1:3d}/{len(unsolved_files)}] {tid}:", end=" ", flush=True)
        start = time.time()

        result = synthesize_task(task, model=MODEL, n_candidates=N_CANDIDATES, verbose=False)
        elapsed = time.time() - start
        total_time += elapsed

        if result:
            rule_name, code = result
            solved += 1

            # Apply to test pairs
            test_outputs = []
            for test in task.get('test', []):
                out = apply_program(code, test['input'])
                test_outputs.append(out)

            results[tid] = {
                'status': 'solved',
                'rule': rule_name,
                'code': code,
                'test_outputs': test_outputs,
                'time_s': round(elapsed, 2),
            }
            print(f"✅ {rule_name} ({elapsed:.1f}s)")
        else:
            results[tid] = {
                'status': 'failed',
                'time_s': round(elapsed, 2),
            }
            print(f"❌ ({elapsed:.1f}s)")

        # Save progress periodically
        if (idx + 1) % 10 == 0:
            with open(OUTPUT_FILE, 'w') as f:
                json.dump({
                    'model': MODEL,
                    'n_candidates': N_CANDIDATES,
                    'solved': solved,
                    'attempted': idx + 1,
                    'total_time_s': round(total_time, 1),
                    'results': results,
                }, f, indent=2)
            print(f"  [Progress saved: {solved}/{idx+1} solved]")

    # --- Final save ---
    with open(OUTPUT_FILE, 'w') as f:
        json.dump({
            'model': MODEL,
            'n_candidates': N_CANDIDATES,
            'solved': solved,
            'attempted': len(unsolved_files),
            'total_time_s': round(total_time, 1),
            'already_solved_symbolic': len(already_solved),
            'total_solved': len(already_solved) + solved,
            'total_tasks': len(task_files),
            'solve_rate': round(100 * (len(already_solved) + solved) / len(task_files), 2),
            'results': results,
        }, f, indent=2)

    # --- Summary ---
    print()
    print("=" * 60)
    print("FINAL RESULTS")
    print("=" * 60)
    print(f"LLM solved:        {solved}/{len(unsolved_files)} unsolved tasks")
    print(f"Symbolic solved:   {len(already_solved)}")
    print(f"TOTAL SOLVED:      {len(already_solved) + solved}/{len(task_files)} ({100*(len(already_solved)+solved)/len(task_files):.1f}%)")
    print(f"Total LLM time:    {total_time:.0f}s ({total_time/max(1,len(unsolved_files)):.1f}s/task)")
    print(f"Results saved to:  {OUTPUT_FILE}")

    # Cleanup
    server.terminate()


# =============================================================================
# 6. GENERATE already_solved.json FROM SYMBOLIC RESULTS
# =============================================================================

def generate_already_solved(summary_file: str, output_file: str = "already_solved.json"):
    """
    Generate already_solved.json from a v4 summary file.
    Run this BEFORE running on Kaggle.
    """
    with open(summary_file) as f:
        data = json.load(f)
    solved = [r['task_id'] for r in data['results'] if r.get('all_train_solved')]
    with open(output_file, 'w') as f:
        json.dump(solved, f)
    print(f"Wrote {len(solved)} solved task IDs to {output_file}")


if __name__ == "__main__":
    # If run with --generate-solved, create the already_solved.json
    if len(sys.argv) > 1 and sys.argv[1] == "--generate-solved":
        summary = sys.argv[2] if len(sys.argv) > 2 else "arc_results/summary_v4.json"
        generate_already_solved(summary)
    else:
        main()