ARC-AGI / pemf /scripts /kaggle_llm_solver.py

Roger MT

move fles into pemf folder

feb08d1 5 days ago

15 kB

	"""
	PEMF ARC-AGI — LLM Program Synthesis via Ollama (Kaggle Edition)
	================================================================

	Self-contained script for Kaggle GPU notebooks.
	Pulls a model via Ollama, runs LLM synthesis on unsolved ARC tasks.

	Usage on Kaggle:
	1. Enable GPU (T4 x2 or P100)
	2. Enable internet access
	3. Upload this file + arc_data/ + already_solved.json
	4. Run all cells

	The script:
	- Installs Ollama
	- Pulls the model (qwen2.5-coder:32b or smaller)
	- Loads ARC tasks
	- For each unsolved task: generates Python transform(), verifies against training pairs
	- Saves results to llm_results.json
	"""

	import subprocess
	import sys
	import os
	import json
	import time
	import re
	import signal
	import numpy as np
	from typing import Dict, List, Optional, Tuple
	from collections import Counter
	from pathlib import Path


	# =============================================================================
	# 1. OLLAMA SETUP
	# =============================================================================

	def install_ollama():
	"""Install Ollama on Kaggle/Linux."""
	print("Installing Ollama...")
	subprocess.run("curl -fsSL https://ollama.com/install.sh \| sh",
	shell=True, check=True, capture_output=True)
	print("Ollama installed.")


	def start_ollama():
	"""Start Ollama server in background."""
	print("Starting Ollama server...")
	proc = subprocess.Popen(
	["ollama", "serve"],
	stdout=subprocess.DEVNULL,
	stderr=subprocess.DEVNULL,
	)
	time.sleep(3) # Wait for server to start
	print(f"Ollama server started (PID {proc.pid})")
	return proc


	def pull_model(model_name: str):
	"""Pull a model via Ollama."""
	print(f"Pulling model {model_name}... (this may take several minutes)")
	result = subprocess.run(
	["ollama", "pull", model_name],
	capture_output=True, text=True, timeout=1800
	)
	if result.returncode != 0:
	print(f"Pull failed: {result.stderr}")
	raise RuntimeError(f"Failed to pull {model_name}")
	print(f"Model {model_name} ready.")


	def call_ollama(prompt: str, model: str = "qwen2.5-coder:32b",
	temperature: float = 0.7, timeout_s: int = 120) -> str:
	"""Call Ollama API and return response text."""
	import urllib.request

	payload = {
	"model": model,
	"prompt": prompt,
	"stream": False,
	"options": {
	"temperature": temperature,
	"num_predict": 2048,
	}
	}

	data = json.dumps(payload).encode('utf-8')
	req = urllib.request.Request(
	"http://localhost:11434/api/generate",
	data=data,
	headers={"Content-Type": "application/json"},
	method='POST'
	)

	try:
	with urllib.request.urlopen(req, timeout=timeout_s) as resp:
	result = json.loads(resp.read().decode())
	return result.get('response', '')
	except Exception as e:
	return f"ERROR: {e}"


	# =============================================================================
	# 2. PROMPT BUILDING
	# =============================================================================

	def build_prompt(task: Dict) -> str:
	"""Build prompt for ARC task."""
	train_pairs = task.get('train', [])

	examples = []
	for i, pair in enumerate(train_pairs):
	examples.append(
	f"Example {i+1}:\n"
	f" Input: {json.dumps(pair['input'])}\n"
	f" Output: {json.dumps(pair['output'])}"
	)
	examples_str = "\n".join(examples)

	# Basic analysis
	inputs = [np.array(p['input']) for p in train_pairs]
	outputs = [np.array(p['output']) for p in train_pairs]
	same_shape = all(i.shape == o.shape for i, o in zip(inputs, outputs))
	in_colors = sorted(set(c for i in inputs for c in np.unique(i).tolist()))
	out_colors = sorted(set(c for o in outputs for c in np.unique(o).tolist()))

	analysis = f" Same input/output shape: {same_shape}\n"
	analysis += f" Input colors: {in_colors}\n"
	analysis += f" Output colors: {out_colors}\n"
	if not same_shape:
	ratios = [(o.shape[0]/i.shape[0], o.shape[1]/i.shape[1])
	for i, o in zip(inputs, outputs)]
	analysis += f" Shape ratios (h,w): {ratios}\n"

	prompt = f"""Solve this ARC-AGI puzzle. Write ONLY a Python function, no explanations.

	{examples_str}

	Analysis:
	{analysis}
	Write a complete Python function that transforms any input grid to its output.
	The function MUST work correctly for ALL examples above.

	```python
	import numpy as np
	from collections import Counter

	def transform(grid: list[list[int]]) -> list[list[int]]:
	grid = np.array(grid)
	"""
	return prompt


	# =============================================================================
	# 3. CODE EXTRACTION AND VERIFICATION
	# =============================================================================

	def extract_code(response: str) -> Optional[str]:
	"""Extract Python function from LLM response."""
	# Try ```python blocks
	for pattern in [r'```python\s(.?)```', r'```\s(.?)```']:
	matches = re.findall(pattern, response, re.DOTALL)
	for match in matches:
	if 'def transform' in match:
	return match.strip()

	# Try finding def transform directly
	idx = response.find('def transform')
	if idx >= 0:
	# Look backwards for imports
	before = response[:idx]
	import_start = before.rfind('import ')
	if import_start >= 0:
	code = response[import_start:]
	else:
	code = response[idx:]
	# Trim at next ``` or double newline after function ends
	end = code.find('```')
	if end > 0:
	code = code[:end]
	return code.strip()

	# If response itself looks like code (starts with import or def)
	stripped = response.strip()
	if stripped.startswith('import') or stripped.startswith('def transform'):
	return stripped

	return None


	def verify_program(code: str, train_pairs: List[Dict]) -> bool:
	"""Execute program and verify against all training pairs."""
	namespace = {'np': np, 'numpy': np, 'Counter': Counter,
	'collections': __import__('collections')}

	try:
	exec(code, namespace)
	except Exception:
	return False

	if 'transform' not in namespace:
	return False

	transform_fn = namespace['transform']

	for pair in train_pairs:
	try:
	inp = [row[:] for row in pair['input']] # deep copy
	result = transform_fn(inp)
	if result is None:
	return False
	result_arr = np.array(result, dtype=int)
	expected_arr = np.array(pair['output'], dtype=int)
	if result_arr.shape != expected_arr.shape:
	return False
	if not np.array_equal(result_arr, expected_arr):
	return False
	except Exception:
	return False

	return True


	def apply_program(code: str, test_input: List[List[int]]) -> Optional[List[List[int]]]:
	"""Apply verified program to test input."""
	namespace = {'np': np, 'numpy': np, 'Counter': Counter,
	'collections': __import__('collections')}
	try:
	exec(code, namespace)
	result = namespace['transform']([row[:] for row in test_input])
	if result is not None:
	return [list(row) for row in np.array(result, dtype=int).tolist()]
	except Exception:
	pass
	return None


	# =============================================================================
	# 4. SYNTHESIS ENGINE
	# =============================================================================

	def synthesize_task(task: Dict, model: str = "qwen2.5-coder:32b",
	n_candidates: int = 8, verbose: bool = False) -> Optional[Tuple[str, str]]:
	"""
	Try to solve a task via LLM.
	Returns (rule_name, code) if successful, None otherwise.
	"""
	train_pairs = task.get('train', [])
	if not train_pairs:
	return None

	prompt = build_prompt(task)

	for i in range(n_candidates):
	temp = 0.1 if i == 0 else 0.5 + 0.1 * i # first try low temp, then increase
	response = call_ollama(prompt, model=model, temperature=min(temp, 1.0))

	if response.startswith("ERROR:"):
	if verbose:
	print(f" Candidate {i+1}: API error")
	continue

	code = extract_code(response)
	if code is None:
	if verbose:
	print(f" Candidate {i+1}: No code extracted")
	continue

	if verbose:
	print(f" Candidate {i+1}: {len(code)} chars", end="")

	if verify_program(code, train_pairs):
	if verbose:
	print(f" ✅")
	return (f"llm_c{i+1}_t{temp:.1f}", code)
	else:
	if verbose:
	print(f" ❌")

	return None


	# =============================================================================
	# 5. MAIN RUNNER
	# =============================================================================

	def main():
	# --- Configuration ---
	MODEL = os.environ.get("OLLAMA_MODEL", "qwen2.5-coder:32b")
	# For smaller GPUs, use:
	# MODEL = "qwen2.5-coder:14b" (fits T4 16GB)
	# MODEL = "qwen2.5-coder:7b" (fits any GPU)

	N_CANDIDATES = int(os.environ.get("N_CANDIDATES", "8"))
	ARC_DIR = os.environ.get("ARC_DIR", "arc_data/training")
	ALREADY_SOLVED_FILE = os.environ.get("ALREADY_SOLVED", "already_solved.json")
	OUTPUT_FILE = os.environ.get("OUTPUT_FILE", "llm_results.json")

	print("=" * 60)
	print("PEMF ARC-AGI — LLM Program Synthesis (Kaggle/Ollama)")
	print("=" * 60)
	print(f"Model: {MODEL}")
	print(f"Candidates per task: {N_CANDIDATES}")
	print(f"ARC data: {ARC_DIR}")
	print()

	# --- Install & start Ollama ---
	try:
	subprocess.run(["ollama", "--version"], capture_output=True, check=True)
	print("Ollama already installed.")
	except (FileNotFoundError, subprocess.CalledProcessError):
	install_ollama()

	server = start_ollama()

	try:
	pull_model(MODEL)
	except Exception as e:
	print(f"Failed to pull {MODEL}: {e}")
	print("Trying smaller model...")
	MODEL = "qwen2.5-coder:7b"
	pull_model(MODEL)

	# --- Load already solved tasks ---
	already_solved = set()
	if os.path.exists(ALREADY_SOLVED_FILE):
	with open(ALREADY_SOLVED_FILE) as f:
	already_solved = set(json.load(f))
	print(f"Already solved (symbolic): {len(already_solved)} tasks")

	# --- Load ARC tasks ---
	import glob
	task_files = sorted(glob.glob(os.path.join(ARC_DIR, "*.json")))
	print(f"Total ARC tasks: {len(task_files)}")

	unsolved_files = []
	for tf in task_files:
	tid = os.path.basename(tf).replace('.json', '')
	if tid not in already_solved:
	unsolved_files.append((tid, tf))
	print(f"Unsolved tasks to try: {len(unsolved_files)}")
	print()

	# --- Run synthesis ---
	results = {}
	solved = 0
	total_time = 0

	for idx, (tid, tf) in enumerate(unsolved_files):
	with open(tf) as f:
	task = json.load(f)

	print(f"[{idx+1:3d}/{len(unsolved_files)}] {tid}:", end=" ", flush=True)
	start = time.time()

	result = synthesize_task(task, model=MODEL, n_candidates=N_CANDIDATES, verbose=False)
	elapsed = time.time() - start
	total_time += elapsed

	if result:
	rule_name, code = result
	solved += 1

	# Apply to test pairs
	test_outputs = []
	for test in task.get('test', []):
	out = apply_program(code, test['input'])
	test_outputs.append(out)

	results[tid] = {
	'status': 'solved',
	'rule': rule_name,
	'code': code,
	'test_outputs': test_outputs,
	'time_s': round(elapsed, 2),
	}
	print(f"✅ {rule_name} ({elapsed:.1f}s)")
	else:
	results[tid] = {
	'status': 'failed',
	'time_s': round(elapsed, 2),
	}
	print(f"❌ ({elapsed:.1f}s)")

	# Save progress periodically
	if (idx + 1) % 10 == 0:
	with open(OUTPUT_FILE, 'w') as f:
	json.dump({
	'model': MODEL,
	'n_candidates': N_CANDIDATES,
	'solved': solved,
	'attempted': idx + 1,
	'total_time_s': round(total_time, 1),
	'results': results,
	}, f, indent=2)
	print(f" [Progress saved: {solved}/{idx+1} solved]")

	# --- Final save ---
	with open(OUTPUT_FILE, 'w') as f:
	json.dump({
	'model': MODEL,
	'n_candidates': N_CANDIDATES,
	'solved': solved,
	'attempted': len(unsolved_files),
	'total_time_s': round(total_time, 1),
	'already_solved_symbolic': len(already_solved),
	'total_solved': len(already_solved) + solved,
	'total_tasks': len(task_files),
	'solve_rate': round(100 * (len(already_solved) + solved) / len(task_files), 2),
	'results': results,
	}, f, indent=2)

	# --- Summary ---
	print()
	print("=" * 60)
	print("FINAL RESULTS")
	print("=" * 60)
	print(f"LLM solved: {solved}/{len(unsolved_files)} unsolved tasks")
	print(f"Symbolic solved: {len(already_solved)}")
	print(f"TOTAL SOLVED: {len(already_solved) + solved}/{len(task_files)} ({100*(len(already_solved)+solved)/len(task_files):.1f}%)")
	print(f"Total LLM time: {total_time:.0f}s ({total_time/max(1,len(unsolved_files)):.1f}s/task)")
	print(f"Results saved to: {OUTPUT_FILE}")

	# Cleanup
	server.terminate()


	# =============================================================================
	# 6. GENERATE already_solved.json FROM SYMBOLIC RESULTS
	# =============================================================================

	def generate_already_solved(summary_file: str, output_file: str = "already_solved.json"):
	"""
	Generate already_solved.json from a v4 summary file.
	Run this BEFORE running on Kaggle.
	"""
	with open(summary_file) as f:
	data = json.load(f)
	solved = [r['task_id'] for r in data['results'] if r.get('all_train_solved')]
	with open(output_file, 'w') as f:
	json.dump(solved, f)
	print(f"Wrote {len(solved)} solved task IDs to {output_file}")


	if __name__ == "__main__":
	# If run with --generate-solved, create the already_solved.json
	if len(sys.argv) > 1 and sys.argv[1] == "--generate-solved":
	summary = sys.argv[2] if len(sys.argv) > 2 else "arc_results/summary_v4.json"
	generate_already_solved(summary)
	else:
	main()