rogermt
/

neurogolf-solver

Model card Files Files and versions

xet

Community

rogermt commited on 14 days ago

Commit

0316872

verified ·

1 Parent(s): 260c341

Upload neurogolf_solver.py with huggingface_hub

Browse files

Files changed (1) hide show

neurogolf_solver.py +1919 -1

neurogolf_solver.py CHANGED Viewed

	@@ -1 +1,1919 @@
1	- ~~FILE_CONTENT_PLACEHOLDER~~

+#!/usr/bin/env python3
+"""
+ARC-AGI NeuroGolf Championship - Complete Solver v5
+Format: [1,10,30,30] one-hot input/output, opset 17, IR version 10.
+v5 CHANGES:
+  - Switched to opset 17 (Kaggle-compatible) for cheaper analytical solvers
+  - Slice-based analytical solvers: rotation, flip, transpose (near-zero cost)
+  - LOOCV Ridge tuning in _lstsq_conv with condition number check + SVD-based λ auto-tune
+  - stride_tricks speedup for patch extraction
+  - Composition detectors: rotation+color, flip+color, transpose+color
+  - Channel reduction wrapper for tasks with <8 colors
+  - ARC-GEN validation, EXCLUDED tasks skipped, submission.csv generation
+Solvers:
+  - Analytical: identity, constant, color_map, transpose, flip, rotate, tile, upscale,
+                concat, concat_enhanced, spatial_gather, varshape_spatial_gather,
+                diagonal_tile, kronecker, shift, mirror_h, mirror_v, quad_mirror,
+                fixed_crop, nonuniform_scale
+  - Composition: rotate+color_map, flip+color_map, transpose+color_map
+  - Conv (fixed shape): Slice -> Conv -> ArgMax -> Equal+Cast -> Pad
+  - Conv (variable shape): Conv(30x30) -> ArgMax -> Equal+Cast -> Mul(mask)
+  - Conv (diff shape): Slice -> Conv -> Slice(crop) -> ArgMax -> Equal+Cast -> Pad
+  - Channel reduction: Conv1x1(10->N) -> transform -> Conv1x1(N->10)
+Usage:
+  python neurogolf_solver.py --data_dir ARC-AGI/data/training/ --output_dir submission
+  python neurogolf_solver.py --data_dir ARC-AGI/data/training/ --output_dir submission --conv_budget 60 --arcgen_dir ARC-GEN-100K/
+"""
+import json, os, sys, math, time, argparse, csv, io, zipfile, warnings
+import numpy as np
+import onnx
+from onnx import helper, TensorProto, numpy_helper
+import onnxruntime as ort
+from collections import Counter
+try:
+    from neurogolf_utils import score_network as _score_network_official
+    HAS_ONNX_TOOL = True
+except ImportError:
+    HAS_ONNX_TOOL = False
+try:
+    import wandb
+except ImportError:
+    wandb = None
+BATCH, CH, GH, GW = 1, 10, 30, 30
+GRID_SHAPE = [BATCH, CH, GH, GW]
+DT = TensorProto.FLOAT
+IR = 10
+# v5: opset 17 for cheaper Slice-based transforms
+OPSET = [helper.make_opsetid("", 17)]
+# Officially excluded tasks (score 0 regardless)
+EXCLUDED_TASKS = {21, 55, 80, 184, 202, 366}
+# Max ARC-GEN examples to use for validation (to keep runtime reasonable)
+MAX_ARCGEN_VALIDATE = 30
+# Max ARC-GEN examples for conv fitting
+MAX_ARCGEN_FIT = 0
+def get_providers():
+    return ['CPUExecutionProvider']
+ORT_PROVIDERS = get_providers()
+# ============================================================
+# LOAD / VALIDATE
+# ============================================================
+def load_tasks_dir(data_dir, arcgen_dir=None):
+    """Load ARC-AGI tasks and optionally merge ARC-GEN data."""
+    files = sorted(f for f in os.listdir(data_dir) if f.endswith('.json'))
+    tasks = {}
+    for i, f in enumerate(files):
+        with open(os.path.join(data_dir, f)) as fh:
+            data = json.load(fh)
+        hex_id = f.replace('.json','')
+        if arcgen_dir and os.path.exists(os.path.join(arcgen_dir, f)):
+            with open(os.path.join(arcgen_dir, f)) as fh:
+                arcgen_examples = json.load(fh)
+            if isinstance(arcgen_examples, list):
+                data['arc-gen'] = arcgen_examples
+        if 'arc-gen' not in data:
+            data['arc-gen'] = []
+        tasks[i+1] = {'hex': hex_id, 'data': data}
+    return tasks
+def load_tasks_kaggle(data_dir):
+    """Load Kaggle format tasks (already have arc-gen embedded)."""
+    tasks = {}
+    for tn in range(1, 401):
+        path = os.path.join(data_dir, f"task{tn:03d}.json")
+        if os.path.exists(path):
+            with open(path) as f:
+                data = json.load(f)
+            if 'arc-gen' not in data:
+                data['arc-gen'] = []
+            tasks[tn] = {'hex': f'task{tn:03d}', 'data': data}
+    return tasks
+def to_onehot(grid):
+    arr = np.zeros((1, CH, GH, GW), dtype=np.float32)
+    for r, row in enumerate(grid):
+        for c, v in enumerate(row):
+            if r < GH and c < GW and 0 <= v < CH:
+                arr[0, v, r, c] = 1.0
+    return arr
+def validate(path, td):
+    """Validate model against ALL examples: train + test + arc-gen."""
+    try:
+        opts = ort.SessionOptions()
+        opts.log_severity_level = 3
+        sess = ort.InferenceSession(path, sess_options=opts, providers=ORT_PROVIDERS)
+    except:
+        return False
+    examples = td['train'] + td['test']
+    if 'arc-gen' in td:
+        examples = examples + td['arc-gen'][:MAX_ARCGEN_VALIDATE]
+    for ex in examples:
+        inp = to_onehot(ex['input'])
+        exp = to_onehot(ex['output'])
+        try:
+            out = sess.run(['output'], {'input': inp})[0]
+            out = (out > 0.0).astype(np.float32)
+        except:
+            return False
+        if not np.array_equal(out, exp):
+            return False
+    return True
+def validate_raw(raw_bytes, td):
+    """Validate model from raw bytes against ALL examples."""
+    try:
+        opts = ort.SessionOptions()
+        opts.log_severity_level = 3
+        sess = ort.InferenceSession(raw_bytes, sess_options=opts, providers=ORT_PROVIDERS)
+    except:
+        return False
+    examples = td['train'] + td['test']
+    if 'arc-gen' in td:
+        examples = examples + td['arc-gen'][:MAX_ARCGEN_VALIDATE]
+    for ex in examples:
+        inp = to_onehot(ex['input'])
+        exp = to_onehot(ex['output'])
+        try:
+            out = sess.run(['output'], {'input': inp})[0]
+            out = (out > 0.0).astype(np.float32)
+        except:
+            return False
+        if not np.array_equal(out, exp):
+            return False
+    return True
+# ============================================================
+# STATIC PROFILER (no onnx_tool dependency)
+# ============================================================
+BANNED_OPS = {'Loop', 'Scan', 'NonZero', 'Unique', 'If', 'Function'}
+MAX_FILESIZE = int(1.44 * 1024 * 1024)
+def score_network(path):
+    """Static profiler matching Kaggle scoring: cost = macs + memory + params."""
+    if HAS_ONNX_TOOL:
+        try:
+            return _score_network_official(path)
+        except:
+            pass
+    return _static_profile(path)
+def _static_profile(path):
+    """Compute cost without onnx_tool: params + nbytes + macs."""
+    try:
+        model = onnx.load(path)
+    except:
+        return None, None, None
+    tensors = {}
+    params = 0
+    nbytes = 0
+    macs = 0
+    for init in model.graph.initializer:
+        a = numpy_helper.to_array(init)
+        tensors[init.name] = a
+        params += a.size
+        nbytes += a.nbytes
+    for nd in model.graph.node:
+        if nd.op_type == 'Constant':
+            for attr in nd.attribute:
+                if attr.t and attr.t.ByteSize() > 0:
+                    try:
+                        a = numpy_helper.to_array(attr.t)
+                        if nd.output:
+                            tensors[nd.output[0]] = a
+                        params += a.size
+                        nbytes += a.nbytes
+                    except:
+                        pass
+        if nd.op_type in BANNED_OPS:
+            return None, None, None
+        if nd.op_type == 'Conv' and len(nd.input) >= 2 and nd.input[1] in tensors:
+            w = tensors[nd.input[1]]
+            if w.ndim == 4:
+                co, ci, kh, kw = w.shape
+                macs += co * ci * kh * kw * GH * GW
+    return int(macs), int(nbytes), int(params)
+def mk(nodes, inits=None):
+    x = helper.make_tensor_value_info("input", DT, GRID_SHAPE)
+    y = helper.make_tensor_value_info("output", DT, GRID_SHAPE)
+    g = helper.make_graph(nodes, "g", [x], [y], initializer=inits or [])
+    return helper.make_model(g, ir_version=IR, opset_imports=OPSET)
+def get_exs(td):
+    """Get examples for analytical solvers (train+test only)."""
+    return [(np.array(ex['input'], dtype=np.int64), np.array(ex['output'], dtype=np.int64))
+            for ex in td['train'] + td['test']]
+def get_exs_for_fitting(td):
+    """Get examples for conv fitting. Uses train+test + arc-gen WHERE SIZES MATCH."""
+    base_exs = [(np.array(ex['input'], dtype=np.int64), np.array(ex['output'], dtype=np.int64))
+                for ex in td['train'] + td['test']]
+    if not base_exs:
+        return base_exs
+    base_shapes = {inp.shape for inp, _ in base_exs}
+    if len(base_shapes) != 1:
+        return base_exs
+    base_shape = list(base_shapes)[0]
+    ag_exs = []
+    for ex in td.get('arc-gen', []):
+        inp = np.array(ex['input'], dtype=np.int64)
+        out = np.array(ex['output'], dtype=np.int64)
+        if inp.shape == base_shape and out.shape == base_exs[0][1].shape:
+            ag_exs.append((inp, out))
+    return base_exs + ag_exs[:10]
+def get_exs_for_fitting_variable(td):
+    """Get examples for variable-shape conv fitting."""
+    base_exs = [(np.array(ex['input'], dtype=np.int64), np.array(ex['output'], dtype=np.int64))
+                for ex in td['train'] + td['test']]
+    ag_exs = []
+    for ex in td.get('arc-gen', []):
+        inp = np.array(ex['input'], dtype=np.int64)
+        out = np.array(ex['output'], dtype=np.int64)
+        if inp.shape == out.shape and inp.shape[0] <= 30 and inp.shape[1] <= 30:
+            ag_exs.append((inp, out))
+    return base_exs + ag_exs[:20]
+def fixed_shapes(td):
+    shapes = set()
+    for inp, out in get_exs(td):
+        shapes.add((inp.shape, out.shape))
+    return list(shapes)[0] if len(shapes) == 1 else None
+# ============================================================
+# GATHER HELPERS (opset 17 compatible)
+# ============================================================
+def _build_gather_model(OH, OW, idx):
+    """Build Gather-based spatial remapping model."""
+    flat_idx = np.zeros((GH*GW,), dtype=np.int64)
+    mask = np.zeros((1,1,GH,GW), dtype=np.float32)
+    for oi in range(OH):
+        for oj in range(OW):
+            flat_idx[oi*GW+oj] = idx[oi,oj,0]*GW + idx[oi,oj,1]
+            mask[0,0,oi,oj] = 1.0
+    inits = [
+        numpy_helper.from_array(np.array([1,10,GH*GW], dtype=np.int64), 'fs'),
+        numpy_helper.from_array(flat_idx, 'idx'),
+        numpy_helper.from_array(np.array([1,10,GH,GW], dtype=np.int64), 'os'),
+        numpy_helper.from_array(mask, 'mask'),
+    ]
+    nodes = [
+        helper.make_node('Reshape', ['input','fs'], ['flat']),
+        helper.make_node('Gather', ['flat','idx'], ['g'], axis=2),
+        helper.make_node('Reshape', ['g','os'], ['raw']),
+        helper.make_node('Mul', ['raw','mask'], ['output']),
+    ]
+    return mk(nodes, inits)
+def _build_gather_model_with_const(IH, IW, OH, OW, idx, cst):
+    """Build Gather model with constant fill for unmapped positions."""
+    flat_idx = np.zeros((GH*GW,), dtype=np.int64)
+    gather_mask = np.zeros((1,1,GH,GW), dtype=np.float32)
+    const_oh = np.zeros((1,10,GH,GW), dtype=np.float32)
+    for oi in range(OH):
+        for oj in range(OW):
+            if idx[oi,oj,0] >= 0:
+                flat_idx[oi*GW+oj] = idx[oi,oj,0]*GW + idx[oi,oj,1]
+                gather_mask[0,0,oi,oj] = 1.0
+            elif cst[oi,oj] >= 0:
+                const_oh[0, cst[oi,oj], oi, oj] = 1.0
+    has_const = np.any(const_oh > 0)
+    inits = [
+        numpy_helper.from_array(np.array([1,10,GH*GW], dtype=np.int64), 'fs'),
+        numpy_helper.from_array(flat_idx, 'idx'),
+        numpy_helper.from_array(np.array([1,10,GH,GW], dtype=np.int64), 'os'),
+        numpy_helper.from_array(gather_mask, 'gmask'),
+    ]
+    nodes = [
+        helper.make_node('Reshape', ['input','fs'], ['flat']),
+        helper.make_node('Gather', ['flat','idx'], ['g'], axis=2),
+        helper.make_node('Reshape', ['g','os'], ['raw']),
+        helper.make_node('Mul', ['raw','gmask'], ['masked']),
+    ]
+    if has_const:
+        inits.append(numpy_helper.from_array(const_oh, 'cst'))
+        nodes.append(helper.make_node('Add', ['masked','cst'], ['output']))
+    else:
+        nodes[-1] = helper.make_node('Mul', ['raw','gmask'], ['output'])
+    return mk(nodes, inits)
+# ============================================================
+# SLICE-BASED ANALYTICAL SOLVERS (opset 17, ~0 cost)
+# ============================================================
+def _build_pad_nodes(input_name, IH, IW, output_name='output', pad_name='pads'):
+    """Build Pad nodes to pad spatial dims to 30x30 (opset 17 with tensor pads).
+    Returns (pad_inits, pad_node)."""
+    pad_h, pad_w = GH - IH, GW - IW
+    if pad_h > 0 or pad_w > 0:
+        pads_arr = np.array([0, 0, 0, 0, 0, 0, pad_h, pad_w], dtype=np.int64)
+        pad_inits = [numpy_helper.from_array(pads_arr, pad_name)]
+        pad_node = helper.make_node('Pad', [input_name, pad_name], [output_name], mode='constant')
+        return pad_inits, pad_node
+    else:
+        return [], helper.make_node('Identity', [input_name], [output_name])
+def _build_slice_flip_model(axis, IH, IW):
+    """Build a Slice-based flip model using negative steps (opset 17).
+    Extracts content, applies flip, pads back to 30x30.
+    axis=0: vertical flip (reverse rows), axis=1: horizontal flip (reverse cols).
+    """
+    # Step 1: Extract content region [1,10,30,30] -> [1,10,IH,IW]
+    ex_st = np.array([0,0,0,0], dtype=np.int64)
+    ex_en = np.array([1,10,IH,IW], dtype=np.int64)
+    # Step 2: Flip with negative step Slice
+    if axis == 0:
+        starts = np.array([IH-1], dtype=np.int64)
+        ends = np.array([-IH-1], dtype=np.int64)
+        axes = np.array([2], dtype=np.int64)
+        steps = np.array([-1], dtype=np.int64)
+    else:
+        starts = np.array([IW-1], dtype=np.int64)
+        ends = np.array([-IW-1], dtype=np.int64)
+        axes = np.array([3], dtype=np.int64)
+        steps = np.array([-1], dtype=np.int64)
+    inits = [
+        numpy_helper.from_array(ex_st, 'ex_st'),
+        numpy_helper.from_array(ex_en, 'ex_en'),
+        numpy_helper.from_array(starts, 'sl_st'),
+        numpy_helper.from_array(ends, 'sl_en'),
+        numpy_helper.from_array(axes, 'sl_ax'),
+        numpy_helper.from_array(steps, 'sl_sp'),
+    ]
+    nodes = [
+        helper.make_node('Slice', ['input','ex_st','ex_en'], ['content']),
+        helper.make_node('Slice', ['content','sl_st','sl_en','sl_ax','sl_sp'], ['flipped']),
+    ]
+    # Step 3: Pad back to 30x30 if needed
+    pad_inits, pad_node = _build_pad_nodes('flipped', IH, IW)
+    inits.extend(pad_inits)
+    nodes.append(pad_node)
+    return mk(nodes, inits)
+def _build_slice_transpose_model(IH, IW):
+    """Build a Transpose-based transpose model (perm=[0,1,3,2]).
+    Extracts content, transposes, pads back to 30x30."""
+    # Step 1: Extract content [1,10,30,30] -> [1,10,IH,IW]
+    ex_st = np.array([0,0,0,0], dtype=np.int64)
+    ex_en = np.array([1,10,IH,IW], dtype=np.int64)
+    inits = [
+        numpy_helper.from_array(ex_st, 'ex_st'),
+        numpy_helper.from_array(ex_en, 'ex_en'),
+    ]
+    nodes = [
+        helper.make_node('Slice', ['input','ex_st','ex_en'], ['content']),
+        helper.make_node('Transpose', ['content'], ['transposed'], perm=[0,1,3,2]),
+    ]
+    # After transpose, shape is [1,10,IW,IH]. Need to pad to [1,10,30,30].
+    pad_inits, pad_node = _build_pad_nodes('transposed', IW, IH)
+    nodes.append(pad_node)
+    return mk(nodes, inits + pad_inits)
+def _build_slice_rotate_model(k, IH, IW):
+    """Build a rotation model using Transpose + Slice (opset 17).
+    Extracts content, applies rotation, pads back to 30x30.
+    Matches existing s_rotate behavior (np.rot90):
+    k=1: 90° CCW = Transpose then vflip (reverse rows)
+    k=2: 180° = hflip then vflip
+    k=3: 270° CCW = Transpose then hflip (reverse cols)
+    """
+    # Step 1: Extract content [1,10,30,30] -> [1,10,IH,IW]
+    ex_st = np.array([0,0,0,0], dtype=np.int64)
+    ex_en = np.array([1,10,IH,IW], dtype=np.int64)
+    inits = [
+        numpy_helper.from_array(ex_st, 'ex_st'),
+        numpy_helper.from_array(ex_en, 'ex_en'),
+    ]
+    nodes = [helper.make_node('Slice', ['input','ex_st','ex_en'], ['content'])]
+    current = 'content'
+    if k in (1, 3):
+        # Transpose: [1,10,IH,IW] -> [1,10,IW,IH]
+        nodes.append(helper.make_node('Transpose', [current], ['t'], perm=[0,1,3,2]))
+        current = 't'
+        new_IH, new_IW = IW, IH
+    else:
+        new_IH, new_IW = IH, IW
+    # Apply flips with negative step Slice
+    if k == 1:
+        # vflip (reverse rows, axis=2) after transpose
+        starts = np.array([new_IH-1], dtype=np.int64)
+        ends = np.array([-new_IH-1], dtype=np.int64)
+        axes = np.array([2], dtype=np.int64)
+        steps = np.array([-1], dtype=np.int64)
+    elif k == 2:
+        # 180° = hflip then vflip
+        starts_h = np.array([new_IW-1], dtype=np.int64)
+        ends_h = np.array([-new_IW-1], dtype=np.int64)
+        axes_h = np.array([3], dtype=np.int64)
+        steps_h = np.array([-1], dtype=np.int64)
+        inits.extend([
+            numpy_helper.from_array(starts_h, 'st_h'),
+            numpy_helper.from_array(ends_h, 'en_h'),
+            numpy_helper.from_array(axes_h, 'ax_h'),
+            numpy_helper.from_array(steps_h, 'sp_h'),
+        ])
+        nodes.append(helper.make_node('Slice', [current,'st_h','en_h','ax_h','sp_h'], ['fh']))
+        current = 'fh'
+        starts_v = np.array([new_IH-1], dtype=np.int64)
+        ends_v = np.array([-new_IH-1], dtype=np.int64)
+        axes_v = np.array([2], dtype=np.int64)
+        steps_v = np.array([-1], dtype=np.int64)
+        inits.extend([
+            numpy_helper.from_array(starts_v, 'st_v'),
+            numpy_helper.from_array(ends_v, 'en_v'),
+            numpy_helper.from_array(axes_v, 'ax_v'),
+            numpy_helper.from_array(steps_v, 'sp_v'),
+        ])
+        nodes.append(helper.make_node('Slice', [current,'st_v','en_v','ax_v','sp_v'], ['rot']))
+        current = 'rot'
+        pad_inits, pad_node = _build_pad_nodes(current, new_IH, new_IW)
+        nodes.append(pad_node)
+        return mk(nodes, inits + pad_inits)
+    elif k == 3:
+        # hflip (reverse cols, axis=3) after transpose
+        starts = np.array([new_IW-1], dtype=np.int64)
+        ends = np.array([-new_IW-1], dtype=np.int64)
+        axes = np.array([3], dtype=np.int64)
+        steps = np.array([-1], dtype=np.int64)
+    inits.extend([
+        numpy_helper.from_array(starts, 'sl_st'),
+        numpy_helper.from_array(ends, 'sl_en'),
+        numpy_helper.from_array(axes, 'sl_ax'),
+        numpy_helper.from_array(steps, 'sl_sp'),
+    ])
+    nodes.append(helper.make_node('Slice', [current,'sl_st','sl_en','sl_ax','sl_sp'], ['rot']))
+    current = 'rot'
+    # Pad back to 30x30
+    pad_inits, pad_node = _build_pad_nodes(current, new_IH, new_IW)
+    nodes.append(pad_node)
+    return mk(nodes, inits + pad_inits)
+# ============================================================
+# ANALYTICAL SOLVERS
+# ============================================================
+def s_identity(td):
+    for ex in td['train']+td['test']:
+        if ex['input'] != ex['output']: return None
+    return mk([helper.make_node('Identity', ['input'], ['output'])])
+def _get_color_map(td):
+    """Extract color map if consistent across all examples, or None."""
+    cm = {}
+    for ex in td['train']+td['test']:
+        inp, out = np.array(ex['input']), np.array(ex['output'])
+        if inp.shape != out.shape: return None
+        for iv, ov in zip(inp.flat, out.flat):
+            iv, ov = int(iv), int(ov)
+            if iv in cm and cm[iv] != ov: return None
+            cm[iv] = ov
+    return cm
+def _build_color_map_model(cm, is_permutation=None):
+    """Build ONNX model for a color map."""
+    if is_permutation is None:
+        is_permutation = (set(cm.keys()) == set(cm.values()))
+    if is_permutation:
+        gather_ch = np.arange(10, dtype=np.int32)
+        for src, dst in cm.items():
+            if 0 <= src < 10 and 0 <= dst < 10:
+                gather_ch[dst] = src
+        inits = [numpy_helper.from_array(gather_ch, 'gi')]
+        nodes = [helper.make_node('Gather', ['input', 'gi'], ['output'], axis=1)]
+        return mk(nodes, inits)
+    else:
+        W = np.zeros((10,10,1,1), dtype=np.float32)
+        for ic in range(10):
+            W[cm.get(ic,ic), ic, 0, 0] = 1.0
+        return mk([helper.make_node('Conv', ['input','W'], ['output'], kernel_shape=[1,1])],
+                  [numpy_helper.from_array(W, 'W')])
+def s_color_map(td):
+    cm = _get_color_map(td)
+    if cm is None: return None
+    is_permutation = (set(cm.keys()) == set(cm.values()))
+    return _build_color_map_model(cm, is_permutation)
+def s_transpose(td):
+    exs = get_exs(td)
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH,IW),(OH,OW) = sp
+    if not all(np.array_equal(out, inp.T) for inp, out in exs): return None
+    return _build_slice_transpose_model(IH, IW)
+def s_flip(td):
+    exs = get_exs(td)
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH,IW),(OH,OW) = sp
+    if (IH,IW) != (OH,OW): return None
+    for axis, flip_fn in [(0, np.flipud), (1, np.fliplr)]:
+        if all(np.array_equal(out, flip_fn(inp)) for inp, out in exs):
+            return _build_slice_flip_model(axis, IH, IW)
+    return None
+def s_rotate(td):
+    exs = get_exs(td)
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH,IW),(OH,OW) = sp
+    for k in [1, 2, 3]:
+        if all(np.array_equal(out, np.rot90(inp, k)) for inp, out in exs):
+            return _build_slice_rotate_model(k, IH, IW)
+    return None
+def s_spatial_gather(td):
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH,IW),(OH,OW) = sp
+    exs = get_exs(td)
+    idx = np.full((OH,OW,2), -1, dtype=np.int64)
+    cst = np.full((OH,OW), -1, dtype=np.int64)
+    for oi in range(OH):
+        for oj in range(OW):
+            vals = set(int(out[oi,oj]) for _,out in exs)
+            if len(vals) == 1: cst[oi,oj] = vals.pop()
+            found = False
+            for ri in range(IH):
+                for rj in range(IW):
+                    if all(int(inp[ri,rj]) == int(out[oi,oj]) for inp,out in exs):
+                        idx[oi,oj] = [ri, rj]; found = True; break
+                if found: break
+            if not found and cst[oi,oj] < 0: return None
+    return _build_gather_model_with_const(IH, IW, OH, OW, idx, cst)
+def s_varshape_spatial_gather(td):
+    """Spatial gather that works for variable-shape tasks by embedding in 30x30."""
+    sp = fixed_shapes(td)
+    if sp is not None: return None
+    exs = get_exs(td)
+    exs_30 = []
+    for inp, out in exs:
+        ih, iw = inp.shape
+        oh, ow = out.shape
+        inp30 = np.zeros((30, 30), dtype=np.int64)
+        out30 = np.zeros((30, 30), dtype=np.int64)
+        inp30[:ih, :iw] = inp
+        out30[:oh, :ow] = out
+        exs_30.append((inp30, out30))
+    idx = np.full((30, 30, 2), -1, dtype=np.int64)
+    cst = np.full((30, 30), -1, dtype=np.int64)
+    for oi in range(30):
+        for oj in range(30):
+            vals = set(int(out30[oi, oj]) for _, out30 in exs_30)
+            if len(vals) == 1:
+                cst[oi, oj] = vals.pop()
+            found = False
+            for ri in range(30):
+                for rj in range(30):
+                    if all(int(inp30[ri, rj]) == int(out30[oi, oj]) for inp30, out30 in exs_30):
+                        idx[oi, oj] = [ri, rj]
+                        found = True
+                        break
+                if found: break
+            if not found and cst[oi, oj] < 0:
+                return None
+    return _build_gather_model_with_const(30, 30, 30, 30, idx, cst)
+def s_tile(td):
+    exs = get_exs(td)
+    in_shapes = set(inp.shape for inp,_ in exs)
+    if len(in_shapes) != 1: return None
+    IH, IW = in_shapes.pop()
+    tiles = set()
+    for inp, out in exs:
+        OH, OW = out.shape
+        if OH % IH or OW % IW: return None
+        rH, rW = OH//IH, OW//IW
+        if rH < 1 or rW < 1 or (rH==1 and rW==1): return None
+        tiles.add((rH, rW))
+    if len(tiles) != 1: return None
+    rH, rW = tiles.pop()
+    OH, OW = IH*rH, IW*rW
+    if OH > 30 or OW > 30: return None
+    for inp, out in exs:
+        if not np.array_equal(out, np.tile(inp, (rH, rW))): return None
+    pad_h, pad_w = 30-OH, 30-OW
+    inits = [
+        numpy_helper.from_array(np.array([0,0,0,0], dtype=np.int64), 'st'),
+        numpy_helper.from_array(np.array([1,10,IH,IW], dtype=np.int64), 'en'),
+        numpy_helper.from_array(np.array([1,1,rH,rW], dtype=np.int64), 'rp'),
+    ]
+    pads_arr = np.array([0, 0, 0, 0, 0, 0, pad_h, pad_w], dtype=np.int64)
+    tile_pads = numpy_helper.from_array(pads_arr, 'tile_pads')
+    nodes = [
+        helper.make_node('Slice', ['input','st','en'], ['cr']),
+        helper.make_node('Tile', ['cr','rp'], ['tl']),
+        helper.make_node('Pad', ['tl', 'tile_pads'], ['output'], mode='constant'),
+    ]
+    inits.append(tile_pads)
+    return mk(nodes, inits)
+def s_upscale(td):
+    exs = get_exs(td)
+    in_shapes = set(inp.shape for inp,_ in exs)
+    if len(in_shapes) != 1: return None
+    IH, IW = in_shapes.pop()
+    scales = set()
+    for inp, out in exs:
+        OH, OW = out.shape
+        if OH % IH or OW % IW: return None
+        sH, sW = OH//IH, OW//IW
+        if sH < 2 or sW < 2: return None
+        scales.add((sH, sW))
+    if len(scales) != 1: return None
+    sH, sW = scales.pop()
+    OH, OW = IH*sH, IW*sW
+    if OH > 30 or OW > 30: return None
+    for inp, out in exs:
+        if not np.array_equal(out, np.repeat(np.repeat(inp, sH, 0), sW, 1)): return None
+    idx = np.zeros((OH,OW,2), dtype=np.int64)
+    for r in range(OH):
+        for c in range(OW):
+            idx[r,c] = [r//sH, c//sW]
+    return _build_gather_model(OH, OW, idx)
+def s_concat(td):
+    from itertools import product as iproduct
+    exs = get_exs(td)
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH,IW),(OH,OW) = sp
+    transforms = [
+        ('id', lambda x: x), ('fliplr', lambda x: np.fliplr(x)),
+        ('flipud', lambda x: np.flipud(x)), ('rot180', lambda x: np.rot90(x, 2)),
+    ]
+    if OH == IH and OW % IW == 0 and OW > IW:
+        n = OW // IW
+        if 2 <= n <= 4:
+            for combo in iproduct(range(4), repeat=n):
+                if all(np.array_equal(out, np.concatenate([transforms[t][1](inp) for t in combo], axis=1))
+                       for inp, out in exs):
+                    idx = np.zeros((OH,OW,2), dtype=np.int64)
+                    for oi in range(OH):
+                        for oj in range(OW):
+                            bj = oj // IW; lr, lc = oi, oj % IW
+                            t = transforms[combo[bj]][0]
+                            if t == 'id': sr, sc = lr, lc
+                            elif t == 'fliplr': sr, sc = lr, IW-1-lc
+                            elif t == 'flipud': sr, sc = IH-1-lr, lc
+                            elif t == 'rot180': sr, sc = IH-1-lr, IW-1-lc
+                            idx[oi,oj] = [sr, sc]
+                    return _build_gather_model(OH, OW, idx)
+    if OW == IW and OH % IH == 0 and OH > IH:
+        n = OH // IH
+        if 2 <= n <= 4:
+            for combo in iproduct(range(4), repeat=n):
+                if all(np.array_equal(out, np.concatenate([transforms[t][1](inp) for t in combo], axis=0))
+                       for inp, out in exs):
+                    idx = np.zeros((OH,OW,2), dtype=np.int64)
+                    for oi in range(OH):
+                        for oj in range(OW):
+                            bi = oi // IH; lr, lc = oi % IH, oj
+                            t = transforms[combo[bi]][0]
+                            if t == 'id': sr, sc = lr, lc
+                            elif t == 'fliplr': sr, sc = lr, IW-1-lc
+                            elif t == 'flipud': sr, sc = IH-1-lr, lc
+                            elif t == 'rot180': sr, sc = IH-1-lr, IW-1-lc
+                            idx[oi,oj] = [sr, sc]
+                    return _build_gather_model(OH, OW, idx)
+    return None
+def s_concat_enhanced(td):
+    """Enhanced concat with all 8 dihedral group transforms."""
+    exs = get_exs(td)
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH,IW),(OH,OW) = sp
+    if IH == OH and IW == OW: return None
+    if OH % IH != 0 or OW % IW != 0: return None
+    rH, rW = OH // IH, OW // IW
+    if rH * rW > 16 or rH * rW < 2: return None
+    if OH > 30 or OW > 30: return None
+    transforms = [
+        ('id', lambda x: x), ('fliplr', lambda x: np.fliplr(x)),
+        ('flipud', lambda x: np.flipud(x)), ('rot180', lambda x: np.rot90(x, 2)),
+        ('rot90', lambda x: np.rot90(x, 1)), ('rot270', lambda x: np.rot90(x, 3)),
+        ('T', lambda x: x.T), ('T_fliplr', lambda x: np.fliplr(x.T)),
+    ]
+    block_transforms = {}
+    for bi in range(rH):
+        for bj in range(rW):
+            found = None
+            for tidx, (tname, tfn) in enumerate(transforms):
+                ok = True
+                for inp, out in exs:
+                    block = out[bi*IH:(bi+1)*IH, bj*IW:(bj+1)*IW]
+                    expected = tfn(inp)
+                    if expected.shape != (IH, IW) or not np.array_equal(block, expected):
+                        ok = False; break
+                if ok:
+                    found = (tidx, tname)
+                    break
+            if found is None: return None
+            block_transforms[(bi, bj)] = found
+    idx = np.zeros((OH, OW, 2), dtype=np.int64)
+    for bi in range(rH):
+        for bj in range(rW):
+            _, tname = block_transforms[(bi, bj)]
+            for lr in range(IH):
+                for lc in range(IW):
+                    oi, oj = bi*IH + lr, bj*IW + lc
+                    if tname == 'id': sr, sc = lr, lc
+                    elif tname == 'fliplr': sr, sc = lr, IW-1-lc
+                    elif tname == 'flipud': sr, sc = IH-1-lr, lc
+                    elif tname == 'rot180': sr, sc = IH-1-lr, IW-1-lc
+                    elif tname == 'rot90': sr, sc = IW-1-lc, lr
+                    elif tname == 'rot270': sr, sc = lc, IH-1-lr
+                    elif tname == 'T': sr, sc = lc, lr
+                    elif tname == 'T_fliplr': sr, sc = IW-1-lc, lr
+                    idx[oi, oj] = [sr, sc]
+    for inp, out in exs:
+        reconstructed = np.zeros_like(out)
+        for oi in range(OH):
+            for oj in range(OW):
+                reconstructed[oi,oj] = inp[idx[oi,oj,0], idx[oi,oj,1]]
+        if not np.array_equal(reconstructed, out): return None
+    return _build_gather_model(OH, OW, idx)
+def s_kronecker(td):
+    exs = get_exs(td)
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH,IW),(OH,OW) = sp
+    if OH % IH != 0 or OW % IW != 0: return None
+    sH, sW = OH // IH, OW // IW
+    if sH < 2 or sW < 2: return None
+    if OH > 30 or OW > 30: return None
+    for inp, out in exs:
+        expected = np.kron(inp, np.ones((sH, sW), dtype=np.int64))
+        if not np.array_equal(out, expected): return None
+    idx = np.zeros((OH,OW,2), dtype=np.int64)
+    for r in range(OH):
+        for c in range(OW):
+            idx[r,c] = [r//sH, c//sW]
+    return _build_gather_model(OH, OW, idx)
+def s_diagonal_tile(td):
+    exs = get_exs(td)
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH,IW),(OH,OW) = sp
+    if OH % IH != 0 or OW % IW != 0: return None
+    rH, rW = OH // IH, OW // IW
+    if rH != rW or rH < 2: return None
+    if OH > 30 or OW > 30: return None
+    for inp, out in exs:
+        for bi in range(rH):
+            for bj in range(rW):
+                block = out[bi*IH:(bi+1)*IH, bj*IW:(bj+1)*IW]
+                if bi == bj:
+                    if not np.array_equal(block, inp): return None
+                else:
+                    if not np.all(block == 0): return None
+    idx = np.zeros((OH,OW,2), dtype=np.int64)
+    cst = np.full((OH,OW), -1, dtype=np.int64)
+    for bi in range(rH):
+        for bj in range(rW):
+            for lr in range(IH):
+                for lc in range(IW):
+                    oi, oj = bi*IH + lr, bj*IW + lc
+                    if bi == bj: idx[oi, oj] = [lr, lc]
+                    else: idx[oi, oj] = [-1, -1]; cst[oi, oj] = 0
+    return _build_gather_model_with_const(IH, IW, OH, OW, idx, cst)
+def s_shift(td):
+    exs = get_exs(td)
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH, IW), (OH, OW) = sp
+    if (IH, IW) != (OH, OW): return None
+    for dr in range(-5, 6):
+        for dc in range(-5, 6):
+            if dr == 0 and dc == 0: continue
+            ok = True
+            for inp, out in exs:
+                shifted = np.zeros_like(inp)
+                r0, r1 = max(0, dr), min(IH, IH + dr)
+                c0, c1 = max(0, dc), min(IW, IW + dc)
+                if r1 > r0 and c1 > c0:
+                    sr0, sc0 = max(0, -dr), max(0, -dc)
+                    shifted[r0:r1, c0:c1] = inp[sr0:sr0+(r1-r0), sc0:sc0+(c1-c0)]
+                if not np.array_equal(shifted, out):
+                    ok = False; break
+            if not ok: continue
+            idx = np.zeros((OH, OW, 2), dtype=np.int64)
+            cst = np.full((OH, OW), 0, dtype=np.int64)
+            for r in range(OH):
+                for c in range(OW):
+                    sr, sc = r - dr, c - dc
+                    if 0 <= sr < IH and 0 <= sc < IW: idx[r, c] = [sr, sc]
+                    else: idx[r, c] = [-1, -1]
+            return _build_gather_model_with_const(IH, IW, OH, OW, idx, cst)
+    return None
+def s_mirror_h(td):
+    exs = get_exs(td)
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH, IW), (OH, OW) = sp
+    if OH != IH or OW != 2 * IW: return None
+    if OW > 30: return None
+    for inp, out in exs:
+        expected = np.concatenate([inp, np.flip(inp, 1)], 1)
+        if not np.array_equal(expected, out): return None
+    idx = np.zeros((OH, OW, 2), dtype=np.int64)
+    for r in range(OH):
+        for c in range(OW):
+            sc = c if c < IW else 2*IW - 1 - c
+            idx[r, c] = [r, sc]
+    return _build_gather_model(OH, OW, idx)
+def s_mirror_v(td):
+    exs = get_exs(td)
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH, IW), (OH, OW) = sp
+    if OW != IW or OH != 2 * IH: return None
+    if OH > 30: return None
+    for inp, out in exs:
+        expected = np.concatenate([inp, np.flip(inp, 0)], 0)
+        if not np.array_equal(expected, out): return None
+    idx = np.zeros((OH, OW, 2), dtype=np.int64)
+    for r in range(OH):
+        for c in range(OW):
+            sr = r if r < IH else 2*IH - 1 - r
+            idx[r, c] = [sr, c]
+    return _build_gather_model(OH, OW, idx)
+def s_quad_mirror(td):
+    exs = get_exs(td)
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH, IW), (OH, OW) = sp
+    if OH != 2 * IH or OW != 2 * IW: return None
+    if OH > 30 or OW > 30: return None
+    for inp, out in exs:
+        expected = np.block([
+            [inp, np.flip(inp, 1)],
+            [np.flip(inp, 0), np.flip(np.flip(inp, 0), 1)]
+        ])
+        if not np.array_equal(expected, out): return None
+    idx = np.zeros((OH, OW, 2), dtype=np.int64)
+    for r in range(OH):
+        for c in range(OW):
+            sr = r if r < IH else 2*IH - 1 - r
+            sc = c if c < IW else 2*IW - 1 - c
+            idx[r, c] = [sr, sc]
+    return _build_gather_model(OH, OW, idx)
+def s_fixed_crop(td):
+    exs = get_exs(td)
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH, IW), (OH, OW) = sp
+    if OH > IH or OW > IW or (OH == IH and OW == IW): return None
+    for r0 in range(IH - OH + 1):
+        for c0 in range(IW - OW + 1):
+            if all(np.array_equal(inp[r0:r0+OH, c0:c0+OW], out) for inp, out in exs):
+                idx = np.zeros((OH, OW, 2), dtype=np.int64)
+                for r in range(OH):
+                    for c in range(OW):
+                        idx[r, c] = [r0 + r, c0 + c]
+                return _build_gather_model(OH, OW, idx)
+    return None
+def s_nonuniform_scale(td):
+    exs = get_exs(td)
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH, IW), (OH, OW) = sp
+    for fh, fw in [(1,2),(2,1),(1,3),(3,1),(2,3),(3,2),(1,4),(4,1),(2,4),(4,2)]:
+        if OH != IH*fh or OW != IW*fw: continue
+        if OH > 30 or OW > 30: continue
+        if all(np.array_equal(np.repeat(np.repeat(inp, fh, 0), fw, 1), out) for inp, out in exs):
+            idx = np.zeros((OH, OW, 2), dtype=np.int64)
+            for r in range(OH):
+                for c in range(OW):
+                    idx[r, c] = [r//fh, c//fw]
+            return _build_gather_model(OH, OW, idx)
+    return None
+def s_constant(td):
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    exs = get_exs(td)
+    outs = [out for _,out in exs]
+    if not all(np.array_equal(outs[0], o) for o in outs[1:]): return None
+    const = np.zeros((1,10,30,30), dtype=np.float32)
+    for r, row in enumerate(outs[0]):
+        for c, v in enumerate(row):
+            const[0, int(v), r, c] = 1.0
+    inits = [numpy_helper.from_array(np.array(0.0, dtype=np.float32), 'z'),
+             numpy_helper.from_array(const, 'c')]
+    nodes = [helper.make_node('Mul', ['input','z'], ['zd']),
+             helper.make_node('ReduceSum', ['zd'], ['s'], axes=[1,2,3], keepdims=1),
+             helper.make_node('Add', ['s','c'], ['output'])]
+    return mk(nodes, inits)
+def _attr_to_dict(attr_proto):
+    """Convert ONNX AttributeProto to Python native type."""
+    from onnx import AttributeProto
+    if attr_proto.type == AttributeProto.INT:
+        return attr_proto.i
+    elif attr_proto.type == AttributeProto.INTS:
+        return list(attr_proto.ints)
+    elif attr_proto.type == AttributeProto.FLOAT:
+        return attr_proto.f
+    elif attr_proto.type == AttributeProto.FLOATS:
+        return list(attr_proto.floats)
+    elif attr_proto.type == AttributeProto.STRING:
+        return attr_proto.s.decode('utf-8')
+    elif attr_proto.type == AttributeProto.STRINGS:
+        return [s.decode('utf-8') for s in attr_proto.strings]
+    elif attr_proto.type == AttributeProto.TENSOR:
+        return numpy_helper.to_array(attr_proto.t)
+    else:
+        return None
+# ============================================================
+# COMPOSITION DETECTORS (transform + color_map)
+# ============================================================
+def _apply_transform(inp, transform_name):
+    """Apply a named transform to a numpy array."""
+    if transform_name == 'id': return inp
+    elif transform_name == 'fliplr': return np.fliplr(inp)
+    elif transform_name == 'flipud': return np.flipud(inp)
+    elif transform_name == 'rot90': return np.rot90(inp, 1)
+    elif transform_name == 'rot180': return np.rot90(inp, 2)
+    elif transform_name == 'rot270': return np.rot90(inp, 3)
+    elif transform_name == 'T': return inp.T
+    else: return inp
+def s_composition_rotate_color(td):
+    """Detect rotation + color_map composition."""
+    exs = get_exs(td)
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH,IW),(OH,OW) = sp
+    if (IH,IW) != (OH,OW): return None
+    for k in [1, 2, 3]:
+        # Try each rotation, then check if consistent color_map remains
+        cm = {}
+        valid = True
+        for inp, out in exs:
+            rotated = np.rot90(inp, k)
+            if rotated.shape != out.shape: valid = False; break
+            for iv, ov in zip(rotated.flat, out.flat):
+                iv, ov = int(iv), int(ov)
+                if iv in cm and cm[iv] != ov: valid = False; break
+                cm[iv] = ov
+            if not valid: break
+        if not valid: continue
+        # Build: rotate first (Slice-based), then color_map
+        rot_model = _build_slice_rotate_model(k, IH, IW)
+        # Extract nodes from rot_model, prepend to color_map
+        cm_model = _build_color_map_model(cm)
+        # Combine: input -> rot_nodes -> color_map -> output
+        # We need to chain the graphs
+        combined_nodes = []
+        combined_inits = []
+        # Add rotation nodes with renamed intermediates
+        for node in rot_model.graph.node:
+            if node.output[0] == 'output':
+                # Last node of rotation feeds into color map
+                new_node = helper.make_node(node.op_type, list(node.input), ['rot_out'],
+                                           **{attr.name: _attr_to_dict(attr) for attr in node.attribute})
+            else:
+                new_node = node
+            combined_nodes.append(new_node)
+        for init in rot_model.graph.initializer:
+            combined_inits.append(init)
+        # Add color map nodes with input = rot_out
+        for node in cm_model.graph.node:
+            if node.input[0] == 'input':
+                new_node = helper.make_node(node.op_type, ['rot_out'] + list(node.input[1:]), list(node.output),
+                                           **{attr.name: _attr_to_dict(attr) for attr in node.attribute})
+            else:
+                new_node = node
+            combined_nodes.append(new_node)
+        for init in cm_model.graph.initializer:
+            combined_inits.append(init)
+        return mk(combined_nodes, combined_inits)
+    return None
+def s_composition_flip_color(td):
+    """Detect flip + color_map composition."""
+    exs = get_exs(td)
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH,IW),(OH,OW) = sp
+    if (IH,IW) != (OH,OW): return None
+    for axis, flip_fn in [(0, np.flipud), (1, np.fliplr)]:
+        cm = {}
+        valid = True
+        for inp, out in exs:
+            flipped = flip_fn(inp)
+            if flipped.shape != out.shape: valid = False; break
+            for iv, ov in zip(flipped.flat, out.flat):
+                iv, ov = int(iv), int(ov)
+                if iv in cm and cm[iv] != ov: valid = False; break
+                cm[iv] = ov
+            if not valid: break
+        if not valid: continue
+        flip_model = _build_slice_flip_model(axis, IH, IW)
+        cm_model = _build_color_map_model(cm)
+        combined_nodes = []
+        combined_inits = []
+        for node in flip_model.graph.node:
+            if node.output[0] == 'output':
+                new_node = helper.make_node(node.op_type, list(node.input), ['flip_out'],
+                                           **{attr.name: _attr_to_dict(attr) for attr in node.attribute})
+            else:
+                new_node = node
+            combined_nodes.append(new_node)
+        for init in flip_model.graph.initializer:
+            combined_inits.append(init)
+        for node in cm_model.graph.node:
+            if node.input[0] == 'input':
+                new_node = helper.make_node(node.op_type, ['flip_out'] + list(node.input[1:]), list(node.output),
+                                           **{attr.name: _attr_to_dict(attr) for attr in node.attribute})
+            else:
+                new_node = node
+            combined_nodes.append(new_node)
+        for init in cm_model.graph.initializer:
+            combined_inits.append(init)
+        return mk(combined_nodes, combined_inits)
+    return None
+def s_composition_transpose_color(td):
+    """Detect transpose + color_map composition."""
+    exs = get_exs(td)
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH,IW),(OH,OW) = sp
+    cm = {}
+    valid = True
+    for inp, out in exs:
+        transposed = inp.T
+        if transposed.shape != out.shape: valid = False; break
+        for iv, ov in zip(transposed.flat, out.flat):
+            iv, ov = int(iv), int(ov)
+            if iv in cm and cm[iv] != ov: valid = False; break
+            cm[iv] = ov
+        if not valid: break
+    if not valid: return None
+    trans_model = _build_slice_transpose_model(IH, IW)
+    cm_model = _build_color_map_model(cm)
+    combined_nodes = []
+    combined_inits = []
+    for node in trans_model.graph.node:
+        if node.output[0] == 'output':
+            new_node = helper.make_node(node.op_type, list(node.input), ['trans_out'],
+                                       **{attr.name: _attr_to_dict(attr) for attr in node.attribute})
+        else:
+            new_node = node
+        combined_nodes.append(new_node)
+    for init in trans_model.graph.initializer:
+        combined_inits.append(init)
+    for node in cm_model.graph.node:
+        if node.input[0] == 'input':
+            new_node = helper.make_node(node.op_type, ['trans_out'] + list(node.input[1:]), list(node.output),
+                                       **{attr.name: _attr_to_dict(attr) for attr in node.attribute})
+        else:
+            new_node = node
+        combined_nodes.append(new_node)
+    for init in cm_model.graph.initializer:
+        combined_inits.append(init)
+    return mk(combined_nodes, combined_inits)
+# ============================================================
+# CHANNEL REDUCTION WRAPPER
+# ============================================================
+def _get_active_colors(td):
+    """Returns set of all colors appearing in inputs and outputs."""
+    colors = set()
+    for ex in td['train'] + td['test']:
+        for row in ex['input']:
+            colors.update(row)
+        for row in ex['output']:
+            colors.update(row)
+    return colors
+def _build_channel_reduced_model(inner_model, input_colors, output_colors):
+    """Wrap a model with channel reduction: Conv1x1(10->N) -> inner -> Conv1x1(N->10).
+    This saves MACs when N < 10."""
+    n_in = len(input_colors)
+    n_out = len(output_colors)
+    # Maps from full 10 channels to reduced set
+    in_map = sorted(input_colors)
+    out_map = sorted(output_colors)
+    # W_reduce: [n_in, 10, 1, 1] - maps 10 channels to n_in
+    W_reduce = np.zeros((n_in, 10, 1, 1), dtype=np.float32)
+    for i, c in enumerate(in_map):
+        W_reduce[i, c, 0, 0] = 1.0
+    # W_expand: [10, n_out, 1, 1] - maps n_out channels back to 10
+    W_expand = np.zeros((10, n_out, 1, 1), dtype=np.float32)
+    for i, c in enumerate(out_map):
+        W_expand[c, i, 0, 0] = 1.0
+    # Build the wrapped model
+    nodes = [
+        helper.make_node('Conv', ['input', 'W_reduce'], ['reduced'], kernel_shape=[1,1]),
+    ]
+    inits = [numpy_helper.from_array(W_reduce, 'W_reduce')]
+    # Add inner model nodes with input='reduced' and output renamed
+    for node in inner_model.graph.node:
+        if node.input[0] == 'input':
+            new_inputs = ['reduced'] + list(node.input[1:])
+        else:
+            new_inputs = list(node.input)
+        if node.output[0] == 'output':
+            new_outputs = ['inner_out']
+        else:
+            new_outputs = list(node.output)
+        new_node = helper.make_node(node.op_type, new_inputs, new_outputs,
+                                    **{attr.name: _attr_to_dict(attr) for attr in node.attribute})
+        nodes.append(new_node)
+    for init in inner_model.graph.initializer:
+        if init.name != 'W_reduce':  # avoid conflict
+            inits.append(init)
+    nodes.append(helper.make_node('Conv', ['inner_out', 'W_expand'], ['output'], kernel_shape=[1,1]))
+    inits.append(numpy_helper.from_array(W_expand, 'W_expand'))
+    return mk(nodes, inits)
+def _try_channel_reduction(solver_fn, td):
+    """Try a solver with channel reduction wrapper if it reduces cost.
+    NOTE: Currently disabled for Gather-based models (spatial_gather, etc.)
+    as they hardcode channel=10 in Reshape operations."""
+    model = solver_fn(td)
+    if model is None: return None
+    # DISABLED: Channel reduction breaks Gather-based models
+    # that reshape to [1,10,900]. Only applies to Conv-based models.
+    # colors = _get_active_colors(td)
+    # if len(colors) >= 8:
+    #     return model
+    # try:
+    #     wrapped = _build_channel_reduced_model(model, colors, colors)
+    #     return wrapped
+    # except Exception:
+    #     return model
+    return model
+# ============================================================
+# CONV SOLVERS WITH LOOCV RIDGE + STRIDE TRICKS
+# ============================================================
+def add_onehot_block(nodes, inits, am_name, oh_name):
+    """Equal + Cast one-hot encoding (replaces OneHot which lacks CUDA kernel)."""
+    classes = np.arange(10, dtype=np.int64).reshape(1, 10, 1, 1)
+    inits.append(numpy_helper.from_array(classes, 'classes'))
+    nodes.append(helper.make_node('Equal', [am_name, 'classes'], ['eq']))
+    nodes.append(helper.make_node('Cast', ['eq'], [oh_name], to=TensorProto.FLOAT))
+def _extract_patches_strided(oh_pad, ks, out_shape):
+    """Extract patches using stride_tricks for speedup.
+    oh_pad: [C, H+2p, W+2p] padded one-hot array
+    ks: kernel size
+    out_shape: (OH, OW) output shape
+    Returns: patches array [OH*OW, C*ks*ks]
+    """
+    C, Hp, Wp = oh_pad.shape
+    OH, OW = out_shape
+    # Use as_strided to create sliding window view over padded array
+    stride_c = oh_pad.strides[0]
+    stride_h = oh_pad.strides[1]
+    stride_w = oh_pad.strides[2]
+    # Ensure base covers all needed elements: up to (OH-1+ks, OW-1+ks)
+    needed_h = min(OH - 1 + ks, Hp)
+    needed_w = min(OW - 1 + ks, Wp)
+    base = oh_pad[:, :needed_h, :needed_w]
+    # Shape: [OH, OW, C, ks, ks]
+    shape = (OH, OW, C, ks, ks)
+    strides = (stride_h, stride_w, stride_c, stride_h, stride_w)
+    patches_view = np.lib.stride_tricks.as_strided(base, shape=shape, strides=strides)
+    # Reshape to [OH*OW, C*ks*ks]
+    return patches_view.reshape(OH * OW, C * ks * ks)
+def _effective_rank(P):
+    """Compute effective rank r(Σ) = Tr(Σ) / ‖Σ‖."""
+    Sigma = np.cov(P, rowvar=False)
+    evals = np.linalg.eigvalsh(Sigma)
+    evals = evals[evals > 1e-12]
+    if len(evals) == 0: return 0
+    return np.sum(evals) / np.max(evals)
+def _tune_ridge_loocv(P, T_oh, lambdas):
+    """Find best λ using efficient LOOCV via Hat Matrix diagonal (SVD shortcut).
+    Cawley & Talbot (2010), JMLR.
+    """
+    n, p = P.shape
+    try:
+        U, s, Vt = np.linalg.svd(P, full_matrices=False)
+    except Exception:
+        return None
+    best_lambda, min_err = None, float('inf')
+    for lam in lambdas:
+        d = (s**2) / (s**2 + lam)
+        y_hat = (U * d) @ (U.T @ T_oh)
+        # Ridge hat matrix diagonal: h_ii = Σ_j U_ij^2 * s_j^2 / (s_j^2 + λ)
+        h_ii = np.sum((U**2) * d[np.newaxis, :], axis=1)
+        # LOOCV shortcut: error_i = (y_i - ŷ_i) / (1 - h_ii)
+        denom = 1 - h_ii
+        denom = np.where(np.abs(denom) < 1e-10, 1e-10, denom)
+        errors = (T_oh - y_hat) / denom[:, np.newaxis]
+        mse = np.mean(errors**2)
+        if mse < min_err:
+            min_err, best_lambda = mse, lam
+    return best_lambda
+def _lstsq_conv(exs_raw, ks, use_bias, use_full_30=False, use_ridge=True):
+    """Shared lstsq conv fitting with optional LOOCV Ridge tuning.
+    Returns (Wconv, B) or None."""
+    pad = ks // 2
+    feat = 10 * ks * ks + (1 if use_bias else 0)
+    if feat > 20000: return None
+    patches_list, targets = [], []
+    for inp_g, out_g in exs_raw:
+        ih, iw = inp_g.shape
+        if use_full_30:
+            oh_full = np.zeros((10, GH, GW), dtype=np.float64)
+            for c in range(10): oh_full[c, :ih, :iw] = (inp_g == c)
+            oh_pad = np.pad(oh_full, ((0,0),(pad,pad),(pad,pad)))
+        else:
+            oh_enc = np.zeros((10, ih, iw), dtype=np.float64)
+            for c in range(10): oh_enc[c] = (inp_g == c)
+            oh_pad = np.pad(oh_enc, ((0,0),(pad,pad),(pad,pad)))
+        oh, ow = out_g.shape
+        # Try stride_tricks for speedup
+        try:
+            patches = _extract_patches_strided(oh_pad, ks, (oh, ow))
+            if use_bias:
+                bias_col = np.ones((patches.shape[0], 1), dtype=np.float64)
+                patches = np.concatenate([patches, bias_col], axis=1)
+            patches_list.append(patches)
+            targets.append(out_g.flatten())
+        except Exception:
+            # Fallback to loop-based extraction
+            for r in range(oh):
+                for c in range(ow):
+                    p = oh_pad[:, r:r+ks, c:c+ks].flatten()
+                    if use_bias: p = np.append(p, 1.0)
+                    patches_list.append(p)
+                    targets.append(int(out_g[r, c]))
+    if len(patches_list) > 0 and isinstance(patches_list[0], np.ndarray) and patches_list[0].ndim == 2:
+        P = np.concatenate(patches_list, axis=0)
+        T = np.concatenate(targets)
+    else:
+        P = np.array(patches_list, dtype=np.float64)
+        T = np.array(targets, dtype=np.int64)
+    n_patches = P.shape[0]
+    if feat > 5000 and n_patches > 2000: return None
+    T_oh = np.zeros((len(T), 10), dtype=np.float64)
+    for i, t in enumerate(T): T_oh[i, t] = 1.0
+    # Quick condition number estimate using norm ratio (cheaper than full SVD)
+    # Only skip if clearly pathological; otherwise try lstsq
+    cond_estimate = None
+    try:
+        # Use 2-norm estimate: cond ≈ ||P||_2 * ||P^+||_2 ≈ max_singular / min_singular
+        # We approximate with norm ratios for speed
+        p_norm = np.linalg.norm(P, 2)
+        if p_norm > 0:
+            # Estimate using power method approximation or just try lstsq
+            pass  # Don't waste time on condition number - lstsq will handle it
+    except Exception:
+        pass
+    if use_ridge and n_patches <= feat * 1.5:
+        # Use LOOCV Ridge tuning when system is underdetermined or near interpolation threshold
+        lambdas = np.logspace(-4, 2, 10)
+        best_lam = _tune_ridge_loocv(P, T_oh, lambdas)
+        if best_lam is not None:
+            # Ridge solve: (P^T P + λI)^-1 P^T T
+            try:
+                WT = np.linalg.solve(P.T @ P + best_lam * np.eye(P.shape[1]), P.T @ T_oh)
+            except Exception:
+                WT = np.linalg.lstsq(P, T_oh, rcond=None)[0]
+        else:
+            WT = np.linalg.lstsq(P, T_oh, rcond=None)[0]
+    else:
+        WT = np.linalg.lstsq(P, T_oh, rcond=None)[0]
+    if not np.array_equal(np.argmax(P @ WT, axis=1), T): return None
+    if use_bias:
+        Wconv = WT[:-1].T.reshape(10, 10, ks, ks).astype(np.float32)
+        B = WT[-1].astype(np.float32)
+    else:
+        Wconv = WT.T.reshape(10, 10, ks, ks).astype(np.float32)
+        B = None
+    return Wconv, B
+# ============================================================
+# CONV SOLVER WRAPPERS
+# ============================================================
+def _get_ks_for_budget(time_budget):
+    """Return kernel sizes to try based on time budget."""
+    if time_budget < 5:
+        return [1, 3, 5]
+    elif time_budget < 10:
+        return [1, 3, 5, 7, 9]
+    elif time_budget < 20:
+        return [1, 3, 5, 7, 9, 11, 13, 15, 17]
+    else:
+        return [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]
+def solve_conv_fixed(td, path, time_budget=30.0):
+    """Fixed-shape conv: Slice -> Conv -> ArgMax -> Equal+Cast -> Pad."""
+    exs = get_exs(td)
+    for inp, out in exs:
+        if inp.shape != out.shape: return None
+    shapes = set(inp.shape for inp, _ in exs)
+    if len(shapes) != 1: return None
+    IH, IW = shapes.pop()
+    fit_exs = get_exs_for_fitting(td)
+    fit_exs = [(i,o) for i,o in fit_exs if i.shape == o.shape and i.shape == (IH, IW)]
+    t_start = time.time()
+    for use_bias in [False, True]:
+        for ks in _get_ks_for_budget(time_budget):
+            if time.time() - t_start > time_budget: return None
+            result = _lstsq_conv(fit_exs, ks, use_bias, use_full_30=False)
+            if result is None: continue
+            Wconv, B = result
+            pad = ks // 2
+            pad_h, pad_w = GH - IH, GW - IW
+            inits = [
+                numpy_helper.from_array(np.array([0,0,0,0], dtype=np.int64), 'sl_st'),
+                numpy_helper.from_array(np.array([1,10,IH,IW], dtype=np.int64), 'sl_en'),
+                numpy_helper.from_array(Wconv, 'W'),
+            ]
+            conv_inputs = ['grid', 'W']
+            if B is not None:
+                inits.append(numpy_helper.from_array(B, 'B'))
+                conv_inputs.append('B')
+            nodes = [
+                helper.make_node('Slice', ['input','sl_st','sl_en'], ['grid']),
+                helper.make_node('Conv', conv_inputs, ['co'], kernel_shape=[ks,ks], pads=[pad]*4),
+                helper.make_node('ArgMax', ['co'], ['am'], axis=1, keepdims=1),
+            ]
+            add_onehot_block(nodes, inits, 'am', 'oh_out')
+            cf_pads = numpy_helper.from_array(np.array([0,0,0,0,0,0,pad_h,pad_w], dtype=np.int64), 'cf_pads')
+            inits.append(cf_pads)
+            nodes.append(
+                helper.make_node('Pad', ['oh_out', 'cf_pads'], ['output'], mode='constant')
+            )
+            model = mk(nodes, inits)
+            onnx.save(model, path)
+            if validate(path, td): return 'conv_fixed', model
+    return None
+def solve_conv_variable(td, path, time_budget=30.0):
+    """Variable-shape conv: Conv(30x30) -> ArgMax -> Equal+Cast -> Mul(mask)."""
+    exs = get_exs(td)
+    for inp, out in exs:
+        if inp.shape != out.shape: return None
+    fit_exs = get_exs_for_fitting_variable(td)
+    fit_exs = [(i,o) for i,o in fit_exs if i.shape == o.shape]
+    t_start = time.time()
+    for use_bias in [False, True]:
+        for ks in [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]:
+            if time.time() - t_start > time_budget: return None
+            result = _lstsq_conv(fit_exs, ks, use_bias, use_full_30=True)
+            if result is None: continue
+            Wconv, B = result
+            pad = ks // 2
+            inits = [numpy_helper.from_array(Wconv, 'W')]
+            conv_inputs = ['input', 'W']
+            if B is not None:
+                inits.append(numpy_helper.from_array(B, 'B'))
+                conv_inputs.append('B')
+            nodes = [
+                helper.make_node('ReduceSum', ['input'], ['mask'], axes=[1], keepdims=1),
+                helper.make_node('Conv', conv_inputs, ['co'], kernel_shape=[ks,ks], pads=[pad]*4),
+                helper.make_node('ArgMax', ['co'], ['am'], axis=1, keepdims=1),
+            ]
+            add_onehot_block(nodes, inits, 'am', 'oh_out')
+            nodes.append(helper.make_node('Mul', ['oh_out', 'mask'], ['output']))
+            model = mk(nodes, inits)
+            onnx.save(model, path)
+            if validate(path, td): return 'conv_var', model
+    return None
+def solve_conv_diffshape(td, path, time_budget=30.0):
+    """Diff-shape conv for fixed io shapes where output is smaller."""
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH, IW), (OH, OW) = sp
+    if IH == OH and IW == OW: return None
+    if OH > IH or OW > IW: return None
+    if OH > 30 or OW > 30: return None
+    exs = get_exs(td)
+    t_start = time.time()
+    for dr_off, dc_off in [(0, 0), ((IH-OH)//2, (IW-OW)//2)]:
+        for use_bias in [False, True]:
+            for ks in [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]:
+                if time.time() - t_start > time_budget: return None
+                pad = ks // 2
+                feat = 10 * ks * ks + (1 if use_bias else 0)
+                if feat > 10000: continue
+                patches, targets = [], []
+                valid = True
+                for inp_g, out_g in exs:
+                    oh_enc = np.zeros((10, IH, IW), dtype=np.float64)
+                    for c in range(10): oh_enc[c] = (inp_g == c)
+                    oh_pad = np.pad(oh_enc, ((0,0),(pad,pad),(pad,pad)))
+                    for r in range(OH):
+                        for c in range(OW):
+                            sr, sc = r + dr_off, c + dc_off
+                            if sr < 0 or sr >= IH or sc < 0 or sc >= IW:
+                                valid = False; break
+                            p = oh_pad[:, sr:sr+ks, sc:sc+ks].flatten()
+                            if use_bias: p = np.append(p, 1.0)
+                            patches.append(p)
+                            targets.append(int(out_g[r, c]))
+                        if not valid: break
+                    if not valid: break
+                if not valid: continue
+                n_patches = len(patches)
+                if feat > 5000 and n_patches > 2000: continue
+                P = np.array(patches, dtype=np.float64)
+                T = np.array(targets, dtype=np.int64)
+                T_oh = np.zeros((len(T), 10), dtype=np.float64)
+                for i, t in enumerate(T): T_oh[i, t] = 1.0
+                WT = np.linalg.lstsq(P, T_oh, rcond=None)[0]
+                if not np.array_equal(np.argmax(P @ WT, axis=1), T): continue
+                if use_bias:
+                    Wconv = WT[:-1].T.reshape(10, 10, ks, ks).astype(np.float32)
+                    B = WT[-1].astype(np.float32)
+                else:
+                    Wconv = WT.T.reshape(10, 10, ks, ks).astype(np.float32)
+                    B = None
+                pad_h, pad_w = GH - OH, GW - OW
+                inits = [
+                    numpy_helper.from_array(np.array([0,0,0,0], dtype=np.int64), 'sl_st'),
+                    numpy_helper.from_array(np.array([1,10,IH,IW], dtype=np.int64), 'sl_en'),
+                    numpy_helper.from_array(Wconv, 'W'),
+                    numpy_helper.from_array(np.array([0,0,dr_off,dc_off], dtype=np.int64), 'cr_st'),
+                    numpy_helper.from_array(np.array([1,10,dr_off+OH,dc_off+OW], dtype=np.int64), 'cr_en'),
+                ]
+                conv_inputs = ['grid', 'W']
+                if B is not None:
+                    inits.append(numpy_helper.from_array(B, 'B'))
+                    conv_inputs.append('B')
+                nodes = [
+                    helper.make_node('Slice', ['input','sl_st','sl_en'], ['grid']),
+                    helper.make_node('Conv', conv_inputs, ['co'], kernel_shape=[ks,ks], pads=[pad]*4),
+                    helper.make_node('Slice', ['co','cr_st','cr_en'], ['co_crop']),
+                    helper.make_node('ArgMax', ['co_crop'], ['am'], axis=1, keepdims=1),
+                ]
+                add_onehot_block(nodes, inits, 'am', 'oh_out')
+                diff_pads = numpy_helper.from_array(np.array([0,0,0,0,0,0,pad_h,pad_w], dtype=np.int64), 'diff_pads')
+                inits.append(diff_pads)
+                nodes.append(
+                    helper.make_node('Pad', ['oh_out', 'diff_pads'], ['output'], mode='constant')
+                )
+                model = mk(nodes, inits)
+                onnx.save(model, path)
+                if validate(path, td): return 'conv_diff', model
+    return None
+def solve_conv_var_diff(td, path, time_budget=30.0):
+    """Variable diff-shape conv."""
+    exs = get_exs(td)
+    t_start = time.time()
+    for use_bias in [False, True]:
+        for ks in [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]:
+            if time.time() - t_start > time_budget: return None
+            pad = ks // 2
+            feat = 10 * ks * ks + (1 if use_bias else 0)
+            if feat > 20000: continue
+            patches, targets = [], []
+            for inp_g, out_g in exs:
+                ih, iw = inp_g.shape
+                oh, ow = out_g.shape
+                oh_full = np.zeros((10, GH, GW), dtype=np.float64)
+                for c in range(10): oh_full[c, :ih, :iw] = (inp_g == c)
+                oh_pad = np.pad(oh_full, ((0,0),(pad,pad),(pad,pad)))
+                for r in range(oh):
+                    for c in range(ow):
+                        p = oh_pad[:, r:r+ks, c:c+ks].flatten()
+                        if use_bias: p = np.append(p, 1.0)
+                        patches.append(p)
+                        targets.append(int(out_g[r, c]))
+            n_patches = len(patches)
+            if feat > 5000 and n_patches > 2000: continue
+            P = np.array(patches, dtype=np.float64)
+            T = np.array(targets, dtype=np.int64)
+            T_oh = np.zeros((len(T), 10), dtype=np.float64)
+            for i, t in enumerate(T): T_oh[i, t] = 1.0
+            try:
+                WT = np.linalg.lstsq(P, T_oh, rcond=None)[0]
+            except:
+                continue
+            if not np.array_equal(np.argmax(P @ WT, axis=1), T): continue
+            if use_bias:
+                Wconv = WT[:-1].T.reshape(10, 10, ks, ks).astype(np.float32)
+                B = WT[-1].astype(np.float32)
+            else:
+                Wconv = WT.T.reshape(10, 10, ks, ks).astype(np.float32)
+                B = None
+            all_output_within_input = all(
+                out_g.shape[0] <= inp_g.shape[0] and out_g.shape[1] <= inp_g.shape[1]
+                for inp_g, out_g in exs
+            )
+            if not all_output_within_input:
+                continue
+            inits = [numpy_helper.from_array(Wconv, 'W')]
+            conv_inputs = ['input', 'W']
+            if B is not None:
+                inits.append(numpy_helper.from_array(B, 'B'))
+                conv_inputs.append('B')
+            nodes = [
+                helper.make_node('ReduceSum', ['input'], ['mask'], axes=[1], keepdims=1),
+                helper.make_node('Conv', conv_inputs, ['co'], kernel_shape=[ks,ks], pads=[pad]*4),
+                helper.make_node('ArgMax', ['co'], ['am'], axis=1, keepdims=1),
+            ]
+            add_onehot_block(nodes, inits, 'am', 'oh_out')
+            nodes.append(helper.make_node('Mul', ['oh_out', 'mask'], ['output']))
+            model = mk(nodes, inits)
+            onnx.save(model, path)
+            if validate(path, td): return 'conv_var_diff', model
+    return None
+# ============================================================
+# MAIN SOLVER PIPELINE
+# ============================================================
+ANALYTICAL_SOLVERS = [
+    ('identity', s_identity),
+    ('constant', s_constant),
+    ('color_map', s_color_map),
+    ('transpose', s_transpose),
+    ('flip', s_flip),
+    ('rotate', s_rotate),
+    ('tile', s_tile),
+    ('upscale', s_upscale),
+    ('kronecker', s_kronecker),
+    ('nonuniform_scale', s_nonuniform_scale),
+    ('mirror_h', s_mirror_h),
+    ('mirror_v', s_mirror_v),
+    ('quad_mirror', s_quad_mirror),
+    ('concat', s_concat),
+    ('concat_enhanced', s_concat_enhanced),
+    ('diagonal_tile', s_diagonal_tile),
+    ('fixed_crop', s_fixed_crop),
+    ('spatial_gather', s_spatial_gather),
+    ('shift', s_shift),
+    ('varshape_spatial_gather', s_varshape_spatial_gather),
+]
+COMPOSITION_SOLVERS = [
+    ('rotate_color', s_composition_rotate_color),
+    ('flip_color', s_composition_flip_color),
+    ('transpose_color', s_composition_transpose_color),
+]
+def solve_task(tn, td, outdir, conv_budget=30.0, use_channel_reduction=True):
+    t_start = time.time()
+    os.makedirs(outdir, exist_ok=True)
+    path = os.path.join(outdir, f"task{tn:03d}.onnx")
+    if tn in EXCLUDED_TASKS:
+        return False, 'excluded', None, time.time() - t_start, path
+    # 1. Try analytical solvers (fast, tiny models)
+    for sname, sfn in ANALYTICAL_SOLVERS:
+        try:
+            if use_channel_reduction and sname in ('transpose', 'flip', 'rotate', 'mirror_h', 'mirror_v', 'quad_mirror', 'shift', 'spatial_gather', 'varshape_spatial_gather'):
+                model = _try_channel_reduction(sfn, td)
+            else:
+                model = sfn(td)
+            if model is None: continue
+            onnx.save(model, path)
+            if validate(path, td):
+                return True, sname, os.path.getsize(path), time.time() - t_start, path
+        except Exception as e:
+            pass
+    # 2. Try composition solvers
+    for sname, sfn in COMPOSITION_SOLVERS:
+        try:
+            model = sfn(td)
+            if model is None: continue
+            onnx.save(model, path)
+            if validate(path, td):
+                return True, sname, os.path.getsize(path), time.time() - t_start, path
+        except Exception:
+            pass
+    # 3. Determine task shape category and try conv solvers
+    exs = get_exs(td)
+    same_shape = all(inp.shape == out.shape for inp, out in exs)
+    shapes = set(inp.shape for inp, _ in exs)
+    fixed_in = len(shapes) == 1
+    conv_time = conv_budget
+    if same_shape:
+        if fixed_in:
+            result = solve_conv_fixed(td, path, time_budget=conv_time/2)
+            if result is not None:
+                sname, model = result
+                return True, sname, os.path.getsize(path), time.time() - t_start, path
+        result = solve_conv_variable(td, path, time_budget=conv_time)
+        if result is not None:
+            sname, model = result
+            return True, sname, os.path.getsize(path), time.time() - t_start, path
+    else:
+        sp = fixed_shapes(td)
+        if sp is not None:
+            (IH,IW),(OH,OW) = sp
+            if OH <= IH and OW <= IW:
+                result = solve_conv_diffshape(td, path, time_budget=conv_time)
+                if result is not None:
+                    sname, model = result
+                    return True, sname, os.path.getsize(path), time.time() - t_start, path
+        result = solve_conv_var_diff(td, path, time_budget=conv_time)
+        if result is not None:
+            sname, model = result
+            return True, sname, os.path.getsize(path), time.time() - t_start, path
+    return False, None, None, time.time() - t_start, path
+def run_tasks(task_nums, tasks, output_dir, conv_budget, use_wandb, use_channel_reduction=True):
+    results = {}
+    costs_dict = {}
+    total_score = 0
+    for tn in task_nums:
+        if tn not in tasks:
+            continue
+        if tn in EXCLUDED_TASKS:
+            print(f"Task {tn:3d}: EXCLUDED (officially)")
+            continue
+        td = tasks[tn]['data']
+        ok, sname, sz, t_task, model_path = solve_task(tn, td, output_dir, conv_budget, use_channel_reduction)
+        if ok:
+            macs, memory, params = score_network(model_path)
+            if macs is None:
+                macs, memory, params = 0, 0, 0
+            cost = macs + memory + params
+            score = max(1.0, 25.0 - math.log(max(1, cost)))
+            total_score += score
+            results[tn] = (sname, t_task, sz)
+            costs_dict[tn] = cost
+            print(f"Task {tn:3d}: {sname:25s} {score:7.3f} {cost:>12} {t_task:7.3f}s  ({sz:>8,} bytes)")
+        else:
+            print(f"Task {tn:3d}: UNSOLVED  {t_task:7.3f}s")
+            cost = 0
+        if use_wandb and wandb is not None:
+            wandb.log({
+                "task_id": tn,
+                "solver": sname if ok else "unsolved",
+                "onnx_bytes": sz if ok else 0,
+                "task_time_sec": t_task,
+                "cost": cost,
+                "score": score if ok else 0,
+            })
+    return results, costs_dict, total_score
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_dir', default='ARC-AGI/data/training/')
+    parser.add_argument('--arcgen_dir', default='', help='Path to ARC-GEN-100K/ directory')
+    parser.add_argument('--output_dir', default='submission')
+    parser.add_argument('--kaggle', action='store_true')
+    parser.add_argument('--conv_budget', type=float, default=30.0)
+    parser.add_argument('--tasks', type=str, default='')
+    parser.add_argument('--device', type=str, default='auto', choices=['auto','cpu','cuda'])
+    parser.add_argument('--use_wandb', action='store_true')
+    parser.add_argument('--no_channel_reduction', action='store_true', help='Disable channel reduction wrapper')
+    args = parser.parse_args()
+    global ORT_PROVIDERS
+    config = {
+        "device": args.device,
+        "conv_budget": args.conv_budget,
+        "data_dir": args.data_dir,
+        "arcgen_dir": args.arcgen_dir,
+        "tasks": args.tasks,
+    }
+    if args.device == 'cuda':
+        ORT_PROVIDERS = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+    elif args.device == 'cpu':
+        ORT_PROVIDERS = ['CPUExecutionProvider']
+    ort.set_default_logger_severity(3)
+    print(f"Using providers: {ORT_PROVIDERS}")
+    print(f"OPSET: 17 (v5)")
+    if args.kaggle:
+        tasks = load_tasks_kaggle(args.data_dir)
+    else:
+        arcgen = args.arcgen_dir if args.arcgen_dir else None
+        tasks = load_tasks_dir(args.data_dir, arcgen_dir=arcgen)
+    total_arcgen = sum(len(t['data'].get('arc-gen', [])) for t in tasks.values())
+    print(f"Loaded {len(tasks)} tasks ({total_arcgen} ARC-GEN examples)")
+    print(f"Excluded tasks: {sorted(EXCLUDED_TASKS)}")
+    task_nums = [int(t) for t in args.tasks.split(',')] if args.tasks else sorted(tasks.keys())
+    active_tasks = [t for t in task_nums if t not in EXCLUDED_TASKS]
+    print(f"Solving {len(active_tasks)} active tasks (skipping {len(task_nums) - len(active_tasks)} excluded)")
+    print(f"Conv budget: {args.conv_budget}s per task")
+    print(f"Channel reduction: {'enabled' if not args.no_channel_reduction else 'disabled'}")
+    print("=" * 70)
+    t0 = time.time()
+    use_ch_red = not args.no_channel_reduction
+    if args.use_wandb and wandb is not None:
+        with wandb.init(project="neurogolf", name="solver_run", config=config):
+            results, costs_dict, total_score = run_tasks(task_nums, tasks, args.output_dir, args.conv_budget, use_wandb=True, use_channel_reduction=use_ch_red)
+    else:
+        results, costs_dict, total_score = run_tasks(task_nums, tasks, args.output_dir, args.conv_budget, use_wandb=False, use_channel_reduction=use_ch_red)
+    elapsed = time.time() - t0
+    print(f"\n{'='*70}")
+    print(f"Solved: {len(results)}/{len(active_tasks)} active tasks in {elapsed:.0f}s")
+    solver_names = [v[0] for v in results.values()]
+    sc = Counter(solver_names)
+    for s, c in sc.most_common(): print(f"  {s}: {c}")
+    outdir = args.output_dir
+    n_files = len([f for f in os.listdir(outdir) if f.endswith('.onnx')])
+    total_size = sum(os.path.getsize(os.path.join(outdir, f))
+                     for f in os.listdir(outdir) if f.endswith('.onnx'))
+    zip_path = os.path.join(os.path.dirname(outdir) or '.', 'submission.zip')
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf:
+        for f in sorted(os.listdir(outdir)):
+            if f.endswith('.onnx'):
+                zf.write(os.path.join(outdir, f), f)
+    zip_bytes = buf.getvalue()
+    with open(zip_path, 'wb') as f:
+        f.write(zip_bytes)
+    zip_size = len(zip_bytes)
+    csv_path = os.path.join(os.path.dirname(outdir) or '.', 'submission.csv')
+    with open(csv_path, 'w', newline='') as f:
+        w = csv.writer(f)
+        w.writerow(['task_id', 'total_cost'])
+        for tn in sorted(costs_dict.keys()):
+            w.writerow([f'task{tn:03d}', costs_dict[tn]])
+    unsolved_count = len(active_tasks) - len(results)
+    est_lb = total_score + unsolved_count * 1.0
+    print(f"\n{n_files} ONNX files, {total_size/1024:.1f} KB uncompressed")
+    print(f"ZIP size: {zip_size/1024:.1f} KB / {MAX_FILESIZE/1024:.0f} KB limit {'OK' if zip_size <= MAX_FILESIZE else 'OVER!'}")
+    print(f"Estimated LB score: {est_lb:.1f} (solved: {total_score:.1f} + unsolved: {unsolved_count}×1.0)")
+    print(f"Written: {zip_path} | {csv_path}")
+if __name__ == '__main__':
+    main()