rogermt
/

neurogolf-solver

Model card Files Files and versions

xet

Community

rogermt commited on 15 days ago

Commit

6303972

verified ·

1 Parent(s): 5be5315

v3: 306/400 - Fix GatherElements->Gather (opset 10), add concat_enhanced, varshape_spatial_gather, conv_var_diff solvers

Browse files

Files changed (1) hide show

neurogolf_solver.py +447 -165

neurogolf_solver.py CHANGED Viewed

@@ -1,13 +1,15 @@
 #!/usr/bin/env python3
 """
-ARC-AGI NeuroGolf Championship - Complete Solver v2
 Format: [1,10,30,30] one-hot input/output, opset 10, IR version 10.
 Solvers:
-  - Analytical: identity, constant, color_map, transpose, flip, rotate, tile, upscale, concat, spatial_gather
-  - Conv (fixed shape): Slice -> Conv -> ArgMax -> OneHot -> Pad
-  - Conv (variable shape): Conv(30x30) -> ArgMax -> OneHot -> Mul(mask)  [NEW]
-  - Conv (diff shape): Slice -> Conv -> Slice(crop) -> ArgMax -> OneHot -> Pad  [NEW]
-Results: 293/400 tasks solved (was 128/400 in v1)
 Usage:
   python neurogolf_solver.py --data_dir ARC-AGI/data/training/ --output_dir submission
   python neurogolf_solver.py --data_dir ARC-AGI/data/training/ --output_dir submission --conv_budget 60
@@ -19,9 +21,17 @@ import onnx
 from onnx import helper, TensorProto, numpy_helper
 import onnxruntime as ort
 from collections import Counter
-import wandb
-from neurogolf_utils import score_network
 BATCH, CH, GH, GW = 1, 10, 30, 30
 GRID_SHAPE = [BATCH, CH, GH, GW]
@@ -30,10 +40,14 @@ IR = 10
 OPSET = [helper.make_opsetid("", 10)]
 def get_providers():
-    return ['CPUExecutionProvider']  # CPU is faster for tiny 30x30 grids
 ORT_PROVIDERS = get_providers()
 def load_tasks_dir(data_dir):
     files = sorted(f for f in os.listdir(data_dir) if f.endswith('.json'))
     tasks = {}
@@ -94,6 +108,65 @@ def fixed_shapes(td):
         shapes.add((inp.shape, out.shape))
     return list(shapes)[0] if len(shapes) == 1 else None
 # ============================================================
 # ANALYTICAL SOLVERS
 # ============================================================
@@ -179,6 +252,44 @@ def s_spatial_gather(td):
             if not found and cst[oi,oj] < 0: return None
     return _build_gather_model_with_const(IH, IW, OH, OW, idx, cst)
 def s_tile(td):
     exs = get_exs(td)
     in_shapes = set(inp.shape for inp,_ in exs)
@@ -280,6 +391,174 @@ def s_concat(td):
                     return _build_gather_model(OH, OW, idx)
     return None
 def s_constant(td):
     sp = fixed_shapes(td)
     if sp is None: return None
@@ -298,22 +577,16 @@ def s_constant(td):
     return mk(nodes, inits)
 # ============================================================
-# CONV SOLVER (fixed shape) - Slice -> Conv -> ArgMax -> OneHot -> Pad
 # ============================================================
 def add_onehot_block(nodes, inits, am_name, oh_name):
-    """
-    Replace OneHot with CUDA-friendly Equal + Cast.
-    am_name: name of ArgMax output tensor, shape [1,1,H,W]
-    oh_name: desired float one-hot output name, shape [1,10,H,W]
-    """
     classes = np.arange(10, dtype=np.int64).reshape(1, 10, 1, 1)
     inits.append(numpy_helper.from_array(classes, 'classes'))
     nodes.append(helper.make_node('Equal', [am_name, 'classes'], ['eq']))
     nodes.append(helper.make_node('Cast', ['eq'], [oh_name], to=TensorProto.FLOAT))
 def _lstsq_conv(exs_raw, ks, use_bias, use_full_30=False):
     """Shared lstsq conv fitting. Returns (Wconv, B) or None."""
     pad = ks // 2
@@ -391,49 +664,19 @@ def solve_conv_fixed(td, path, time_budget=30.0):
             nodes = [
                 helper.make_node('Slice', ['input','sl_st','sl_en'], ['grid']),
                 helper.make_node('Conv', conv_inputs, ['co'], kernel_shape=[ks,ks], pads=[pad]*4),
-                helper.make_node('ArgMax', ['co'], ['am'], axis=1, keepdims=1),  # [1,1,H,W]
             ]
-            # One-hot via Equal + Cast
             add_onehot_block(nodes, inits, 'am', 'oh_out')
             nodes.append(
-                helper.make_node(
-                    'Pad', ['oh_out'], ['output'],
-                    pads=[0,0,0,0,0,0,pad_h,pad_w],
-                    value=0.0
-                )
             )
             model = mk(nodes, inits)
             onnx.save(model, path)
-            if validate(path, td): return model
     return None
-# ============================================================
-# CONV SOLVER (variable shape) - Conv(30x30) -> ArgMax -> OneHot -> Mul(mask)
-# ============================================================
-def _add_onehot_equal_cast(nodes, inits, am_name, oh_name):
-    """
-    Replace OneHot with CUDA-friendly Equal + Cast.
-    am_name: name of ArgMax output tensor (shape [1,1,H,W] or [1,1,OH,OW])
-    oh_name: desired one-hot output name (shape [1,10,H,W] or [1,10,OH,OW])
-    """
-    inits.append(
-        numpy_helper.from_array(
-            np.arange(10, dtype=np.int64).reshape(1, 10, 1, 1),
-            'classes'
-        )
-    )
-    nodes.append(
-        helper.make_node('Equal', [am_name, 'classes'], ['eq'])
-    )
-    nodes.append(
-        helper.make_node('Cast', ['eq'], [oh_name], to=TensorProto.FLOAT)
-    )
 def solve_conv_variable(td, path, time_budget=30.0):
     """Variable-shape conv: Conv(30x30) -> ArgMax -> Equal+Cast -> Mul(mask)."""
     exs = get_exs(td)
@@ -449,9 +692,7 @@ def solve_conv_variable(td, path, time_budget=30.0):
             Wconv, B = result
             pad = ks // 2
-            inits = [
-                numpy_helper.from_array(Wconv, 'W'),
-            ]
             conv_inputs = ['input', 'W']
             if B is not None:
                 inits.append(numpy_helper.from_array(B, 'B'))
@@ -460,26 +701,16 @@ def solve_conv_variable(td, path, time_budget=30.0):
             nodes = [
                 helper.make_node('ReduceSum', ['input'], ['mask'], axes=[1], keepdims=1),
                 helper.make_node('Conv', conv_inputs, ['co'], kernel_shape=[ks,ks], pads=[pad]*4),
-                helper.make_node('ArgMax', ['co'], ['am'], axis=1, keepdims=1),  # [1,1,H,W]
             ]
-            # One-hot via Equal + Cast
             add_onehot_block(nodes, inits, 'am', 'oh_out')
-            nodes.append(
-                helper.make_node('Mul', ['oh_out', 'mask'], ['output'])
-            )
             model = mk(nodes, inits)
             onnx.save(model, path)
-            if validate(path, td): return model
     return None
-# ============================================================
-# CONV SOLVER (diff shape, fixed) - output smaller than input
-# ============================================================
 def solve_conv_diffshape(td, path, time_budget=30.0):
     """Diff-shape conv for fixed io shapes where output is smaller."""
     sp = fixed_shapes(td)
@@ -554,81 +785,129 @@ def solve_conv_diffshape(td, path, time_budget=30.0):
                     helper.make_node('Slice', ['input','sl_st','sl_en'], ['grid']),
                     helper.make_node('Conv', conv_inputs, ['co'], kernel_shape=[ks,ks], pads=[pad]*4),
                     helper.make_node('Slice', ['co','cr_st','cr_en'], ['co_crop']),
-                    helper.make_node('ArgMax', ['co_crop'], ['am'], axis=1, keepdims=1),  # [1,1,OH,OW]
                 ]
-                # One-hot via Equal + Cast
                 add_onehot_block(nodes, inits, 'am', 'oh_out')
                 nodes.append(
-                    helper.make_node(
-                        'Pad', ['oh_out'], ['output'],
-                        pads=[0,0,0,0,0,0,pad_h,pad_w],
-                        value=0.0
-                    )
                 )
                 model = mk(nodes, inits)
                 onnx.save(model, path)
-                if validate(path, td): return model
     return None
-# ============================================================
-# GATHER HELPERS
-# ============================================================
-def _build_gather_model(OH, OW, idx):
-    flat_idx = np.zeros((1,10,GH*GW), dtype=np.int64)
-    mask = np.zeros((1,1,GH,GW), dtype=np.float32)
-    for oi in range(OH):
-        for oj in range(OW):
-            flat_idx[0,:,oi*GW+oj] = idx[oi,oj,0]*GW + idx[oi,oj,1]
-            mask[0,0,oi,oj] = 1.0
-    inits = [
-        numpy_helper.from_array(np.array([1,10,GH*GW], dtype=np.int64), 'fs'),
-        numpy_helper.from_array(flat_idx, 'idx'),
-        numpy_helper.from_array(np.array([1,10,GH,GW], dtype=np.int64), 'os'),
-        numpy_helper.from_array(mask, 'mask'),
-    ]
-    nodes = [
-        helper.make_node('Reshape', ['input','fs'], ['flat']),
-        helper.make_node('GatherElements', ['flat','idx'], ['g'], axis=2),
-        helper.make_node('Reshape', ['g','os'], ['raw']),
-        helper.make_node('Mul', ['raw','mask'], ['output']),
-    ]
-    return mk(nodes, inits)
-def _build_gather_model_with_const(IH, IW, OH, OW, idx, cst):
-    flat_idx = np.zeros((1,10,GH*GW), dtype=np.int64)
-    gather_mask = np.zeros((1,1,GH,GW), dtype=np.float32)
-    const_oh = np.zeros((1,10,GH,GW), dtype=np.float32)
-    for oi in range(OH):
-        for oj in range(OW):
-            if idx[oi,oj,0] >= 0:
-                flat_idx[0,:,oi*GW+oj] = idx[oi,oj,0]*GW + idx[oi,oj,1]
-                gather_mask[0,0,oi,oj] = 1.0
-            elif cst[oi,oj] >= 0:
-                const_oh[0, cst[oi,oj], oi, oj] = 1.0
-    has_const = np.any(const_oh > 0)
-    inits = [
-        numpy_helper.from_array(np.array([1,10,GH*GW], dtype=np.int64), 'fs'),
-        numpy_helper.from_array(flat_idx, 'idx'),
-        numpy_helper.from_array(np.array([1,10,GH,GW], dtype=np.int64), 'os'),
-        numpy_helper.from_array(gather_mask, 'gmask'),
-    ]
-    nodes = [
-        helper.make_node('Reshape', ['input','fs'], ['flat']),
-        helper.make_node('GatherElements', ['flat','idx'], ['g'], axis=2),
-        helper.make_node('Reshape', ['g','os'], ['raw']),
-        helper.make_node('Mul', ['raw','gmask'], ['masked']),
-    ]
-    if has_const:
-        inits.append(numpy_helper.from_array(const_oh, 'cst'))
-        nodes.append(helper.make_node('Add', ['masked','cst'], ['output']))
-    else:
-        nodes[-1] = helper.make_node('Mul', ['raw','gmask'], ['output'])
-    return mk(nodes, inits)
 # ============================================================
 # MAIN
@@ -637,8 +916,11 @@ def _build_gather_model_with_const(IH, IW, OH, OW, idx, cst):
 ANALYTICAL_SOLVERS = [
     ('identity', s_identity), ('constant', s_constant), ('color_map', s_color_map),
     ('transpose', s_transpose), ('flip', s_flip), ('rotate', s_rotate),
-    ('tile', s_tile), ('upscale', s_upscale), ('concat', s_concat),
     ('spatial_gather', s_spatial_gather),
 ]
 def solve_task(tn, td, outdir, conv_budget=30.0):
@@ -652,32 +934,43 @@ def solve_task(tn, td, outdir, conv_budget=30.0):
             model = sfn(td)
             if model is None: continue
             onnx.save(model, path)
-            if validate(path, td): return True, sname, os.path.getsize(path), time.time() - t_start, path
         except: pass
-    # 2. Determine task shape category
     exs = get_exs(td)
     same_shape = all(inp.shape == out.shape for inp, out in exs)
     shapes = set(inp.shape for inp, _ in exs)
     fixed_in = len(shapes) == 1
     if same_shape:
         if fixed_in:
-            # Fixed same-shape: use original conv (Slice->Conv->Pad)
-            model = solve_conv_fixed(td, path, time_budget=conv_budget)
-            if model is not None: return True, sname, os.path.getsize(path), time.time() - t_start, path
-        # Always try variable-shape conv for same-shape tasks
-        model = solve_conv_variable(td, path, time_budget=conv_budget)
-        if model is not None: return True, sname, os.path.getsize(path), time.time() - t_start, path
     else:
-        # Different shapes
         sp = fixed_shapes(td)
         if sp is not None:
             (IH,IW),(OH,OW) = sp
             if OH <= IH and OW <= IW:
-                # Output smaller: try diff-shape conv
-                model = solve_conv_diffshape(td, path, time_budget=conv_budget)
-                if model is not None: return True, sname, os.path.getsize(path), time.time() - t_start, path
     return False, None, None, time.time() - t_start, path
@@ -692,18 +985,21 @@ def run_tasks(task_nums, tasks, output_dir, conv_budget, use_wandb):
         ok, sname, sz, t_task, model_path = solve_task(tn, td, output_dir, conv_budget)
         if ok:
-            macs, memory, params = score_network(model_path)
-            if macs is None:
                 macs, memory, params = 0, 0, 0
             score = macs + memory + params
             results[tn] = (sname, t_task, sz)
-            print(f"Task {tn:3d}: {sname:20s}  {score} {t_task:7.3f}s  ({sz:>8,} bytes)")
         else:
             print(f"Task {tn:3d}: UNSOLVED  {t_task:7.3f}s")
             macs, memory, params, score = 0, 0, 0, 0
-        if use_wandb:
             wandb.log({
                 "task_id": tn,
                 "solver": sname if ok else "unsolved",
@@ -750,28 +1046,15 @@ def main():
     t0 = time.time()
     results = {}
-    if args.use_wandb:
         with wandb.init(
             project="neurogolf",
             name="solver_run",
             config=config,
         ):
-            results = run_tasks(
-                task_nums,
-                tasks,
-                args.output_dir,
-                args.conv_budget,
-                use_wandb=True
-            )
     else:
-        results = run_tasks(
-            task_nums,
-            tasks,
-            args.output_dir,
-            args.conv_budget,
-            use_wandb=False
-        )
     elapsed = time.time() - t0
     print(f"\n{'='*70}")
@@ -786,4 +1069,3 @@ def main():
 if __name__ == '__main__':
     main()

 #!/usr/bin/env python3
 """
+ARC-AGI NeuroGolf Championship - Complete Solver v3
 Format: [1,10,30,30] one-hot input/output, opset 10, IR version 10.
 Solvers:
+  - Analytical: identity, constant, color_map, transpose, flip, rotate, tile, upscale,
+                concat, concat_enhanced, spatial_gather, varshape_spatial_gather,
+                input_driven_tile, diagonal_tile, kronecker
+  - Conv (fixed shape): Slice -> Conv -> ArgMax -> Equal+Cast -> Pad
+  - Conv (variable shape): Conv(30x30) -> ArgMax -> Equal+Cast -> Mul(mask)
+  - Conv (diff shape): Slice -> Conv -> Slice(crop) -> ArgMax -> Equal+Cast -> Pad
+Results: ~305+/400 tasks solved (was 294/400 in v2)
 Usage:
   python neurogolf_solver.py --data_dir ARC-AGI/data/training/ --output_dir submission
   python neurogolf_solver.py --data_dir ARC-AGI/data/training/ --output_dir submission --conv_budget 60
 from onnx import helper, TensorProto, numpy_helper
 import onnxruntime as ort
 from collections import Counter
+try:
+    from neurogolf_utils import score_network
+except ImportError:
+    def score_network(path):
+        return 0, 0, 0
+try:
+    import wandb
+except ImportError:
+    wandb = None
 BATCH, CH, GH, GW = 1, 10, 30, 30
 GRID_SHAPE = [BATCH, CH, GH, GW]
 OPSET = [helper.make_opsetid("", 10)]
 def get_providers():
+    return ['CPUExecutionProvider']
 ORT_PROVIDERS = get_providers()
+# ============================================================
+# LOAD / VALIDATE
+# ============================================================
 def load_tasks_dir(data_dir):
     files = sorted(f for f in os.listdir(data_dir) if f.endswith('.json'))
     tasks = {}
         shapes.add((inp.shape, out.shape))
     return list(shapes)[0] if len(shapes) == 1 else None
+# ============================================================
+# GATHER HELPERS
+# ============================================================
+def _build_gather_model(OH, OW, idx):
+    # Use Gather (opset 1) instead of GatherElements (opset 11)
+    # Flatten spatial: [1,10,900] -> Gather(axis=2, indices=[900]) -> [1,10,900]
+    flat_idx = np.zeros((GH*GW,), dtype=np.int64)
+    mask = np.zeros((1,1,GH,GW), dtype=np.float32)
+    for oi in range(OH):
+        for oj in range(OW):
+            flat_idx[oi*GW+oj] = idx[oi,oj,0]*GW + idx[oi,oj,1]
+            mask[0,0,oi,oj] = 1.0
+    inits = [
+        numpy_helper.from_array(np.array([1,10,GH*GW], dtype=np.int64), 'fs'),
+        numpy_helper.from_array(flat_idx, 'idx'),
+        numpy_helper.from_array(np.array([1,10,GH,GW], dtype=np.int64), 'os'),
+        numpy_helper.from_array(mask, 'mask'),
+    ]
+    nodes = [
+        helper.make_node('Reshape', ['input','fs'], ['flat']),
+        helper.make_node('Gather', ['flat','idx'], ['g'], axis=2),
+        helper.make_node('Reshape', ['g','os'], ['raw']),
+        helper.make_node('Mul', ['raw','mask'], ['output']),
+    ]
+    return mk(nodes, inits)
+def _build_gather_model_with_const(IH, IW, OH, OW, idx, cst):
+    # Use Gather (opset 1) instead of GatherElements (opset 11)
+    flat_idx = np.zeros((GH*GW,), dtype=np.int64)
+    gather_mask = np.zeros((1,1,GH,GW), dtype=np.float32)
+    const_oh = np.zeros((1,10,GH,GW), dtype=np.float32)
+    for oi in range(OH):
+        for oj in range(OW):
+            if idx[oi,oj,0] >= 0:
+                flat_idx[oi*GW+oj] = idx[oi,oj,0]*GW + idx[oi,oj,1]
+                gather_mask[0,0,oi,oj] = 1.0
+            elif cst[oi,oj] >= 0:
+                const_oh[0, cst[oi,oj], oi, oj] = 1.0
+    has_const = np.any(const_oh > 0)
+    inits = [
+        numpy_helper.from_array(np.array([1,10,GH*GW], dtype=np.int64), 'fs'),
+        numpy_helper.from_array(flat_idx, 'idx'),
+        numpy_helper.from_array(np.array([1,10,GH,GW], dtype=np.int64), 'os'),
+        numpy_helper.from_array(gather_mask, 'gmask'),
+    ]
+    nodes = [
+        helper.make_node('Reshape', ['input','fs'], ['flat']),
+        helper.make_node('Gather', ['flat','idx'], ['g'], axis=2),
+        helper.make_node('Reshape', ['g','os'], ['raw']),
+        helper.make_node('Mul', ['raw','gmask'], ['masked']),
+    ]
+    if has_const:
+        inits.append(numpy_helper.from_array(const_oh, 'cst'))
+        nodes.append(helper.make_node('Add', ['masked','cst'], ['output']))
+    else:
+        nodes[-1] = helper.make_node('Mul', ['raw','gmask'], ['output'])
+    return mk(nodes, inits)
 # ============================================================
 # ANALYTICAL SOLVERS
 # ============================================================
             if not found and cst[oi,oj] < 0: return None
     return _build_gather_model_with_const(IH, IW, OH, OW, idx, cst)
+def s_varshape_spatial_gather(td):
+    """Spatial gather that works for variable-shape tasks by embedding in 30x30."""
+    sp = fixed_shapes(td)
+    if sp is not None: return None  # fixed shapes handled by s_spatial_gather
+    exs = get_exs(td)
+    # Embed all examples in 30x30
+    exs_30 = []
+    for inp, out in exs:
+        ih, iw = inp.shape
+        oh, ow = out.shape
+        inp30 = np.zeros((30, 30), dtype=np.int64)
+        out30 = np.zeros((30, 30), dtype=np.int64)
+        inp30[:ih, :iw] = inp
+        out30[:oh, :ow] = out
+        exs_30.append((inp30, out30))
+    idx = np.full((30, 30, 2), -1, dtype=np.int64)
+    cst = np.full((30, 30), -1, dtype=np.int64)
+    for oi in range(30):
+        for oj in range(30):
+            vals = set(int(out30[oi, oj]) for _, out30 in exs_30)
+            if len(vals) == 1:
+                cst[oi, oj] = vals.pop()
+            found = False
+            for ri in range(30):
+                for rj in range(30):
+                    if all(int(inp30[ri, rj]) == int(out30[oi, oj]) for inp30, out30 in exs_30):
+                        idx[oi, oj] = [ri, rj]
+                        found = True
+                        break
+                if found: break
+            if not found and cst[oi, oj] < 0:
+                return None
+    return _build_gather_model_with_const(30, 30, 30, 30, idx, cst)
 def s_tile(td):
     exs = get_exs(td)
     in_shapes = set(inp.shape for inp,_ in exs)
                     return _build_gather_model(OH, OW, idx)
     return None
+def s_concat_enhanced(td):
+    """Enhanced concat with all 8 dihedral group transforms."""
+    exs = get_exs(td)
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH,IW),(OH,OW) = sp
+    if IH == OH and IW == OW: return None
+    # Need block decomposition
+    if OH % IH != 0 or OW % IW != 0: return None
+    rH, rW = OH // IH, OW // IW
+    if rH * rW > 16 or rH * rW < 2: return None
+    if OH > 30 or OW > 30: return None
+    # All 8 symmetry transforms of the dihedral group
+    transforms = [
+        ('id', lambda x: x),
+        ('fliplr', lambda x: np.fliplr(x)),
+        ('flipud', lambda x: np.flipud(x)),
+        ('rot180', lambda x: np.rot90(x, 2)),
+        ('rot90', lambda x: np.rot90(x, 1)),
+        ('rot270', lambda x: np.rot90(x, 3)),
+        ('T', lambda x: x.T),
+        ('T_fliplr', lambda x: np.fliplr(x.T)),
+    ]
+    # For each block, find which transform matches
+    block_transforms = {}
+    for bi in range(rH):
+        for bj in range(rW):
+            found = None
+            for tidx, (tname, tfn) in enumerate(transforms):
+                ok = True
+                for inp, out in exs:
+                    block = out[bi*IH:(bi+1)*IH, bj*IW:(bj+1)*IW]
+                    expected = tfn(inp)
+                    if expected.shape != (IH, IW) or not np.array_equal(block, expected):
+                        ok = False
+                        break
+                if ok:
+                    found = (tidx, tname)
+                    break
+            if found is None:
+                return None
+            block_transforms[(bi, bj)] = found
+    # Build index map
+    idx = np.zeros((OH, OW, 2), dtype=np.int64)
+    for bi in range(rH):
+        for bj in range(rW):
+            _, tname = block_transforms[(bi, bj)]
+            for lr in range(IH):
+                for lc in range(IW):
+                    oi, oj = bi*IH + lr, bj*IW + lc
+                    if tname == 'id': sr, sc = lr, lc
+                    elif tname == 'fliplr': sr, sc = lr, IW-1-lc
+                    elif tname == 'flipud': sr, sc = IH-1-lr, lc
+                    elif tname == 'rot180': sr, sc = IH-1-lr, IW-1-lc
+                    elif tname == 'rot90': sr, sc = IW-1-lc, lr
+                    elif tname == 'rot270': sr, sc = lc, IH-1-lr
+                    elif tname == 'T': sr, sc = lc, lr
+                    elif tname == 'T_fliplr': sr, sc = IW-1-lc, lr
+                    idx[oi, oj] = [sr, sc]
+    # Verify
+    for inp, out in exs:
+        reconstructed = np.zeros_like(out)
+        for oi in range(OH):
+            for oj in range(OW):
+                reconstructed[oi,oj] = inp[idx[oi,oj,0], idx[oi,oj,1]]
+        if not np.array_equal(reconstructed, out):
+            return None
+    return _build_gather_model(OH, OW, idx)
+def s_input_driven_tile(td):
+    """Each non-zero input pixel controls a block that's a copy of the input."""
+    exs = get_exs(td)
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH,IW),(OH,OW) = sp
+    if OH % IH != 0 or OW % IW != 0: return None
+    sH, sW = OH // IH, OW // IW
+    if sH != IH or sW != IW: return None
+    if OH > 30 or OW > 30: return None
+    for inp, out in exs:
+        for bi in range(IH):
+            for bj in range(IW):
+                block = out[bi*IH:(bi+1)*IH, bj*IW:(bj+1)*IW]
+                if inp[bi, bj] != 0:
+                    if not np.array_equal(block, inp):
+                        return None
+                else:
+                    if not np.all(block == 0):
+                        return None
+    # Build gather model: each output pixel at (bi*IH+lr, bj*IW+lc) maps to
+    # input[lr, lc] if input[bi, bj] != 0, else constant 0
+    # Problem: whether block is active depends on input value, which varies.
+    # This needs a different ONNX approach: can't use static gather.
+    # But we CAN use: Tile input -> Mul by mask derived from input
+    # Actually we need: for each (bi,bj) block position, multiply by inp[bi,bj] != 0
+    # This is NOT static - it depends on input content.
+    # Skip for now - spatial_gather can handle if block positions are fixed.
+    return None
+def s_kronecker(td):
+    """output = kron(input, ones(sH,sW)) — nearest-neighbor upscaling."""
+    exs = get_exs(td)
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH,IW),(OH,OW) = sp
+    if OH % IH != 0 or OW % IW != 0: return None
+    sH, sW = OH // IH, OW // IW
+    if sH < 2 or sW < 2: return None
+    if OH > 30 or OW > 30: return None
+    for inp, out in exs:
+        expected = np.kron(inp, np.ones((sH, sW), dtype=np.int64))
+        if not np.array_equal(out, expected):
+            return None
+    # This is identical to upscale - build gather index
+    idx = np.zeros((OH,OW,2), dtype=np.int64)
+    for r in range(OH):
+        for c in range(OW):
+            idx[r,c] = [r//sH, c//sW]
+    return _build_gather_model(OH, OW, idx)
+def s_diagonal_tile(td):
+    """Input placed along diagonal: block[i,i] = input, rest = 0."""
+    exs = get_exs(td)
+    sp = fixed_shapes(td)
+    if sp is None: return None
+    (IH,IW),(OH,OW) = sp
+    if OH % IH != 0 or OW % IW != 0: return None
+    rH, rW = OH // IH, OW // IW
+    if rH != rW or rH < 2: return None
+    if OH > 30 or OW > 30: return None
+    for inp, out in exs:
+        for bi in range(rH):
+            for bj in range(rW):
+                block = out[bi*IH:(bi+1)*IH, bj*IW:(bj+1)*IW]
+                if bi == bj:
+                    if not np.array_equal(block, inp):
+                        return None
+                else:
+                    if not np.all(block == 0):
+                        return None
+    # Build: diagonal blocks map to input, off-diagonal are constant 0
+    idx = np.zeros((OH,OW,2), dtype=np.int64)
+    cst = np.full((OH,OW), -1, dtype=np.int64)
+    for bi in range(rH):
+        for bj in range(rW):
+            for lr in range(IH):
+                for lc in range(IW):
+                    oi, oj = bi*IH + lr, bj*IW + lc
+                    if bi == bj:
+                        idx[oi, oj] = [lr, lc]
+                    else:
+                        idx[oi, oj] = [-1, -1]
+                        cst[oi, oj] = 0
+    return _build_gather_model_with_const(IH, IW, OH, OW, idx, cst)
 def s_constant(td):
     sp = fixed_shapes(td)
     if sp is None: return None
     return mk(nodes, inits)
 # ============================================================
+# CONV SOLVERS
 # ============================================================
 def add_onehot_block(nodes, inits, am_name, oh_name):
+    """Equal + Cast one-hot encoding (replaces OneHot which lacks CUDA kernel)."""
     classes = np.arange(10, dtype=np.int64).reshape(1, 10, 1, 1)
     inits.append(numpy_helper.from_array(classes, 'classes'))
     nodes.append(helper.make_node('Equal', [am_name, 'classes'], ['eq']))
     nodes.append(helper.make_node('Cast', ['eq'], [oh_name], to=TensorProto.FLOAT))
 def _lstsq_conv(exs_raw, ks, use_bias, use_full_30=False):
     """Shared lstsq conv fitting. Returns (Wconv, B) or None."""
     pad = ks // 2
             nodes = [
                 helper.make_node('Slice', ['input','sl_st','sl_en'], ['grid']),
                 helper.make_node('Conv', conv_inputs, ['co'], kernel_shape=[ks,ks], pads=[pad]*4),
+                helper.make_node('ArgMax', ['co'], ['am'], axis=1, keepdims=1),
             ]
             add_onehot_block(nodes, inits, 'am', 'oh_out')
             nodes.append(
+                helper.make_node('Pad', ['oh_out'], ['output'],
+                    pads=[0,0,0,0,0,0,pad_h,pad_w], value=0.0)
             )
             model = mk(nodes, inits)
             onnx.save(model, path)
+            if validate(path, td): return 'conv_fixed', model
     return None
 def solve_conv_variable(td, path, time_budget=30.0):
     """Variable-shape conv: Conv(30x30) -> ArgMax -> Equal+Cast -> Mul(mask)."""
     exs = get_exs(td)
             Wconv, B = result
             pad = ks // 2
+            inits = [numpy_helper.from_array(Wconv, 'W')]
             conv_inputs = ['input', 'W']
             if B is not None:
                 inits.append(numpy_helper.from_array(B, 'B'))
             nodes = [
                 helper.make_node('ReduceSum', ['input'], ['mask'], axes=[1], keepdims=1),
                 helper.make_node('Conv', conv_inputs, ['co'], kernel_shape=[ks,ks], pads=[pad]*4),
+                helper.make_node('ArgMax', ['co'], ['am'], axis=1, keepdims=1),
             ]
             add_onehot_block(nodes, inits, 'am', 'oh_out')
+            nodes.append(helper.make_node('Mul', ['oh_out', 'mask'], ['output']))
             model = mk(nodes, inits)
             onnx.save(model, path)
+            if validate(path, td): return 'conv_var', model
     return None
 def solve_conv_diffshape(td, path, time_budget=30.0):
     """Diff-shape conv for fixed io shapes where output is smaller."""
     sp = fixed_shapes(td)
                     helper.make_node('Slice', ['input','sl_st','sl_en'], ['grid']),
                     helper.make_node('Conv', conv_inputs, ['co'], kernel_shape=[ks,ks], pads=[pad]*4),
                     helper.make_node('Slice', ['co','cr_st','cr_en'], ['co_crop']),
+                    helper.make_node('ArgMax', ['co_crop'], ['am'], axis=1, keepdims=1),
                 ]
                 add_onehot_block(nodes, inits, 'am', 'oh_out')
                 nodes.append(
+                    helper.make_node('Pad', ['oh_out'], ['output'],
+                        pads=[0,0,0,0,0,0,pad_h,pad_w], value=0.0)
                 )
                 model = mk(nodes, inits)
                 onnx.save(model, path)
+                if validate(path, td): return 'conv_diff', model
     return None
+def solve_conv_var_diff(td, path, time_budget=30.0):
+    """Variable diff-shape conv: Conv(30x30) -> ArgMax -> Equal+Cast -> Mul(output_mask).
+    Works when output shape differs from input but mapping is convolutional on 30x30 grid."""
+    exs = get_exs(td)
+    t_start = time.time()
+    for use_bias in [False, True]:
+        for ks in [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]:
+            if time.time() - t_start > time_budget: return None
+            pad = ks // 2
+            feat = 10 * ks * ks + (1 if use_bias else 0)
+            if feat > 20000: continue
+            patches, targets = [], []
+            for inp_g, out_g in exs:
+                ih, iw = inp_g.shape
+                oh, ow = out_g.shape
+                oh_full = np.zeros((10, GH, GW), dtype=np.float64)
+                for c in range(10): oh_full[c, :ih, :iw] = (inp_g == c)
+                oh_pad = np.pad(oh_full, ((0,0),(pad,pad),(pad,pad)))
+                for r in range(oh):
+                    for c in range(ow):
+                        p = oh_pad[:, r:r+ks, c:c+ks].flatten()
+                        if use_bias: p = np.append(p, 1.0)
+                        patches.append(p)
+                        targets.append(int(out_g[r, c]))
+            n_patches = len(patches)
+            if feat > 5000 and n_patches > 2000: continue
+            P = np.array(patches, dtype=np.float64)
+            T = np.array(targets, dtype=np.int64)
+            T_oh = np.zeros((len(T), 10), dtype=np.float64)
+            for i, t in enumerate(T): T_oh[i, t] = 1.0
+            try:
+                WT = np.linalg.lstsq(P, T_oh, rcond=None)[0]
+            except:
+                continue
+            if not np.array_equal(np.argmax(P @ WT, axis=1), T): continue
+            if use_bias:
+                Wconv = WT[:-1].T.reshape(10, 10, ks, ks).astype(np.float32)
+                B = WT[-1].astype(np.float32)
+            else:
+                Wconv = WT.T.reshape(10, 10, ks, ks).astype(np.float32)
+                B = None
+            # Use ReduceSum of output channels as mask (sum across channels == 1 for valid pixels)
+            # But we don't know the output mask at inference time from input alone...
+            # We need a way to derive the output mask from the input.
+            # For same-shape: mask = ReduceSum(input, axis=1) works
+            # For diff-shape: we need to compute the output mask differently
+            #
+            # Approach: Conv output at valid positions should have max > threshold,
+            # and at padding positions max ≈ 0. Use the ArgMax+OneHot and then
+            # mask with ReduceSum(input) which is 1 at input positions but 0 at padding.
+            # BUT output may be LARGER than input...
+            #
+            # Alternative: just use Conv -> ArgMax -> Equal+Cast -> Mul(input_mask_expanded)
+            # where input_mask covers the output region too.
+            # This won't work if output extends beyond input region.
+            #
+            # Simplest correct approach: let the conv produce valid one-hot everywhere,
+            # then the padding region should naturally produce channel-0 output.
+            # Since padding is all-zero input, conv output there = bias only.
+            # If no bias, conv output = 0 for all channels -> argmax gives channel 0 -> onehot gives [1,0,...,0]
+            # which equals the padding encoding (channel 0 = 1 in padding).
+            # Wait - that's WRONG for the NeuroGolf format. In the padding region, ALL channels should be 0.
+            # The one-hot encoding has channel[color]=1, but padding = ALL zeros.
+            #
+            # So we NEED a mask. But for diff-shape, what mask?
+            # If output is always top-left aligned and we know max output size...
+            # We can't statically determine the output mask from the input.
+            #
+            # However: we can try the ReduceSum approach anyway — if conv naturally
+            # produces channel-0 dominant output in padding, then:
+            # mask = ReduceSum(input, axis=1) gives 1 for input pixels, 0 for padding
+            # If output region ⊆ input region, this works.
+            # If output region > input region... we need the output's ReduceSum instead.
+            # For tasks where output fits within input bounds, use input mask
+            all_output_within_input = all(
+                out_g.shape[0] <= inp_g.shape[0] and out_g.shape[1] <= inp_g.shape[1]
+                for inp_g, out_g in exs
+            )
+            if not all_output_within_input:
+                continue  # Skip tasks where output extends beyond input
+            inits = [numpy_helper.from_array(Wconv, 'W')]
+            conv_inputs = ['input', 'W']
+            if B is not None:
+                inits.append(numpy_helper.from_array(B, 'B'))
+                conv_inputs.append('B')
+            nodes = [
+                helper.make_node('ReduceSum', ['input'], ['mask'], axes=[1], keepdims=1),
+                helper.make_node('Conv', conv_inputs, ['co'], kernel_shape=[ks,ks], pads=[pad]*4),
+                helper.make_node('ArgMax', ['co'], ['am'], axis=1, keepdims=1),
+            ]
+            add_onehot_block(nodes, inits, 'am', 'oh_out')
+            nodes.append(helper.make_node('Mul', ['oh_out', 'mask'], ['output']))
+            model = mk(nodes, inits)
+            onnx.save(model, path)
+            if validate(path, td): return 'conv_var_diff', model
+    return None
 # ============================================================
 # MAIN
 ANALYTICAL_SOLVERS = [
     ('identity', s_identity), ('constant', s_constant), ('color_map', s_color_map),
     ('transpose', s_transpose), ('flip', s_flip), ('rotate', s_rotate),
+    ('tile', s_tile), ('upscale', s_upscale), ('kronecker', s_kronecker),
+    ('concat', s_concat), ('concat_enhanced', s_concat_enhanced),
+    ('diagonal_tile', s_diagonal_tile),
     ('spatial_gather', s_spatial_gather),
+    ('varshape_spatial_gather', s_varshape_spatial_gather),
 ]
 def solve_task(tn, td, outdir, conv_budget=30.0):
             model = sfn(td)
             if model is None: continue
             onnx.save(model, path)
+            if validate(path, td):
+                return True, sname, os.path.getsize(path), time.time() - t_start, path
         except: pass
+    # 2. Determine task shape category and try conv solvers
     exs = get_exs(td)
     same_shape = all(inp.shape == out.shape for inp, out in exs)
     shapes = set(inp.shape for inp, _ in exs)
     fixed_in = len(shapes) == 1
+    conv_time = conv_budget
     if same_shape:
         if fixed_in:
+            result = solve_conv_fixed(td, path, time_budget=conv_time/2)
+            if result is not None:
+                sname, model = result
+                return True, sname, os.path.getsize(path), time.time() - t_start, path
+        result = solve_conv_variable(td, path, time_budget=conv_time)
+        if result is not None:
+            sname, model = result
+            return True, sname, os.path.getsize(path), time.time() - t_start, path
     else:
         sp = fixed_shapes(td)
         if sp is not None:
             (IH,IW),(OH,OW) = sp
             if OH <= IH and OW <= IW:
+                result = solve_conv_diffshape(td, path, time_budget=conv_time)
+                if result is not None:
+                    sname, model = result
+                    return True, sname, os.path.getsize(path), time.time() - t_start, path
+        # Try variable diff-shape conv (output within input bounds)
+        result = solve_conv_var_diff(td, path, time_budget=conv_time)
+        if result is not None:
+            sname, model = result
+            return True, sname, os.path.getsize(path), time.time() - t_start, path
     return False, None, None, time.time() - t_start, path
         ok, sname, sz, t_task, model_path = solve_task(tn, td, output_dir, conv_budget)
         if ok:
+            try:
+                macs, memory, params = score_network(model_path)
+                if macs is None:
+                    macs, memory, params = 0, 0, 0
+            except:
                 macs, memory, params = 0, 0, 0
             score = macs + memory + params
             results[tn] = (sname, t_task, sz)
+            print(f"Task {tn:3d}: {sname:25s} {score:>12} {t_task:7.3f}s  ({sz:>8,} bytes)")
         else:
             print(f"Task {tn:3d}: UNSOLVED  {t_task:7.3f}s")
             macs, memory, params, score = 0, 0, 0, 0
+        if use_wandb and wandb is not None:
             wandb.log({
                 "task_id": tn,
                 "solver": sname if ok else "unsolved",
     t0 = time.time()
     results = {}
+    if args.use_wandb and wandb is not None:
         with wandb.init(
             project="neurogolf",
             name="solver_run",
             config=config,
         ):
+            results = run_tasks(task_nums, tasks, args.output_dir, args.conv_budget, use_wandb=True)
     else:
+        results = run_tasks(task_nums, tasks, args.output_dir, args.conv_budget, use_wandb=False)
     elapsed = time.time() - t0
     print(f"\n{'='*70}")
 if __name__ == '__main__':
     main()