rogermt commited on
Commit
75bfb61
·
verified ·
1 Parent(s): 9b5600c

Add ARC-AGI bulk evaluation script

Browse files
Files changed (1) hide show
  1. scripts/run_all_arc.py +183 -0
scripts/run_all_arc.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Run the PEMF solver on all ARC-AGI tasks and report solve rates.
3
+
4
+ For each task, the solver tries every training pair. A task is "solved"
5
+ if the solver achieves σ=0 on ALL training pairs.
6
+
7
+ Usage:
8
+ 1. Download the ARC dataset into arc_data/training/:
9
+ git clone https://github.com/fchollet/ARC-AGI.git /tmp/arc
10
+ cp -r /tmp/arc/data/training arc_data/training
11
+ 2. Run:
12
+ python scripts/run_all_arc.py
13
+
14
+ Outputs:
15
+ arc_results/summary.json — per-task results
16
+ arc_results/report.txt — human-readable report
17
+ """
18
+ import os, json, time, glob
19
+
20
+ import numpy as np
21
+ from itt_solver.solver_core import initialize_potential, sigma_l1
22
+ from itt_solver.beam_logging import beam_minimize_with_log
23
+ from itt_solver.experiment_driver import default_atomic_factory
24
+
25
+ ARC_DIR = os.environ.get("ARC_DIR", "arc_data/training")
26
+ OUT_DIR = os.environ.get("OUT_DIR", "arc_results")
27
+ os.makedirs(OUT_DIR, exist_ok=True)
28
+
29
+ PARAMS = {
30
+ 'beam_width': 8,
31
+ 'max_depth': 2,
32
+ 'lock_coeff': 0.0,
33
+ 'max_fraction': 1.0,
34
+ 'use_symmetry': True,
35
+ 'use_gravity': True,
36
+ 'use_color_ops': True,
37
+ 'boundary_source': 'target',
38
+ }
39
+
40
+ def solve_pair(inp, out, params):
41
+ """Run solver on one input→output pair. Returns (sigma, transform_name, time_s)."""
42
+ h, w = len(out), len(out[0])
43
+ task = {
44
+ 'name': 'pair',
45
+ 'input': inp,
46
+ 'target': out,
47
+ 'target_shape': (h, w),
48
+ }
49
+ atomic_lib = default_atomic_factory(params, task)
50
+ phi_in = initialize_potential(inp)
51
+ phi_target = initialize_potential(out)
52
+
53
+ start = time.time()
54
+ T_best, phi_best, states, sigmas, logs = beam_minimize_with_log(
55
+ phi_in, phi_target, atomic_lib,
56
+ beam_width=params['beam_width'],
57
+ max_depth=params['max_depth'],
58
+ lock_coeff=params['lock_coeff'],
59
+ max_fraction=params['max_fraction'],
60
+ allowed_symbols=list(range(10)),
61
+ enable_layer_minus_one=False,
62
+ boundary_source=params['boundary_source'],
63
+ )
64
+ elapsed = time.time() - start
65
+ final_sigma = float(sigmas[-1]) if sigmas else float('inf')
66
+ return final_sigma, repr(T_best), elapsed
67
+
68
+ def run_all():
69
+ task_files = sorted(glob.glob(os.path.join(ARC_DIR, "*.json")))
70
+ print(f"Running solver on {len(task_files)} ARC training tasks...")
71
+ print(f"Params: beam_width={PARAMS['beam_width']}, max_depth={PARAMS['max_depth']}")
72
+ print()
73
+
74
+ results = []
75
+ solved_count = 0
76
+ partial_count = 0
77
+ total_time = 0
78
+
79
+ for ti, tf in enumerate(task_files):
80
+ task_id = os.path.basename(tf).replace('.json', '')
81
+ with open(tf) as fh:
82
+ task_data = json.load(fh)
83
+
84
+ train_pairs = task_data.get('train', [])
85
+ test_pairs = task_data.get('test', [])
86
+
87
+ pair_results = []
88
+ all_zero = True
89
+ best_sigma = float('inf')
90
+ best_transform = None
91
+
92
+ for pi, pair in enumerate(train_pairs):
93
+ sigma, transform, elapsed = solve_pair(pair['input'], pair['output'], PARAMS)
94
+ total_time += elapsed
95
+ pair_results.append({
96
+ 'pair': pi, 'sigma': sigma,
97
+ 'transform': transform, 'time_s': round(elapsed, 4),
98
+ })
99
+ if sigma > 0:
100
+ all_zero = False
101
+ if sigma < best_sigma:
102
+ best_sigma = sigma
103
+ best_transform = transform
104
+
105
+ test_results = []
106
+ test_solved = None
107
+ for pi, pair in enumerate(test_pairs):
108
+ if 'output' in pair:
109
+ sigma, transform, elapsed = solve_pair(pair['input'], pair['output'], PARAMS)
110
+ total_time += elapsed
111
+ test_results.append({
112
+ 'pair': pi, 'sigma': sigma,
113
+ 'transform': transform, 'time_s': round(elapsed, 4),
114
+ })
115
+ if test_solved is None:
116
+ test_solved = True
117
+ if sigma > 0:
118
+ test_solved = False
119
+
120
+ status = "SOLVED" if all_zero else "PARTIAL" if best_sigma < float('inf') and best_sigma > 0 else "FAILED"
121
+ if all_zero:
122
+ solved_count += 1
123
+ elif best_sigma < float('inf'):
124
+ partial_count += 1
125
+
126
+ results.append({
127
+ 'task_id': task_id, 'status': status,
128
+ 'train_pairs': len(train_pairs), 'all_train_solved': all_zero,
129
+ 'best_sigma': best_sigma, 'best_transform': best_transform,
130
+ 'pair_results': pair_results,
131
+ 'test_results': test_results, 'test_solved': test_solved,
132
+ })
133
+
134
+ if (ti + 1) % 20 == 0 or all_zero:
135
+ marker = "✅" if all_zero else " "
136
+ print(f"[{ti+1:3d}/{len(task_files)}] {task_id}: {status} (best σ={best_sigma:.1f}) {marker}")
137
+
138
+ failed_count = len(task_files) - solved_count - partial_count
139
+ print(f"\n{'='*60}")
140
+ print(f"RESULTS: {len(task_files)} tasks")
141
+ print(f" SOLVED (σ=0 all train pairs): {solved_count} ({100*solved_count/len(task_files):.1f}%)")
142
+ print(f" PARTIAL (σ>0 but finite): {partial_count}")
143
+ print(f" FAILED: {failed_count}")
144
+ print(f" Total time: {total_time:.1f}s ({total_time/len(task_files):.2f}s/task)")
145
+
146
+ summary = {
147
+ 'total_tasks': len(task_files), 'solved': solved_count,
148
+ 'partial': partial_count, 'failed': failed_count,
149
+ 'solve_rate': round(100 * solved_count / len(task_files), 2),
150
+ 'params': PARAMS, 'total_time_s': round(total_time, 2),
151
+ 'results': results,
152
+ }
153
+ with open(os.path.join(OUT_DIR, 'summary.json'), 'w') as fh:
154
+ json.dump(summary, fh, indent=2)
155
+
156
+ solved_tasks = [r for r in results if r['all_train_solved']]
157
+ print(f"\nSolved tasks:")
158
+ for r in solved_tasks:
159
+ print(f" {r['task_id']}: {r['best_transform']}")
160
+
161
+ partial_tasks = sorted(
162
+ [r for r in results if not r['all_train_solved'] and r['best_sigma'] < float('inf')],
163
+ key=lambda r: r['best_sigma']
164
+ )
165
+ print(f"\nTop 20 closest-to-solving:")
166
+ for r in partial_tasks[:20]:
167
+ print(f" {r['task_id']}: σ={r['best_sigma']:.1f} ({r['best_transform']})")
168
+
169
+ with open(os.path.join(OUT_DIR, 'report.txt'), 'w') as fh:
170
+ fh.write(f"PEMF Solver — ARC-AGI Training Set Results\n{'='*60}\n")
171
+ fh.write(f"Total tasks: {len(task_files)}\n")
172
+ fh.write(f"Solved: {solved_count} ({100*solved_count/len(task_files):.1f}%)\n")
173
+ fh.write(f"Partial: {partial_count}\nFailed: {failed_count}\n")
174
+ fh.write(f"Time: {total_time:.1f}s\n\n")
175
+ fh.write(f"Params: {json.dumps(PARAMS, indent=2)}\n\n")
176
+ fh.write(f"Solved tasks:\n")
177
+ for r in solved_tasks:
178
+ fh.write(f" {r['task_id']}: {r['best_transform']}\n")
179
+
180
+ print(f"\nResults saved to {OUT_DIR}/")
181
+
182
+ if __name__ == '__main__':
183
+ run_all()