Add scripts/merge_results.py
Browse files- scripts/merge_results.py +53 -0
scripts/merge_results.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Merge LLM results with symbolic results to get final solve count.
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
python merge_results.py arc_results/summary_v4.json llm_results.json
|
| 6 |
+
"""
|
| 7 |
+
import json
|
| 8 |
+
import sys
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def merge(symbolic_file: str, llm_file: str, output_file: str = "arc_results/summary_final.json"):
|
| 12 |
+
with open(symbolic_file) as f:
|
| 13 |
+
symbolic = json.load(f)
|
| 14 |
+
with open(llm_file) as f:
|
| 15 |
+
llm = json.load(f)
|
| 16 |
+
|
| 17 |
+
symbolic_solved = {r['task_id'] for r in symbolic['results'] if r.get('all_train_solved')}
|
| 18 |
+
llm_solved = {tid for tid, r in llm['results'].items() if r['status'] == 'solved'}
|
| 19 |
+
|
| 20 |
+
total_solved = symbolic_solved | llm_solved
|
| 21 |
+
new_from_llm = llm_solved - symbolic_solved
|
| 22 |
+
|
| 23 |
+
print(f"Symbolic solved: {len(symbolic_solved)}")
|
| 24 |
+
print(f"LLM solved: {len(llm_solved)}")
|
| 25 |
+
print(f"New from LLM: {len(new_from_llm)}")
|
| 26 |
+
print(f"TOTAL SOLVED: {len(total_solved)}/{symbolic['total_tasks']} ({100*len(total_solved)/symbolic['total_tasks']:.1f}%)")
|
| 27 |
+
|
| 28 |
+
print(f"\nNew tasks solved by LLM:")
|
| 29 |
+
for tid in sorted(new_from_llm):
|
| 30 |
+
rule = llm['results'][tid].get('rule', '?')
|
| 31 |
+
print(f" {tid}: {rule}")
|
| 32 |
+
|
| 33 |
+
# Save merged
|
| 34 |
+
merged = {
|
| 35 |
+
'total_tasks': symbolic['total_tasks'],
|
| 36 |
+
'symbolic_solved': len(symbolic_solved),
|
| 37 |
+
'llm_solved': len(llm_solved),
|
| 38 |
+
'new_from_llm': len(new_from_llm),
|
| 39 |
+
'total_solved': len(total_solved),
|
| 40 |
+
'solve_rate': round(100 * len(total_solved) / symbolic['total_tasks'], 2),
|
| 41 |
+
'symbolic_tasks': sorted(symbolic_solved),
|
| 42 |
+
'llm_tasks': sorted(llm_solved),
|
| 43 |
+
'new_llm_tasks': sorted(new_from_llm),
|
| 44 |
+
}
|
| 45 |
+
with open(output_file, 'w') as f:
|
| 46 |
+
json.dump(merged, f, indent=2)
|
| 47 |
+
print(f"\nMerged results saved to {output_file}")
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
if __name__ == "__main__":
|
| 51 |
+
sym = sys.argv[1] if len(sys.argv) > 1 else "arc_results/summary_v4.json"
|
| 52 |
+
llm = sys.argv[2] if len(sys.argv) > 2 else "llm_results.json"
|
| 53 |
+
merge(sym, llm)
|