File size: 6,464 Bytes
b4b2877 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | #!/usr/bin/env python3
"""Aggregate results from the three new benchmark experiments."""
import os
import json
import glob
import numpy as np
ROOT = '${PULSE_ROOT}/results/exp_new'
def load_results(pattern):
files = sorted(glob.glob(pattern))
results = []
for f in files:
try:
results.append(json.load(open(f)))
except Exception as e:
print(f" ERR: {f}: {e}")
return results
def aggregate_expA():
"""Missing modality: average across seeds per eval config."""
print("\n" + "=" * 70)
print("EXP A: Missing-modality robustness")
print("=" * 70)
for subdir in ['expA_missing', 'expA_baseline']:
files = load_results(f'{ROOT}/{subdir}/*/results.json')
if not files:
print(f" No results yet for {subdir}")
continue
print(f"\n-- {subdir} (n seeds = {len(files)}) --")
# Group by eval config name; accumulate F1/Acc over seeds
config_stats = {}
for r in files:
if 'eval_configs' not in r:
continue
for name, info in r['eval_configs'].items():
config_stats.setdefault(name, {'f1': [], 'acc': [], 'active': info['active']})
config_stats[name]['f1'].append(info['f1'])
config_stats[name]['acc'].append(info['acc'])
# Order: full, leave-one-out, singletons
full_names = [n for n in config_stats if n == 'full']
drop_names = sorted([n for n in config_stats if n.startswith('drop_')])
only_names = sorted([n for n in config_stats if n.startswith('only_')])
print(f" {'Config':<22s} {'Active modalities':<42s} "
f"{'F1 mean±std':<14s} {'Acc mean±std':<14s}")
print(' ' + '-' * 96)
for grp in [full_names, drop_names, only_names]:
for name in grp:
d = config_stats[name]
f1_m, f1_s = np.mean(d['f1']), np.std(d['f1'])
ac_m, ac_s = np.mean(d['acc']), np.std(d['acc'])
active = ','.join(d['active'])
print(f" {name:<22s} {active:<42s} "
f"{f1_m:.3f}±{f1_s:.3f} {ac_m:.3f}±{ac_s:.3f}")
def aggregate_expB():
"""Grip regression: group by (backbone, mod_config), average over seeds."""
print("\n" + "=" * 70)
print("EXP B: Grip force regression")
print("=" * 70)
files = load_results(f'{ROOT}/expB_grip/*/results.json')
if not files:
print(" No results yet")
return
# Group
groups = {}
for r in files:
if 'best_test_metrics' not in r:
continue
key = (r['backbone'], ','.join(r['modalities']))
groups.setdefault(key, []).append(r)
rows = []
for (bb, mods), rs in groups.items():
mae_R = [r['best_test_metrics']['right_hand']['mae_g'] for r in rs]
mae_L = [r['best_test_metrics']['left_hand']['mae_g'] for r in rs]
r_R = [r['best_test_metrics']['right_hand']['pearson_r'] for r in rs]
r_L = [r['best_test_metrics']['left_hand']['pearson_r'] for r in rs]
r2_R = [r['best_test_metrics']['right_hand']['r2'] for r in rs]
r2_L = [r['best_test_metrics']['left_hand']['r2'] for r in rs]
mae_avg = [r['best_test_metrics']['avg_mae_g'] for r in rs]
r_avg = [r['best_test_metrics']['avg_pearson_r'] for r in rs]
rows.append({
'backbone': bb,
'modalities': mods,
'n_seeds': len(rs),
'mae_R': (np.mean(mae_R), np.std(mae_R)),
'mae_L': (np.mean(mae_L), np.std(mae_L)),
'mae_avg': (np.mean(mae_avg), np.std(mae_avg)),
'r_R': (np.mean(r_R), np.std(r_R)),
'r_L': (np.mean(r_L), np.std(r_L)),
'r_avg': (np.mean(r_avg), np.std(r_avg)),
'r2_R': (np.mean(r2_R), np.std(r2_R)),
'r2_L': (np.mean(r2_L), np.std(r2_L)),
})
rows.sort(key=lambda r: r['r_avg'][0], reverse=True)
print(f" {'Backbone':<12s} {'Modalities':<30s} N "
f"{'MAE(g) avg':<14s} {'Pearson r avg':<14s} {'R²(R)':<12s} {'R²(L)':<12s}")
print(' ' + '-' * 102)
for row in rows:
print(f" {row['backbone']:<12s} {row['modalities']:<30s} {row['n_seeds']} "
f"{row['mae_avg'][0]:.1f}±{row['mae_avg'][1]:.1f} "
f"{row['r_avg'][0]:.3f}±{row['r_avg'][1]:.3f} "
f"{row['r2_R'][0]:.3f}±{row['r2_R'][1]:.3f} "
f"{row['r2_L'][0]:.3f}±{row['r2_L'][1]:.3f}")
def aggregate_expC():
"""T5 retrieval: group by mod config, average over seeds."""
print("\n" + "=" * 70)
print("EXP C: T5 Cross-modal text retrieval")
print("=" * 70)
files = load_results(f'{ROOT}/expC_retrieval/*/results.json')
if not files:
print(" No results yet")
return
groups = {}
for r in files:
if 'final_avg_over_3_pool_seeds' not in r:
continue
key = ','.join(r['modalities'])
groups.setdefault(key, []).append(r)
rows = []
for mods, rs in groups.items():
r1 = [r['final_avg_over_3_pool_seeds']['recall@1'] for r in rs]
r5 = [r['final_avg_over_3_pool_seeds']['recall@5'] for r in rs]
r10 = [r['final_avg_over_3_pool_seeds']['recall@10'] for r in rs]
medR = [r['final_avg_over_3_pool_seeds']['median_rank'] for r in rs]
rows.append({
'modalities': mods,
'n_seeds': len(rs),
'r1': (np.mean(r1), np.std(r1)),
'r5': (np.mean(r5), np.std(r5)),
'r10': (np.mean(r10), np.std(r10)),
'medR': (np.mean(medR), np.std(medR)),
'n_test': rs[0].get('n_test_segments', 0),
'K': rs[0].get('K_pool', 100),
})
rows.sort(key=lambda r: r['r10'][0], reverse=True)
print(f" {'Modalities':<30s} N N_test K "
f"{'R@1':<12s} {'R@5':<12s} {'R@10':<12s} {'medR':<12s}")
print(' ' + '-' * 100)
for row in rows:
print(f" {row['modalities']:<30s} {row['n_seeds']} {row['n_test']:<6d} {row['K']:<2d} "
f"{row['r1'][0]:.3f}±{row['r1'][1]:.3f} "
f"{row['r5'][0]:.3f}±{row['r5'][1]:.3f} "
f"{row['r10'][0]:.3f}±{row['r10'][1]:.3f} "
f"{row['medR'][0]:.1f}±{row['medR'][1]:.1f}")
def main():
aggregate_expA()
aggregate_expB()
aggregate_expC()
if __name__ == '__main__':
main()
|