diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..588ccdf60fe455b327abbea9a315d175f647afd6 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Anonymous Authors (under double-blind review for NeurIPS 2026) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..da554d9230713561e10a4333157f00acb46b374f --- /dev/null +++ b/README.md @@ -0,0 +1,152 @@ +--- +license: mit +language: + - en +library_name: pytorch +tags: + - multi-modal + - daily-activity + - wearable-sensors + - benchmark +--- + +# PULSE — Code Repository + +Reference implementation, training scripts, and benchmark baselines for the +**PULSE** dataset paper (under double-blind review at NeurIPS 2026 Evaluations & +Datasets Track). + +> **Dataset:** [`velvet-pine-22/PULSE`](https://huggingface.co/datasets/velvet-pine-22/PULSE) +> · **Sample subset (≈285 MB):** [`velvet-pine-22/PULSE-sample`](https://huggingface.co/datasets/velvet-pine-22/PULSE-sample) + +## Repository layout + +``` +PULSE-code/ +├── experiments/ +│ ├── data/ # PyTorch Dataset wrappers +│ │ ├── dataset.py # core multi-modal dataset (T1, T2) +│ │ ├── dataset_seqpred.py # T2 fine-grained action recognition +│ │ ├── dataset_grasp_state.py # T3 grasp onset anticipation +│ │ ├── dataset_forecast.py # auxiliary forecasting heads +│ │ └── dataset_signal_forecast.py # T5 tactile-driven motion forecast +│ │ +│ ├── nets/ # Model architectures +│ │ ├── models.py # backbone networks (Transformer / LSTM / 1D-CNN) +│ │ ├── models_seqpred.py # DailyActFormer (DAF) — multi-modal Transformer +│ │ ├── models_forecast.py # forecasting heads +│ │ ├── models_forecast_priv.py # privileged-tactile variants for T5 +│ │ ├── published_models.py # third-party model implementations +│ │ └── baselines_published/ # 7 published baselines (re-implementation) +│ │ ├── baselines.py # DeepConvLSTM / InceptionTime / MS-TCN / etc. +│ │ └── syncfuse.py # under-pressure-style multi-modal fusion +│ │ +│ ├── tasks/ # Training + evaluation entry points +│ │ ├── train_exp1.py # T1 — scene recognition +│ │ ├── train_seqpred.py # T2 — action recognition (DAF + ablations) +│ │ ├── train_grasp_state.py # T3 — grasp onset anticipation +│ │ ├── train_pred_cls.py # T3 alt classification head +│ │ ├── train_exp_missing.py # T4 — missing-modality robustness +│ │ ├── train_signal_forecast.py # T5 — tactile-driven motion forecasting +│ │ ├── train_signal_forecast_priv.py # T5 privileged variants +│ │ ├── train_baselines_t1.py # baselines for T1 +│ │ ├── train_exp{2,3,4}.py # ablation experiments +│ │ ├── train_exp_{anticipate,grip,pose,retrieval,zeroshot}.py # auxiliary +│ │ ├── train_pred.py / train_forecast.py +│ │ ├── eval_baselines.py / eval_combined.py +│ │ └── published_baselines.py # baseline registry +│ │ +│ ├── analysis/ # Case study, figures, data prep utilities +│ │ ├── grasp_phase_analysis.py # case study (gaze→EMG→hand→contact cascade) +│ │ ├── modality_viz.py / analysis_figures.py / data_statistics_figure.py +│ │ ├── extract_video_features.py / extract_videomae_features.py +│ │ ├── build_taxonomy.py / generate_action_labels.py / generate_coarse_annotations.py +│ │ ├── reannotate_actions.py / gen_val_comparison.py +│ │ ├── exp_per_subject.py / check_seg_lengths.py +│ │ └── aggregate_*.py # collate run results +│ │ +│ ├── slurm/ # 60+ SLURM launch scripts (one per main experiment) +│ │ └── run_*.sh +│ │ +│ ├── taxonomy.py # shared 18-primitive taxonomy +│ ├── s9_primitives.json +│ └── taxonomy_v3.json +│ +├── scripts/ # Top-level utilities (not task-specific) +│ ├── build_paper_tables.py # collates results JSONs into LaTeX tables +│ ├── eval_macrof1.py / eval_subset.py / eval_topk_v3.py +│ └── dispatch_eval.sh # batch dispatcher +│ +├── LICENSE # MIT +├── requirements.txt # Python deps +└── README.md +``` + +## Quick start + +```bash +# 1. Set up Python environment +python -m venv .venv && source .venv/bin/activate +pip install -r requirements.txt + +# 2. Point at the PULSE dataset (download from HuggingFace first) +export PULSE_ROOT=/path/to/PULSE # the dataset root (not this code repo) + +# 3. Run a training entry point as a module (from the experiments/ directory) +cd experiments +python -m tasks.train_seqpred \ + --root $PULSE_ROOT \ + --modalities mocap emg eyetrack imu pressure \ + --output_dir runs/t2_daf + +# 4. Reproduce paper tables (after training all benchmarks) +cd .. +python scripts/build_paper_tables.py \ + --results_root experiments/runs/ \ + --out tables/ +``` + +> **Why `python -m tasks.train_seqpred` and not `python tasks/train_seqpred.py`?** +> The training scripts import sibling modules (`from data.dataset import …`, +> `from nets.models import …`). Running with `-m` from the `experiments/` +> directory makes Python treat `data/`, `nets/`, `tasks/`, and `analysis/` as +> top-level packages so the imports resolve cleanly. + +## Reproducing the benchmark tasks + +| Task | Entry point | Output | +|---|---|---| +| T1 — Scene recognition (8-way) | `tasks.train_exp1` | scene-classification metrics | +| T2 — Fine-grained action recognition | `tasks.train_seqpred` | verb / noun / hand top-k accuracy | +| T3 — Grasp onset anticipation | `tasks.train_grasp_state` / `tasks.train_pred_cls` | anticipation F1 / time-to-contact | +| T4 — Missing-modality robustness | `tasks.train_exp_missing` + `tasks.eval_combined` | per-modality ablation table | +| T5 — Tactile-driven grasp-state recognition | `tasks.train_signal_forecast` (+ `_priv` variants) | sub-second grasp-state metrics | +| T6 — Cross-modal pressure prediction | `tasks.train_forecast` / `tasks.train_signal_forecast` | pressure reconstruction metrics | + +The exact command lines (with hyperparameters, seeds, GPU configs) used for +every paper table are checked in under `experiments/slurm/run_*.sh`, one +SLURM script per paper experiment. Output JSON files from these runs are +collated into LaTeX tables by `scripts/build_paper_tables.py`. + +## Hardware + +Headline experiments were run on **NVIDIA A800 (80 GB)** GPUs. A single seed of +DailyActFormer T2 trains in ~6 hours on one A800. Most baselines fit on a +single 24 GB consumer GPU. + +## License & attribution + +Code is released under **MIT** (see `LICENSE`). The PULSE dataset itself is +released under **CC BY-NC 4.0** (see the dataset repository). + +## Citation + +```bibtex +@inproceedings{anonymous2026pulse, + title = {PULSE: A Synchronized Five-Modality Dataset for Multi-Modal Daily Activity Understanding}, + author = {Anonymous Authors}, + booktitle = {Submitted to NeurIPS 2026 Evaluations and Datasets Track}, + year = {2026}, + note = {Under double-blind review} +} +``` diff --git a/experiments/__init__.py b/experiments/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/experiments/analysis/__init__.py b/experiments/analysis/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/experiments/analysis/aggregate_new_exps.py b/experiments/analysis/aggregate_new_exps.py new file mode 100644 index 0000000000000000000000000000000000000000..8ca73af1a81fb86bd54ecb7bae468f48d5e9d94c --- /dev/null +++ b/experiments/analysis/aggregate_new_exps.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +"""Aggregate results from the three new benchmark experiments.""" +import os +import json +import glob +import numpy as np + +ROOT = '${PULSE_ROOT}/results/exp_new' + + +def load_results(pattern): + files = sorted(glob.glob(pattern)) + results = [] + for f in files: + try: + results.append(json.load(open(f))) + except Exception as e: + print(f" ERR: {f}: {e}") + return results + + +def aggregate_expA(): + """Missing modality: average across seeds per eval config.""" + print("\n" + "=" * 70) + print("EXP A: Missing-modality robustness") + print("=" * 70) + + for subdir in ['expA_missing', 'expA_baseline']: + files = load_results(f'{ROOT}/{subdir}/*/results.json') + if not files: + print(f" No results yet for {subdir}") + continue + print(f"\n-- {subdir} (n seeds = {len(files)}) --") + # Group by eval config name; accumulate F1/Acc over seeds + config_stats = {} + for r in files: + if 'eval_configs' not in r: + continue + for name, info in r['eval_configs'].items(): + config_stats.setdefault(name, {'f1': [], 'acc': [], 'active': info['active']}) + config_stats[name]['f1'].append(info['f1']) + config_stats[name]['acc'].append(info['acc']) + + # Order: full, leave-one-out, singletons + full_names = [n for n in config_stats if n == 'full'] + drop_names = sorted([n for n in config_stats if n.startswith('drop_')]) + only_names = sorted([n for n in config_stats if n.startswith('only_')]) + + print(f" {'Config':<22s} {'Active modalities':<42s} " + f"{'F1 mean±std':<14s} {'Acc mean±std':<14s}") + print(' ' + '-' * 96) + for grp in [full_names, drop_names, only_names]: + for name in grp: + d = config_stats[name] + f1_m, f1_s = np.mean(d['f1']), np.std(d['f1']) + ac_m, ac_s = np.mean(d['acc']), np.std(d['acc']) + active = ','.join(d['active']) + print(f" {name:<22s} {active:<42s} " + f"{f1_m:.3f}±{f1_s:.3f} {ac_m:.3f}±{ac_s:.3f}") + + +def aggregate_expB(): + """Grip regression: group by (backbone, mod_config), average over seeds.""" + print("\n" + "=" * 70) + print("EXP B: Grip force regression") + print("=" * 70) + files = load_results(f'{ROOT}/expB_grip/*/results.json') + if not files: + print(" No results yet") + return + + # Group + groups = {} + for r in files: + if 'best_test_metrics' not in r: + continue + key = (r['backbone'], ','.join(r['modalities'])) + groups.setdefault(key, []).append(r) + + rows = [] + for (bb, mods), rs in groups.items(): + mae_R = [r['best_test_metrics']['right_hand']['mae_g'] for r in rs] + mae_L = [r['best_test_metrics']['left_hand']['mae_g'] for r in rs] + r_R = [r['best_test_metrics']['right_hand']['pearson_r'] for r in rs] + r_L = [r['best_test_metrics']['left_hand']['pearson_r'] for r in rs] + r2_R = [r['best_test_metrics']['right_hand']['r2'] for r in rs] + r2_L = [r['best_test_metrics']['left_hand']['r2'] for r in rs] + mae_avg = [r['best_test_metrics']['avg_mae_g'] for r in rs] + r_avg = [r['best_test_metrics']['avg_pearson_r'] for r in rs] + rows.append({ + 'backbone': bb, + 'modalities': mods, + 'n_seeds': len(rs), + 'mae_R': (np.mean(mae_R), np.std(mae_R)), + 'mae_L': (np.mean(mae_L), np.std(mae_L)), + 'mae_avg': (np.mean(mae_avg), np.std(mae_avg)), + 'r_R': (np.mean(r_R), np.std(r_R)), + 'r_L': (np.mean(r_L), np.std(r_L)), + 'r_avg': (np.mean(r_avg), np.std(r_avg)), + 'r2_R': (np.mean(r2_R), np.std(r2_R)), + 'r2_L': (np.mean(r2_L), np.std(r2_L)), + }) + rows.sort(key=lambda r: r['r_avg'][0], reverse=True) + print(f" {'Backbone':<12s} {'Modalities':<30s} N " + f"{'MAE(g) avg':<14s} {'Pearson r avg':<14s} {'R²(R)':<12s} {'R²(L)':<12s}") + print(' ' + '-' * 102) + for row in rows: + print(f" {row['backbone']:<12s} {row['modalities']:<30s} {row['n_seeds']} " + f"{row['mae_avg'][0]:.1f}±{row['mae_avg'][1]:.1f} " + f"{row['r_avg'][0]:.3f}±{row['r_avg'][1]:.3f} " + f"{row['r2_R'][0]:.3f}±{row['r2_R'][1]:.3f} " + f"{row['r2_L'][0]:.3f}±{row['r2_L'][1]:.3f}") + + +def aggregate_expC(): + """T5 retrieval: group by mod config, average over seeds.""" + print("\n" + "=" * 70) + print("EXP C: T5 Cross-modal text retrieval") + print("=" * 70) + files = load_results(f'{ROOT}/expC_retrieval/*/results.json') + if not files: + print(" No results yet") + return + groups = {} + for r in files: + if 'final_avg_over_3_pool_seeds' not in r: + continue + key = ','.join(r['modalities']) + groups.setdefault(key, []).append(r) + + rows = [] + for mods, rs in groups.items(): + r1 = [r['final_avg_over_3_pool_seeds']['recall@1'] for r in rs] + r5 = [r['final_avg_over_3_pool_seeds']['recall@5'] for r in rs] + r10 = [r['final_avg_over_3_pool_seeds']['recall@10'] for r in rs] + medR = [r['final_avg_over_3_pool_seeds']['median_rank'] for r in rs] + rows.append({ + 'modalities': mods, + 'n_seeds': len(rs), + 'r1': (np.mean(r1), np.std(r1)), + 'r5': (np.mean(r5), np.std(r5)), + 'r10': (np.mean(r10), np.std(r10)), + 'medR': (np.mean(medR), np.std(medR)), + 'n_test': rs[0].get('n_test_segments', 0), + 'K': rs[0].get('K_pool', 100), + }) + rows.sort(key=lambda r: r['r10'][0], reverse=True) + print(f" {'Modalities':<30s} N N_test K " + f"{'R@1':<12s} {'R@5':<12s} {'R@10':<12s} {'medR':<12s}") + print(' ' + '-' * 100) + for row in rows: + print(f" {row['modalities']:<30s} {row['n_seeds']} {row['n_test']:<6d} {row['K']:<2d} " + f"{row['r1'][0]:.3f}±{row['r1'][1]:.3f} " + f"{row['r5'][0]:.3f}±{row['r5'][1]:.3f} " + f"{row['r10'][0]:.3f}±{row['r10'][1]:.3f} " + f"{row['medR'][0]:.1f}±{row['medR'][1]:.1f}") + + +def main(): + aggregate_expA() + aggregate_expB() + aggregate_expC() + + +if __name__ == '__main__': + main() diff --git a/experiments/analysis/aggregate_t1_extended.py b/experiments/analysis/aggregate_t1_extended.py new file mode 100644 index 0000000000000000000000000000000000000000..f55119ac43543b8f04ba0575636f491d0c057082 --- /dev/null +++ b/experiments/analysis/aggregate_t1_extended.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +"""Aggregate T1 extended benchmark results. +Prints a Markdown-style table sorted by F1 desc.""" +import os +import json +import glob +import numpy as np +from collections import defaultdict + +ROOT = '${PULSE_ROOT}/results/t1_extended' + + +def collect(pattern): + by_key = defaultdict(list) + for f in sorted(glob.glob(pattern)): + try: + r = json.load(open(f)) + except Exception as e: + print(f" ERR reading {f}: {e}") + continue + key = r.get('method', os.path.basename(os.path.dirname(f))) + # Distinguish ablations by tag + tag = r.get('args', {}).get('tag', '') + if tag: + key = f"{key}_{tag}" + by_key[key].append(r) + return by_key + + +def main(): + groups = collect(f'{ROOT}/*/results.json') + rows = [] + for key, rs in groups.items(): + f1s = [r['test_f1'] for r in rs] + accs = [r['test_acc'] for r in rs] + mods = ','.join(rs[0]['modalities']) + rows.append({ + 'method': key, + 'modalities': mods, + 'n_seeds': len(rs), + 'f1_mean': np.mean(f1s), + 'f1_std': np.std(f1s), + 'acc_mean': np.mean(accs), + 'acc_std': np.std(accs), + 'n_params': rs[0].get('n_params', 0), + }) + rows.sort(key=lambda r: r['f1_mean'], reverse=True) + + print(f"\n{'Method':<28s} {'Modalities':<32s} N {'F1 mean±std':<14s} " + f"{'Acc mean±std':<14s} Params") + print('-' * 110) + for r in rows: + print(f"{r['method']:<28s} {r['modalities']:<32s} {r['n_seeds']} " + f"{r['f1_mean']:.3f}±{r['f1_std']:.3f} " + f"{r['acc_mean']:.3f}±{r['acc_std']:.3f} " + f"{r['n_params']:,}") + + +if __name__ == '__main__': + main() diff --git a/experiments/analysis/analysis_figures.py b/experiments/analysis/analysis_figures.py new file mode 100644 index 0000000000000000000000000000000000000000..04d6535b34e5f80bfd88df74fc61f6d3ec2df4eb --- /dev/null +++ b/experiments/analysis/analysis_figures.py @@ -0,0 +1,444 @@ +#!/usr/bin/env python3 +"""Generate three showcase figures for the main paper: + 1. Eye-Hand-Contact coordination (gaze fixation + hand velocity + pressure) + 2. Pressure fingerprints per action category + 3. 3D hand trajectory colored by pressure +""" +import os, glob, json, re +import numpy as np +import pandas as pd +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +from scipy.signal import savgol_filter + +DATASET = "${PULSE_ROOT}/dataset" +OUT_DIR = "${PULSE_ROOT}/paper/figures" +os.makedirs(OUT_DIR, exist_ok=True) + +PRESSURE_THRESHOLD = 5.0 +FPS = 100 + + +# ============================================================ +# Shared data-loading helpers +# ============================================================ + +def load_pressure(scenario_dir): + """Return (T, 2) array of (right_total, left_total) pressure.""" + f = os.path.join(scenario_dir, "aligned_pressure_100hz.csv") + if not os.path.exists(f): + return None + df = pd.read_csv(f, low_memory=False) + r_cols = [c for c in df.columns if c.startswith('R') and c.endswith('(g)')] + l_cols = [c for c in df.columns if c.startswith('L') and c.endswith('(g)')] + if len(r_cols) < 20 or len(l_cols) < 20: + return None + r = df[r_cols].apply(pd.to_numeric, errors='coerce').fillna(0).values + l = df[l_cols].apply(pd.to_numeric, errors='coerce').fillna(0).values + return r, l # (T, 25) each + + +def load_emg(scenario_dir): + f = os.path.join(scenario_dir, "aligned_emg_100hz.csv") + if not os.path.exists(f): + return None + df = pd.read_csv(f, low_memory=False) + numeric = [c for c in df.select_dtypes(include=[np.number]).columns + if c not in ('time', 'UTC', 'Frame')] + if len(numeric) < 4: + return None + return np.nan_to_num(df[numeric].values.astype(np.float32)) + + +def load_gaze(scenario_dir): + f = os.path.join(scenario_dir, "aligned_eyetrack_100hz.csv") + if not os.path.exists(f): + return None + df = pd.read_csv(f, low_memory=False) + gx_col = [c for c in df.columns if 'Gaze X' in c and 'Scene Cam' in c] + gy_col = [c for c in df.columns if 'Gaze Y' in c and 'Scene Cam' in c] + if gx_col and gy_col: + gx = pd.to_numeric(df[gx_col[0]], errors='coerce').fillna(0).values + gy = pd.to_numeric(df[gy_col[0]], errors='coerce').fillna(0).values + return np.stack([gx, gy], axis=1) + return None + + +def load_mocap_hand(scenario_dir, vol, scenario): + """Return wrist 3D position (T,3) and tip position summary.""" + f = os.path.join(scenario_dir, f"aligned_{vol}{scenario}_s_Q.tsv") + if not os.path.exists(f): + return None, None + df = pd.read_csv(f, sep='\t', low_memory=False) + # Right hand wrist (try several naming patterns) + candidates = [ + ['RightHand_X','RightHand_Y','RightHand_Z'], + ['R_Hand_X','R_Hand_Y','R_Hand_Z'], + ['Q_RWristIn_X','Q_RWristIn_Y','Q_RWristIn_Z'], + ] + r_wrist = None + for cs in candidates: + if all(c in df.columns for c in cs): + r_wrist = df[cs].apply(pd.to_numeric, errors='coerce').fillna(0).values + break + l_wrist = None + for cs_l in [['LeftHand_X','LeftHand_Y','LeftHand_Z'], + ['L_Hand_X','L_Hand_Y','L_Hand_Z'], + ['Q_LWristIn_X','Q_LWristIn_Y','Q_LWristIn_Z']]: + if all(c in df.columns for c in cs_l): + l_wrist = df[cs_l].apply(pd.to_numeric, errors='coerce').fillna(0).values + break + return r_wrist, l_wrist + + +def compute_velocity(position, window=5): + """Magnitude of velocity (after smoothing).""" + vel = np.zeros_like(position) + vel[1:] = position[1:] - position[:-1] + mag = np.linalg.norm(vel, axis=1) + try: + mag = savgol_filter(mag, window_length=min(window*2+1, len(mag)-1 if len(mag)%2==0 else len(mag)), polyorder=2) + except: + pass + return mag + + +def detect_grasp_events(hand_pressure, threshold=PRESSURE_THRESHOLD, min_gap=50): + """Detect pressure onset events.""" + total = hand_pressure.sum(axis=1) if hand_pressure.ndim == 2 else hand_pressure + above = total > threshold + onsets = [] + last_state = False + for i, a in enumerate(above): + if a and not last_state: + if i + 10 < len(above) and np.mean(above[i:i+10]) > 0.7: + if not onsets or i - onsets[-1] > min_gap: + onsets.append(i) + last_state = True + elif not a and last_state: + if i + 5 < len(above) and np.mean(above[i:i+5]) < 0.3: + last_state = False + return onsets + + +def emg_envelope(emg, window=20): + rect = np.abs(emg - np.mean(emg, axis=0)) + kernel = np.ones(window) / window + env = np.stack([np.convolve(rect[:, c], kernel, mode='same') for c in range(rect.shape[1])], axis=1) + return env.sum(axis=1) + + +def gaze_velocity(gaze_xy, window=5): + """Magnitude of gaze velocity — high = saccade, low = fixation.""" + v = np.zeros_like(gaze_xy) + v[1:] = gaze_xy[1:] - gaze_xy[:-1] + mag = np.linalg.norm(v, axis=1) + try: + mag = savgol_filter(mag, window_length=min(window*2+1, 15), polyorder=2) + except: + pass + return mag + + +# ============================================================ +# FIGURE 1: Eye-Hand-Contact coordination +# ============================================================ +def make_eye_hand_contact_figure(): + print("=== Figure 1: Eye-Hand-Contact coordination ===") + context = 200 # 2s before + 0.5s after + after = 50 + events = [] # list of dicts: gaze_vel, hand_vel, pressure, all shape (context+after,) + + for vol_dir in sorted(glob.glob(f"{DATASET}/v*")): + vol = os.path.basename(vol_dir) + for sd in sorted(glob.glob(f"{vol_dir}/s*")): + scenario = os.path.basename(sd) + meta_path = os.path.join(sd, "alignment_metadata.json") + if not os.path.exists(meta_path): + continue + meta = json.load(open(meta_path)) + if not {'pressure', 'eyetrack', 'mocap'}.issubset(set(meta['modalities'])): + continue + + p = load_pressure(sd) + g = load_gaze(sd) + r_wrist, _ = load_mocap_hand(sd, vol, scenario) + if p is None or g is None or r_wrist is None: + continue + r_p, _ = p + min_len = min(len(r_p), len(g), len(r_wrist)) + r_p, g, r_wrist = r_p[:min_len], g[:min_len], r_wrist[:min_len] + + hand_vel = compute_velocity(r_wrist) + gvel = gaze_velocity(g) + total_p = r_p.sum(axis=1) + + onsets = detect_grasp_events(r_p) + for o in onsets: + if o < context or o + after >= min_len: + continue + # Require quiescent pre-grasp + rest_window = gvel[o-150:o-100] + vel_rest = hand_vel[o-150:o-100] + if np.mean(vel_rest) > hand_vel[o-50:o].mean() * 0.5: + continue + gv_seg = gvel[o-context:o+after] + hv_seg = hand_vel[o-context:o+after] + pr_seg = total_p[o-context:o+after] + if len(gv_seg) != context+after or np.isnan(gv_seg).any(): + continue + events.append({'gv': gv_seg, 'hv': hv_seg, 'p': pr_seg}) + if len(events) > 400: + break + if len(events) > 400: + break + + print(f" Collected {len(events)} events") + if len(events) < 50: + print(" Not enough events, skipping") + return + + # Gaze: fixation = low gaze velocity, so use "1 - normalized gaze velocity" + # This represents "gaze fixation stability" + def norm01(arr): + arr = np.array(arr) + arr = arr - arr.min(axis=1, keepdims=True) + mx = arr.max(axis=1, keepdims=True) + return arr / (mx + 1e-8) + + gv_stack = norm01([e['gv'] for e in events]) + hv_stack = norm01([e['hv'] for e in events]) + p_stack = norm01([e['p'] for e in events]) + + # Smooth gaze to show fixation trend + # Gaze fixation = low velocity. Plot (1 - gaze_velocity) -> rises as gaze fixates + gaze_fix = 1 - gv_stack # high = fixating + # Normalize each event's fix to [0,1] for display + gaze_fix_plot = norm01(gaze_fix) + + time_axis = np.arange(-context, after) * 10 # ms + + fig, ax = plt.subplots(figsize=(9, 4.5)) + + for stack, color, label in [ + (gaze_fix_plot, '#8E44AD', 'Gaze fixation'), + (hv_stack, '#3498DB', 'Hand velocity'), + (p_stack, '#27AE60', 'Pressure (contact)'), + ]: + mean = stack.mean(axis=0) + std = stack.std(axis=0) + ax.plot(time_axis, mean, color=color, linewidth=2.5, label=label) + ax.fill_between(time_axis, mean - std*0.4, mean + std*0.4, color=color, alpha=0.15) + + ax.axvline(0, color='black', linestyle='--', linewidth=1.2, alpha=0.7) + ax.set_xlabel('Time relative to contact onset (ms)', fontsize=12) + ax.set_ylabel('Normalized amplitude', fontsize=12) + ax.set_title(f'Gaze → Hand → Contact coordination ({len(events)} events)', + fontsize=13, fontweight='bold') + ax.set_xlim(-2000, 500) + ax.legend(loc='upper left', fontsize=10, frameon=True) + ax.grid(True, alpha=0.3) + ax.set_ylim(-0.05, 1.1) + + plt.tight_layout() + out_path = os.path.join(OUT_DIR, 'eye_hand_contact.pdf') + plt.savefig(out_path, dpi=150, bbox_inches='tight') + plt.savefig(out_path.replace('.pdf', '.png'), dpi=150, bbox_inches='tight') + plt.close() + print(f" Saved {out_path}") + + +# ============================================================ +# FIGURE 2: Pressure fingerprints per action category +# ============================================================ +def make_pressure_fingerprints(): + print("\n=== Figure 2: Pressure fingerprints ===") + import sys + sys.path.insert(0, '${PULSE_ROOT}') + from experiments.train_exp2 import load_annotations + + # For each action class, accumulate mean pressure profile (50 channels) + action_r_sum = {} # action -> (sum 25 channels, count) + action_l_sum = {} + + for vol_dir in sorted(glob.glob(f"{DATASET}/v*")): + vol = os.path.basename(vol_dir) + for sd in sorted(glob.glob(f"{vol_dir}/s*")): + scenario = os.path.basename(sd) + meta_path = os.path.join(sd, "alignment_metadata.json") + if not os.path.exists(meta_path): + continue + meta = json.load(open(meta_path)) + if 'pressure' not in set(meta['modalities']): + continue + p = load_pressure(sd) + if p is None: + continue + r_p, l_p = p + labels = load_annotations(vol, scenario, len(r_p), sampling_rate=100, use_coarse=False) + if labels is None: + continue + labels = labels[:len(r_p)] + from experiments.train_exp2 import ACTION_NAMES + for a_id, a_name in ACTION_NAMES.items(): + if a_name == 'Idle': + continue + mask = labels == a_id + if mask.sum() < 10: + continue + r_mean = r_p[mask].mean(axis=0) + l_mean = l_p[mask].mean(axis=0) + if a_name not in action_r_sum: + action_r_sum[a_name] = [np.zeros(25), 0] + action_l_sum[a_name] = [np.zeros(25), 0] + action_r_sum[a_name][0] += r_mean * mask.sum() + action_r_sum[a_name][1] += mask.sum() + action_l_sum[a_name][0] += l_mean * mask.sum() + action_l_sum[a_name][1] += mask.sum() + + # Compute mean for each action + results = {} + for a_name in action_r_sum: + r_cnt = action_r_sum[a_name][1] + l_cnt = action_l_sum[a_name][1] + if r_cnt == 0 or l_cnt == 0: + continue + results[a_name] = { + 'r': action_r_sum[a_name][0] / r_cnt, + 'l': action_l_sum[a_name][0] / l_cnt, + } + print(f" Action categories: {list(results.keys())}") + + if not results: + print(" No data") + return + + # Pick top 6 by frequency (they have most data) + # Sort by right-hand count + sorted_actions = sorted(results.keys(), + key=lambda a: action_r_sum[a][1], reverse=True)[:6] + + # Plot as 2-row grid: top row = right hand, bottom row = left hand (or combine as single image) + # Use 25 points arranged as a 5x5 grid (stylized hand layout) + # Actual finger layout is complex; for visualization use simple grid + # Layout (rough hand analogy): arrange as fingertips at top, palm base at bottom + # Index mapping — 25 points, organized heuristically: + # row 0 (fingertips): 1-5 + # row 1-2: finger segments + # row 3-4: palm area + def point_to_xy(idx): + """Map channel index (0-24) to 2D hand position (stylized).""" + # Simple 5x5 grid + row = idx // 5 + col = idx % 5 + return col, 4 - row # flip y so fingertips at top + + n = len(sorted_actions) + fig, axes = plt.subplots(2, n, figsize=(2.0 * n, 4.8), squeeze=False) + vmax = max(max(results[a]['r'].max(), results[a]['l'].max()) for a in sorted_actions) + + for i, a in enumerate(sorted_actions): + for row, (hand, title) in enumerate([('r', 'Right'), ('l', 'Left')]): + ax = axes[row][i] + data = results[a][hand] + grid = np.zeros((5, 5)) + for idx, v in enumerate(data): + x, y = point_to_xy(idx) + grid[4-y, x] = v + im = ax.imshow(grid, cmap='hot', vmin=0, vmax=vmax, aspect='equal') + ax.set_xticks([]); ax.set_yticks([]) + if row == 0: + ax.set_title(a, fontsize=11, fontweight='bold') + if i == 0: + ax.set_ylabel(title, fontsize=10) + + fig.suptitle('Per-action fingertip pressure signatures (mean across events)', + fontsize=12, fontweight='bold', y=0.98) + cbar = fig.colorbar(im, ax=axes.ravel().tolist(), shrink=0.7, pad=0.02) + cbar.set_label('Pressure (g)', fontsize=10) + plt.savefig(os.path.join(OUT_DIR, 'pressure_fingerprints.pdf'), bbox_inches='tight') + plt.savefig(os.path.join(OUT_DIR, 'pressure_fingerprints.png'), dpi=150, bbox_inches='tight') + plt.close() + print(f" Saved pressure_fingerprints.pdf") + + +# ============================================================ +# FIGURE 3: 3D hand trajectory colored by pressure +# ============================================================ +def make_3d_trajectory(): + print("\n=== Figure 3: 3D hand trajectory + pressure coloring ===") + from mpl_toolkits.mplot3d import Axes3D + # Pick a few illustrative recordings with rich grasping — use v1 s3 (kitchen) or similar + candidates = [('v1', 's3'), ('v2', 's4'), ('v1', 's5'), ('v1', 's7')] + picked = [] + + for vol, scn in candidates: + sd = f"{DATASET}/{vol}/{scn}" + if not os.path.isdir(sd): + continue + p = load_pressure(sd) + r_wrist, _ = load_mocap_hand(sd, vol, scn) + if p is None or r_wrist is None: + continue + r_p, _ = p + min_len = min(len(r_p), len(r_wrist)) + total_p = r_p[:min_len].sum(axis=1) + r_wrist = r_wrist[:min_len] + # Take a window that contains a grasp + onsets = detect_grasp_events(r_p[:min_len]) + if not onsets: + continue + # Take ~3s centred on first onset + o = onsets[0] + start = max(0, o - 150) + end = min(min_len, o + 150) + traj = r_wrist[start:end] + pressure = total_p[start:end] + picked.append((vol, scn, traj, pressure)) + if len(picked) >= 3: + break + + if not picked: + print(" No valid recordings found") + return + + fig = plt.figure(figsize=(3.5 * len(picked), 4)) + for i, (vol, scn, traj, pr) in enumerate(picked): + ax = fig.add_subplot(1, len(picked), i+1, projection='3d') + # Normalize pressure for coloring + pr_norm = pr / (pr.max() + 1e-6) + # Plot as colored line segments + for j in range(len(traj) - 1): + x = traj[j:j+2, 0] + y = traj[j:j+2, 1] + z = traj[j:j+2, 2] + c = plt.cm.coolwarm(pr_norm[j]) + ax.plot(x, y, z, color=c, linewidth=2.5, alpha=0.85) + # Mark contact point + contact_idx = np.argmax(pr) + ax.scatter(traj[contact_idx, 0], traj[contact_idx, 1], traj[contact_idx, 2], + color='red', s=50, marker='*', zorder=5, label='Peak contact') + ax.set_title(f'{vol}/{scn}', fontsize=10) + ax.set_xlabel('X', fontsize=8); ax.set_ylabel('Y', fontsize=8); ax.set_zlabel('Z', fontsize=8) + ax.tick_params(labelsize=7) + + # Colorbar + sm = plt.cm.ScalarMappable(cmap='coolwarm', norm=matplotlib.colors.Normalize(vmin=0, vmax=1)) + sm.set_array([]) + cbar = fig.colorbar(sm, ax=fig.axes, shrink=0.6, pad=0.02) + cbar.set_label('Normalised pressure', fontsize=10) + + fig.suptitle('Right-hand wrist 3D trajectory coloured by fingertip pressure', + fontsize=12, fontweight='bold', y=1.02) + plt.savefig(os.path.join(OUT_DIR, 'hand_trajectory_3d.pdf'), bbox_inches='tight') + plt.savefig(os.path.join(OUT_DIR, 'hand_trajectory_3d.png'), dpi=150, bbox_inches='tight') + plt.close() + print(f" Saved hand_trajectory_3d.pdf") + + +if __name__ == '__main__': + make_eye_hand_contact_figure() + make_pressure_fingerprints() + make_3d_trajectory() + print("\nAll figures generated in", OUT_DIR) diff --git a/experiments/analysis/build_taxonomy.py b/experiments/analysis/build_taxonomy.py new file mode 100644 index 0000000000000000000000000000000000000000..18c0167958a6939b7dec041a5dfed9ee10b30de3 --- /dev/null +++ b/experiments/analysis/build_taxonomy.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +""" +Rebuild the frozen taxonomy JSON from the current annotations_v3/ state. + +Run this *once* after annotation is complete to lock the 28+ noun list. Later +experiments load the frozen list via taxonomy.py, so class indices don't +drift if more annotations are ever added. + +Usage: + python3 experiments/build_taxonomy.py + python3 experiments/build_taxonomy.py --threshold 50 --out experiments/taxonomy_v3.json +""" + +import argparse +import glob +import json +import os +from collections import Counter +from pathlib import Path + +REPO = Path(__file__).resolve().parents[1] + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument( + "--annotations_dir", + default=str(REPO / "annotations_v3"), + help="Directory containing v*/s*.json annotation files", + ) + ap.add_argument("--threshold", type=int, default=50, + help="Minimum noun frequency to keep (Strategy A drops the rest)") + ap.add_argument( + "--out", + default=str(REPO / "experiments" / "taxonomy_v3.json"), + help="Output frozen taxonomy JSON", + ) + args = ap.parse_args() + + # Late import so building the list doesn't depend on the frozen file + # being present yet. + import sys + sys.path.insert(0, str(REPO)) + from experiments.taxonomy import ( + VERB_FINE, VERB_COMPOSITE, HAND, NOUN_CANONICAL, canonical_noun, + ) + + paths = sorted(glob.glob(os.path.join(args.annotations_dir, "v*", "s*.json"))) + if not paths: + raise SystemExit(f"No json files under {args.annotations_dir}") + + verbs, nouns, hands = Counter(), Counter(), Counter() + total = 0 + dropped_unknown_verb = 0 + dropped_unknown_hand = 0 + for p in paths: + try: + with open(p) as f: + d = json.load(f) + except Exception as e: + print(f" WARN: could not parse {p}: {e}") + continue + for s in d.get("segments", []): + a = s.get("action_annotation", {}) + v = a.get("action_name") + n = a.get("object_name") + h = a.get("hand_type") + if not (v and n and h): + continue + total += 1 + if v not in VERB_FINE: + dropped_unknown_verb += 1 + continue + if h not in HAND: + dropped_unknown_hand += 1 + continue + verbs[v] += 1 + nouns[canonical_noun(n)] += 1 + hands[h] += 1 + + kept = [n for n, c in nouns.most_common() if c >= args.threshold] + + # Stable alphabetical ordering within kept-set, so re-runs that swap two + # near-tie classes don't flip indices. + kept = sorted(kept, key=lambda n: (-nouns[n], n)) + + surviving_segs = 0 + for p in paths: + with open(p) as f: + d = json.load(f) + for s in d.get("segments", []): + a = s.get("action_annotation", {}) + v = a.get("action_name") + n = a.get("object_name") + h = a.get("hand_type") + if not (v and n and h): + continue + if v not in VERB_FINE or h not in HAND: + continue + if canonical_noun(n) not in kept: + continue + surviving_segs += 1 + + out = { + "threshold": args.threshold, + "annotation_file_count": len(paths), + "total_segments": total, + "dropped_unknown_verb": dropped_unknown_verb, + "dropped_unknown_hand": dropped_unknown_hand, + "surviving_segments": surviving_segs, + "verbs": VERB_FINE, + "verb_composite": VERB_COMPOSITE, + "hand": HAND, + "nouns": kept, + "noun_counts": {n: nouns[n] for n in kept}, + "verb_counts": dict(verbs), + "hand_counts": dict(hands), + } + Path(args.out).parent.mkdir(parents=True, exist_ok=True) + with open(args.out, "w") as f: + json.dump(out, f, ensure_ascii=False, indent=2) + + print(f"Scanned {len(paths)} files, {total} segments") + print(f"Dropped (unknown verb / hand): {dropped_unknown_verb} / " + f"{dropped_unknown_hand}") + print(f"Kept {len(kept)} nouns (>= {args.threshold}):") + for n in kept: + print(f" {n}: {nouns[n]}") + print(f"Surviving segments (Strategy A): " + f"{surviving_segs} / {total} " + f"({100 * surviving_segs / max(1, total):.1f}%)") + print(f"Wrote {args.out}") + + +if __name__ == "__main__": + main() diff --git a/experiments/analysis/check_seg_lengths.py b/experiments/analysis/check_seg_lengths.py new file mode 100644 index 0000000000000000000000000000000000000000..25a07b38b315cb71a71934563a18229e046cd9d2 --- /dev/null +++ b/experiments/analysis/check_seg_lengths.py @@ -0,0 +1,229 @@ +#!/usr/bin/env python3 +""" +Analyze segment lengths in the recognition dataset. + +For each annotation file, computes segment lengths in: +- Raw frames (at 100Hz sampling rate) +- Downsampled frames (downsample=5 -> 20Hz effective) + +Reports statistics and distribution relative to window_frames used in training. +""" + +import os +import sys +import json +import re +import numpy as np +from collections import defaultdict + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from data.dataset import DATASET_DIR, TRAIN_VOLS, VAL_VOLS, TEST_VOLS + +ANNOTATION_DIR = "${PULSE_ROOT}" +SAMPLING_RATE = 100 # Hz +DOWNSAMPLE = 5 + + +def parse_timestamp(ts_str): + parts = ts_str.strip().split(':') + if len(parts) == 2: + return int(parts[0]) * 60 + int(parts[1]) + elif len(parts) == 3: + return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2]) + return 0 + + +def main(): + all_vols = TRAIN_VOLS + VAL_VOLS + TEST_VOLS + + # Collect segment lengths + raw_lengths_sec = [] # in seconds + raw_lengths_frames = [] # in raw 100Hz frames + ds_lengths_frames = [] # in downsampled frames (100/5 = 20Hz) + + split_stats = defaultdict(list) # split -> list of ds_lengths + + total_scenarios = 0 + total_segments = 0 + skipped_segments = 0 + + for vol in sorted(all_vols): + # Determine split + if vol in TRAIN_VOLS: + split = 'train' + elif vol in VAL_VOLS: + split = 'val' + else: + split = 'test' + + ann_vol_dir = os.path.join(ANNOTATION_DIR, vol) + if not os.path.isdir(ann_vol_dir): + print(f"WARNING: No annotation dir for {vol}") + continue + + for ann_file in sorted(os.listdir(ann_vol_dir)): + if not ann_file.endswith('.json'): + continue + scenario = ann_file.replace('.json', '') + ann_path = os.path.join(ann_vol_dir, ann_file) + + # Also check that corresponding dataset dir exists + scenario_dir = os.path.join(DATASET_DIR, vol, scenario) + if not os.path.isdir(scenario_dir): + continue + + with open(ann_path) as f: + ann = json.load(f) + + total_scenarios += 1 + + for seg in ann.get('segments', []): + m = re.match(r'(\d+:\d+(?::\d+)?)\s*-\s*(\d+:\d+(?::\d+)?)', + seg['timestamp']) + if not m: + skipped_segments += 1 + continue + + start_sec = parse_timestamp(m.group(1)) + end_sec = parse_timestamp(m.group(2)) + + if end_sec <= start_sec: + skipped_segments += 1 + continue + + duration_sec = end_sec - start_sec + raw_frames = duration_sec * SAMPLING_RATE + ds_frames = int(end_sec * SAMPLING_RATE / DOWNSAMPLE) - int(start_sec * SAMPLING_RATE / DOWNSAMPLE) + + raw_lengths_sec.append(duration_sec) + raw_lengths_frames.append(raw_frames) + ds_lengths_frames.append(ds_frames) + split_stats[split].append(ds_frames) + total_segments += 1 + + # Convert to numpy + raw_sec = np.array(raw_lengths_sec) + raw_fr = np.array(raw_lengths_frames) + ds_fr = np.array(ds_lengths_frames) + + print("=" * 70) + print("SEGMENT LENGTH ANALYSIS FOR RECOGNITION DATASET") + print("=" * 70) + print(f"\nTotal scenarios: {total_scenarios}") + print(f"Total valid segments: {total_segments}") + print(f"Skipped segments (bad timestamp): {skipped_segments}") + print(f"Sampling rate: {SAMPLING_RATE} Hz") + print(f"Downsample factor: {DOWNSAMPLE}") + print(f"Effective rate after downsample: {SAMPLING_RATE / DOWNSAMPLE} Hz") + + # --- Raw seconds --- + print("\n" + "-" * 70) + print("SEGMENT DURATION (seconds)") + print("-" * 70) + print(f" Min: {raw_sec.min():.1f}s") + print(f" Max: {raw_sec.max():.1f}s") + print(f" Mean: {raw_sec.mean():.2f}s") + print(f" Median: {np.median(raw_sec):.1f}s") + print(f" Std: {raw_sec.std():.2f}s") + + # Percentiles + for p in [5, 10, 25, 50, 75, 90, 95]: + print(f" P{p:2d}: {np.percentile(raw_sec, p):.1f}s") + + # --- Raw frames (100Hz) --- + print("\n" + "-" * 70) + print("SEGMENT LENGTH (raw frames @ 100Hz)") + print("-" * 70) + print(f" Min: {raw_fr.min()}") + print(f" Max: {raw_fr.max()}") + print(f" Mean: {raw_fr.mean():.1f}") + print(f" Median: {np.median(raw_fr):.0f}") + + # --- Downsampled frames --- + print("\n" + "-" * 70) + print(f"SEGMENT LENGTH (downsampled frames @ {SAMPLING_RATE/DOWNSAMPLE:.0f}Hz)") + print("-" * 70) + print(f" Min: {ds_fr.min()}") + print(f" Max: {ds_fr.max()}") + print(f" Mean: {ds_fr.mean():.1f}") + print(f" Median: {np.median(ds_fr):.0f}") + print(f" Std: {ds_fr.std():.1f}") + + for p in [5, 10, 25, 50, 75, 90, 95]: + print(f" P{p:2d}: {np.percentile(ds_fr, p):.0f}") + + # --- Comparison with window_frames --- + print("\n" + "-" * 70) + print("COMPARISON WITH window_frames SETTINGS") + print("-" * 70) + + # Common window_sec values and their corresponding window_frames + for window_sec in [5.0, 10.0, 15.0, 20.0, 30.0]: + wf = int(window_sec * SAMPLING_RATE / DOWNSAMPLE) + shorter = (ds_fr < wf).sum() + equal_or_longer = (ds_fr >= wf).sum() + longer = (ds_fr > wf).sum() + pct_shorter = 100.0 * shorter / len(ds_fr) + pct_longer = 100.0 * longer / len(ds_fr) + print(f"\n window_sec={window_sec:5.1f}s -> window_frames={wf}") + print(f" Segments SHORTER than window: {shorter:4d} ({pct_shorter:5.1f}%) -> will be PADDED") + print(f" Segments LONGER than window: {longer:4d} ({pct_longer:5.1f}%) -> will be CENTER-CROPPED") + + # --- Thresholds in downsampled frames --- + print("\n" + "-" * 70) + print("PERCENTAGE SHORTER THAN THRESHOLDS (downsampled frames)") + print("-" * 70) + for thresh in [20, 40, 60, 100, 200, 300, 400, 500, 1000, 2000]: + pct = 100.0 * (ds_fr < thresh).sum() / len(ds_fr) + print(f" < {thresh:5d} frames ({thresh * DOWNSAMPLE / SAMPLING_RATE:6.1f}s): {pct:5.1f}%") + + # --- Per-split stats --- + print("\n" + "-" * 70) + print("PER-SPLIT STATISTICS (downsampled frames)") + print("-" * 70) + for split in ['train', 'val', 'test']: + arr = np.array(split_stats[split]) + if len(arr) == 0: + print(f" {split}: no segments") + continue + print(f"\n {split.upper()} ({len(arr)} segments):") + print(f" Min={arr.min()}, Max={arr.max()}, Mean={arr.mean():.1f}, Median={np.median(arr):.0f}") + + # --- Histogram (text-based) --- + print("\n" + "-" * 70) + print("HISTOGRAM OF SEGMENT DURATIONS (seconds)") + print("-" * 70) + bins = [0, 1, 2, 3, 4, 5, 7, 10, 15, 20, 30, 60, 120, 300, 600] + for i in range(len(bins) - 1): + count = ((raw_sec >= bins[i]) & (raw_sec < bins[i + 1])).sum() + pct = 100.0 * count / len(raw_sec) + bar = '#' * int(pct / 2) + print(f" [{bins[i]:4d}-{bins[i+1]:4d})s: {count:5d} ({pct:5.1f}%) {bar}") + # Last bin: >= 600 + count = (raw_sec >= bins[-1]).sum() + pct = 100.0 * count / len(raw_sec) + bar = '#' * int(pct / 2) + print(f" [{bins[-1]:4d}+ )s: {count:5d} ({pct:5.1f}%) {bar}") + + # --- Key insight --- + print("\n" + "=" * 70) + print("KEY INSIGHTS") + print("=" * 70) + median_sec = np.median(raw_sec) + mean_sec = raw_sec.mean() + print(f" Median segment duration: {median_sec:.1f}s ({median_sec * SAMPLING_RATE / DOWNSAMPLE:.0f} ds-frames)") + print(f" Mean segment duration: {mean_sec:.1f}s ({mean_sec * SAMPLING_RATE / DOWNSAMPLE:.0f} ds-frames)") + print() + # Suggest optimal window + p95_sec = np.percentile(raw_sec, 95) + print(f" 95th percentile duration: {p95_sec:.1f}s") + print(f" -> A window of {p95_sec:.0f}s would cover 95% of segments without cropping") + print(f" -> Current default window_sec=15.0 -> window_frames={int(15.0 * SAMPLING_RATE / DOWNSAMPLE)}") + wf15 = int(15.0 * SAMPLING_RATE / DOWNSAMPLE) + pct_crop = 100.0 * (ds_fr > wf15).sum() / len(ds_fr) + pct_pad = 100.0 * (ds_fr < wf15).sum() / len(ds_fr) + print(f" {pct_pad:.1f}% segments padded, {pct_crop:.1f}% center-cropped") + + +if __name__ == '__main__': + main() diff --git a/experiments/analysis/data_statistics_figure.py b/experiments/analysis/data_statistics_figure.py new file mode 100644 index 0000000000000000000000000000000000000000..e3f33b3dfe63874ca9ceb2b293ffbc5087fad75e --- /dev/null +++ b/experiments/analysis/data_statistics_figure.py @@ -0,0 +1,126 @@ +"""Generate dataset statistics figure from the currently-available annotations. + +Panels (3): + (a) Recording duration distribution per scene (boxplot) + (b) Segment length distribution (histogram) + (c) Top-20 manipulated objects by segment count + +Note: panel for motor-primitive frequency is deferred until the 18-primitive +annotation pipeline (anno.py) is rerun across all recordings. +""" +import json, re +from pathlib import Path +from collections import Counter +import numpy as np +import matplotlib.pyplot as plt + +ANNO_DIR = Path("${PULSE_ROOT}/annotations_by_scene") +OUT = Path("${PULSE_ROOT}/paper/figures/dataset_stats.pdf") + +# Chinese -> English object name mapping (from anno.py OBJECT_TRANSLATIONS) +OBJ_EN = { + "笔记本电脑": "laptop", "有线鼠标": "wired mouse", "有线键盘": "wired keyboard", + "马克笔": "marker", "胶带": "tape", "笔记本电源": "laptop power", "折叠伞": "umbrella", + "剪刀": "scissors", "钱包": "wallet", "纸": "paper", "订书机": "stapler", + "纸箱": "box", "文件": "document", "架子": "rack", "桌布": "tablecloth", "罐子": "jar", + "调料瓶": "seasoning bottle", "密封罐": "sealed jar", "厨房纸巾": "kitchen paper", + "抹布": "cloth", "茶包": "tea bag", "饭碗": "rice bowl", "菜盘": "plate", + "菜锅": "pot", "勺子": "spoon", "水杯": "water cup", "茶杯": "tea cup", + "茶壶": "teapot", "食物残渣": "food residue", "垃圾桶": "trash bin", + "纸巾": "tissue", "餐垫": "placemat", "托盘": "tray", "清洁喷雾": "spray", + "食物": "food", "电源": "power adapter", "移动硬盘": "HDD", "鼠标": "mouse", + "笔记本充电器": "laptop charger", "转换插头": "plug adapter", "插线板": "power strip", + "线材收纳包": "cable organizer", "衬衫": "shirt", "裤子": "pants", + "牙膏": "toothpaste", "牙刷": "toothbrush", "牙刷盒": "toothbrush case", + "剃须刀": "razor", "毛巾": "towel", "皮鞋": "shoes", "鞋袋": "shoe bag", + "耳机": "headphones", "护照套": "passport holder", "证件夹": "ID holder", + "纸巾包": "tissue pack", "行李箱": "suitcase", "马克杯": "mug", + "调料罐": "seasoning jar", "茶罐": "tea canister", "外套": "coat", + "围巾": "scarf", "衣架": "hanger", +} + + +def parse_t(ts: str) -> float: + parts = ts.split(":") + if len(parts) == 2: # MM:SS + m, s = parts + return int(m) * 60 + int(s) + h, m, s = parts + return int(h) * 3600 + int(m) * 60 + int(s) + + +durations = {f"S{i}": [] for i in range(1, 9)} +seg_lengths = [] +objects = Counter() + +for v_dir in sorted(ANNO_DIR.glob("v*")): + for jf in sorted(v_dir.glob("s*.json")): + scene = jf.stem.upper() + try: + data = json.loads(jf.read_text()) + except Exception: + continue + segs = data.get("segments", []) + if not segs: + continue + max_end = 0 + for seg in segs: + ts = seg.get("timestamp", "") + if "-" not in ts: + continue + try: + start, end = ts.split("-") + s_sec, e_sec = parse_t(start), parse_t(end) + seg_lengths.append(e_sec - s_sec) + max_end = max(max_end, e_sec) + for o in seg.get("objects", []) or []: + nm = o.get("name") if isinstance(o, dict) else o + if nm: + objects[OBJ_EN.get(nm, nm)] += 1 + except Exception: + continue + if max_end > 0 and scene in durations: + durations[scene].append(max_end / 60.0) + +print(f"Per-scene durations: { {s: len(v) for s, v in durations.items()} }") +print(f"Total segments: {len(seg_lengths)}") +print(f"Unique objects: {len(objects)}") +top_obj = objects.most_common(5) +print(f"Top objects: {top_obj}") + +fig, axes = plt.subplots(1, 3, figsize=(12, 3.5)) + +# (a) Duration boxplot per scene +ax = axes[0] +scene_order = [f"S{i}" for i in range(1, 9)] +data = [durations[s] for s in scene_order] +ax.boxplot(data, tick_labels=scene_order, showfliers=False, patch_artist=True, + boxprops=dict(facecolor="#b3cde3")) +ax.set_ylabel("Recording duration (min)") +ax.set_title("(a) Recording duration per scene") +ax.grid(axis="y", alpha=0.3) + +# (b) Segment length histogram +ax = axes[1] +seg_arr = np.array(seg_lengths) +seg_arr = seg_arr[seg_arr <= 10] +ax.hist(seg_arr, bins=np.arange(0, 11) - 0.5, color="#8c96c6", edgecolor="black") +ax.set_xlabel("Segment length (s)") +ax.set_ylabel("Segment count") +ax.set_title(f"(b) Segment length (n={len(seg_lengths)})") +ax.set_xticks(range(0, 11)) +ax.grid(axis="y", alpha=0.3) + +# (c) Top-20 objects +ax = axes[2] +objs, ocounts = zip(*objects.most_common(20)) +ax.barh(objs[::-1], ocounts[::-1], color="#74c476") +ax.set_xlabel("Segment count") +ax.set_title("(c) Top-20 manipulated objects") +ax.tick_params(axis="y", labelsize=8) +ax.grid(axis="x", alpha=0.3) + +fig.tight_layout() +fig.savefig(OUT, bbox_inches="tight") +fig.savefig(str(OUT).replace(".pdf", ".png"), dpi=140, bbox_inches="tight") +print(f"Saved: {OUT}") diff --git a/experiments/analysis/exp_per_subject.py b/experiments/analysis/exp_per_subject.py new file mode 100644 index 0000000000000000000000000000000000000000..0cf8397764089baeec05478baf7dcabafb7fcc5a --- /dev/null +++ b/experiments/analysis/exp_per_subject.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +""" +Experiment G: Per-subject diagnostic analysis. + +Load the best scene-recognition checkpoint(s) from previous T1 runs and +produce a per-test-volunteer breakdown of F1 and Accuracy. Reveals whether +aggregate metrics are driven by one or two outlier subjects, as reviewers +often ask. + +Runs CPU-side; no training. +""" + +import os +import sys +import json +import glob +import argparse +import numpy as np +import torch +from sklearn.metrics import accuracy_score, f1_score + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from data.dataset import ( + MultimodalSceneDataset, TEST_VOLS, SCENE_LABELS, NUM_CLASSES, + get_dataloaders, +) +from nets.models import build_model + + +def per_subject_eval(model, device, modalities, stats, downsample): + """Evaluate one model across each test volunteer separately.""" + breakdown = {} + for vol in TEST_VOLS: + ds = MultimodalSceneDataset([vol], modalities, downsample=downsample, + stats=stats) + if len(ds) == 0: + breakdown[vol] = {'n': 0} + continue + preds, ys = [], [] + model.eval() + with torch.no_grad(): + for i in range(len(ds)): + x, y = ds[i] + x = x.to(device).unsqueeze(0) + mask = torch.ones(1, x.size(1), dtype=torch.bool).to(device) + logits = model(x, mask) + preds.append(logits.argmax(dim=1).cpu().item()) + ys.append(y) + breakdown[vol] = { + 'n': len(ds), + 'acc': float(accuracy_score(ys, preds)), + 'f1': float(f1_score(ys, preds, average='macro', zero_division=0)), + 'preds': preds, + 'labels': ys, + 'samples': ds.sample_info, + } + return breakdown + + +def run_on_checkpoint(ckpt_path, args_json_path, output_dir): + ckpt_args = json.load(open(args_json_path))['args'] + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + modalities = ckpt_args['modalities'] if isinstance(ckpt_args['modalities'], list) \ + else ckpt_args['modalities'].split(',') + downsample = ckpt_args.get('downsample', 5) + # Get train stats + _, _, _, info = get_dataloaders(modalities, + batch_size=ckpt_args.get('batch_size', 16), + downsample=downsample) + # Need the actual stats object -- re-load train set to compute + tr_ds = MultimodalSceneDataset( + __import__('experiments.dataset', fromlist=['TRAIN_VOLS']).TRAIN_VOLS, + modalities, downsample=downsample) + stats = tr_ds.get_stats() + + model = build_model( + ckpt_args.get('model', 'transformer'), + ckpt_args.get('fusion', 'late'), + info['feat_dim'], info['modality_dims'], NUM_CLASSES, + hidden_dim=ckpt_args.get('hidden_dim', 128), + proj_dim=ckpt_args.get('proj_dim', 0), + late_agg=ckpt_args.get('late_agg', 'mean'), + ).to(device) + try: + sd = torch.load(ckpt_path, weights_only=True, map_location=device) + except Exception: + sd = torch.load(ckpt_path, map_location=device) + model.load_state_dict(sd, strict=False) + + breakdown = per_subject_eval(model, device, modalities, stats, downsample) + + # Overall F1 + all_preds, all_ys = [], [] + for v, info_v in breakdown.items(): + if info_v.get('n', 0) > 0: + all_preds.extend(info_v['preds']) + all_ys.extend(info_v['labels']) + overall_f1 = float(f1_score(all_ys, all_preds, average='macro', zero_division=0)) + overall_acc = float(accuracy_score(all_ys, all_preds)) + + # Per-subject summary + summary = { + 'ckpt': ckpt_path, + 'modalities': modalities, + 'overall': {'acc': overall_acc, 'f1': overall_f1, + 'n': len(all_preds)}, + 'per_subject': { + v: {'n': b.get('n'), 'acc': b.get('acc'), 'f1': b.get('f1')} + for v, b in breakdown.items() + }, + 'detail': breakdown, + } + os.makedirs(output_dir, exist_ok=True) + out_path = os.path.join(output_dir, os.path.basename( + os.path.dirname(ckpt_path)) + '_per_subject.json') + with open(out_path, 'w') as f: + json.dump(summary, f, indent=2) + print(f"Per-subject breakdown saved: {out_path}") + print(f"Overall F1: {overall_f1:.4f} Acc: {overall_acc:.4f}") + for v, b in summary['per_subject'].items(): + print(f" {v}: n={b['n']} acc={b.get('acc'):.3f} f1={b.get('f1'):.3f}" + if b.get('n') else f" {v}: (empty)") + return summary + + +def main(): + p = argparse.ArgumentParser() + p.add_argument('--exp_root', type=str, required=True, + help='Directory containing run subdirs with model_best.pt and results.json') + p.add_argument('--output_dir', type=str, required=True) + args = p.parse_args() + + runs = [] + for sub in sorted(os.listdir(args.exp_root)): + if sub == 'slurm_logs': + continue + ckpt = os.path.join(args.exp_root, sub, 'model_best.pt') + res = os.path.join(args.exp_root, sub, 'results.json') + if os.path.exists(ckpt) and os.path.exists(res): + runs.append((ckpt, res)) + print(f"Found {len(runs)} runs with checkpoints.") + for ckpt, res in runs: + try: + run_on_checkpoint(ckpt, res, args.output_dir) + except Exception as e: + print(f" FAIL {ckpt}: {e}") + + +if __name__ == '__main__': + main() diff --git a/experiments/analysis/extract_video_features.py b/experiments/analysis/extract_video_features.py new file mode 100644 index 0000000000000000000000000000000000000000..f2f7c1970e34e78b3bad0776cb49a0b633d8fbae --- /dev/null +++ b/experiments/analysis/extract_video_features.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +""" +Extract video features from Scene Camera videos using a pretrained backbone. +Uses CLIP (ViT-B/16) which is lightweight and doesn't need video-specific pretraining. + +Output: per-frame feature vectors saved as .npy files, aligned to 100Hz sensor data. +""" + +import os +import sys +import json +import glob +import argparse +import numpy as np +import cv2 +import torch +import torch.nn as nn +from torchvision import transforms + +DATASET_DIR = "${PULSE_ROOT}/dataset" + + +class CLIPFeatureExtractor: + """Extract features using CLIP ViT-B/16 (via torchvision).""" + + def __init__(self, device='cpu'): + self.device = device + # Use torchvision's pretrained ViT + from torchvision.models import vit_b_16, ViT_B_16_Weights + weights = ViT_B_16_Weights.IMAGENET1K_V1 + model = vit_b_16(weights=weights) + # Remove classification head, keep feature extractor + model.heads = nn.Identity() + model.eval() + self.model = model.to(device) + self.transform = weights.transforms() + self.feat_dim = 768 # ViT-B/16 feature dimension + + @torch.no_grad() + def extract_batch(self, frames): + """Extract features from a batch of frames. + + Args: + frames: list of numpy arrays (H, W, 3) in BGR format + Returns: + features: numpy array (N, feat_dim) + """ + tensors = [] + for frame in frames: + # BGR -> RGB -> PIL-like tensor + rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + tensor = torch.from_numpy(rgb).permute(2, 0, 1).float() / 255.0 + tensor = self.transform(tensor) + tensors.append(tensor) + + batch = torch.stack(tensors).to(self.device) + features = self.model(batch) + return features.cpu().numpy() + + +def find_scene_video(scenario_dir, vol, scenario): + """Find the Scene Camera video file.""" + pattern = os.path.join(scenario_dir, f"trimmed_{vol}{scenario}*Scene Cam.mp4") + matches = glob.glob(pattern) + return matches[0] if matches else None + + +def extract_features_for_video(extractor, video_path, target_fps=100, + batch_size=32, sample_fps=2): + """Extract features from a video file. + + Args: + extractor: feature extractor + video_path: path to video file + target_fps: target frame rate to align with sensor data (100Hz) + batch_size: batch size for feature extraction + sample_fps: extract features at this rate (e.g., 2 = every 0.5s) + Features are then interpolated to target_fps. + Returns: + features: numpy array (T_target, feat_dim) aligned to target_fps + """ + cap = cv2.VideoCapture(video_path) + video_fps = cap.get(cv2.CAP_PROP_FPS) + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + duration = total_frames / video_fps + + # Sample frames at sample_fps + sample_interval = int(video_fps / sample_fps) + sample_indices = list(range(0, total_frames, sample_interval)) + + print(f" Video: {total_frames} frames @ {video_fps:.1f}fps = {duration:.1f}s") + print(f" Sampling {len(sample_indices)} frames @ {sample_fps}fps") + + # Extract features in batches + all_features = [] + batch_frames = [] + batch_indices = [] + + for idx in sample_indices: + cap.set(cv2.CAP_PROP_POS_FRAMES, idx) + ret, frame = cap.read() + if not ret: + break + batch_frames.append(frame) + batch_indices.append(idx) + + if len(batch_frames) >= batch_size: + feats = extractor.extract_batch(batch_frames) + all_features.append(feats) + batch_frames = [] + if len(all_features) % 10 == 0: + print(f" Processed {len(all_features) * batch_size} frames...") + + if batch_frames: + feats = extractor.extract_batch(batch_frames) + all_features.append(feats) + + cap.release() + + if not all_features: + return None + + features = np.concatenate(all_features, axis=0) # (N_samples, feat_dim) + sample_times = np.array(batch_indices[:features.shape[0]]) / video_fps # seconds + + # Interpolate to target_fps (100Hz) + target_times = np.arange(0, duration, 1.0 / target_fps) + n_target = len(target_times) + + # Linear interpolation per feature dimension + from scipy.interpolate import interp1d + if len(sample_times) < 2: + # Not enough samples, repeat + interpolated = np.tile(features[0], (n_target, 1)) + else: + interp_func = interp1d( + sample_times, features, axis=0, + kind='linear', fill_value='extrapolate' + ) + interpolated = interp_func(target_times).astype(np.float32) + + print(f" Output: {interpolated.shape} @ {target_fps}Hz") + return interpolated + + +def main(): + parser = argparse.ArgumentParser(description='Extract video features') + parser.add_argument('--sample_fps', type=int, default=2, + help='Sample rate for feature extraction (default: 2fps)') + parser.add_argument('--batch_size', type=int, default=16, + help='Batch size for feature extraction') + parser.add_argument('--device', type=str, default='cuda', + help='Device (cuda or cpu)') + args = parser.parse_args() + + device = args.device if torch.cuda.is_available() and args.device == 'cuda' else 'cpu' + print(f"Device: {device}") + + print("Loading ViT-B/16 feature extractor...") + extractor = CLIPFeatureExtractor(device=device) + print(f"Feature dim: {extractor.feat_dim}") + + # Process all volunteers and scenarios + processed = 0 + skipped = 0 + + for vol_dir in sorted(glob.glob(f"{DATASET_DIR}/v*")): + vol = os.path.basename(vol_dir) + for scenario_dir in sorted(glob.glob(f"{vol_dir}/s*")): + scenario = os.path.basename(scenario_dir) + output_path = os.path.join(scenario_dir, "video_features_100hz.npy") + + # Skip if already extracted + if os.path.exists(output_path): + print(f"[{vol}/{scenario}] Already exists, skipping") + skipped += 1 + continue + + # Find video + video_path = find_scene_video(scenario_dir, vol, scenario) + if video_path is None: + print(f"[{vol}/{scenario}] No Scene Camera video found, skipping") + skipped += 1 + continue + + print(f"\n[{vol}/{scenario}]") + print(f" Video: {os.path.basename(video_path)}") + + features = extract_features_for_video( + extractor, video_path, + batch_size=args.batch_size, + sample_fps=args.sample_fps, + ) + + if features is not None: + np.save(output_path, features) + print(f" Saved: {output_path} ({features.shape})") + processed += 1 + else: + print(f" FAILED: Could not extract features") + + print(f"\n{'='*60}") + print(f"Done! Processed: {processed}, Skipped: {skipped}") + print(f"Feature files: {DATASET_DIR}/*/*/video_features_100hz.npy") + + +if __name__ == '__main__': + main() diff --git a/experiments/analysis/extract_videomae_features.py b/experiments/analysis/extract_videomae_features.py new file mode 100644 index 0000000000000000000000000000000000000000..061143f60f3459cb839de879e8994629d36ec3c8 --- /dev/null +++ b/experiments/analysis/extract_videomae_features.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python3 +""" +Extract video features using VideoMAE (pretrained on Kinetics-400). +Process 16-frame video clips to capture temporal dynamics. + +Output: per-frame feature vectors aligned to 100Hz sensor data. +""" + +import os +import sys +import json +import glob +import argparse +import numpy as np +import cv2 +import torch + +DATASET_DIR = "${PULSE_ROOT}/dataset" +MODEL_NAME = "${PULSE_ROOT}/models/videomae-base-kinetics" + + +class VideoMAEFeatureExtractor: + """Extract features using VideoMAE-Base (16-frame clips). Multi-GPU enabled.""" + + def __init__(self, device='cpu'): + from transformers import VideoMAEModel, VideoMAEImageProcessor + import torch.nn as nn + self.device = device + self.processor = VideoMAEImageProcessor.from_pretrained(MODEL_NAME) + model = VideoMAEModel.from_pretrained(MODEL_NAME).to(device) + model.eval() + # Wrap with DataParallel if multiple GPUs available + if torch.cuda.is_available() and torch.cuda.device_count() > 1: + self.n_gpus = torch.cuda.device_count() + print(f" Using DataParallel across {self.n_gpus} GPUs") + self.model = nn.DataParallel(model) + self.num_frames = model.config.num_frames + self.feat_dim = model.config.hidden_size + else: + self.n_gpus = 1 + self.model = model + self.num_frames = model.config.num_frames + self.feat_dim = model.config.hidden_size + + @torch.no_grad() + def extract_clip(self, frames): + """Extract feature from a single 16-frame clip. + + Args: + frames: list of 16 RGB numpy arrays (H, W, 3) + Returns: + feature: numpy array (feat_dim,) - mean-pooled patch tokens + """ + # Pad/truncate to exactly num_frames + if len(frames) < self.num_frames: + frames = frames + [frames[-1]] * (self.num_frames - len(frames)) + elif len(frames) > self.num_frames: + # uniform sampling + indices = np.linspace(0, len(frames) - 1, self.num_frames, dtype=int) + frames = [frames[i] for i in indices] + + inputs = self.processor(frames, return_tensors="pt") + pixel_values = inputs["pixel_values"].to(self.device) + outputs = self.model(pixel_values) + # Average pool over all patch tokens + feature = outputs.last_hidden_state.mean(dim=1).squeeze(0) # (768,) + return feature.cpu().numpy() + + @torch.no_grad() + def extract_clip_batch(self, clips): + """Extract features from a batch of clips. + + Args: + clips: list of clips, each is a list of 16 RGB frames + Returns: + features: numpy array (B, feat_dim) + """ + # Process each clip + all_pixel_values = [] + for frames in clips: + if len(frames) < self.num_frames: + frames = frames + [frames[-1]] * (self.num_frames - len(frames)) + elif len(frames) > self.num_frames: + indices = np.linspace(0, len(frames) - 1, self.num_frames, dtype=int) + frames = [frames[i] for i in indices] + inputs = self.processor(frames, return_tensors="pt") + all_pixel_values.append(inputs["pixel_values"]) + + batch = torch.cat(all_pixel_values, dim=0).to(self.device) + outputs = self.model(batch) + features = outputs.last_hidden_state.mean(dim=1) # (B, 768) + return features.cpu().numpy() + + +def find_scene_video(scenario_dir, vol, scenario): + pattern = os.path.join(scenario_dir, f"trimmed_{vol}{scenario}*Scene Cam.mp4") + matches = glob.glob(pattern) + return matches[0] if matches else None + + +def extract_features_for_video(extractor, video_path, target_fps=100, + clip_stride_sec=0.5, batch_size=4): + """Extract VideoMAE features from a video. + + Strategy (fast): + - Sequentially decode video ONCE, downsample to 8fps and store frames in RAM + - Build clips by indexing into the in-memory frame array (no random seeks) + """ + import time + t0 = time.time() + cap = cv2.VideoCapture(video_path) + video_fps = cap.get(cv2.CAP_PROP_FPS) + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + duration = total_frames / video_fps + + # Read all frames sequentially, downsample to ~16fps (every video_fps/16 frame) + decode_fps = 16 # we sample frames at this rate from the video + decode_stride = max(1, int(round(video_fps / decode_fps))) + print(f" Video: {total_frames} frames @ {video_fps:.1f}fps = {duration:.1f}s") + print(f" Decoding sequentially with stride {decode_stride} (~{video_fps/decode_stride:.1f}fps)...") + + # Pre-resize to model input size during decoding to save memory + # VideoMAE expects 224x224 + target_size = 224 + + decoded_frames = [] # list of (H, W, 3) uint8 RGB arrays + decoded_times = [] # corresponding timestamps in seconds + frame_idx = 0 + while True: + ret, frame = cap.read() + if not ret: + break + if frame_idx % decode_stride == 0: + # Resize early to save memory + resized = cv2.resize(frame, (target_size, target_size), interpolation=cv2.INTER_AREA) + rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) + decoded_frames.append(rgb) + decoded_times.append(frame_idx / video_fps) + frame_idx += 1 + cap.release() + + decoded_frames = np.array(decoded_frames) # (N, 224, 224, 3) + decoded_times = np.array(decoded_times) + decode_time = time.time() - t0 + print(f" Decoded {len(decoded_frames)} frames in {decode_time:.1f}s") + + # Build clips: each clip = 16 frames spanning ~1 second + # Sample 16 consecutive frames from in-memory array + frames_per_clip = 16 + n_decoded = len(decoded_frames) + if n_decoded < 4: + return None + + # Each clip occupies 16 frames at ~16fps = 1 second + clip_centers_sec = np.arange(0.5, duration - 0.5, clip_stride_sec) + n_clips = len(clip_centers_sec) + print(f" Building {n_clips} clips (stride={clip_stride_sec}s, {frames_per_clip} frames each)") + + all_features = [] + clip_times = [] + batch_clips = [] + batch_times = [] + + t1 = time.time() + for center_sec in clip_centers_sec: + # Find decoded frames within ±0.5s window + center_idx = np.searchsorted(decoded_times, center_sec) + half = frames_per_clip // 2 + start = max(0, center_idx - half) + end = min(n_decoded, start + frames_per_clip) + start = max(0, end - frames_per_clip) + + if end - start < 4: + continue + + clip = list(decoded_frames[start:end]) + # Pad if needed + if len(clip) < frames_per_clip: + clip = clip + [clip[-1]] * (frames_per_clip - len(clip)) + + batch_clips.append(clip) + batch_times.append(center_sec) + + if len(batch_clips) >= batch_size: + feats = extractor.extract_clip_batch(batch_clips) + all_features.append(feats) + clip_times.extend(batch_times) + batch_clips = [] + batch_times = [] + + if batch_clips: + feats = extractor.extract_clip_batch(batch_clips) + all_features.append(feats) + clip_times.extend(batch_times) + inference_time = time.time() - t1 + print(f" Inference time: {inference_time:.1f}s ({len(clip_times)} clips)") + + if not all_features: + return None + + features = np.concatenate(all_features, axis=0) # (N_clips, 768) + clip_times = np.array(clip_times[:features.shape[0]]) + + # Interpolate to target_fps (100Hz) + target_times = np.arange(0, duration, 1.0 / target_fps) + n_target = len(target_times) + + from scipy.interpolate import interp1d + if len(clip_times) < 2: + interpolated = np.tile(features[0], (n_target, 1)) + else: + interp_func = interp1d( + clip_times, features, axis=0, + kind='linear', fill_value='extrapolate' + ) + interpolated = interp_func(target_times).astype(np.float32) + + print(f" Output: {interpolated.shape} @ {target_fps}Hz") + return interpolated + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--clip_stride', type=float, default=0.5, + help='Clip extraction stride in seconds (default: 0.5)') + parser.add_argument('--batch_size', type=int, default=4) + parser.add_argument('--device', type=str, default='cuda') + parser.add_argument('--output_name', type=str, default='video_features_videomae_100hz.npy') + args = parser.parse_args() + + device = args.device if torch.cuda.is_available() and args.device == 'cuda' else 'cpu' + print(f"Device: {device}") + + print(f"Loading VideoMAE from {MODEL_NAME}...") + extractor = VideoMAEFeatureExtractor(device=device) + print(f"Feature dim: {extractor.feat_dim}, num frames per clip: {extractor.num_frames}") + + processed = 0 + skipped = 0 + + for vol_dir in sorted(glob.glob(f"{DATASET_DIR}/v*")): + vol = os.path.basename(vol_dir) + for scenario_dir in sorted(glob.glob(f"{vol_dir}/s*")): + scenario = os.path.basename(scenario_dir) + output_path = os.path.join(scenario_dir, args.output_name) + + if os.path.exists(output_path): + print(f"[{vol}/{scenario}] exists, skip") + skipped += 1 + continue + + video_path = find_scene_video(scenario_dir, vol, scenario) + if video_path is None: + print(f"[{vol}/{scenario}] no video, skip") + skipped += 1 + continue + + print(f"\n[{vol}/{scenario}]") + features = extract_features_for_video( + extractor, video_path, + clip_stride_sec=args.clip_stride, + batch_size=args.batch_size, + ) + + if features is not None: + np.save(output_path, features) + print(f" Saved: {output_path} ({features.shape})") + processed += 1 + else: + print(f" FAILED") + + print(f"\nDone! Processed: {processed}, Skipped: {skipped}") + + +if __name__ == '__main__': + main() diff --git a/experiments/analysis/gen_val_comparison.py b/experiments/analysis/gen_val_comparison.py new file mode 100644 index 0000000000000000000000000000000000000000..f72cf05eeac7bbd011fbe3abb6b005e3a07174dd --- /dev/null +++ b/experiments/analysis/gen_val_comparison.py @@ -0,0 +1,74 @@ +import os, sys, json, torch +sys.path.insert(0, '${PULSE_ROOT}') +os.environ['HF_HUB_OFFLINE'] = '1' +os.environ['TRANSFORMERS_OFFLINE'] = '1' + +from tasks.train_pred import ( + TextPredictionDataset, SensorToTextModel, apply_lora, set_seed +) +from data.dataset import TRAIN_VOLS, VAL_VOLS, TEST_VOLS + +set_seed(42) +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +# Load tokenizer & LLM +from transformers import AutoTokenizer, AutoModelForCausalLM +llm_path = '${PULSE_ROOT}/models/qwen2.5-0.5b' +tokenizer = AutoTokenizer.from_pretrained(llm_path, trust_remote_code=True, local_files_only=True) +if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + +llm = AutoModelForCausalLM.from_pretrained( + llm_path, trust_remote_code=True, torch_dtype=torch.float32, local_files_only=True +).to(device) +llm.config.pad_token_id = tokenizer.pad_token_id +for p in llm.parameters(): + p.requires_grad = False +lora_params = apply_lora(llm, r=8, alpha=16) + +modalities = ['mocap', 'emg', 'imu'] + +# Build datasets +train_ds = TextPredictionDataset(TRAIN_VOLS, modalities, tokenizer, window_sec=15.0, downsample=5) +stats = train_ds.get_stats() +val_ds = TextPredictionDataset(VAL_VOLS, modalities, tokenizer, window_sec=15.0, downsample=5, stats=stats) +test_ds = TextPredictionDataset(TEST_VOLS, modalities, tokenizer, window_sec=15.0, downsample=5, stats=stats) + +# Build model & load weights +model = SensorToTextModel(train_ds.feat_dim, llm, tokenizer, n_sensor_tokens=8, d_model=64) +model.to(device) + +ckpt_path = '${PULSE_ROOT}/results/pred_llm2/pred_llm_mocap-emg-imu/model_best.pt' +sd = torch.load(ckpt_path, weights_only=True, map_location=device) +model.load_state_dict(sd, strict=False) +model.eval() + +out_path = '${PULSE_ROOT}/docs/pred_llm2_val_comparison.txt' + +from torch.utils.data import DataLoader + +with open(out_path, 'w') as f: + for split_name, ds in [('Validation', val_ds), ('Test', test_ds)]: + loader = DataLoader(ds, batch_size=8, shuffle=False) + f.write(f"{'='*70}\n") + f.write(f"{split_name} Set — mocap,emg,imu (best charF1=0.0324)\n") + f.write(f"Samples: {len(ds)}\n") + f.write(f"{'='*70}\n\n") + + idx = 0 + for batch in loader: + sensor = batch['sensor'].to(device) + preds = model.generate_text(sensor, tokenizer, max_new_tokens=20) + refs = [ds.texts[idx + i] for i in range(len(preds))] + for p, r in zip(preds, refs): + match = "OK" if p.strip() == r.strip() else "XX" + f.write(f"[{match}] #{idx+1}\n") + f.write(f" Pred: {p.strip()}\n") + f.write(f" Ref: {r.strip()}\n\n") + idx += 1 + + # Stats + f.write(f"\n--- {split_name} Summary ---\n") + f.write(f"Total: {idx}\n\n") + +print(f"Written to {out_path}") diff --git a/experiments/analysis/generate_action_labels.py b/experiments/analysis/generate_action_labels.py new file mode 100644 index 0000000000000000000000000000000000000000..c841b2d3c09e5a1ca526cea0a9e1fcaffe5ae0cd --- /dev/null +++ b/experiments/analysis/generate_action_labels.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +""" +Generate action labels by clustering task descriptions using text embeddings. +No manual rules — uses sentence-transformers + K-Means clustering. +""" + +import os +import json +import glob +import argparse +import numpy as np +from collections import Counter +from sklearn.cluster import KMeans +from sklearn.metrics import silhouette_score + +ANNOTATION_DIR = "${PULSE_ROOT}" + + +def collect_tasks(): + """Collect all task descriptions from all annotation files.""" + tasks = [] + for path in sorted(glob.glob(os.path.join(ANNOTATION_DIR, 'v*/s*.json'))): + with open(path) as f: + data = json.load(f) + for seg in data.get('segments', []): + tasks.append(seg['task']) + return tasks + + +def embed_texts(texts): + """Encode texts using sentence-transformers (multilingual model).""" + try: + from sentence_transformers import SentenceTransformer + model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2') + embeddings = model.encode(texts, show_progress_bar=True, batch_size=128) + print(f"Encoded {len(texts)} texts with sentence-transformers, dim={embeddings.shape[1]}") + return embeddings + except Exception as e: + print(f"sentence-transformers failed ({e}), falling back to TF-IDF") + from sklearn.feature_extraction.text import TfidfVectorizer + vec = TfidfVectorizer(analyzer='char', ngram_range=(1, 3), max_features=3000) + X = vec.fit_transform(texts).toarray() + print(f"Encoded {len(texts)} texts with TF-IDF char n-grams, dim={X.shape[1]}") + return X + + +def cluster_tasks(tasks, k_range=(10, 30)): + unique_tasks = sorted(set(tasks)) + print(f"Total segments: {len(tasks)}, Unique task texts: {len(unique_tasks)}") + + X = embed_texts(unique_tasks) + + # Find optimal K via silhouette score + best_k, best_score = k_range[0], -1 + scores = {} + for k in range(k_range[0], k_range[1] + 1): + km = KMeans(n_clusters=k, random_state=42, n_init=10) + labels = km.fit_predict(X) + score = silhouette_score(X, labels, sample_size=min(2000, len(unique_tasks))) + scores[k] = score + if score > best_score: + best_score = score + best_k = k + print(f" K={k}: silhouette={score:.4f}" + (" *" if k == best_k else "")) + + print(f"\nBest K={best_k} (silhouette={best_score:.4f})") + + # Final clustering + km = KMeans(n_clusters=best_k, random_state=42, n_init=10) + labels = km.fit_predict(X) + + task_to_cluster = {task: int(labels[i]) for i, task in enumerate(unique_tasks)} + + # Representative task per cluster (closest to centroid) + cluster_representatives = {} + cluster_members = {} + for cid in range(best_k): + member_idx = [i for i, l in enumerate(labels) if l == cid] + members = [unique_tasks[i] for i in member_idx] + cluster_members[cid] = members + centroid = km.cluster_centers_[cid] + dists = np.linalg.norm(X[member_idx] - centroid, axis=1) + closest = member_idx[np.argmin(dists)] + cluster_representatives[cid] = unique_tasks[closest] + + return task_to_cluster, cluster_representatives, cluster_members, best_k, scores + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--output_dir', type=str, + default='${PULSE_ROOT}/results/pred') + parser.add_argument('--k_min', type=int, default=10) + parser.add_argument('--k_max', type=int, default=30) + args = parser.parse_args() + + os.makedirs(args.output_dir, exist_ok=True) + + tasks = collect_tasks() + task_to_cluster, representatives, members, K, scores = cluster_tasks( + tasks, k_range=(args.k_min, args.k_max) + ) + + # Print summary + segment_counts = Counter(task_to_cluster[t] for t in tasks) + print(f"\n{'='*60}") + print(f"Clusters (K={K}):") + for cid in range(K): + rep = representatives[cid] + n_unique = len(members[cid]) + n_segs = segment_counts.get(cid, 0) + examples = [m for m in members[cid] if m != rep][:3] + print(f"\n [{cid:2d}] ({n_segs:4d} segs, {n_unique:3d} unique) \"{rep}\"") + for ex in examples: + print(f" - {ex}") + + # Save + output = { + 'num_classes': K, + 'task_to_cluster': task_to_cluster, + 'cluster_representatives': {str(k): v for k, v in representatives.items()}, + 'cluster_sizes_unique': {str(k): len(v) for k, v in members.items()}, + 'cluster_sizes_segments': {str(k): v for k, v in segment_counts.items()}, + 'silhouette_scores': {str(k): v for k, v in scores.items()}, + } + out_path = os.path.join(args.output_dir, 'action_labels.json') + with open(out_path, 'w') as f: + json.dump(output, f, indent=2, ensure_ascii=False) + print(f"\nSaved to {out_path}") + + +if __name__ == '__main__': + main() diff --git a/experiments/analysis/generate_coarse_annotations.py b/experiments/analysis/generate_coarse_annotations.py new file mode 100644 index 0000000000000000000000000000000000000000..c6a4559b14852533186c6e77bf99b4216c7d390d --- /dev/null +++ b/experiments/analysis/generate_coarse_annotations.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python3 +""" +Generate coarse-grained annotations by merging consecutive fine-grained segments +into composite actions (8-15s duration) using LLM. + +Input: annotations_v2/ (fine-grained, ~2-3s segments, 11 classes) +Output: annotations_coarse/ (coarse-grained, ~8-15s segments, ~6 classes) + +Does NOT modify annotations_v2/. +""" + +import os +import json +import re +import time +import glob +import urllib.request +from collections import Counter + +INPUT_DIR = "${PULSE_ROOT}/annotations_v2" +OUTPUT_DIR = "${PULSE_ROOT}/annotations_coarse" + +API_URL = "https://api.chatanywhere.tech/v1/chat/completions" +API_KEYS = [ + "sk-MN5n1uEETyaky96fLJdHqZobXF1f7KmOrZHzwD3lt585asFQ", + "sk-YnYrtPdAXwlE12hRpi6dYqlE1RRVR3LDVBka6wKaefU4iQRY", + "sk-jOZtodDv6OxUOMu3NuJ8lzffjwBlshn9OHY5KSmqmPTtc9qs", + "sk-qAaKTKYIRF24btu1oQWgubWG4UdA92bILNtzOkHNEPAcCxdB", + "sk-MgCBBonblMrCFnSXd6fJZaBLTCfCJ5FjYZfSe2e46bgmyktk", + "sk-79e30kYRgduuf2fSU0Lsc814YjNkClXXzQqIbx0iLS40IOEH", + "sk-h9Tej4tW6AQC6fT0njfzrPKXEk6fBwpiSvvQd0aJAhw4UwLz", + "sk-k2QNHt5wAH26Fw8hZuPWuVXw8Psd1jX09qusiA6PdBj5Vzuu", + "sk-w7EkTblciNI44cwosHXi0PGZNUf1hnJmpzOQ85va9VPdAKbz", + "sk-Dexs5ZF7OjFCq7CZW45wJ8EKoGtIswv6rsLUMzUXXkWBDBBJ", +] + +SCENE_DESCRIPTIONS = { + "s1": "办公桌面整理与工作准备", + "s2": "快递打包发送", + "s3": "厨房调料整理", + "s4": "清理餐后桌面", + "s5": "餐前桌面布置", + "s6": "商务旅行行李箱打包", + "s7": "冲泡咖啡/饮品", + "s8": "晾衣架整理与衣物收纳", +} + +COARSE_CATEGORIES = """粗粒度动作类别(共6类): + +1. Manipulate - 操作物体(抓取、调整、放置某个物体的完整过程,包含拿起→操作→放下的组合) +2. CleanOrganize - 清洁/整理(擦桌子、理线、整理桌面、叠衣服等持续性整理活动) +3. Transfer - 搬运/传递(将物体从一个位置搬到另一个位置的过程) +4. Assemble - 组装/连接/包装(封箱、贴胶带、盖盖子、插电源、拧瓶盖等需要精细对准的操作) +5. FoodPrep - 食物/饮品准备(倒水、倒调料、搅拌、冲泡等与食物饮品相关的操作) +6. Idle - 空闲/过渡(无明确操作的间隔) +""" + +current_key_idx = 0 +call_count = 0 + + +def call_llm(prompt, max_tokens=1500, retries=3): + global current_key_idx, call_count + for attempt in range(retries * len(API_KEYS)): + key = API_KEYS[current_key_idx] + try: + data = json.dumps({ + "model": "gpt-4o-mini", + "messages": [{"role": "user", "content": prompt}], + "max_tokens": max_tokens, + "temperature": 0.1, + }).encode() + req = urllib.request.Request( + API_URL, data=data, + headers={"Content-Type": "application/json", "Authorization": f"Bearer {key}"} + ) + resp = urllib.request.urlopen(req, timeout=30) + result = json.loads(resp.read()) + call_count += 1 + return result["choices"][0]["message"]["content"] + except Exception as e: + err = str(e) + if any(k in err for k in ["429", "quota", "limit", "402", "403"]): + current_key_idx = (current_key_idx + 1) % len(API_KEYS) + else: + time.sleep(0.5) + current_key_idx = (current_key_idx + 1) % len(API_KEYS) + return None + + +def parse_ts(ts_str): + """Parse 'MM:SS' to seconds.""" + m = re.match(r'(\d+):(\d+)', ts_str.strip()) + if m: + return int(m.group(1)) * 60 + int(m.group(2)) + return 0 + + +def format_ts(sec): + """Format seconds to 'MM:SS'.""" + return f"{sec//60:02d}:{sec%60:02d}" + + +def merge_segments_with_llm(segments, scene_id): + """Use LLM to merge fine-grained segments into coarse composite actions.""" + scene_desc = SCENE_DESCRIPTIONS.get(scene_id, "日常活动") + + # Build segment list + seg_lines = [] + for i, seg in enumerate(segments): + label = seg.get("action_label", "Idle") + seg_lines.append(f"{i+1}. [{seg['timestamp']}] {label}: {seg['task']}") + seg_text = "\n".join(seg_lines) + + prompt = f"""你是一个动作标注专家。以下是一段"{scene_desc}"录制中的细粒度动作序列(每个2-3秒)。 +请将相关的连续动作合并为粗粒度复合动作,每个复合动作持续5-15秒。 + +合并规则: +- 围绕同一个物体的连续操作合并为一个(如"抓取杯子→调整→放下"合并为一个Manipulate) +- 连续的整理/清洁动作合并 +- 合并后的时间范围 = 第一个子动作的开始时间 到 最后一个子动作的结束时间 +- 如果中间有短暂Idle(≤3秒),可以包含进去 +- 每个复合动作必须从6个类别中选一个 + +{COARSE_CATEGORIES} + +细粒度动作序列: +{seg_text} + +请严格按以下JSON格式返回,不要添加任何额外文字: +[{{"timestamp": "MM:SS-MM:SS", "coarse_action": "类别名", "description": "简要描述这段复合动作", "fine_segments": [子动作编号列表]}}]""" + + response = call_llm(prompt, max_tokens=2000) + if response is None: + return None + + try: + match = re.search(r'\[.*\]', response, re.DOTALL) + if match: + results = json.loads(match.group()) + valid = [] + for r in results: + if all(k in r for k in ["timestamp", "coarse_action", "description"]): + # Validate category + if r["coarse_action"] in {"Manipulate", "CleanOrganize", "Transfer", + "Assemble", "FoodPrep", "Idle"}: + valid.append(r) + return valid + except (json.JSONDecodeError, KeyError) as e: + print(f" Parse error: {e}") + return None + + +def process_file(input_path, vol, scenario): + """Process one annotation file.""" + data = json.load(open(input_path)) + segments = data["segments"] + + if not segments: + return {"fine_segments": segments, "coarse_segments": []}, 0 + + print(f" Merging {len(segments)} fine segments...") + coarse = merge_segments_with_llm(segments, scenario) + + if coarse is None: + # Fallback: simple time-based merging without LLM + print(f" LLM failed, using fallback merge") + coarse = fallback_merge(segments) + + result = { + "fine_segments": segments, + "coarse_segments": coarse, + } + return result, len(coarse) + + +def fallback_merge(segments): + """Simple rule-based merging as fallback.""" + if not segments: + return [] + + coarse = [] + group = [segments[0]] + + for seg in segments[1:]: + # Parse timestamps + prev_ts = group[-1]["timestamp"] + curr_ts = seg["timestamp"] + m1 = re.match(r'(\d+:\d+)\s*-\s*(\d+:\d+)', prev_ts) + m2 = re.match(r'(\d+:\d+)\s*-\s*(\d+:\d+)', curr_ts) + if not m1 or not m2: + group.append(seg) + continue + + prev_end = parse_ts(m1.group(2)) + curr_start = parse_ts(m2.group(1)) + gap = curr_start - prev_end + + # Merge if gap ≤ 3s and group duration < 15s + group_start = parse_ts(re.match(r'(\d+:\d+)', group[0]["timestamp"]).group(1)) + curr_end = parse_ts(m2.group(2)) + group_duration = curr_end - group_start + + if gap <= 3 and group_duration <= 15: + group.append(seg) + else: + # Emit current group + coarse.append(_emit_group(group)) + group = [seg] + + if group: + coarse.append(_emit_group(group)) + + return coarse + + +def _emit_group(group): + """Create a coarse segment from a group of fine segments.""" + m_start = re.match(r'(\d+:\d+)', group[0]["timestamp"]) + m_end = re.match(r'\d+:\d+\s*-\s*(\d+:\d+)', group[-1]["timestamp"]) + start = m_start.group(1) if m_start else "00:00" + end = m_end.group(1) if m_end else "00:00" + + labels = [seg.get("action_label", "Idle") for seg in group] + label_counts = Counter(labels) + dominant = label_counts.most_common(1)[0][0] + + # Map fine label to coarse + label_map = { + "Grasp": "Manipulate", "Place": "Manipulate", "Arrange": "CleanOrganize", + "Wipe": "CleanOrganize", "Fold": "CleanOrganize", "Transport": "Transfer", + "OpenClose": "Assemble", "TearCut": "Assemble", + "Pour": "FoodPrep", "Stir": "FoodPrep", "Idle": "Idle", + } + coarse_label = label_map.get(dominant, "Manipulate") + + tasks = [seg["task"] for seg in group] + desc = tasks[0] if len(tasks) == 1 else f"{tasks[0]}...{tasks[-1]}" + + return { + "timestamp": f"{start}-{end}", + "coarse_action": coarse_label, + "description": desc[:80], + "fine_segments": list(range(1, len(group) + 1)), + } + + +def main(): + os.makedirs(OUTPUT_DIR, exist_ok=True) + + total_fine = 0 + total_coarse = 0 + total_files = 0 + coarse_labels = Counter() + + for vol_dir in sorted(glob.glob(f"{INPUT_DIR}/v*")): + vol = os.path.basename(vol_dir) + out_dir = os.path.join(OUTPUT_DIR, vol) + os.makedirs(out_dir, exist_ok=True) + + for ann_file in sorted(glob.glob(f"{vol_dir}/s*.json")): + scenario = os.path.basename(ann_file).replace(".json", "") + print(f"[{vol}/{scenario}]", flush=True) + + result, n_coarse = process_file(ann_file, vol, scenario) + + out_path = os.path.join(out_dir, f"{scenario}.json") + with open(out_path, "w", encoding="utf-8") as f: + json.dump(result, f, ensure_ascii=False, indent=2) + + n_fine = len(result["fine_segments"]) + total_fine += n_fine + total_coarse += n_coarse + total_files += 1 + + for seg in result["coarse_segments"]: + coarse_labels[seg["coarse_action"]] += 1 + + print(f" {n_fine} fine → {n_coarse} coarse segments", flush=True) + + print(f"\n{'='*60}") + print(f"Total: {total_files} files") + print(f" Fine segments: {total_fine}") + print(f" Coarse segments: {total_coarse}") + print(f" Compression: {total_fine/max(total_coarse,1):.1f}x") + print(f" API calls: {call_count}") + + print(f"\n Coarse label distribution:") + for label, count in coarse_labels.most_common(): + print(f" {label:<20} {count:>5} ({count/max(total_coarse,1)*100:.1f}%)") + + print(f"\n Output: {OUTPUT_DIR}") + + +if __name__ == "__main__": + main() diff --git a/experiments/analysis/grasp_phase_analysis.py b/experiments/analysis/grasp_phase_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..e19ddd3c09a29343fe5f8bd67c855b1f0e6d3348 --- /dev/null +++ b/experiments/analysis/grasp_phase_analysis.py @@ -0,0 +1,442 @@ +#!/usr/bin/env python3 +""" +Grasp Phase Timing Analysis — Flagship visualization for the paper. + +Classic neuroscience finding: + Eye gaze → EMG activation → Hand motion → Pressure contact + +This script: +1. Detects grasp events (pressure onset: 0 → >5g) +2. Looks back in time to find: + - EMG envelope activation onset + - Hand velocity peak (from MoCap) + - Eye gaze fixation (if available) +3. Computes statistics over all grasp events +4. Produces the canonical "grasp phase" timing figure +""" + +import os +import glob +import json +import numpy as np +import pandas as pd +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +from scipy import signal as scisig +from collections import defaultdict + +DATASET_DIR = "${PULSE_ROOT}/dataset" +OUTPUT_DIR = "${PULSE_ROOT}/results/grasp_phase" +SAMPLING_RATE = 100 # Hz +PRESSURE_THRESHOLD = 5.0 # grams +CONTEXT_WINDOW_SEC = 2.0 # look back 2s before contact +CONTEXT_FRAMES = int(CONTEXT_WINDOW_SEC * SAMPLING_RATE) + +os.makedirs(OUTPUT_DIR, exist_ok=True) + + +def load_pressure(scenario_dir): + """Load pressure data and return (T, 2) array: [right_total, left_total].""" + f = os.path.join(scenario_dir, "aligned_pressure_100hz.csv") + if not os.path.exists(f): + return None + df = pd.read_csv(f, low_memory=False) + r_cols = [c for c in df.columns if c.startswith('R') and c.endswith('(g)')] + l_cols = [c for c in df.columns if c.startswith('L') and c.endswith('(g)')] + if not r_cols or not l_cols: + return None + r = df[r_cols].apply(pd.to_numeric, errors='coerce').fillna(0).values.sum(axis=1) + l = df[l_cols].apply(pd.to_numeric, errors='coerce').fillna(0).values.sum(axis=1) + return np.stack([r, l], axis=1) # (T, 2) + + +def load_emg(scenario_dir): + """Load EMG data: (T, 8) array.""" + f = os.path.join(scenario_dir, "aligned_emg_100hz.csv") + if not os.path.exists(f): + return None + df = pd.read_csv(f, low_memory=False) + # Find EMG channel columns (e.g., EMG1...EMG8 or channels) + numeric_cols = df.select_dtypes(include=[np.number]).columns + numeric_cols = [c for c in numeric_cols if c not in ('Frame', 'Time', 'time', 'UTC')] + if len(numeric_cols) < 4: + return None + arr = df[numeric_cols].values.astype(np.float32) + arr = np.nan_to_num(arr, nan=0.0, posinf=0.0, neginf=0.0) + return arr + + +def load_mocap(scenario_dir, vol, scenario): + """Load MoCap hand position, return (T, 3) right hand velocity magnitude, (T, 3) left hand.""" + f = os.path.join(scenario_dir, f"aligned_{vol}{scenario}_s_Q.tsv") + if not os.path.exists(f): + return None, None + df = pd.read_csv(f, sep='\t', low_memory=False) + # Find right/left hand position columns + # Try common naming patterns + r_cols = [c for c in df.columns if 'RightHand' in c and (c.endswith('_X') or c.endswith('_Y') or c.endswith('_Z'))] + l_cols = [c for c in df.columns if 'LeftHand' in c and (c.endswith('_X') or c.endswith('_Y') or c.endswith('_Z'))] + if not r_cols or not l_cols: + # Try alternative naming + r_cols = [c for c in df.columns if 'R_Hand' in c or 'RHand' in c][:3] + l_cols = [c for c in df.columns if 'L_Hand' in c or 'LHand' in c][:3] + if not r_cols or not l_cols: + return None, None + + r_pos = df[r_cols[:3]].apply(pd.to_numeric, errors='coerce').fillna(0).values + l_pos = df[l_cols[:3]].apply(pd.to_numeric, errors='coerce').fillna(0).values + return r_pos, l_pos + + +def compute_emg_envelope(emg, window_size=20): + """Rectify and low-pass filter EMG to get envelope.""" + # Rectify + rectified = np.abs(emg - np.mean(emg, axis=0)) + # Moving average + kernel = np.ones(window_size) / window_size + envelope = np.zeros_like(rectified) + for ch in range(rectified.shape[1]): + envelope[:, ch] = np.convolve(rectified[:, ch], kernel, mode='same') + # Sum across channels and normalize + total = envelope.sum(axis=1) + if total.max() > total.min(): + total = (total - total.min()) / (total.max() - total.min() + 1e-8) + return total # (T,) + + +def compute_velocity(position, window=3): + """Compute velocity magnitude from 3D position.""" + vel = np.zeros_like(position) + vel[1:] = position[1:] - position[:-1] + vel_mag = np.linalg.norm(vel, axis=1) + # Smooth + kernel = np.ones(window) / window + vel_mag = np.convolve(vel_mag, kernel, mode='same') + return vel_mag # (T,) + + +def detect_grasp_events(pressure_1d, threshold=5.0, min_duration=10, min_gap=50): + """Detect pressure onset events (0 → >threshold). + + Returns list of onset frame indices. + """ + above = pressure_1d > threshold + # Hysteresis smoothing: require persistence + onsets = [] + last_state = False + stable_counter = 0 + for i, a in enumerate(above): + if a and not last_state: + # Candidate onset, check persistence + if i + min_duration < len(above) and np.mean(above[i:i+min_duration]) > 0.7: + if not onsets or i - onsets[-1] > min_gap: + onsets.append(i) + last_state = True + elif not a and last_state: + # Check if really released + if i + 5 < len(above) and np.mean(above[i:i+5]) < 0.3: + last_state = False + return onsets + + +def find_signal_onset(signal, ref_idx, window_frames, threshold_ratio=0.3): + """Find the LATEST pre-contact onset of signal activation. + + Strategy: walk backward from ref_idx. Look for the last sample that's + still 'active' (> baseline + threshold_ratio * (peak-baseline)). + The first 'inactive' sample going backward marks the onset. + + Returns: frame index of onset relative to ref_idx (negative = before). + """ + start = max(0, ref_idx - window_frames) + segment = signal[start:ref_idx + 1] # pre-contact window + if len(segment) < 10: + return None + + # Baseline: lower quartile of the pre-contact window (robust to activation) + # Only use the earliest 30% as baseline estimate + early_part = segment[:max(10, int(len(segment) * 0.3))] + baseline = np.percentile(early_part, 25) + + # Peak of the pre-contact activation + peak = np.max(segment) + if peak - baseline < 1e-4: + return None + + threshold = baseline + (peak - baseline) * threshold_ratio + + # Walk BACKWARD from ref_idx: find the last consecutive 'active' region + # ending at ref_idx, then the onset is where that region starts + above = segment > threshold + if not above[-1]: + # Not active at contact - use threshold crossing pattern + # Find the rising edge closest to ref_idx + rising = np.where(np.diff(above.astype(int)) == 1)[0] + if len(rising) == 0: + return None + onset_local = rising[-1] + 1 # first active frame + else: + # Active at contact - walk back to find onset + onset_local = len(segment) - 1 + while onset_local > 0 and above[onset_local - 1]: + onset_local -= 1 + + onset_global = start + onset_local + return onset_global - ref_idx # negative = before contact + + +def is_clean_grasp(emg_env, velocity, pressure_trace, onset, look_back=150, rest_window=50): + """Check if this is a CLEAN grasp starting from rest. + + Requires: EMG and velocity are both low in the REST window (onset-150 ~ onset-100). + """ + rest_start = onset - look_back + rest_end = onset - (look_back - rest_window) + if rest_start < 0: + return False + + # Quiescent rest period: EMG and velocity both low + emg_rest = emg_env[rest_start:rest_end].mean() + vel_rest = velocity[rest_start:rest_end].mean() + + # Compare to the entire pre-contact activation + emg_pre = emg_env[rest_end:onset] + vel_pre = velocity[rest_end:onset] + + if len(emg_pre) < 10: + return False + + # The rest period should be significantly lower than the activation period + emg_active = np.percentile(emg_pre, 75) + vel_active = np.percentile(vel_pre, 75) + + emg_increase = emg_active - emg_rest + vel_increase = vel_active - vel_rest + + # Require meaningful increase from rest to activation + emg_dyn = emg_env.max() - emg_env.min() + vel_dyn = velocity.max() - velocity.min() + + if emg_dyn < 1e-6 or vel_dyn < 1e-6: + return False + + return (emg_increase / emg_dyn > 0.1) and (vel_increase / vel_dyn > 0.1) + + +def analyze_one_scenario(vol, scenario): + """Analyze clean grasp events starting from rest.""" + scenario_dir = os.path.join(DATASET_DIR, vol, scenario) + + pressure = load_pressure(scenario_dir) + emg = load_emg(scenario_dir) + mocap_r, mocap_l = load_mocap(scenario_dir, vol, scenario) + + if pressure is None or emg is None or mocap_r is None: + return None + + min_len = min(pressure.shape[0], emg.shape[0], mocap_r.shape[0]) + pressure = pressure[:min_len] + emg = emg[:min_len] + mocap_r = mocap_r[:min_len] + mocap_l = mocap_l[:min_len] + + emg_env = compute_emg_envelope(emg) + vel_r = compute_velocity(mocap_r) + vel_l = compute_velocity(mocap_l) + + events = [] + + for hand_name, hand_pressure, hand_vel in [ + ('right', pressure[:, 0], vel_r), + ('left', pressure[:, 1], vel_l), + ]: + onsets = detect_grasp_events(hand_pressure, threshold=PRESSURE_THRESHOLD) + for onset in onsets: + if onset < CONTEXT_FRAMES: + continue + + # Filter: only clean grasps starting from rest + if not is_clean_grasp(emg_env, hand_vel, hand_pressure, onset): + continue + + # Find EMG onset: look for sustained activation rising from rest + emg_delay = find_signal_onset(emg_env, onset, CONTEXT_FRAMES, threshold_ratio=0.3) + motion_delay = find_signal_onset(hand_vel, onset, CONTEXT_FRAMES, threshold_ratio=0.3) + if emg_delay is None or motion_delay is None: + continue + + # Sanity check: delays should be within [-1500, 0] ms + if emg_delay * 10 < -1500 or emg_delay * 10 > 0: + continue + if motion_delay * 10 < -1500 or motion_delay * 10 > 0: + continue + + start = onset - CONTEXT_FRAMES + end = onset + 50 + events.append({ + 'pressure': hand_pressure[start:end], + 'emg': emg_env[start:end], + 'velocity': hand_vel[start:end], + 'hand': hand_name, + 'onset_idx': onset, + 'emg_delay_ms': emg_delay * 10, + 'motion_delay_ms': motion_delay * 10, + }) + + return events + + +def main(): + all_events = [] + stats = defaultdict(int) + + for vol_dir in sorted(glob.glob(f"{DATASET_DIR}/v*")): + vol = os.path.basename(vol_dir) + for scenario_dir in sorted(glob.glob(f"{vol_dir}/s*")): + scenario = os.path.basename(scenario_dir) + meta_path = os.path.join(scenario_dir, 'alignment_metadata.json') + if not os.path.exists(meta_path): + continue + meta = json.load(open(meta_path)) + # Need all 3 modalities + if not {'pressure', 'emg', 'mocap'}.issubset(set(meta['modalities'])): + stats['no_modality'] += 1 + continue + + events = analyze_one_scenario(vol, scenario) + if events is None: + stats['load_error'] += 1 + continue + all_events.extend(events) + stats['scenarios'] += 1 + stats['events'] += len(events) + print(f"[{vol}/{scenario}] {len(events)} grasp events", flush=True) + + print(f"\n=== Summary ===") + print(f"Scenarios processed: {stats['scenarios']}") + print(f"Total grasp events: {stats['events']}") + print(f"Loading errors: {stats['load_error']}") + print(f"Missing modality: {stats['no_modality']}") + + if not all_events: + print("No events found!") + return + + # Extract delays + emg_delays = np.array([e['emg_delay_ms'] for e in all_events]) + motion_delays = np.array([e['motion_delay_ms'] for e in all_events]) + + print(f"\n=== Timing Statistics (ms, negative = before contact) ===") + print(f"EMG onset delay: mean={emg_delays.mean():.1f} median={np.median(emg_delays):.1f} std={emg_delays.std():.1f}") + print(f"Motion peak delay: mean={motion_delays.mean():.1f} median={np.median(motion_delays):.1f} std={motion_delays.std():.1f}") + + # Save statistics + stats_dict = { + 'n_events': len(all_events), + 'emg_delay_ms': {'mean': float(emg_delays.mean()), 'median': float(np.median(emg_delays)), + 'std': float(emg_delays.std()), 'p25': float(np.percentile(emg_delays, 25)), + 'p75': float(np.percentile(emg_delays, 75))}, + 'motion_delay_ms': {'mean': float(motion_delays.mean()), 'median': float(np.median(motion_delays)), + 'std': float(motion_delays.std()), 'p25': float(np.percentile(motion_delays, 25)), + 'p75': float(np.percentile(motion_delays, 75))}, + } + with open(os.path.join(OUTPUT_DIR, 'timing_stats.json'), 'w') as f: + json.dump(stats_dict, f, indent=2) + + # ============ Figure 1: Aligned signal traces (averaged) ============ + # Filter to events that have sufficient context + valid = [e for e in all_events if len(e['pressure']) == CONTEXT_FRAMES + 50] + print(f"\nEvents with full context: {len(valid)} / {len(all_events)}") + + if len(valid) < 10: + print("Not enough events for plotting") + return + + # Normalize signals (per-event max) + def normalize(sigs): + sigs = np.stack(sigs) + # Normalize each to [0, 1] + sigs = sigs - sigs.min(axis=1, keepdims=True) + maxs = sigs.max(axis=1, keepdims=True) + sigs = sigs / (maxs + 1e-8) + return sigs + + pressure_stack = normalize([e['pressure'] for e in valid]) + emg_stack = normalize([e['emg'] for e in valid]) + vel_stack = normalize([e['velocity'] for e in valid]) + + time_axis = np.arange(-CONTEXT_FRAMES, 50) * 10 # ms + + fig, ax = plt.subplots(figsize=(9, 5)) + + # Plot mean ± std + for sigs, color, label in [ + (emg_stack, '#E74C3C', 'EMG envelope'), + (vel_stack, '#3498DB', 'Hand velocity'), + (pressure_stack, '#27AE60', 'Pressure (contact)'), + ]: + mean = sigs.mean(axis=0) + std = sigs.std(axis=0) + ax.plot(time_axis, mean, color=color, linewidth=2.5, label=label) + ax.fill_between(time_axis, mean - std * 0.5, mean + std * 0.5, color=color, alpha=0.15) + + ax.axvline(0, color='black', linestyle='--', linewidth=1.2, alpha=0.7, label='Contact onset') + ax.axvline(emg_delays.mean(), color='#E74C3C', linestyle=':', alpha=0.8) + ax.axvline(motion_delays.mean(), color='#3498DB', linestyle=':', alpha=0.8) + + # Annotations + ax.annotate(f'EMG\n{emg_delays.mean():.0f}ms', + xy=(emg_delays.mean(), 0.85), ha='center', fontsize=10, color='#C0392B', + bbox=dict(boxstyle="round,pad=0.3", fc='#FADBD8', ec='#E74C3C', alpha=0.9)) + ax.annotate(f'Motion\n{motion_delays.mean():.0f}ms', + xy=(motion_delays.mean(), 0.65), ha='center', fontsize=10, color='#1F618D', + bbox=dict(boxstyle="round,pad=0.3", fc='#D6EAF8', ec='#3498DB', alpha=0.9)) + + ax.set_xlabel('Time relative to contact onset (ms)', fontsize=12) + ax.set_ylabel('Normalized amplitude', fontsize=12) + ax.set_title(f'Grasp Phase Timing ({len(valid)} events, {stats["scenarios"]} recordings)', + fontsize=13, fontweight='bold') + ax.set_xlim(-CONTEXT_WINDOW_SEC * 1000, 500) + ax.legend(loc='upper left', frameon=True, fontsize=10) + ax.grid(True, alpha=0.3) + ax.set_ylim(-0.05, 1.1) + + plt.tight_layout() + fig_path = os.path.join(OUTPUT_DIR, 'grasp_phase_timing.png') + plt.savefig(fig_path, dpi=150, bbox_inches='tight') + plt.savefig(fig_path.replace('.png', '.pdf'), bbox_inches='tight') + print(f"Saved figure: {fig_path}") + + # ============ Figure 2: Delay distributions ============ + fig, axes = plt.subplots(1, 2, figsize=(11, 4)) + + axes[0].hist(emg_delays, bins=30, color='#E74C3C', alpha=0.7, edgecolor='black') + axes[0].axvline(emg_delays.mean(), color='black', linestyle='--', linewidth=2, label=f'Mean: {emg_delays.mean():.0f}ms') + axes[0].axvline(np.median(emg_delays), color='grey', linestyle=':', linewidth=2, label=f'Median: {np.median(emg_delays):.0f}ms') + axes[0].set_xlabel('EMG onset - Contact onset (ms)', fontsize=11) + axes[0].set_ylabel('Count', fontsize=11) + axes[0].set_title('EMG → Contact Delay', fontsize=12, fontweight='bold') + axes[0].legend(fontsize=10) + axes[0].grid(True, alpha=0.3) + + axes[1].hist(motion_delays, bins=30, color='#3498DB', alpha=0.7, edgecolor='black') + axes[1].axvline(motion_delays.mean(), color='black', linestyle='--', linewidth=2, label=f'Mean: {motion_delays.mean():.0f}ms') + axes[1].axvline(np.median(motion_delays), color='grey', linestyle=':', linewidth=2, label=f'Median: {np.median(motion_delays):.0f}ms') + axes[1].set_xlabel('Motion onset - Contact onset (ms)', fontsize=11) + axes[1].set_ylabel('Count', fontsize=11) + axes[1].set_title('Hand Motion → Contact Delay', fontsize=12, fontweight='bold') + axes[1].legend(fontsize=10) + axes[1].grid(True, alpha=0.3) + + plt.tight_layout() + fig2_path = os.path.join(OUTPUT_DIR, 'delay_distributions.png') + plt.savefig(fig2_path, dpi=150, bbox_inches='tight') + plt.savefig(fig2_path.replace('.png', '.pdf'), bbox_inches='tight') + print(f"Saved figure: {fig2_path}") + + print(f"\nAll outputs saved to: {OUTPUT_DIR}") + + +if __name__ == '__main__': + main() diff --git a/experiments/analysis/modality_viz.py b/experiments/analysis/modality_viz.py new file mode 100644 index 0000000000000000000000000000000000000000..89646957d87a7507f8962c420b6f0b78b756675e --- /dev/null +++ b/experiments/analysis/modality_viz.py @@ -0,0 +1,145 @@ +"""Visualize mocap skeleton frames, IMU waveforms, EMG waveforms.""" +import os, numpy as np, pandas as pd, matplotlib.pyplot as plt +from mpl_toolkits.mplot3d import Axes3D # noqa + +REC = "${PULSE_ROOT}/dataset/v1/s1" +OUT = "${PULSE_ROOT}/paper/figures" +os.makedirs(OUT, exist_ok=True) + +# ---- Skeleton bone definition (marker pairs) ---- +BONES = [ + # torso + ("HeadTop","HeadFront"),("HeadL","HeadR"),("HeadFront","SpineTop"), + ("SpineTop","Chest"),("Chest","WaistLFront"),("Chest","WaistRFront"), + ("WaistLFront","WaistLBack"),("WaistRFront","WaistRBack"), + ("WaistLBack","BackL"),("WaistRBack","BackR"),("BackL","BackR"), + ("SpineTop","LShoulderTop"),("SpineTop","RShoulderTop"), + ("LShoulderTop","LShoulderBack"),("RShoulderTop","RShoulderBack"), + # left arm + ("LShoulderTop","LArm"),("LArm","LElbowOut"),("LElbowOut","LElbowBack"), + ("LElbowOut","LForearmRoll"),("LForearmRoll","LWristOut"), + ("LWristOut","LWristIn"),("LWristOut","LHandOut"),("LWristIn","LHandIn"), + ("LHandOut","LIndex2"),("LIndex2","LIndexTip"), + ("LHandOut","LMiddle2"),("LMiddle2","LMiddleTip"), + ("LHandIn","LRing2"),("LRing2","LRingTip"), + ("LHandIn","LPinky2"),("LPinky2","LPinkyTip"), + ("LWristIn","LThumb1"),("LThumb1","LThumbTip"), + # right arm + ("RShoulderTop","RArm"),("RArm","RElbowOut"),("RElbowOut","RElbowBack"), + ("RElbowOut","RForearmRoll"),("RForearmRoll","RWristOut"), + ("RWristOut","RWristIn"),("RWristOut","RHandOut"),("RWristIn","RHandIn"), + ("RHandOut","RIndex2"),("RIndex2","RIndexTip"), + ("RHandOut","RMiddle2"),("RMiddle2","RMiddleTip"), + ("RHandIn","RRing2"),("RRing2","RRingTip"), + ("RHandIn","RPinky2"),("RPinky2","RPinkyTip"), + ("RWristIn","RThumb1"),("RThumb1","RThumbTip"), +] + + +def load_mocap(path): + df = pd.read_csv(path) + # Extract x,y,z for each marker ignoring Type cols + markers = {} + for col in df.columns: + if col.startswith("Q_") and col.endswith(" X"): + name = col[2:-2] + xs = df[f"Q_{name} X"].to_numpy() + ys = df[f"Q_{name} Y"].to_numpy() + zs = df[f"Q_{name} Z"].to_numpy() + markers[name] = np.stack([xs, ys, zs], axis=-1) + return df["Time"].to_numpy(), markers + + +def plot_skeletons(): + t, mk = load_mocap(os.path.join(REC, "aligned_mocap_100hz.csv")) + N = len(t) + # pick 4 time frames well spread through the recording with valid data + candidate = np.linspace(int(0.1*N), int(0.9*N), 4).astype(int) + + fig = plt.figure(figsize=(12, 3.2)) + for i, fr in enumerate(candidate): + ax = fig.add_subplot(1, 4, i+1, projection='3d') + # gather all points at this frame + pts = np.array([mk[n][fr] for n in mk]) + pts = pts[~np.isnan(pts).any(axis=1)] + if len(pts) == 0: + continue + # draw bones + for a, b in BONES: + if a in mk and b in mk: + pa, pb = mk[a][fr], mk[b][fr] + if np.isnan(pa).any() or np.isnan(pb).any(): + continue + ax.plot([pa[0], pb[0]], [pa[1], pb[1]], [pa[2], pb[2]], + color='#2266aa', lw=1.2) + ax.scatter(pts[:, 0], pts[:, 1], pts[:, 2], s=4, c='#cc3333', alpha=0.8) + # equal aspect + c = pts.mean(0) + r = np.ptp(pts, axis=0).max() / 2 + ax.set_xlim(c[0]-r, c[0]+r); ax.set_ylim(c[1]-r, c[1]+r); ax.set_zlim(c[2]-r, c[2]+r) + ax.set_xticks([]); ax.set_yticks([]); ax.set_zticks([]) + ax.set_title(f"t={t[fr]:.1f}s", fontsize=9) + ax.view_init(elev=12, azim=-75) + fig.suptitle("MoCap skeleton frames (56-marker Qualisys, v1/s1)", fontsize=11) + fig.tight_layout() + out = os.path.join(OUT, "mocap_skeleton.pdf") + fig.savefig(out, bbox_inches='tight'); fig.savefig(out.replace('.pdf', '.png'), dpi=150, bbox_inches='tight') + plt.close(fig) + print("Saved", out) + + +def plot_imu(): + df = pd.read_csv(os.path.join(REC, "aligned_imu_100hz.csv")) + t = df["time"].to_numpy(); t = t - t[0] + # pick 5 body locations (WT0..WT9 order roughly: wrists, forearms, upper arms, shins, thighs, torso) + sites = [("WT0", "Wrist R"), ("WT2", "Forearm R"), + ("WT4", "Upper arm R"), ("WT6", "Shin R"), ("WT9", "Torso")] + fig, axes = plt.subplots(len(sites), 1, figsize=(9, 6), sharex=True) + # crop to 20s window mid-recording + mid = len(t)//2 + sl = slice(max(0, mid-1000), min(len(t), mid+1000)) + for ax, (sid, lbl) in zip(axes, sites): + for comp, col in zip(["x", "y", "z"], ["#d62728", "#2ca02c", "#1f77b4"]): + ax.plot(t[sl], df[f"{sid}_acc_{comp}"].to_numpy()[sl], color=col, lw=0.8, label=f"acc_{comp}") + ax.set_ylabel(lbl, fontsize=9) + ax.grid(alpha=0.3) + axes[0].legend(loc="upper right", ncol=3, fontsize=8) + axes[-1].set_xlabel("Time (s)") + fig.suptitle("IMU 3-axis acceleration across 5 body sites (v1/s1, 20s window)", fontsize=11) + fig.tight_layout() + out = os.path.join(OUT, "imu_waveforms.pdf") + fig.savefig(out, bbox_inches='tight'); fig.savefig(out.replace('.pdf', '.png'), dpi=150, bbox_inches='tight') + plt.close(fig) + print("Saved", out) + + +def plot_emg(): + df = pd.read_csv(os.path.join(REC, "aligned_emg_100hz.csv")) + t = df["time"].to_numpy(); t = t - t[0] + ch = [f"emg_{i}" for i in range(1, 9)] + # 20s window mid-recording + mid = len(t)//2 + sl = slice(max(0, mid-1000), min(len(t), mid+1000)) + fig, axes = plt.subplots(8, 1, figsize=(9, 7), sharex=True) + for ax, c in zip(axes, ch): + sig = df[c].to_numpy()[sl] + ax.plot(t[sl], sig, color="#555", lw=0.5) + # envelope overlay + env = pd.Series(np.abs(sig)).rolling(20, min_periods=1).mean().to_numpy() + ax.plot(t[sl], env, color="#d62728", lw=0.9) + ax.set_ylabel(c, fontsize=8) + ax.grid(alpha=0.3) + axes[-1].set_xlabel("Time (s)") + fig.suptitle("Surface EMG 8-channel raw (grey) with rectified envelope (red), v1/s1, 20s window", + fontsize=11) + fig.tight_layout() + out = os.path.join(OUT, "emg_waveforms.pdf") + fig.savefig(out, bbox_inches='tight'); fig.savefig(out.replace('.pdf', '.png'), dpi=150, bbox_inches='tight') + plt.close(fig) + print("Saved", out) + + +if __name__ == "__main__": + plot_skeletons() + plot_imu() + plot_emg() diff --git a/experiments/analysis/reannotate_actions.py b/experiments/analysis/reannotate_actions.py new file mode 100644 index 0000000000000000000000000000000000000000..d65c8d64a7f3d3adcb85c95c0de7a5742218e3f3 --- /dev/null +++ b/experiments/analysis/reannotate_actions.py @@ -0,0 +1,363 @@ +#!/usr/bin/env python3 +""" +Re-annotate action segments using LLM (GPT-4o-mini). +1. Re-classify existing segments with better accuracy +2. Infer actions in unlabeled gaps based on context (scene, surrounding actions) +3. Output improved annotations with higher coverage +""" + +import os +import sys +import json +import re +import time +import copy +import glob +import urllib.request +from collections import Counter + +ANN_DIR = "${PULSE_ROOT}/annotations_by_scene" +OUTPUT_DIR = "${PULSE_ROOT}/annotations_v2" +DATASET_DIR = "${PULSE_ROOT}/dataset" + +API_URL = "https://api.chatanywhere.tech/v1/chat/completions" +API_KEYS = [ + "sk-MN5n1uEETyaky96fLJdHqZobXF1f7KmOrZHzwD3lt585asFQ", + "sk-YnYrtPdAXwlE12hRpi6dYqlE1RRVR3LDVBka6wKaefU4iQRY", + "sk-jOZtodDv6OxUOMu3NuJ8lzffjwBlshn9OHY5KSmqmPTtc9qs", + "sk-qAaKTKYIRF24btu1oQWgubWG4UdA92bILNtzOkHNEPAcCxdB", + "sk-MgCBBonblMrCFnSXd6fJZaBLTCfCJ5FjYZfSe2e46bgmyktk", + "sk-79e30kYRgduuf2fSU0Lsc814YjNkClXXzQqIbx0iLS40IOEH", + "sk-h9Tej4tW6AQC6fT0njfzrPKXEk6fBwpiSvvQd0aJAhw4UwLz", + "sk-k2QNHt5wAH26Fw8hZuPWuVXw8Psd1jX09qusiA6PdBj5Vzuu", + "sk-w7EkTblciNI44cwosHXi0PGZNUf1hnJmpzOQ85va9VPdAKbz", + "sk-Dexs5ZF7OjFCq7CZW45wJ8EKoGtIswv6rsLUMzUXXkWBDBBJ", +] + +SCENE_DESCRIPTIONS = { + "s1": "办公桌面整理与工作准备(整理文件、电源线、鼠标、笔记本电脑等)", + "s2": "快递打包发送(折叠纸箱、放入物品、封箱、贴标签等)", + "s3": "厨房调料整理(拿取调料瓶、倒调料、拧瓶盖、擦拭等)", + "s4": "清理餐后桌面(收碗碟、擦桌子、整理餐具、倒残渣等)", + "s5": "餐前桌面布置(铺桌布、摆放餐具碗碟、放杯子等)", + "s6": "商务旅行行李箱打包(折叠衣物、放入行李箱、整理物品等)", + "s7": "冲泡咖啡/饮品(取杯子、放咖啡粉/茶包、倒热水、搅拌等)", + "s8": "晾衣架整理与衣物收纳(取衣架、挂衣服、折叠衣物等)", +} + +ACTION_CATEGORIES = """动作类别定义(共11类): + +1. Grasp - 抓取/拿起物体(手从无接触到接触并握住物体) +2. Place - 放置/放下物体(将物体放到某个位置并释放) +3. Pour - 倾倒/注入液体或颗粒(倒水、倒调料、倒咖啡粉等) +4. Wipe - 擦拭/清洁表面(用抹布或手擦桌面、瓶身等) +5. Fold - 折叠/卷起(折衣服、折桌布、折纸箱等) +6. OpenClose - 打开/关闭/旋开/旋紧(开盒子、拧瓶盖、拉拉链、合箱盖等) +7. Stir - 搅拌(搅拌咖啡、搅拌饮品等) +8. TearCut - 撕/剪/粘贴(撕胶带、剪快递单、贴标签等) +9. Arrange - 整理/摆放/调整位置(摆餐具、整理文件、调整物品位置、理线等) +10. Transport - 搬运/移动物体到较远位置(把包裹搬到架子、把碗端到水槽等) +11. Idle - 空闲/过渡/无明确操作(双手无目的性动作、等待、观察等) + +注意: +- 只有真正没有任何手部操作时才标Idle +- "调整姿态"、"检查物体"等属于Arrange +- "插入"、"装入"等属于Place +- "提起并移动"如果距离短属于Grasp,距离远属于Transport +""" + +current_key_idx = 0 +call_count = 0 + + +def call_llm(prompt, max_tokens=1000, retries=3): + """Call LLM API with automatic key rotation.""" + global current_key_idx, call_count + + for attempt in range(retries * len(API_KEYS)): + key = API_KEYS[current_key_idx] + try: + data = json.dumps({ + "model": "gpt-4o-mini", + "messages": [{"role": "user", "content": prompt}], + "max_tokens": max_tokens, + "temperature": 0.1, + }).encode() + req = urllib.request.Request( + API_URL, data=data, + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {key}", + } + ) + resp = urllib.request.urlopen(req, timeout=30) + result = json.loads(resp.read()) + call_count += 1 + return result["choices"][0]["message"]["content"] + except Exception as e: + err = str(e) + if "429" in err or "quota" in err or "limit" in err or "402" in err: + # Key exhausted, rotate + print(f" Key {current_key_idx+1} exhausted, rotating...") + current_key_idx = (current_key_idx + 1) % len(API_KEYS) + elif "timeout" in err.lower(): + time.sleep(1) + else: + print(f" API error: {err[:100]}") + current_key_idx = (current_key_idx + 1) % len(API_KEYS) + time.sleep(0.5) + + print(" WARNING: All API keys failed!") + return None + + +def reclassify_segments(segments, scene_id): + """Use LLM to reclassify all segments in a recording.""" + scene_desc = SCENE_DESCRIPTIONS.get(scene_id, "日常活动") + + # Build segment list for prompt + seg_list = [] + for i, seg in enumerate(segments): + seg_list.append(f"{i+1}. [{seg['timestamp']}] {seg['task']}") + seg_text = "\n".join(seg_list) + + prompt = f"""你是一个人体动作标注专家。请为以下每个动作片段分配一个动作类别。 + +场景:{scene_desc} + +{ACTION_CATEGORIES} + +动作片段列表: +{seg_text} + +请严格按以下JSON格式返回,不要添加任何额外文字: +[{{"id": 1, "action": "类别名"}}, {{"id": 2, "action": "类别名"}}, ...] + +每个action必须是以下之一:Grasp, Place, Pour, Wipe, Fold, OpenClose, Stir, TearCut, Arrange, Transport, Idle""" + + response = call_llm(prompt, max_tokens=len(segments) * 40) + if response is None: + return None + + # Parse response + try: + # Extract JSON from response + match = re.search(r'\[.*\]', response, re.DOTALL) + if match: + results = json.loads(match.group()) + return {r["id"]: r["action"] for r in results} + except (json.JSONDecodeError, KeyError) as e: + print(f" Parse error: {e}, response: {response[:200]}") + return None + + +def infer_gap_actions(scene_id, before_seg, after_seg, gap_start, gap_end): + """Use LLM to infer what actions likely happened in an unlabeled gap.""" + scene_desc = SCENE_DESCRIPTIONS.get(scene_id, "日常活动") + gap_duration = gap_end - gap_start + + before_text = f"[{before_seg['timestamp']}] {before_seg['task']}" if before_seg else "(录制开始)" + after_text = f"[{after_seg['timestamp']}] {after_seg['task']}" if after_seg else "(录制结束)" + + prompt = f"""你是一个人体动作标注专家。在一段日常活动录制中,有一段时间没有被标注。请根据场景和前后动作推断这段时间内最可能发生的动作。 + +场景:{scene_desc} +未标注时间段:{gap_start//60:02d}:{gap_start%60:02d} - {gap_end//60:02d}:{gap_end%60:02d}(共{gap_duration}秒) +前一个标注动作:{before_text} +后一个标注动作:{after_text} + +{ACTION_CATEGORIES} + +请推断这段时间内可能发生的动作序列。每个动作段落2-4秒,时间用MM:SS格式。 +如果确实是空闲等待,标注为Idle。 + +严格按以下JSON格式返回,不要添加任何额外文字: +[{{"timestamp": "MM:SS-MM:SS", "task": "动作描述", "action": "类别名"}}] + +每个action必须是以下之一:Grasp, Place, Pour, Wipe, Fold, OpenClose, Stir, TearCut, Arrange, Transport, Idle""" + + response = call_llm(prompt, max_tokens=500) + if response is None: + return [] + + try: + match = re.search(r'\[.*\]', response, re.DOTALL) + if match: + results = json.loads(match.group()) + # Validate timestamps + valid = [] + for r in results: + if "timestamp" in r and "action" in r and "task" in r: + ts_match = re.match(r'(\d+):(\d+)\s*-\s*(\d+):(\d+)', r["timestamp"]) + if ts_match: + s = int(ts_match.group(1))*60 + int(ts_match.group(2)) + e = int(ts_match.group(3))*60 + int(ts_match.group(4)) + if gap_start <= s < e <= gap_end: + valid.append(r) + return valid + except (json.JSONDecodeError, KeyError) as e: + print(f" Parse error: {e}") + return [] + + +def get_recording_duration(vol, scenario): + """Get total recording duration in seconds.""" + meta_path = os.path.join(DATASET_DIR, vol, scenario, "alignment_metadata.json") + if os.path.exists(meta_path): + meta = json.load(open(meta_path)) + if "aligned_length_sec" in meta: + return meta["aligned_length_sec"] + if "aligned_length_frames" in meta: + return meta["aligned_length_frames"] / 100.0 + return None + + +def process_one_file(ann_path, vol, scenario): + """Process one annotation file: reclassify + fill gaps.""" + data = json.load(open(ann_path)) + segments = data["segments"] + + if not segments: + return data, {"reclassified": 0, "gaps_filled": 0} + + # Step 1: Reclassify existing segments + print(f" Reclassifying {len(segments)} segments...") + classifications = reclassify_segments(segments, scenario) + + if classifications: + for i, seg in enumerate(segments): + action = classifications.get(i + 1) + if action and action in {"Grasp", "Place", "Pour", "Wipe", "Fold", + "OpenClose", "Stir", "TearCut", "Arrange", + "Transport", "Idle"}: + seg["action_label"] = action + else: + seg["action_label"] = "Idle" + else: + # Fallback: keep without label + for seg in segments: + seg["action_label"] = "Idle" + + reclassified = sum(1 for s in segments if "action_label" in s) + + # Step 2: Find and fill gaps ≥ 3 seconds + # Parse all timestamps + parsed = [] + for seg in segments: + m = re.match(r'(\d+):(\d+)\s*-\s*(\d+):(\d+)', seg["timestamp"]) + if m: + s = int(m.group(1))*60 + int(m.group(2)) + e = int(m.group(3))*60 + int(m.group(4)) + parsed.append((s, e, seg)) + parsed.sort() + + total_dur = get_recording_duration(vol, scenario) + + new_segments = [] + gaps_filled = 0 + + for i in range(len(parsed)): + new_segments.append(parsed[i][2]) + + # Check gap after this segment + if i < len(parsed) - 1: + gap_start = parsed[i][1] + gap_end = parsed[i + 1][0] + elif total_dur: + gap_start = parsed[i][1] + gap_end = int(total_dur) + else: + continue + + gap_duration = gap_end - gap_start + if gap_duration >= 3: + before_seg = parsed[i][2] + after_seg = parsed[i + 1][2] if i < len(parsed) - 1 else None + + print(f" Filling gap {gap_start}s-{gap_end}s ({gap_duration}s)...") + inferred = infer_gap_actions(scenario, before_seg, after_seg, gap_start, gap_end) + + for inf in inferred: + new_seg = { + "timestamp": inf["timestamp"], + "task": inf["task"], + "action_label": inf["action"], + "source": "llm_inferred", + "left_hand": "", + "right_hand": "", + "bimanual_interaction": "", + "objects": [], + } + new_segments.append(new_seg) + gaps_filled += 1 + + # Also check gap at the beginning + if parsed and parsed[0][0] >= 3: + print(f" Filling start gap 0s-{parsed[0][0]}s...") + inferred = infer_gap_actions(scenario, None, parsed[0][2], 0, parsed[0][0]) + for inf in inferred: + new_seg = { + "timestamp": inf["timestamp"], + "task": inf["task"], + "action_label": inf["action"], + "source": "llm_inferred", + "left_hand": "", + "right_hand": "", + "bimanual_interaction": "", + "objects": [], + } + new_segments.insert(0, new_seg) + gaps_filled += 1 + + # Sort by timestamp + def sort_key(seg): + m = re.match(r'(\d+):(\d+)', seg["timestamp"]) + return int(m.group(1))*60 + int(m.group(2)) if m else 0 + new_segments.sort(key=sort_key) + + result = copy.deepcopy(data) + result["segments"] = new_segments + + return result, {"reclassified": reclassified, "gaps_filled": gaps_filled} + + +def main(): + os.makedirs(OUTPUT_DIR, exist_ok=True) + + total_reclassified = 0 + total_gaps_filled = 0 + total_files = 0 + + for vol_dir in sorted(glob.glob(f"{ANN_DIR}/v*")): + vol = os.path.basename(vol_dir) + out_vol_dir = os.path.join(OUTPUT_DIR, vol) + os.makedirs(out_vol_dir, exist_ok=True) + + for ann_file in sorted(glob.glob(f"{vol_dir}/s*.json")): + scenario = os.path.basename(ann_file).replace(".json", "") + print(f"\n[{vol}/{scenario}]", flush=True) + + result, stats = process_one_file(ann_file, vol, scenario) + + # Save + out_path = os.path.join(out_vol_dir, f"{scenario}.json") + with open(out_path, "w", encoding="utf-8") as f: + json.dump(result, f, ensure_ascii=False, indent=2) + + total_reclassified += stats["reclassified"] + total_gaps_filled += stats["gaps_filled"] + total_files += 1 + + print(f" Done: {stats['reclassified']} reclassified, {stats['gaps_filled']} gaps filled", + flush=True) + + print(f"\n{'='*60}") + print(f"Total: {total_files} files processed") + print(f" Reclassified: {total_reclassified} segments") + print(f" Gap-filled: {total_gaps_filled} new segments") + print(f" API calls: {call_count}") + print(f" Output: {OUTPUT_DIR}") + + +if __name__ == "__main__": + main() diff --git a/experiments/data/__init__.py b/experiments/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/experiments/data/__pycache__/dataset.cpython-312.pyc b/experiments/data/__pycache__/dataset.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d0da15532a86a4dc18d317c8e15c4b6a6c1ed280 Binary files /dev/null and b/experiments/data/__pycache__/dataset.cpython-312.pyc differ diff --git a/experiments/data/dataset.py b/experiments/data/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..9f9ecca26b979772e1f456c0ab3843632756d8db --- /dev/null +++ b/experiments/data/dataset.py @@ -0,0 +1,332 @@ +""" +Multimodal scene dataset for Experiment 1: Activity Recognition. +Loads aligned 100Hz multi-modal data, supports modality selection, +subject-independent splits, and variable-length sequence handling. +""" + +import os +import json +import numpy as np +import pandas as pd +import torch +from torch.utils.data import Dataset, DataLoader +from torch.nn.utils.rnn import pad_sequence + +DATASET_DIR = "${PULSE_ROOT}/dataset" + +MODALITY_FILES = { + 'mocap': None, # Special: uses aligned_{vol}{scene}_s_Q.tsv (skeleton data) + 'emg': 'aligned_emg_100hz.csv', + 'eyetrack': 'aligned_eyetrack_100hz.csv', + 'imu': 'aligned_imu_100hz.csv', + 'pressure': 'aligned_pressure_100hz.csv', + 'video': 'video_features_100hz.npy', # ViT-B/16 (ImageNet) + 'videomae': 'video_features_videomae_100hz.npy', # VideoMAE (Kinetics-400) +} + + +def get_modality_filepath(scenario_dir, modality, vol=None, scenario=None): + """Return the file path for a given modality. + + Mocap uses a special naming pattern: aligned_{vol}{scene}_s_Q.tsv + All other modalities use MODALITY_FILES directly. + """ + if modality == 'mocap': + if vol is None or scenario is None: + raise ValueError("vol and scenario required for mocap modality") + return os.path.join(scenario_dir, f"aligned_{vol}{scenario}_s_Q.tsv") + return os.path.join(scenario_dir, MODALITY_FILES[modality]) + +SKIP_COLS = {'Frame', 'Time', 'time', 'UTC'} +SKIP_COL_SUFFIXES = (' Type',) + +# Eyetrack exports sometimes include volunteer-specific marker/ICA columns. +# Benchmark inputs use the fixed 24 core gaze columns below; recordings missing +# any core column are skipped instead of truncating the full dataset. +EYETRACK_SKIP_PATTERNS = ('Index Of Cognitive Activity', 'Marker Coordinates', 'Markers_') +EYETRACK_CORE_COLS = [ + 'Dikablis Glasses 3_Eye Data_Original_Pupil X', + 'Dikablis Glasses 3_Eye Data_Original_Pupil Y', + 'Dikablis Glasses 3_Eye Data_Original_Left Eye_Pupil X', + 'Dikablis Glasses 3_Eye Data_Original_Left Eye_Pupil Y', + 'Dikablis Glasses 3_Eye Data_Original_Left Eye_Pupil Area', + 'Dikablis Glasses 3_Eye Data_Original_Left Eye_Pupil Height', + 'Dikablis Glasses 3_Eye Data_Original_Left Eye_Pupil Width', + 'Dikablis Glasses 3_Eye Data_Original_Left Eye_Fixations_Fixations', + 'Dikablis Glasses 3_Eye Data_Original_Left Eye_Fixations_Fixations Duration', + 'Dikablis Glasses 3_Eye Data_Original_Left Eye_Saccades_Saccades', + 'Dikablis Glasses 3_Eye Data_Original_Left Eye_Saccades_Saccades Duration', + 'Dikablis Glasses 3_Eye Data_Original_Left Eye_Saccades_Saccades Angle', + 'Dikablis Glasses 3_Eye Data_Original_Right Eye_Pupil X', + 'Dikablis Glasses 3_Eye Data_Original_Right Eye_Pupil Y', + 'Dikablis Glasses 3_Eye Data_Original_Right Eye_Pupil Area', + 'Dikablis Glasses 3_Eye Data_Original_Right Eye_Pupil Height', + 'Dikablis Glasses 3_Eye Data_Original_Right Eye_Pupil Width', + 'Dikablis Glasses 3_Eye Data_Original_Right Eye_Fixations_Fixations', + 'Dikablis Glasses 3_Eye Data_Original_Right Eye_Fixations_Fixations Duration', + 'Dikablis Glasses 3_Eye Data_Original_Right Eye_Saccades_Saccades', + 'Dikablis Glasses 3_Eye Data_Original_Right Eye_Saccades_Saccades Duration', + 'Dikablis Glasses 3_Eye Data_Original_Right Eye_Saccades_Saccades Angle', + 'Dikablis Glasses 3_Field Data_Scene Cam_Original_Gaze_Gaze X', + 'Dikablis Glasses 3_Field Data_Scene Cam_Original_Gaze_Gaze Y', +] +EYETRACK_EXCLUDED_RECORDINGS = {('v1', 's1'), ('v14', 's8')} + +SCENE_LABELS = {f's{i}': i - 1 for i in range(1, 9)} +NUM_CLASSES = 8 + +TRAIN_VOLS = ['v1', 'v2', 'v11', 'v12', 'v13', 'v15', 'v16', 'v17', 'v19', 'v20', 'v21', 'v22', 'v23', 'v24'] +VAL_VOLS = [] # No separate val set; use train for early stopping or cross-val +TEST_VOLS = ['v25', 'v26', 'v27', 'v3'] + + +def _preprocess_mocap_skeleton(arr, feat_cols): + """Convert absolute skeleton coords to hip-relative positions + velocity. + + Input: (T, F) with absolute XYZ + quaternions + Output: (T, F + N_pos) where N_pos = number of XYZ position features + [hip-relative features, XYZ velocity] + """ + col_to_idx = {c: i for i, c in enumerate(feat_cols)} + + # Find hip position for subtraction + hip_x_idx = col_to_idx.get('Hips_X') + hip_y_idx = col_to_idx.get('Hips_Y') + hip_z_idx = col_to_idx.get('Hips_Z') + if hip_x_idx is None: + return arr # No hip joint found, skip preprocessing + + # Identify all position columns (_X, _Y, _Z) + x_indices = [i for i, c in enumerate(feat_cols) if c.endswith('_X')] + y_indices = [i for i, c in enumerate(feat_cols) if c.endswith('_Y')] + z_indices = [i for i, c in enumerate(feat_cols) if c.endswith('_Z')] + all_pos_indices = sorted(x_indices + y_indices + z_indices) + + # 1. Make XYZ positions hip-relative + arr_rel = arr.copy() + hip_xyz = arr[:, [hip_x_idx, hip_y_idx, hip_z_idx]] # (T, 3) + for idx in x_indices: + arr_rel[:, idx] -= hip_xyz[:, 0] + for idx in y_indices: + arr_rel[:, idx] -= hip_xyz[:, 1] + for idx in z_indices: + arr_rel[:, idx] -= hip_xyz[:, 2] + + # 2. Compute velocity of position features only + pos_data = arr_rel[:, all_pos_indices] # (T, N_pos) + velocity = np.zeros_like(pos_data) + velocity[1:] = pos_data[1:] - pos_data[:-1] + + # 3. Concatenate: [hip-relative features (pos+quat), position velocity] + return np.concatenate([arr_rel, velocity], axis=1) + + +def load_modality_array(filepath, modality): + """Load a modality CSV/TSV/NPY and return numpy_array. + Returns None if data is corrupted (extreme values or mostly zeros).""" + # Video features stored as .npy + if filepath.endswith('.npy'): + if not os.path.exists(filepath): + return None + arr = np.load(filepath).astype(np.float32) + arr = np.nan_to_num(arr, nan=0.0, posinf=0.0, neginf=0.0) + return arr + # Mocap uses TSV with tab separator + sep = '\t' if filepath.endswith('.tsv') else ',' + df = pd.read_csv(filepath, sep=sep, low_memory=False) + df.columns = [str(c).strip() for c in df.columns] + if modality == 'eyetrack': + parts = os.path.normpath(filepath).split(os.sep) + if len(parts) >= 3 and (parts[-3], parts[-2]) in EYETRACK_EXCLUDED_RECORDINGS: + return None + feat_cols = [c for c in df.columns + if c not in SKIP_COLS + and not any(c.endswith(s) for s in SKIP_COL_SUFFIXES)] + if modality == 'eyetrack': + feat_cols = [c for c in EYETRACK_CORE_COLS if c in feat_cols] + if len(feat_cols) != len(EYETRACK_CORE_COLS): + return None + sub = df[feat_cols] + # Coerce non-numeric columns + obj_cols = sub.select_dtypes(include=['object']).columns + if len(obj_cols) > 0: + sub = sub.copy() + sub[obj_cols] = sub[obj_cols].apply(pd.to_numeric, errors='coerce') + arr = sub.values.astype(np.float64) + arr = np.nan_to_num(arr, nan=0.0, posinf=0.0, neginf=0.0) + # Quality check: reject samples with extreme values (corrupted data) + max_abs = np.max(np.abs(arr)) + if max_abs > 1e6: + return None # Corrupted + # Quality check: reject samples that are mostly zeros (sensor dropout). + # Pressure and EMG are legitimately zero for long periods (rest, no grip) + # so we only apply the strict near-total-loss check to the modalities + # where a flat-zero stream is a clear dropout signal. + if modality not in ("pressure", "emg"): + zero_ratio = np.mean(arr == 0.0) + if zero_ratio > 0.9: + return None # Near-total data loss + # Mocap skeleton: convert to hip-relative + velocity + if modality == 'mocap' and filepath.endswith('.tsv'): + arr = _preprocess_mocap_skeleton(arr, feat_cols) + arr = arr.astype(np.float32) + return arr + + +class MultimodalSceneDataset(Dataset): + """Dataset for scene-level classification from multimodal time series.""" + + def __init__(self, volunteers, modalities, downsample=5, stats=None): + self.modalities = modalities + self.downsample = downsample + self.data = [] + self.labels = [] + self.sample_info = [] + self._modality_dims = {} + + for vol in volunteers: + vol_dir = os.path.join(DATASET_DIR, vol) + if not os.path.isdir(vol_dir): + continue + for scenario in sorted(os.listdir(vol_dir)): + scenario_dir = os.path.join(vol_dir, scenario) + if not os.path.isdir(scenario_dir) or scenario not in SCENE_LABELS: + continue + meta_path = os.path.join(scenario_dir, 'alignment_metadata.json') + if not os.path.exists(meta_path): + continue + with open(meta_path) as f: + meta = json.load(f) + available = set(meta['modalities']) + if not set(modalities).issubset(available): + continue + + parts = [] + skip = False + for mod in modalities: + if mod == 'mocap': + # Skeleton data: aligned_{vol}{scene}_s_Q.tsv + tsv_name = f"aligned_{vol}{scenario}_s_Q.tsv" + filepath = os.path.join(scenario_dir, tsv_name) + else: + filepath = os.path.join(scenario_dir, MODALITY_FILES[mod]) + if not os.path.exists(filepath): + skip = True + break + arr = load_modality_array(filepath, mod) + if arr is None: + print(f" SKIP {vol}/{scenario} {mod}: corrupted data", flush=True) + skip = True + break + # Validate dimension consistency + if mod in self._modality_dims and arr.shape[1] != self._modality_dims[mod]: + print(f" WARNING: {vol}/{scenario} {mod} dim {arr.shape[1]} " + f"!= expected {self._modality_dims[mod]}, padding/truncating", + flush=True) + expected = self._modality_dims[mod] + if arr.shape[1] < expected: + pad = np.zeros((arr.shape[0], expected - arr.shape[1]), dtype=np.float32) + arr = np.concatenate([arr, pad], axis=1) + else: + arr = arr[:, :expected] + if mod not in self._modality_dims: + self._modality_dims[mod] = arr.shape[1] + parts.append(arr) + + if skip: + continue + + min_len = min(p.shape[0] for p in parts) + parts = [p[:min_len] for p in parts] + combined = np.concatenate(parts, axis=1) + combined = combined[::downsample] + + self.data.append(combined) + self.labels.append(SCENE_LABELS[scenario]) + self.sample_info.append(f"{vol}/{scenario}") + + print(f" Loaded {len(self.data)} samples, modality dims: {self._modality_dims}, " + f"total feat dim: {sum(self._modality_dims.values())}", flush=True) + + # Normalization (compute in float64 to avoid overflow) + if stats is not None: + self.mean, self.std = stats + else: + self._compute_stats() + for i in range(len(self.data)): + self.data[i] = ((self.data[i].astype(np.float64) - self.mean) / self.std).astype(np.float32) + self.data[i] = np.nan_to_num(self.data[i], nan=0.0, posinf=0.0, neginf=0.0) + + def _compute_stats(self): + # Use float64 for accumulation to prevent overflow + all_frames = np.concatenate(self.data, axis=0).astype(np.float64) + self.mean = np.mean(all_frames, axis=0, keepdims=True) + self.std = np.std(all_frames, axis=0, keepdims=True) + self.std[self.std < 1e-8] = 1.0 + + def get_stats(self): + return (self.mean, self.std) + + @property + def feat_dim(self): + return sum(self._modality_dims.values()) + + @property + def modality_dims(self): + return dict(self._modality_dims) + + def get_class_weights(self): + counts = np.bincount(self.labels, minlength=NUM_CLASSES).astype(np.float32) + counts[counts == 0] = 1.0 + weights = 1.0 / counts + weights = weights / weights.sum() * NUM_CLASSES + return torch.FloatTensor(weights) + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return torch.from_numpy(self.data[idx]), self.labels[idx] + + +def collate_fn(batch): + """Pad variable-length sequences and create masks.""" + sequences, labels = zip(*batch) + lengths = torch.LongTensor([s.shape[0] for s in sequences]) + padded = pad_sequence(sequences, batch_first=True, padding_value=0.0) + max_len = padded.shape[1] + mask = torch.arange(max_len).unsqueeze(0) < lengths.unsqueeze(1) + labels = torch.LongTensor(labels) + return padded, labels, mask, lengths + + +def get_dataloaders(modalities, batch_size=16, downsample=5, num_workers=0): + """Create train/val/test DataLoaders with proper normalization.""" + print("Loading training data...", flush=True) + train_ds = MultimodalSceneDataset(TRAIN_VOLS, modalities, downsample) + stats = train_ds.get_stats() + + print("Loading validation data...", flush=True) + val_ds = MultimodalSceneDataset(VAL_VOLS, modalities, downsample, stats=stats) + + print("Loading test data...", flush=True) + test_ds = MultimodalSceneDataset(TEST_VOLS, modalities, downsample, stats=stats) + + train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, + collate_fn=collate_fn, num_workers=num_workers, + drop_last=False) + val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, + collate_fn=collate_fn, num_workers=num_workers) + test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, + collate_fn=collate_fn, num_workers=num_workers) + + info = { + 'feat_dim': train_ds.feat_dim, + 'modality_dims': train_ds.modality_dims, + 'num_classes': NUM_CLASSES, + 'train_size': len(train_ds), + 'val_size': len(val_ds), + 'test_size': len(test_ds), + 'class_weights': train_ds.get_class_weights(), + } + return train_loader, val_loader, test_loader, info diff --git a/experiments/data/dataset_forecast.py b/experiments/data/dataset_forecast.py new file mode 100644 index 0000000000000000000000000000000000000000..db9d7b642a1c117d56900e0caa1923d0f954fadd --- /dev/null +++ b/experiments/data/dataset_forecast.py @@ -0,0 +1,319 @@ +"""Frame-level future motor-primitive forecasting dataset. + +Task definition +--------------- +At a sampled anchor time t in a recording: + past = sensor frames over [t - T_obs, t] ← input + future = per-frame verb_fine labels over (t, t + T_fut] ← target + +We use NUM_VERB_FINE (= 17) as a sentinel "idle / no segment" class for +frames not covered by any annotated segment, so every future frame has a +valid label (output cardinality = NUM_VERB_FINE + 1 = 18). + +Anchors are sampled at fixed stride within each recording so the model +sees both intra-segment future (mostly stationary) and across-boundary +future (where the next-action label changes — the interesting cases). +""" +from __future__ import annotations + +import os +import sys +from pathlib import Path +from typing import Dict, List, Optional, Sequence, Tuple + +import numpy as np +import torch +from torch.utils.data import Dataset + +THIS = Path(__file__).resolve() +sys.path.insert(0, str(THIS.parent)) +sys.path.insert(0, str(THIS.parents[1])) + +try: + from experiments.dataset_seqpred import ( + SAMPLING_RATE_HZ, _load_recording_sensors, _load_annotations, + parse_ts_range, TRAIN_VOLS_V3, TEST_VOLS_V3, + DEFAULT_DATASET_DIR, DEFAULT_ANNOT_DIR, + ) + from experiments.taxonomy import ( + classify_segment, NUM_VERB_FINE, + ) +except ModuleNotFoundError: + from dataset_seqpred import ( + SAMPLING_RATE_HZ, _load_recording_sensors, _load_annotations, + parse_ts_range, TRAIN_VOLS_V3, TEST_VOLS_V3, + DEFAULT_DATASET_DIR, DEFAULT_ANNOT_DIR, + ) + from taxonomy import classify_segment, NUM_VERB_FINE + + +IDLE_LABEL = NUM_VERB_FINE # = 17, sentinel for "no segment covers this frame" +NUM_FORECAST_CLASSES = NUM_VERB_FINE + 1 # = 18 + + +class ForecastDataset(Dataset): + """Forecast next T_fut seconds of per-frame verb_fine given past T_obs.""" + + def __init__( + self, + volunteers: Sequence[str], + modalities: Sequence[str], + t_obs_sec: float = 1.5, + t_fut_sec: float = 0.5, + anchor_stride_sec: float = 0.25, + downsample: int = 5, + dataset_dir: Path = DEFAULT_DATASET_DIR, + annot_dir: Path = DEFAULT_ANNOT_DIR, + stats: Optional[Dict[str, Tuple[np.ndarray, np.ndarray]]] = None, + expected_dims: Optional[Dict[str, int]] = None, + contact_only: bool = False, + contact_threshold_g: float = 5.0, + log: bool = True, + ): + super().__init__() + self.modalities = list(modalities) + self.t_obs_sec = float(t_obs_sec) + self.t_fut_sec = float(t_fut_sec) + self.anchor_stride_sec = float(anchor_stride_sec) + self.downsample = int(downsample) + self.sr = SAMPLING_RATE_HZ // self.downsample + self.dataset_dir = Path(dataset_dir) + self.annot_dir = Path(annot_dir) + self.contact_only = bool(contact_only) + self.contact_threshold_g = float(contact_threshold_g) + + # Output time-step counts (after downsample) + self.T_obs = int(round(self.t_obs_sec * self.sr)) + self.T_fut = int(round(self.t_fut_sec * self.sr)) + + self._items: List[dict] = [] + # Pre-seed modality dims if caller (e.g. test set) provides them + self._modality_dims: Dict[str, int] = dict(expected_dims) if expected_dims else {} + + for vol in volunteers: + vol_dir = self.dataset_dir / vol + if not vol_dir.is_dir(): + continue + for scenario_dir in sorted(vol_dir.glob("s*")): + if not scenario_dir.is_dir(): + continue + scene = scenario_dir.name + annot_path = self.annot_dir / vol / f"{scene}.json" + if not annot_path.exists(): + continue + + # Always include pressure for the filter, even if model + # doesn't see it as input. We separate "filter sensors" + # (load_mods) from "model input sensors" (self.modalities). + load_mods = list(dict.fromkeys(list(self.modalities) + ["pressure"])) + try: + sensors_all = _load_recording_sensors( + scenario_dir, vol, scene, load_mods + ) + except Exception: + continue + if sensors_all is None or any(a is None for a in sensors_all.values()): + continue + pressure_full = sensors_all.get("pressure") # (T, 50) + # Subset to model-input modalities for everything downstream + sensors = {m: sensors_all[m] for m in self.modalities} + + # Track modality dim consistency + for m, arr in sensors.items(): + if m in self._modality_dims: + target = self._modality_dims[m] + if arr.shape[1] != target: + if arr.shape[1] < target: + pad = np.zeros((arr.shape[0], target - arr.shape[1]), + dtype=np.float32) + sensors[m] = np.concatenate([arr, pad], axis=1) + else: + sensors[m] = arr[:, :target] + else: + self._modality_dims[m] = arr.shape[1] + + T_avail = min(a.shape[0] for a in sensors.values()) + if T_avail < (self.T_obs + self.T_fut) * self.downsample: + continue + + # Build per-frame verb_fine timeline at full 100 Hz + timeline = np.full(T_avail, IDLE_LABEL, dtype=np.int64) + segs = _load_annotations(annot_path) + for seg in segs: + a = seg.get("action_annotation", {}) + labels = classify_segment(a) + if labels is None: + continue + start_sec, end_sec = parse_ts_range(seg.get("timestamp", "")) + s = int(round(start_sec * SAMPLING_RATE_HZ)) + e = int(round(end_sec * SAMPLING_RATE_HZ)) + s = max(0, s); e = min(T_avail, e) + if e > s: + timeline[s:e] = labels["verb_fine"] + + # Downsample timeline to 20 Hz + timeline_ds = timeline[::self.downsample] + T_ds = len(timeline_ds) + + # Downsample sensors to 20 Hz (kept as full record; + # we'll slice windows below) + sensors_ds = {m: arr[::self.downsample] for m, arr in sensors.items()} + + # Build contact mask at 20 Hz (per-frame): is pressure-sum > thr? + # Pressure is 50 channels; we follow the T2 contact convention + # (sum across all fingertips and threshold at 5 g). + if pressure_full is not None: + pressure_ds = pressure_full[::self.downsample] + contact_ds = pressure_ds.sum(axis=1) > self.contact_threshold_g + else: + contact_ds = np.zeros(T_ds, dtype=bool) + + # Sample anchors at fixed stride (in 20 Hz frames) + stride = max(1, int(round(self.anchor_stride_sec * self.sr))) + first_anchor = self.T_obs + last_anchor = T_ds - self.T_fut + if last_anchor <= first_anchor: + continue + + for anchor in range(first_anchor, last_anchor + 1, stride): + # contact-rich filter: any contact frame in past or future window? + if self.contact_only: + win = contact_ds[max(0, anchor - self.T_obs): + min(T_ds, anchor + self.T_fut)] + if not win.any(): + continue + past_slice = {m: arr[anchor - self.T_obs:anchor] + for m, arr in sensors_ds.items()} + fut_labels = timeline_ds[anchor:anchor + self.T_fut].copy() + # length sanity + if any(w.shape[0] != self.T_obs for w in past_slice.values()): + continue + if fut_labels.shape[0] != self.T_fut: + continue + self._items.append({ + "x": past_slice, # dict[mod] -> (T_obs, F_mod) + "y_seq": fut_labels, # (T_fut,) int in [0..17] + "meta": {"vol": vol, "scene": scene, "anchor_idx": int(anchor)}, + }) + + if not self._items: + raise RuntimeError("ForecastDataset: collected 0 anchors. Check annot_dir / modalities.") + + # Per-modality z-score using training stats + if stats is None: + stats = self._compute_stats() + self._stats = stats + self._apply_stats(stats) + + if log: + print(f"[ForecastDataset] vols={len(volunteers)} " + f"anchors={len(self._items)} " + f"T_obs={self.T_obs} T_fut={self.T_fut} " + f"contact_only={self.contact_only} " + f"modality_dims={self._modality_dims} " + f"sr={self.sr}Hz", flush=True) + + # ----- Stats / normalization ----- + def _compute_stats(self) -> Dict[str, Tuple[np.ndarray, np.ndarray]]: + accs = {m: [] for m in self._modality_dims} + for it in self._items: + for m, w in it["x"].items(): + accs[m].append(w) + out = {} + for m, ws in accs.items(): + cat = np.concatenate(ws, axis=0) + mu = cat.mean(axis=0) + sd = cat.std(axis=0); sd = np.where(sd < 1e-6, 1.0, sd) + out[m] = (mu.astype(np.float32), sd.astype(np.float32)) + return out + + def _apply_stats(self, stats): + for it in self._items: + for m, w in it["x"].items(): + if m in stats: + mu, sd = stats[m] + it["x"][m] = ((w - mu) / sd).astype(np.float32) + + # ----- Dataset protocol ----- + def __len__(self): + return len(self._items) + + def __getitem__(self, idx): + it = self._items[idx] + x = {m: torch.from_numpy(np.ascontiguousarray(w)) for m, w in it["x"].items()} + y_seq = torch.from_numpy(np.ascontiguousarray(it["y_seq"])) # (T_fut,) + return x, y_seq, it["meta"] + + @property + def modality_dims(self): + return dict(self._modality_dims) + + def class_freq(self) -> np.ndarray: + c = np.zeros(NUM_FORECAST_CLASSES, dtype=np.int64) + for it in self._items: + for v in it["y_seq"]: + c[int(v)] += 1 + return c + + +def collate_forecast(batch): + """Stack (x_dict, y_seq, meta) -> batched tensors. All samples share T_obs/T_fut.""" + xs, ys, metas = zip(*batch) + B = len(batch) + mods = list(xs[0].keys()) + x_out: Dict[str, torch.Tensor] = {} + for m in mods: + x_out[m] = torch.stack([x[m] for x in xs], dim=0) # (B, T_obs, F_mod) + y_out = torch.stack(ys, dim=0) # (B, T_fut) + return x_out, y_out, list(metas) + + +def build_train_test( + modalities: Sequence[str], + t_obs_sec: float = 1.5, + t_fut_sec: float = 0.5, + anchor_stride_sec: float = 0.25, + downsample: int = 5, + dataset_dir: Path = DEFAULT_DATASET_DIR, + annot_dir: Path = DEFAULT_ANNOT_DIR, + contact_only: bool = False, + contact_threshold_g: float = 5.0, +): + train = ForecastDataset( + TRAIN_VOLS_V3, modalities=modalities, + t_obs_sec=t_obs_sec, t_fut_sec=t_fut_sec, + anchor_stride_sec=anchor_stride_sec, downsample=downsample, + dataset_dir=dataset_dir, annot_dir=annot_dir, + contact_only=contact_only, contact_threshold_g=contact_threshold_g, + stats=None, log=True, + ) + test = ForecastDataset( + TEST_VOLS_V3, modalities=modalities, + t_obs_sec=t_obs_sec, t_fut_sec=t_fut_sec, + anchor_stride_sec=anchor_stride_sec, downsample=downsample, + dataset_dir=dataset_dir, annot_dir=annot_dir, + contact_only=contact_only, contact_threshold_g=contact_threshold_g, + stats=train._stats, expected_dims=train._modality_dims, log=True, + ) + return train, test + + +if __name__ == "__main__": + import argparse + ap = argparse.ArgumentParser() + ap.add_argument("--modalities", type=str, default="imu,emg,eyetrack,mocap,pressure") + ap.add_argument("--t_obs", type=float, default=1.5) + ap.add_argument("--t_fut", type=float, default=0.5) + ap.add_argument("--stride", type=float, default=0.25) + args = ap.parse_args() + mods = args.modalities.split(",") + tr, te = build_train_test( + modalities=mods, + t_obs_sec=args.t_obs, t_fut_sec=args.t_fut, + anchor_stride_sec=args.stride, + ) + print(f"\nTrain={len(tr)} Test={len(te)} T_obs={tr.T_obs} T_fut={tr.T_fut}") + print(f"Train class freq:\n{tr.class_freq()}") + print(f"Test class freq:\n{te.class_freq()}") + x, y, meta = tr[0] + print(f"Sample: x={ {m: tuple(v.shape) for m,v in x.items()} } y_seq={tuple(y.shape)}") diff --git a/experiments/data/dataset_grasp_state.py b/experiments/data/dataset_grasp_state.py new file mode 100644 index 0000000000000000000000000000000000000000..4030f3771309ac5f1169a49da3a97ff9bbbdb429 --- /dev/null +++ b/experiments/data/dataset_grasp_state.py @@ -0,0 +1,571 @@ +"""Anchor-based binary "is_grasping" classification dataset (T5 v3 / TGSR). + +At each sampled anchor t in a recording: + past = sensor frames over [t - T_obs, t] ← input + label = majority vote of grasp-annotation mask over (t, t+T_fut] ← binary class + +Ground-truth source: annotations_v3 verb segments. A frame is marked +"is_grasp" if it falls inside a segment whose action_name belongs to +GRASP_VERBS (set below). The label is annotation-derived, completely +independent of pressure — so adding/removing pressure as input does +NOT leak the label. + +This is the cleanest test of "does pressure improve recognition of +object-interaction state when human-annotated grasp segments are GT?" +""" +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Dict, List, Optional, Sequence, Tuple + +import numpy as np +import torch +from torch.utils.data import Dataset + +THIS = Path(__file__).resolve() +sys.path.insert(0, str(THIS.parent)) +sys.path.insert(0, str(THIS.parents[1])) + +try: + from experiments.dataset_seqpred import ( + SAMPLING_RATE_HZ, _load_recording_sensors, + TRAIN_VOLS_V3, TEST_VOLS_V3, + DEFAULT_DATASET_DIR, DEFAULT_ANNOT_DIR, + ) +except ModuleNotFoundError: + from dataset_seqpred import ( + SAMPLING_RATE_HZ, _load_recording_sensors, + TRAIN_VOLS_V3, TEST_VOLS_V3, + DEFAULT_DATASET_DIR, DEFAULT_ANNOT_DIR, + ) + + +GRASP_VERBS = { + "grasp", "hold", "pick_up", "move", "place", "put_down", + "pull", "rotate", "insert", "remove", +} +# User-specified subset of action verbs that mean "the object has been lifted +# off its resting surface and held in hand" (used as Class 2 stricter definition). +LIFT_VERBS = {"grasp", "open", "move", "pick_up", "hold"} + +# Multi-class verb taxonomy (annotations_v3 verb_fine universe). +# Verb 0 = background (anchor outside any segment). +VERB_LIST = [ + "background", + "grasp", "move", "place", "adjust", "pick_up", + "close", "put_down", "pull", "hold", "open", + "rotate", "release", "push", "insert", "remove", + "align", "stabilize", +] +VERB_TO_IDX = {v: i for i, v in enumerate(VERB_LIST)} + +# Top-15 most common object categories with non-zero coverage in the +# pressure-bearing test set (annotations_v3 survey of TRAIN+TEST_VOLS_V3). +# Index 0 = "_other": anchor outside any segment OR object not in top-15. +# Note: "coat" excluded because it appears only in v14, which has no +# pressure-aligned sessions and is silently dropped by the loader. +OBJECT_TOP_LIST = [ + "_other", + "sealed jar", "towel", "tablecloth", "box", "pot", + "rice bowl", "tape", "pants", "spoon", "plate", + "marker", "cloth", "laptop", "toothbrush case", "tea canister", +] +OBJECT_TO_IDX = {o: i for i, o in enumerate(OBJECT_TOP_LIST)} +EVENT_NAMES = {0: "non-contact", 1: "pre-contact", 2: "steady-grip", 3: "release"} +CLASS_NAMES_BINARY = {0: "non-grasp", 1: "grasp"} +CLASS_NAMES_THREE = {0: "no-grasp", 1: "attempted", 2: "sustained"} +# Back-compat default (used by binary code paths) +CLASS_NAMES = CLASS_NAMES_BINARY + + +def _parse_one(x: str, fmt_mode: str) -> float: + p = x.split(":") + if len(p) == 2: + return int(p[0]) * 60 + int(p[1]) + if fmt_mode == "hhmmss": + return int(p[0]) * 3600 + int(p[1]) * 60 + int(p[2]) + return int(p[0]) * 60 + int(p[1]) + int(p[2]) / 30.0 # mmssff @ 30fps + + +def _detect_fmt(segments, rec_sec: float) -> str: + for s in segments: + b = s["timestamp"].split("-")[1] + p = b.split(":") + if len(p) == 3: + hh = int(p[0]) * 3600 + int(p[1]) * 60 + int(p[2]) + if hh > rec_sec * 1.05: + return "mmssff" + return "hhmmss" + + +def build_object_label(annot_path: Path, n_frames: int, + sr: int = SAMPLING_RATE_HZ) -> np.ndarray: + """Per-frame object index (top-15 + '_other' fallback as class 0).""" + label = np.zeros(n_frames, dtype=np.int8) + if not annot_path.exists(): + return label + try: + ann = json.load(open(annot_path)) + except Exception: + return label + segments = ann.get("segments", []) + if not segments: + return label + rec_sec = n_frames / sr + fmt = _detect_fmt(segments, rec_sec) + for s in segments: + obj = s.get("action_annotation", {}).get("object_name") + idx = OBJECT_TO_IDX.get(obj, 0) + if idx == 0: + continue # leave as 0 ("_other"/background) + try: + a, b = s["timestamp"].split("-") + t0 = _parse_one(a, fmt); t1 = _parse_one(b, fmt) + except Exception: + continue + if t1 <= t0 or t1 > rec_sec * 1.10: + continue + i0 = max(0, int(round(t0 * sr))) + i1 = min(n_frames, int(round(t1 * sr))) + label[i0:i1] = idx + return label + + +def build_lift_eligible_mask(annot_path: Path, n_frames: int, + sr: int = SAMPLING_RATE_HZ) -> np.ndarray: + """Per-frame bool: True if frame is inside a segment that meets the + lifted-grasp criterion: verb ∈ LIFT_VERBS OR hand_type == 'both'. + Used by 3-class label_mode when require_lift_for_sustained=True.""" + mask = np.zeros(n_frames, dtype=bool) + if not annot_path.exists(): + return mask + try: + ann = json.load(open(annot_path)) + except Exception: + return mask + segments = ann.get("segments", []) + if not segments: + return mask + rec_sec = n_frames / sr + fmt = _detect_fmt(segments, rec_sec) + for s in segments: + a = s.get("action_annotation", {}) + verb = a.get("action_name") + hand = a.get("hand_type", "") + is_lift = (verb in LIFT_VERBS) or (hand == "both") + if not is_lift: + continue + try: + ts0, ts1 = s["timestamp"].split("-") + t0 = _parse_one(ts0, fmt); t1 = _parse_one(ts1, fmt) + except Exception: + continue + if t1 <= t0 or t1 > rec_sec * 1.10: + continue + i0 = max(0, int(round(t0 * sr))) + i1 = min(n_frames, int(round(t1 * sr))) + mask[i0:i1] = True + return mask + + +def build_verb_label(annot_path: Path, n_frames: int, + sr: int = SAMPLING_RATE_HZ) -> np.ndarray: + """Per-frame verb index (int8). Default (no segment) = 0 (background).""" + label = np.zeros(n_frames, dtype=np.int8) + if not annot_path.exists(): + return label + try: + ann = json.load(open(annot_path)) + except Exception: + return label + segments = ann.get("segments", []) + if not segments: + return label + rec_sec = n_frames / sr + fmt = _detect_fmt(segments, rec_sec) + for s in segments: + verb = s.get("action_annotation", {}).get("action_name") + v_idx = VERB_TO_IDX.get(verb, 0) # unknown verb → background + if v_idx == 0: + continue + try: + a, b = s["timestamp"].split("-") + t0 = _parse_one(a, fmt); t1 = _parse_one(b, fmt) + except Exception: + continue + if t1 <= t0 or t1 > rec_sec * 1.10: + continue + i0 = max(0, int(round(t0 * sr))) + i1 = min(n_frames, int(round(t1 * sr))) + label[i0:i1] = v_idx + return label + + +def build_grasp_mask(annot_path: Path, n_frames: int, + sr: int = SAMPLING_RATE_HZ) -> np.ndarray: + """Return bool array of shape (n_frames,).""" + mask = np.zeros(n_frames, dtype=bool) + if not annot_path.exists(): + return mask + try: + ann = json.load(open(annot_path)) + except Exception: + return mask + segments = ann.get("segments", []) + if not segments: + return mask + rec_sec = n_frames / sr + fmt = _detect_fmt(segments, rec_sec) + for s in segments: + verb = s.get("action_annotation", {}).get("action_name") + if verb not in GRASP_VERBS: + continue + try: + a, b = s["timestamp"].split("-") + t0 = _parse_one(a, fmt); t1 = _parse_one(b, fmt) + except Exception: + continue + if t1 <= t0 or t1 > rec_sec * 1.10: + continue + i0 = max(0, int(round(t0 * sr))) + i1 = min(n_frames, int(round(t1 * sr))) + mask[i0:i1] = True + return mask + + +class GraspStateDataset(Dataset): + """Predict binary 'is_grasping' label over future window from past sensor signals.""" + + def __init__( + self, + volunteers: Sequence[str], + input_modalities: Sequence[str], + t_obs_sec: float = 1.0, + t_fut_sec: float = 0.5, + anchor_stride_sec: float = 0.25, + downsample: int = 5, + dataset_dir: Path = DEFAULT_DATASET_DIR, + annot_dir: Path = DEFAULT_ANNOT_DIR, + contact_threshold_g: float = 5.0, # legacy sum-threshold (kept for back-compat, unused if use_per_cell_contact=True) + per_cell_threshold_g: float = 10.0, # per-cell threshold to declare a sensor cell "active" + min_active_cells: int = 3, # need ≥ this many active cells to declare contact + use_per_cell_contact: bool = True, # NEW default: use per-cell active-count for event_type + label_mode: str = "binary", # "binary", "three_class", or "verb" + sustained_threshold_sec: float = 0.3, # (3-class only) min contiguous contact for "Sustained" + require_lift_for_sustained: bool = False, # (3-class only) Class 2 also requires verb ∈ LIFT_VERBS + per_class_max: Optional[int] = None, + input_stats: Optional[Dict[str, Tuple[np.ndarray, np.ndarray]]] = None, + expected_input_dims: Optional[Dict[str, int]] = None, + majority_threshold: float = 0.5, + rng_seed: int = 0, + log: bool = True, + ): + super().__init__() + self.input_modalities = list(input_modalities) + self.t_obs_sec = float(t_obs_sec) + self.t_fut_sec = float(t_fut_sec) + self.anchor_stride_sec = float(anchor_stride_sec) + self.downsample = int(downsample) + self.sr = SAMPLING_RATE_HZ // self.downsample + self.dataset_dir = Path(dataset_dir) + self.annot_dir = Path(annot_dir) + self.contact_threshold_g = float(contact_threshold_g) + self.per_cell_threshold_g = float(per_cell_threshold_g) + self.min_active_cells = int(min_active_cells) + self.use_per_cell_contact = bool(use_per_cell_contact) + self.label_mode = str(label_mode) + if self.label_mode not in ("binary", "three_class", "verb", "object"): + raise ValueError(f"label_mode must be binary|three_class|verb|object, got {label_mode}") + if self.label_mode == "binary": + self.num_classes = 2 + elif self.label_mode == "three_class": + self.num_classes = 3 + elif self.label_mode == "verb": + self.num_classes = len(VERB_LIST) + else: # object + self.num_classes = len(OBJECT_TOP_LIST) + self.sustained_threshold_sec = float(sustained_threshold_sec) + self.require_lift_for_sustained = bool(require_lift_for_sustained) + self.per_class_max = per_class_max + self.majority_threshold = float(majority_threshold) + self.T_obs = int(round(self.t_obs_sec * self.sr)) + self.T_fut = int(round(self.t_fut_sec * self.sr)) + + self._items: List[dict] = [] + self._modality_dims: Dict[str, int] = dict(expected_input_dims) if expected_input_dims else {} + rng = np.random.default_rng(rng_seed) + + # Load pressure even if not in inputs, for event_type stratification. + load_mods = list(dict.fromkeys(list(self.input_modalities) + ["pressure"])) + + # Per-class anchor pool + pools: Dict[int, List[dict]] = {c: [] for c in range(self.num_classes)} + sustained_thresh_frames = int(round(self.sustained_threshold_sec * self.sr)) + + for vol in volunteers: + vol_dir = self.dataset_dir / vol + if not vol_dir.is_dir(): + continue + for scenario_dir in sorted(vol_dir.glob("s*")): + if not scenario_dir.is_dir(): + continue + scene = scenario_dir.name + annot_path = self.annot_dir / vol / f"{scene}.json" + if not annot_path.exists(): + continue + try: + sensors_all = _load_recording_sensors( + scenario_dir, vol, scene, load_mods + ) + except Exception: + continue + if sensors_all is None or any(a is None for a in sensors_all.values()): + continue + + pressure_full = sensors_all["pressure"] # (T, 50) + input_arrs = {m: sensors_all[m] for m in self.input_modalities} + for m, arr in input_arrs.items(): + self._enforce_dim(input_arrs, m, arr, self._modality_dims) + + T_avail = min(a.shape[0] for a in input_arrs.values()) + T_avail = min(T_avail, pressure_full.shape[0]) + if T_avail < (self.T_obs + self.T_fut) * self.downsample: + continue + + # Build grasp mask at 100 Hz, then downsample. + mask_full = build_grasp_mask(annot_path, T_avail, + sr=SAMPLING_RATE_HZ) + if self.label_mode == "verb": + verb_full = build_verb_label(annot_path, T_avail, sr=SAMPLING_RATE_HZ) + verb_ds = verb_full[:T_avail:self.downsample] + else: + verb_ds = None + if self.label_mode == "object": + obj_full = build_object_label(annot_path, T_avail, sr=SAMPLING_RATE_HZ) + obj_ds = obj_full[:T_avail:self.downsample] + else: + obj_ds = None + if self.label_mode == "three_class" and self.require_lift_for_sustained: + lift_full = build_lift_eligible_mask(annot_path, T_avail, sr=SAMPLING_RATE_HZ) + lift_eligible_ds = lift_full[:T_avail:self.downsample] + else: + lift_eligible_ds = None + input_ds = {m: arr[:T_avail:self.downsample] for m, arr in input_arrs.items()} + pressure_ds = pressure_full[:T_avail:self.downsample] + mask_ds = mask_full[:T_avail:self.downsample] + T_ds = mask_ds.shape[0] + if self.use_per_cell_contact: + # n_active per frame: count cells with value > per_cell_threshold_g + n_active = (pressure_ds > self.per_cell_threshold_g).sum(axis=1) + contact_frame = n_active >= self.min_active_cells + else: + pressure_sum = pressure_ds.sum(axis=1) + contact_frame = pressure_sum > self.contact_threshold_g + + stride = max(1, int(round(self.anchor_stride_sec * self.sr))) + first_anchor = self.T_obs + last_anchor = T_ds - self.T_fut + if last_anchor <= first_anchor: + continue + + for anchor in range(first_anchor, last_anchor + 1, stride): + fut_mask = mask_ds[anchor:anchor + self.T_fut] + if fut_mask.shape[0] != self.T_fut: + continue + annotation_is_grasp = fut_mask.mean() >= self.majority_threshold + + if self.label_mode == "binary": + label = int(annotation_is_grasp) + elif self.label_mode == "three_class": + if not annotation_is_grasp: + label = 0 # NoGrasp + else: + # longest contiguous run of contact in future window + fut_contact = contact_frame[anchor:anchor + self.T_fut] + longest = 0; cur = 0 + for v in fut_contact: + if v: cur += 1; longest = max(longest, cur) + else: cur = 0 + is_sustained = longest >= sustained_thresh_frames + if is_sustained and self.require_lift_for_sustained: + # Demote to Class 1 unless majority of future window is in + # a "lift-eligible" segment (verb ∈ LIFT_VERBS or hand=both). + fut_lift = lift_eligible_ds[anchor:anchor + self.T_fut] + if fut_lift.mean() < 0.5: + is_sustained = False + label = 2 if is_sustained else 1 + elif self.label_mode == "verb": + fut_v = verb_ds[anchor:anchor + self.T_fut] + counts = np.bincount(fut_v, minlength=self.num_classes) + label = int(np.argmax(counts)) + else: # object — majority object in future window + fut_o = obj_ds[anchor:anchor + self.T_fut] + counts = np.bincount(fut_o, minlength=self.num_classes) + label = int(np.argmax(counts)) + + # event_type for stratification (4-class transition taxonomy) + past_high = contact_frame[anchor - self.T_obs:anchor].mean() > 0.5 + fut_high = contact_frame[anchor:anchor + self.T_fut].mean() > 0.5 + if not past_high and not fut_high: et = 0 + elif not past_high and fut_high: et = 1 + elif past_high and fut_high: et = 2 + else: et = 3 + + past_slice = {m: arr[anchor - self.T_obs:anchor] + for m, arr in input_ds.items()} + if any(w.shape[0] != self.T_obs for w in past_slice.values()): + continue + + item = { + "x": past_slice, + "label": label, + "event_type": et, + "meta": {"vol": vol, "scene": scene, "anchor_idx": int(anchor)}, + } + pools[label].append(item) + + # Balance classes if requested (cap larger pool to per_class_max) + if self.per_class_max is not None: + for c, pool in pools.items(): + if len(pool) > self.per_class_max: + idx = rng.choice(len(pool), size=self.per_class_max, replace=False) + pools[c] = [pool[i] for i in sorted(idx)] + self._items = [it for c in range(self.num_classes) for it in pools[c]] + + if not self._items: + raise RuntimeError("GraspStateDataset: collected 0 anchors.") + + # Z-score inputs + if input_stats is None: + input_stats = self._compute_input_stats() + self._input_stats = input_stats + self._apply_input_stats(input_stats) + + if log: + if self.label_mode == "binary": + class_names = CLASS_NAMES_BINARY + elif self.label_mode == "three_class": + class_names = CLASS_NAMES_THREE + elif self.label_mode == "verb": + class_names = {i: v for i, v in enumerate(VERB_LIST)} + else: # object + class_names = {i: v for i, v in enumerate(OBJECT_TOP_LIST)} + counts_class = {class_names[c]: sum(1 for it in self._items if it["label"] == c) + for c in range(self.num_classes)} + counts_event = {EVENT_NAMES[k]: sum(1 for it in self._items if it["event_type"] == k) + for k in (0, 1, 2, 3)} + print(f"[GraspStateDataset] vols={len(volunteers)} " + f"inputs={self.input_modalities} " + f"anchors={len(self._items)} class={counts_class} " + f"event={counts_event} " + f"T_obs={self.T_obs} T_fut={self.T_fut} sr={self.sr}Hz " + f"input_dims={self._modality_dims}", flush=True) + + @staticmethod + def _enforce_dim(arrs, m, arr, dim_dict): + if m in dim_dict: + tgt = dim_dict[m] + if arr.shape[1] != tgt: + if arr.shape[1] < tgt: + pad = np.zeros((arr.shape[0], tgt - arr.shape[1]), dtype=np.float32) + arrs[m] = np.concatenate([arr, pad], axis=1) + else: + arrs[m] = arr[:, :tgt] + else: + dim_dict[m] = arr.shape[1] + + def _compute_input_stats(self): + accs = {m: [] for m in self._modality_dims} + for it in self._items: + for m, w in it["x"].items(): + accs[m].append(w) + out = {} + for m, ws in accs.items(): + cat = np.concatenate(ws, axis=0) + mu = cat.mean(axis=0).astype(np.float32) + sd = cat.std(axis=0); sd = np.where(sd < 1e-6, 1.0, sd) + out[m] = (mu, sd.astype(np.float32)) + return out + + def _apply_input_stats(self, stats): + for it in self._items: + for m, w in it["x"].items(): + if m in stats: + mu, sd = stats[m] + it["x"][m] = ((w - mu) / sd).astype(np.float32) + + def __len__(self): return len(self._items) + + def __getitem__(self, idx): + it = self._items[idx] + x = {m: torch.from_numpy(np.ascontiguousarray(w)) for m, w in it["x"].items()} + label = int(it["label"]) + et = int(it["event_type"]) + return x, label, et, it["meta"] + + @property + def modality_dims(self): return dict(self._modality_dims) + + +def collate_grasp_state(batch): + xs, labels, ets, metas = zip(*batch) + mods = list(xs[0].keys()) + x_out = {m: torch.stack([x[m] for x in xs], dim=0) for m in mods} + y_out = torch.tensor(labels, dtype=torch.long) + et_out = torch.tensor(ets, dtype=torch.long) + return x_out, y_out, et_out, list(metas) + + +def build_grasp_train_test( + input_modalities, + t_obs_sec=1.0, t_fut_sec=0.5, anchor_stride_sec=0.25, + downsample=5, + dataset_dir=DEFAULT_DATASET_DIR, annot_dir=DEFAULT_ANNOT_DIR, + contact_threshold_g=5.0, per_class_max=None, + label_mode="binary", sustained_threshold_sec=0.3, + require_lift_for_sustained=False, + rng_seed=0, + train_vols=None, test_vols=None, +): + if train_vols is None: train_vols = TRAIN_VOLS_V3 + if test_vols is None: test_vols = TEST_VOLS_V3 + train = GraspStateDataset( + train_vols, input_modalities=input_modalities, + t_obs_sec=t_obs_sec, t_fut_sec=t_fut_sec, + anchor_stride_sec=anchor_stride_sec, downsample=downsample, + dataset_dir=dataset_dir, annot_dir=annot_dir, + contact_threshold_g=contact_threshold_g, per_class_max=per_class_max, + label_mode=label_mode, sustained_threshold_sec=sustained_threshold_sec, + require_lift_for_sustained=require_lift_for_sustained, + rng_seed=rng_seed, log=True, + ) + test = GraspStateDataset( + test_vols, input_modalities=input_modalities, + t_obs_sec=t_obs_sec, t_fut_sec=t_fut_sec, + anchor_stride_sec=anchor_stride_sec, downsample=downsample, + dataset_dir=dataset_dir, annot_dir=annot_dir, + contact_threshold_g=contact_threshold_g, per_class_max=None, # don't cap test + label_mode=label_mode, sustained_threshold_sec=sustained_threshold_sec, + require_lift_for_sustained=require_lift_for_sustained, + input_stats=train._input_stats, + expected_input_dims=train._modality_dims, + rng_seed=rng_seed + 1, log=True, + ) + return train, test + + +if __name__ == "__main__": + import argparse + ap = argparse.ArgumentParser() + ap.add_argument("--input_modalities", default="emg,imu,mocap") + ap.add_argument("--t_obs", type=float, default=1.0) + ap.add_argument("--t_fut", type=float, default=0.5) + args = ap.parse_args() + tr, te = build_grasp_train_test( + input_modalities=args.input_modalities.split(","), + t_obs_sec=args.t_obs, t_fut_sec=args.t_fut, + ) + x, y, et, meta = tr[0] + print(f"sample: x={ {m: tuple(v.shape) for m,v in x.items()} } y={y} et={et}") diff --git a/experiments/data/dataset_seqpred.py b/experiments/data/dataset_seqpred.py new file mode 100644 index 0000000000000000000000000000000000000000..77668492579ea72b3505677b9ca13ec313b32b54 --- /dev/null +++ b/experiments/data/dataset_seqpred.py @@ -0,0 +1,533 @@ +""" +Segment-to-Next-Segment Triplet Prediction dataset (T10). + +For every annotated action segment k in every recording: + anchor_t = start_time(segment_k) - T_fut (seconds) + observation = sensor frames in [anchor_t - T_obs, anchor_t] + target = triplet labels of segment_k: (verb_fine, verb_composite, + noun, hand) + +Segments whose observation window would spill before t=0 of the recording +are skipped (no left-padding), so we never mix noise with real sensor data. + +Strategy A is enforced in taxonomy.classify_segment(): segments whose noun is +not in the kept set (<50 occurrences) are dropped entirely. + +Per-modality tensors are returned as a dict so downstream models can either +concat them (single-flow baselines) or keep them separate (our cross-modal +fusion model). A float mask is returned alongside the sensor tensor so +variable-length obs windows can be padded within a batch. +""" + +from __future__ import annotations + +# pandas must be imported BEFORE torch/numpy to avoid a GLIBCXX load-order bug +# on this cluster. +import pandas as pd + +import json +import os +import sys +from pathlib import Path +from typing import Dict, List, Optional, Sequence, Tuple + +import numpy as np +import torch +from torch.utils.data import Dataset + +# Make sibling modules importable from either (a) the neurips26 root, or +# (b) the frozen row/code/ folder (populated by setup_row.sh). +_THIS = Path(__file__).resolve() +sys.path.insert(0, str(_THIS.parent)) # code/ itself +sys.path.insert(0, str(_THIS.parent.parent)) # neurips26/ + +try: + from data.dataset import ( # noqa: E402 + MODALITY_FILES, load_modality_array, + ) + from experiments.taxonomy import ( # noqa: E402 + classify_segment, NOUN, NUM_VERB_FINE, NUM_VERB_COMPOSITE, NUM_NOUN, + NUM_HAND, + ) +except ModuleNotFoundError: + from dataset import ( # noqa: E402 + MODALITY_FILES, load_modality_array, + ) + from taxonomy import ( # noqa: E402 + classify_segment, NOUN, NUM_VERB_FINE, NUM_VERB_COMPOSITE, NUM_NOUN, + NUM_HAND, + ) + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +# Hard-code the dataset and annotation paths. The frozen row/code/ folders sit +# at arbitrary depths under the repo, so relative-to-__file__ discovery is +# unreliable. An env override is available for e.g. running on a mirror. +REPO = Path(os.environ.get( + "DAILYACT_REPO", "${PULSE_ROOT}" +)) +DEFAULT_DATASET_DIR = REPO / "aligned_gy" +DEFAULT_ANNOT_DIR = REPO / "annotations_v3" + +SAMPLING_RATE_HZ = 100 +# 5x downsample -> 20 Hz. Matches the existing pipeline in dataset.py. +DEFAULT_DOWNSAMPLE = 5 + +VALID_MODALITIES = ("mocap", "emg", "eyetrack", "imu", "pressure") + +# Fixed subject-independent split. Hand-picked 5 test volunteers with full +# 8-scene coverage, spread across the ID range. Any volunteer not listed +# below but annotated in v3 is assumed to be train data (so the lists stay +# stable as more volunteers get annotated). +TEST_VOLS_V3 = ["v14", "v30", "v34", "v38", "v41"] +TRAIN_VOLS_V3 = [ + "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v12", "v13", "v15", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "v31", "v32", "v33", "v35", "v36", "v37", "v39", "v40", +] +assert set(TRAIN_VOLS_V3).isdisjoint(TEST_VOLS_V3), "Split must be disjoint" + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _parse_ts(ts: str) -> float: + """Parse 'HH:MM:SS' or 'MM:SS' (or 'M:S') into seconds.""" + parts = ts.strip().split(":") + try: + if len(parts) == 2: + return float(parts[0]) * 60 + float(parts[1]) + if len(parts) == 3: + return float(parts[0]) * 3600 + float(parts[1]) * 60 + float(parts[2]) + except ValueError: + return 0.0 + return 0.0 + + +def parse_ts_range(ts_range: str) -> Tuple[float, float]: + """Parse 'MM:SS-MM:SS' or 'HH:MM:SS-HH:MM:SS' into (start_sec, end_sec).""" + if "-" not in ts_range: + return 0.0, 0.0 + a, b = ts_range.split("-", 1) + return _parse_ts(a), _parse_ts(b) + + +def _load_recording_sensors( + scenario_dir: Path, vol: str, scenario: str, + modalities: Sequence[str], +) -> Optional[Dict[str, np.ndarray]]: + """Load each requested modality as a (T, F_mod) float32 array at 100 Hz. + + Returns None if any requested modality is missing or corrupted.""" + out: Dict[str, np.ndarray] = {} + for mod in modalities: + if mod == "mocap": + fp = scenario_dir / f"aligned_{vol}{scenario}_s_Q.tsv" + else: + fp = scenario_dir / MODALITY_FILES[mod] + if not fp.exists(): + return None + arr = load_modality_array(str(fp), mod) + if arr is None: + return None + out[mod] = arr.astype(np.float32) + # Align lengths across modalities (take min); all start at sensor t=0. + T = min(a.shape[0] for a in out.values()) + for m in out: + out[m] = out[m][:T] + return out + + +def _load_annotations(annot_path: Path) -> List[dict]: + with open(annot_path) as f: + d = json.load(f) + return d.get("segments", []) + + +# --------------------------------------------------------------------------- +# Dataset +# --------------------------------------------------------------------------- + +class TripletSeqPredDataset(Dataset): + """One sample per (annotated segment, recording) pair. + + Sample schema returned by __getitem__: + x: dict {mod_name: FloatTensor(T_frames, F_mod)} + y: dict {'verb_fine': int, 'verb_composite': int, + 'noun': int, 'hand': int} + meta: dict {'vol', 'scene', 'seg_idx', 'anchor_sec'} + """ + + def __init__( + self, + volunteers: Sequence[str], + modalities: Sequence[str] = ("imu", "mocap", "emg", "eyetrack", "pressure"), + t_obs_sec: float = 8.0, + t_fut_sec: float = 2.0, + downsample: int = DEFAULT_DOWNSAMPLE, + dataset_dir: Path = DEFAULT_DATASET_DIR, + annot_dir: Path = DEFAULT_ANNOT_DIR, + stats: Optional[Dict[str, Tuple[np.ndarray, np.ndarray]]] = None, + min_seg_duration_sec: float = 0.4, + log: bool = True, + mode: str = "recognition", + ): + for m in modalities: + if m not in VALID_MODALITIES: + raise ValueError(f"Unknown modality: {m}") + if mode not in ("recognition", "anticipation"): + raise ValueError(f"mode must be 'recognition' or 'anticipation', got {mode!r}") + + self.modalities = tuple(modalities) + self.t_obs_sec = float(t_obs_sec) + self.t_fut_sec = float(t_fut_sec) + self.downsample = int(downsample) + self.dataset_dir = Path(dataset_dir) + self.annot_dir = Path(annot_dir) + self.mode = mode + + # Effective obs-window length in frames at the post-downsample rate. + sr = SAMPLING_RATE_HZ // self.downsample # 20 Hz + self.T_frames = int(round(self.t_obs_sec * sr)) # used only for anticipation + self._sr_down = sr + + self._items: List[dict] = [] + self._modality_dims: Dict[str, int] = {} + + # If re-using training-set stats, force each modality's feature + # layout to match so we never apply a (14,)-mean to (24,)-data. + if stats is not None: + for m, (mu, _) in stats.items(): + self._modality_dims[m] = mu.shape[1] + + stats_counts = { + "recordings_scanned": 0, + "recordings_used": 0, + "segments_seen": 0, + "seg_dropped_label": 0, # Strategy A + invalid verb/hand + "seg_dropped_too_early": 0, # obs window before t=0 + "seg_dropped_short": 0, + "seg_kept": 0, + } + + for vol in volunteers: + vol_dir = self.dataset_dir / vol + if not vol_dir.is_dir(): + continue + for scenario_dir in sorted(vol_dir.glob("s*")): + if not scenario_dir.is_dir(): + continue + scene = scenario_dir.name + if scene not in {f"s{i}" for i in range(1, 9)}: + continue + + annot_path = self.annot_dir / vol / f"{scene}.json" + if not annot_path.exists(): + continue + + stats_counts["recordings_scanned"] += 1 + + sensors = _load_recording_sensors(scenario_dir, vol, scene, + self.modalities) + if sensors is None: + continue + + # Store / validate per-modality dim + for m, arr in sensors.items(): + if m in self._modality_dims: + if arr.shape[1] != self._modality_dims[m]: + # Pad or truncate to match the first seen dim. + target = self._modality_dims[m] + if arr.shape[1] < target: + pad = np.zeros((arr.shape[0], target - arr.shape[1]), + dtype=np.float32) + sensors[m] = np.concatenate([arr, pad], axis=1) + else: + sensors[m] = arr[:, :target] + else: + self._modality_dims[m] = arr.shape[1] + + segs = _load_annotations(annot_path) + rec_used = False + # BOS index for first segment in a recording (or after dropped segs). + BOS_VC = NUM_VERB_COMPOSITE # = 6 + BOS_N = NUM_NOUN # = 34 + prev_vc, prev_n = BOS_VC, BOS_N + for seg_idx, seg in enumerate(segs): + stats_counts["segments_seen"] += 1 + a = seg.get("action_annotation", {}) + labels = classify_segment(a) + if labels is None: + stats_counts["seg_dropped_label"] += 1 + # do not advance prev (skipped segment doesn't update context) + continue + + start_sec, end_sec = parse_ts_range(seg.get("timestamp", "")) + if end_sec - start_sec < min_seg_duration_sec: + stats_counts["seg_dropped_short"] += 1 + continue + + if self.mode == "anticipation": + anchor_sec = start_sec - self.t_fut_sec + obs_start_sec = anchor_sec - self.t_obs_sec + if obs_start_sec < 0: + stats_counts["seg_dropped_too_early"] += 1 + continue + i0 = int(round(obs_start_sec * SAMPLING_RATE_HZ)) + i1 = int(round(anchor_sec * SAMPLING_RATE_HZ)) + meta_extra = {"anchor_sec": anchor_sec} + else: # recognition + # Use the segment's own [start, end] as the input window. + i0 = int(round(start_sec * SAMPLING_RATE_HZ)) + i1 = int(round(end_sec * SAMPLING_RATE_HZ)) + meta_extra = {"start_sec": start_sec, "end_sec": end_sec} + + T_avail = min(a.shape[0] for a in sensors.values()) + if i1 > T_avail: + stats_counts["seg_dropped_too_early"] += 1 + continue + if i0 < 0: + i0 = 0 # safety; recognition mode shouldn't hit this + + window: Dict[str, np.ndarray] = {} + for m, arr in sensors.items(): + w = arr[i0:i1] + # Downsample: decimate every `downsample`-th frame. + w = w[::self.downsample] + window[m] = w + + # Must have at least 4 post-downsample frames to be useful. + min_T = min(w.shape[0] for w in window.values()) + if min_T < 4: + stats_counts["seg_dropped_short"] += 1 + continue + + self._items.append({ + "x": window, + "y": labels, + "prev": {"verb_composite": prev_vc, "noun": prev_n}, + "meta": { + "vol": vol, "scene": scene, + "seg_idx": seg_idx, **meta_extra, + }, + }) + stats_counts["seg_kept"] += 1 + # Update context for next kept segment in this recording. + prev_vc = labels["verb_composite"] + prev_n = labels["noun"] + rec_used = True + + if rec_used: + stats_counts["recordings_used"] += 1 + + if len(self._items) == 0: + raise RuntimeError( + "No samples collected. Check annot_dir, modalities, t_obs, t_fut." + ) + + # Per-modality z-score normalization using training-set stats. + if stats is None: + stats = self._compute_stats() + self._stats = stats + self._apply_stats(stats) + + if log: + print(f"[TripletSeqPredDataset:{self.mode}] " + f"vols={len(volunteers)} " + f"recs_scan={stats_counts['recordings_scanned']} " + f"recs_used={stats_counts['recordings_used']} " + f"segs_seen={stats_counts['segments_seen']} " + f"kept={stats_counts['seg_kept']} " + f"drop_label={stats_counts['seg_dropped_label']} " + f"drop_early={stats_counts['seg_dropped_too_early']} " + f"drop_short={stats_counts['seg_dropped_short']}", + flush=True) + print(f" modality_dims={self._modality_dims} " + f"T_frames={self.T_frames} sr_down={sr}Hz", + flush=True) + self.stats_counts = stats_counts + + # ----- stats (per-modality mean/std on training split) ----- + def _compute_stats(self) -> Dict[str, Tuple[np.ndarray, np.ndarray]]: + acc: Dict[str, List[np.ndarray]] = {m: [] for m in self.modalities} + for it in self._items: + for m, w in it["x"].items(): + acc[m].append(w.astype(np.float64)) + out: Dict[str, Tuple[np.ndarray, np.ndarray]] = {} + for m, arrs in acc.items(): + cat = np.concatenate(arrs, axis=0) + mu = cat.mean(axis=0, keepdims=True) + sd = cat.std(axis=0, keepdims=True) + sd[sd < 1e-8] = 1.0 + out[m] = (mu.astype(np.float32), sd.astype(np.float32)) + return out + + def _apply_stats(self, stats: Dict[str, Tuple[np.ndarray, np.ndarray]]) -> None: + for it in self._items: + for m, w in it["x"].items(): + mu, sd = stats[m] + z = (w.astype(np.float32) - mu) / sd + z = np.nan_to_num(z, nan=0.0, posinf=0.0, neginf=0.0) + it["x"][m] = z.astype(np.float32) + + def get_stats(self) -> Dict[str, Tuple[np.ndarray, np.ndarray]]: + return self._stats + + # ----- Dataset protocol ----- + def __len__(self) -> int: + return len(self._items) + + def __getitem__(self, idx: int): + it = self._items[idx] + x = {m: torch.from_numpy(w) for m, w in it["x"].items()} + y = it["y"] + meta = it["meta"] + prev = it.get("prev", {"verb_composite": NUM_VERB_COMPOSITE, "noun": NUM_NOUN}) + return x, y, meta, prev + + # ----- convenience ----- + @property + def modality_dims(self) -> Dict[str, int]: + return dict(self._modality_dims) + + @property + def total_feat_dim(self) -> int: + return sum(self._modality_dims.values()) + + def class_counts(self) -> Dict[str, np.ndarray]: + vf = np.zeros(NUM_VERB_FINE, dtype=np.int64) + vc = np.zeros(NUM_VERB_COMPOSITE, dtype=np.int64) + n = np.zeros(NUM_NOUN, dtype=np.int64) + h = np.zeros(NUM_HAND, dtype=np.int64) + for it in self._items: + y = it["y"] + vf[y["verb_fine"]] += 1 + vc[y["verb_composite"]] += 1 + n[y["noun"]] += 1 + h[y["hand"]] += 1 + return {"verb_fine": vf, "verb_composite": vc, "noun": n, "hand": h} + + +# --------------------------------------------------------------------------- +# Collate: pad each modality to the max T_frames in the batch +# --------------------------------------------------------------------------- + +def collate_triplet(batch): + """Stack samples into batched tensors. Backward-compatible: accepts + samples of either (x, y, meta) or (x, y, meta, prev) form. + + Returned: + x: dict[mod] -> FloatTensor (B, T_max, F_mod) + mask: BoolTensor (B, T_max) + lens: LongTensor (B,) + y: dict (each -> LongTensor (B,)) + meta: list of dicts + prev: dict {'verb_composite': LongTensor (B,), 'noun': LongTensor (B,)} + values are class indices, with NUM_VERB_COMPOSITE / NUM_NOUN + used as a BOS sentinel for the first segment in a recording. + """ + has_prev = len(batch[0]) >= 4 + if has_prev: + xs, ys, metas, prevs = zip(*batch) + else: + xs, ys, metas = zip(*batch) + prevs = [{"verb_composite": NUM_VERB_COMPOSITE, "noun": NUM_NOUN} for _ in batch] + B = len(batch) + mods = list(xs[0].keys()) + lens = torch.tensor([x[mods[0]].shape[0] for x in xs], dtype=torch.long) + T_max = int(lens.max().item()) + + x_out: Dict[str, torch.Tensor] = {} + for m in mods: + F = xs[0][m].shape[1] + padded = torch.zeros(B, T_max, F, dtype=torch.float32) + for i, x in enumerate(xs): + w = x[m] + padded[i, :w.shape[0]] = w + x_out[m] = padded + + ar = torch.arange(T_max).unsqueeze(0) + mask = ar < lens.unsqueeze(1) + + y_out = { + k: torch.tensor([y[k] for y in ys], dtype=torch.long) + for k in ("verb_fine", "verb_composite", "noun", "hand") + } + prev_out = { + "verb_composite": torch.tensor([p["verb_composite"] for p in prevs], dtype=torch.long), + "noun": torch.tensor([p["noun"] for p in prevs], dtype=torch.long), + } + return x_out, mask, lens, y_out, list(metas), prev_out + + +# --------------------------------------------------------------------------- +# Convenience: build paired train/test datasets with shared normalization +# --------------------------------------------------------------------------- + +def build_train_test( + modalities: Sequence[str] = ("imu", "mocap", "emg", "eyetrack", "pressure"), + t_obs_sec: float = 8.0, + t_fut_sec: float = 2.0, + downsample: int = DEFAULT_DOWNSAMPLE, + dataset_dir: Path = DEFAULT_DATASET_DIR, + annot_dir: Path = DEFAULT_ANNOT_DIR, + mode: str = "recognition", +) -> Tuple["TripletSeqPredDataset", "TripletSeqPredDataset"]: + train = TripletSeqPredDataset( + TRAIN_VOLS_V3, modalities=modalities, + t_obs_sec=t_obs_sec, t_fut_sec=t_fut_sec, downsample=downsample, + dataset_dir=dataset_dir, annot_dir=annot_dir, mode=mode, + ) + test = TripletSeqPredDataset( + TEST_VOLS_V3, modalities=modalities, + t_obs_sec=t_obs_sec, t_fut_sec=t_fut_sec, downsample=downsample, + dataset_dir=dataset_dir, annot_dir=annot_dir, + stats=train.get_stats(), mode=mode, + ) + return train, test + + +# --------------------------------------------------------------------------- +# CLI: quick sanity check +# --------------------------------------------------------------------------- + +if __name__ == "__main__": + import argparse + + ap = argparse.ArgumentParser() + ap.add_argument("--modalities", type=str, default="imu,emg,eyetrack") + ap.add_argument("--t_obs", type=float, default=8.0) + ap.add_argument("--t_fut", type=float, default=2.0) + ap.add_argument("--smoke_n", type=int, default=3, + help="Inspect first N samples per split") + args = ap.parse_args() + + mods = args.modalities.split(",") + print(f"Building train/test with modalities={mods} " + f"t_obs={args.t_obs}s t_fut={args.t_fut}s ...") + train, test = build_train_test( + modalities=mods, + t_obs_sec=args.t_obs, + t_fut_sec=args.t_fut, + ) + print(f"train: {len(train)} samples | test: {len(test)} samples") + + for name, ds in [("train", train), ("test", test)]: + counts = ds.class_counts() + print(f"\n[{name}] class counts:") + print(" verb_fine:", counts["verb_fine"].tolist()) + print(" verb_composite:", counts["verb_composite"].tolist()) + print(" noun (sum):", int(counts["noun"].sum()), + "nonzero:", int((counts["noun"] > 0).sum())) + print(" hand:", counts["hand"].tolist()) + + print(f"\n[{name}] first {args.smoke_n} samples:") + for i in range(min(args.smoke_n, len(ds))): + x, y, meta = ds[i] + shape_str = " ".join(f"{m}:{tuple(x[m].shape)}" for m in x) + print(f" {i:3d} {meta['vol']}/{meta['scene']}#{meta['seg_idx']:3d} " + f"anchor={meta['anchor_sec']:.2f}s y={y} {shape_str}") diff --git a/experiments/data/dataset_signal_forecast.py b/experiments/data/dataset_signal_forecast.py new file mode 100644 index 0000000000000000000000000000000000000000..f1be791c05f46de1382a258171f987b7486f35a9 --- /dev/null +++ b/experiments/data/dataset_signal_forecast.py @@ -0,0 +1,391 @@ +"""Frame-level future *signal* forecasting dataset (T8 v2). + +Task definition +--------------- +At a sampled anchor t in a recording: + past = sensor frames over [t - T_obs, t] ← input + future = target-modality frames over (t, t + T_fut] ← regression target + +Unlike the v1 ForecastDataset (which targets per-frame verb-fine class), this +predicts the raw *signal* values of one chosen target modality. This directly +tests the Johansson 1984 / monzee 2003 hypothesis that cutaneous force +feedback drives sub-second motor planning at the *signal* level (motor +commands / kinematics), not at the level of slow-changing semantic verbs. + +Anchor stratification (4 event types based on contact transitions) +------------------------------------------------------------------ +For each candidate anchor, we compute pressure_sum on past and future windows +and label it by the (past_majority_contact, future_majority_contact) pair: + + type 0 = non-contact (past low, future low) — control: pressure ~ 0 + type 1 = pre-contact (past low, future high) — pressure foretells onset + type 2 = steady-grip (past high, future high) — sustained contact dynamics + type 3 = release (past high, future low) — letting-go dynamics + +Per-event-type counts are reported and (optionally) capped to balance. +Evaluation is broken down per event type so we can see WHERE pressure helps. +""" +from __future__ import annotations + +import sys +from pathlib import Path +from typing import Dict, List, Optional, Sequence, Tuple + +import numpy as np +import torch +from torch.utils.data import Dataset + +THIS = Path(__file__).resolve() +sys.path.insert(0, str(THIS.parent)) +sys.path.insert(0, str(THIS.parents[1])) + +try: + from experiments.dataset_seqpred import ( + SAMPLING_RATE_HZ, _load_recording_sensors, + TRAIN_VOLS_V3, TEST_VOLS_V3, + DEFAULT_DATASET_DIR, DEFAULT_ANNOT_DIR, + ) +except ModuleNotFoundError: + from dataset_seqpred import ( + SAMPLING_RATE_HZ, _load_recording_sensors, + TRAIN_VOLS_V3, TEST_VOLS_V3, + DEFAULT_DATASET_DIR, DEFAULT_ANNOT_DIR, + ) + + +EVENT_NAMES = {0: "non-contact", 1: "pre-contact", 2: "steady-grip", 3: "release"} + + +class SignalForecastDataset(Dataset): + """Predict future T_fut frames of `target_modality` from past T_obs of `input_modalities`.""" + + def __init__( + self, + volunteers: Sequence[str], + input_modalities: Sequence[str], + target_modality: str, + t_obs_sec: float = 1.5, + t_fut_sec: float = 0.5, + anchor_stride_sec: float = 0.25, + downsample: int = 5, + dataset_dir: Path = DEFAULT_DATASET_DIR, + annot_dir: Path = DEFAULT_ANNOT_DIR, + contact_threshold_g: float = 5.0, + per_event_max: Optional[int] = None, + input_stats: Optional[Dict[str, Tuple[np.ndarray, np.ndarray]]] = None, + target_stats: Optional[Tuple[np.ndarray, np.ndarray]] = None, + future_pressure_stats: Optional[Tuple[np.ndarray, np.ndarray]] = None, + expected_input_dims: Optional[Dict[str, int]] = None, + expected_target_dim: Optional[int] = None, + include_future_pressure: bool = False, + rng_seed: int = 0, + log: bool = True, + ): + super().__init__() + self.input_modalities = list(input_modalities) + self.target_modality = str(target_modality) + self.t_obs_sec = float(t_obs_sec) + self.t_fut_sec = float(t_fut_sec) + self.anchor_stride_sec = float(anchor_stride_sec) + self.downsample = int(downsample) + self.sr = SAMPLING_RATE_HZ // self.downsample + self.dataset_dir = Path(dataset_dir) + self.annot_dir = Path(annot_dir) + self.contact_threshold_g = float(contact_threshold_g) + self.per_event_max = per_event_max + self.include_future_pressure = bool(include_future_pressure) + self.T_obs = int(round(self.t_obs_sec * self.sr)) + self.T_fut = int(round(self.t_fut_sec * self.sr)) + + self._items: List[dict] = [] + self._modality_dims: Dict[str, int] = dict(expected_input_dims) if expected_input_dims else {} + self._target_dim: int = int(expected_target_dim) if expected_target_dim else -1 + rng = np.random.default_rng(rng_seed) + + # Modalities to load: union of inputs + target + pressure (for filter) + load_mods = list(dict.fromkeys( + list(self.input_modalities) + [self.target_modality, "pressure"] + )) + + # Per-event-type pool of candidate anchor records + pools: Dict[int, List[dict]] = {0: [], 1: [], 2: [], 3: []} + + for vol in volunteers: + vol_dir = self.dataset_dir / vol + if not vol_dir.is_dir(): + continue + for scenario_dir in sorted(vol_dir.glob("s*")): + if not scenario_dir.is_dir(): + continue + scene = scenario_dir.name + annot_path = self.annot_dir / vol / f"{scene}.json" + if not annot_path.exists(): + continue + try: + sensors_all = _load_recording_sensors( + scenario_dir, vol, scene, load_mods + ) + except Exception: + continue + if sensors_all is None or any(a is None for a in sensors_all.values()): + continue + + pressure_full = sensors_all["pressure"] # (T, 50) + target_full = sensors_all[self.target_modality] + input_arrs = {m: sensors_all[m] for m in self.input_modalities} + + # Track input modality dims + for m, arr in input_arrs.items(): + self._enforce_dim(input_arrs, m, arr, self._modality_dims) + # Track target dim + if self._target_dim < 0: + self._target_dim = target_full.shape[1] + elif target_full.shape[1] != self._target_dim: + if target_full.shape[1] < self._target_dim: + pad = np.zeros((target_full.shape[0], self._target_dim - target_full.shape[1]), + dtype=np.float32) + target_full = np.concatenate([target_full, pad], axis=1) + else: + target_full = target_full[:, :self._target_dim] + + T_avail = min(a.shape[0] for a in input_arrs.values()) + T_avail = min(T_avail, target_full.shape[0], pressure_full.shape[0]) + if T_avail < (self.T_obs + self.T_fut) * self.downsample: + continue + + # Downsample to 20 Hz + input_ds = {m: arr[:T_avail:self.downsample] for m, arr in input_arrs.items()} + target_ds = target_full[:T_avail:self.downsample] + pressure_ds = pressure_full[:T_avail:self.downsample] + T_ds = target_ds.shape[0] + pressure_sum = pressure_ds.sum(axis=1) # (T_ds,) + + stride = max(1, int(round(self.anchor_stride_sec * self.sr))) + first_anchor = self.T_obs + last_anchor = T_ds - self.T_fut + if last_anchor <= first_anchor: + continue + + for anchor in range(first_anchor, last_anchor + 1, stride): + past_p = pressure_sum[anchor - self.T_obs:anchor] + fut_p = pressure_sum[anchor:anchor + self.T_fut] + past_high = (past_p > self.contact_threshold_g).mean() > 0.5 + fut_high = (fut_p > self.contact_threshold_g).mean() > 0.5 + if not past_high and not fut_high: + et = 0 + elif not past_high and fut_high: + et = 1 + elif past_high and fut_high: + et = 2 + else: + et = 3 + + past_slice = {m: arr[anchor - self.T_obs:anchor] + for m, arr in input_ds.items()} + past_target_last = target_ds[anchor - 1].copy() # (target_dim,) + fut_target = target_ds[anchor:anchor + self.T_fut].copy() + if any(w.shape[0] != self.T_obs for w in past_slice.values()): + continue + if fut_target.shape[0] != self.T_fut: + continue + + item = { + "x": past_slice, + "y": fut_target, + "y_last": past_target_last, # for persistence + "event_type": int(et), + "meta": {"vol": vol, "scene": scene, "anchor_idx": int(anchor)}, + } + if self.include_future_pressure: + fut_press = pressure_ds[anchor:anchor + self.T_fut].copy() + if fut_press.shape[0] != self.T_fut: + continue + item["fp"] = fut_press # (T_fut, 50) + pools[et].append(item) + + # Cap per-event count if requested (uniform downsample for balance) + for et, pool in pools.items(): + if self.per_event_max is not None and len(pool) > self.per_event_max: + idx = rng.choice(len(pool), size=self.per_event_max, replace=False) + pools[et] = [pool[i] for i in sorted(idx)] + self._items = [it for et in (0, 1, 2, 3) for it in pools[et]] + + if not self._items: + raise RuntimeError("SignalForecastDataset: collected 0 anchors.") + + # Z-score inputs and target separately + if input_stats is None: + input_stats = self._compute_input_stats() + self._input_stats = input_stats + self._apply_input_stats(input_stats) + if target_stats is None: + target_stats = self._compute_target_stats() + self._target_stats = target_stats + self._apply_target_stats(target_stats) + if self.include_future_pressure: + if future_pressure_stats is None: + future_pressure_stats = self._compute_fp_stats() + self._fp_stats = future_pressure_stats + self._apply_fp_stats(future_pressure_stats) + else: + self._fp_stats = None + + if log: + counts = {EVENT_NAMES[k]: sum(1 for it in self._items if it["event_type"] == k) + for k in (0, 1, 2, 3)} + print(f"[SignalForecastDataset] vols={len(volunteers)} " + f"target={self.target_modality} inputs={self.input_modalities} " + f"anchors={len(self._items)} {counts} " + f"T_obs={self.T_obs} T_fut={self.T_fut} sr={self.sr}Hz " + f"input_dims={self._modality_dims} target_dim={self._target_dim}", + flush=True) + + @staticmethod + def _enforce_dim(arrs, m, arr, dim_dict): + if m in dim_dict: + target = dim_dict[m] + if arr.shape[1] != target: + if arr.shape[1] < target: + pad = np.zeros((arr.shape[0], target - arr.shape[1]), dtype=np.float32) + arrs[m] = np.concatenate([arr, pad], axis=1) + else: + arrs[m] = arr[:, :target] + else: + dim_dict[m] = arr.shape[1] + + def _compute_input_stats(self): + accs = {m: [] for m in self._modality_dims} + for it in self._items: + for m, w in it["x"].items(): + accs[m].append(w) + out = {} + for m, ws in accs.items(): + cat = np.concatenate(ws, axis=0) + mu = cat.mean(axis=0).astype(np.float32) + sd = cat.std(axis=0); sd = np.where(sd < 1e-6, 1.0, sd) + out[m] = (mu, sd.astype(np.float32)) + return out + + def _apply_input_stats(self, stats): + for it in self._items: + for m, w in it["x"].items(): + if m in stats: + mu, sd = stats[m] + it["x"][m] = ((w - mu) / sd).astype(np.float32) + + def _compute_target_stats(self): + ys = np.concatenate([it["y"] for it in self._items], axis=0) + mu = ys.mean(axis=0).astype(np.float32) + sd = ys.std(axis=0); sd = np.where(sd < 1e-6, 1.0, sd) + return (mu, sd.astype(np.float32)) + + def _apply_target_stats(self, stats): + mu, sd = stats + for it in self._items: + it["y"] = ((it["y"] - mu) / sd).astype(np.float32) + it["y_last"] = ((it["y_last"] - mu) / sd).astype(np.float32) + + def _compute_fp_stats(self): + fps = np.concatenate([it["fp"] for it in self._items], axis=0) + mu = fps.mean(axis=0).astype(np.float32) + sd = fps.std(axis=0); sd = np.where(sd < 1e-6, 1.0, sd) + return (mu, sd.astype(np.float32)) + + def _apply_fp_stats(self, stats): + mu, sd = stats + for it in self._items: + it["fp"] = ((it["fp"] - mu) / sd).astype(np.float32) + + def __len__(self): + return len(self._items) + + def __getitem__(self, idx): + it = self._items[idx] + x = {m: torch.from_numpy(np.ascontiguousarray(w)) for m, w in it["x"].items()} + y = torch.from_numpy(np.ascontiguousarray(it["y"])) # (T_fut, target_dim) + y_last = torch.from_numpy(np.ascontiguousarray(it["y_last"])) # (target_dim,) + et = int(it["event_type"]) + if self.include_future_pressure: + fp = torch.from_numpy(np.ascontiguousarray(it["fp"])) # (T_fut, 50) + return x, y, y_last, fp, et, it["meta"] + return x, y, y_last, et, it["meta"] + + @property + def modality_dims(self): + return dict(self._modality_dims) + + @property + def target_dim(self): + return self._target_dim + + +def collate_signal_forecast(batch): + if len(batch[0]) == 6: # has future pressure + xs, ys, ylasts, fps, ets, metas = zip(*batch) + mods = list(xs[0].keys()) + x_out = {m: torch.stack([x[m] for x in xs], dim=0) for m in mods} + y_out = torch.stack(ys, dim=0) + yl_out = torch.stack(ylasts, dim=0) + fp_out = torch.stack(fps, dim=0) # (B, T_fut, 50) + et_out = torch.tensor(ets, dtype=torch.long) + return x_out, y_out, yl_out, fp_out, et_out, list(metas) + xs, ys, ylasts, ets, metas = zip(*batch) + mods = list(xs[0].keys()) + x_out = {m: torch.stack([x[m] for x in xs], dim=0) for m in mods} + y_out = torch.stack(ys, dim=0) + yl_out = torch.stack(ylasts, dim=0) + et_out = torch.tensor(ets, dtype=torch.long) + return x_out, y_out, yl_out, et_out, list(metas) + + +def build_signal_train_test( + input_modalities, target_modality, + t_obs_sec=1.5, t_fut_sec=0.5, anchor_stride_sec=0.25, + downsample=5, + dataset_dir=DEFAULT_DATASET_DIR, annot_dir=DEFAULT_ANNOT_DIR, + contact_threshold_g=5.0, per_event_max=None, + include_future_pressure=False, + rng_seed=0, +): + train = SignalForecastDataset( + TRAIN_VOLS_V3, input_modalities=input_modalities, + target_modality=target_modality, + t_obs_sec=t_obs_sec, t_fut_sec=t_fut_sec, + anchor_stride_sec=anchor_stride_sec, downsample=downsample, + dataset_dir=dataset_dir, annot_dir=annot_dir, + contact_threshold_g=contact_threshold_g, per_event_max=per_event_max, + include_future_pressure=include_future_pressure, + rng_seed=rng_seed, log=True, + ) + test = SignalForecastDataset( + TEST_VOLS_V3, input_modalities=input_modalities, + target_modality=target_modality, + t_obs_sec=t_obs_sec, t_fut_sec=t_fut_sec, + anchor_stride_sec=anchor_stride_sec, downsample=downsample, + dataset_dir=dataset_dir, annot_dir=annot_dir, + contact_threshold_g=contact_threshold_g, per_event_max=per_event_max, + input_stats=train._input_stats, target_stats=train._target_stats, + future_pressure_stats=train._fp_stats, + expected_input_dims=train._modality_dims, + expected_target_dim=train._target_dim, + include_future_pressure=include_future_pressure, + rng_seed=rng_seed + 1, log=True, + ) + return train, test + + +if __name__ == "__main__": + import argparse + ap = argparse.ArgumentParser() + ap.add_argument("--input_modalities", default="imu") + ap.add_argument("--target_modality", default="imu") + ap.add_argument("--t_obs", type=float, default=1.5) + ap.add_argument("--t_fut", type=float, default=0.5) + args = ap.parse_args() + tr, te = build_signal_train_test( + input_modalities=args.input_modalities.split(","), + target_modality=args.target_modality, + t_obs_sec=args.t_obs, t_fut_sec=args.t_fut, + ) + x, y, y_last, et, meta = tr[0] + print(f"Sample: x={ {m: tuple(v.shape) for m,v in x.items()} } y={tuple(y.shape)} y_last={tuple(y_last.shape)} event_type={et}") diff --git a/experiments/nets/__init__.py b/experiments/nets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/experiments/nets/__pycache__/models_seqpred.cpython-312.pyc b/experiments/nets/__pycache__/models_seqpred.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a93d10beed63a4ad2bae85a948c8571aa4767796 Binary files /dev/null and b/experiments/nets/__pycache__/models_seqpred.cpython-312.pyc differ diff --git a/experiments/nets/baselines_published/__init__.py b/experiments/nets/baselines_published/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/experiments/nets/baselines_published/baselines.py b/experiments/nets/baselines_published/baselines.py new file mode 100644 index 0000000000000000000000000000000000000000..68274ded21f4330c81103190a5eea912961c205f --- /dev/null +++ b/experiments/nets/baselines_published/baselines.py @@ -0,0 +1,488 @@ +""" +Published baselines for T1 Scene Recognition, reproduced on DailyAct-5M. + +Each method accepts a concatenated feature tensor (B, T, F_total) where F_total +is the sum of the active modality dims; the per-modality slices are recorded in +the `modality_dims` dict. Each method then uses the subset of modalities its +original paper intended. + +All methods output an (B, num_classes) logit tensor. +""" +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def _slice(x, mod_dims, wanted): + """Slice the concatenated feature tensor to keep only `wanted` modalities, + in the order given. mod_dims is an ordered dict. Returns + {name: tensor(B,T,d_name)} plus the concat.""" + parts = {} + offset = 0 + for name, d in mod_dims.items(): + if name in wanted: + parts[name] = x[..., offset:offset + d] + offset += d + assert len(parts) > 0, f"None of {wanted} in {list(mod_dims.keys())}" + return parts + + +# --------------------------------------------------------------------------- +# 1) ST-GCN (Yan et al., AAAI 2018) +# Spatio-temporal graph CNN for skeleton action recognition. +# We treat the 56-joint MoCap skeleton as the graph. +# --------------------------------------------------------------------------- + +class STGCNBlock(nn.Module): + def __init__(self, in_ch, out_ch, n_joints, stride=1, dropout=0.2): + super().__init__() + # Spatial graph conv: learnable adjacency (fully learned, no handcrafted A) + self.A = nn.Parameter(torch.eye(n_joints) + 0.1 * torch.randn(n_joints, n_joints)) + self.spatial = nn.Conv2d(in_ch, out_ch, kernel_size=(1, 1), bias=False) + self.spatial_bn = nn.BatchNorm2d(out_ch) + self.temporal = nn.Conv2d(out_ch, out_ch, kernel_size=(9, 1), + padding=(4, 0), stride=(stride, 1)) + self.temporal_bn = nn.BatchNorm2d(out_ch) + self.dropout = nn.Dropout(dropout) + if in_ch != out_ch or stride != 1: + self.res = nn.Conv2d(in_ch, out_ch, kernel_size=1, + stride=(stride, 1)) + else: + self.res = nn.Identity() + + def forward(self, x): + # x: (B, C, T, V) + res = self.res(x) + # spatial: aggregate along joints via A + h = self.spatial(x) + h = torch.einsum('bctv,vw->bctw', h, F.softmax(self.A, dim=-1)) + h = self.spatial_bn(h) + h = F.relu(h) + # temporal + h = self.temporal(h) + h = self.temporal_bn(h) + h = self.dropout(h) + return F.relu(h + res) + + +class STGCN(nn.Module): + """ST-GCN on MoCap skeleton. We assume the MoCap modality is 620-dim + (hip-relative + velocity) and reshape to ~56 joints.""" + def __init__(self, feat_dim_mocap, num_classes, hidden=64, n_joints=52): + super().__init__() + self.n_joints = n_joints + # MoCap feat is (T, 620). 52 joints × 4 (xyz+quat_type), or we take per-joint xyz-only = 156. + # In this repo, 620 = 52 markers * 4 cols + velocity features. We'll + # reshape by slicing to 3*52=156 "primary" coords, padded if needed. + self.coord_dim = 3 # we'll treat each joint as having 3 coords (XYZ) + self.proj_in = nn.Linear(feat_dim_mocap, n_joints * self.coord_dim) + + self.blocks = nn.ModuleList([ + STGCNBlock(self.coord_dim, hidden, n_joints), + STGCNBlock(hidden, hidden, n_joints), + STGCNBlock(hidden, hidden * 2, n_joints, stride=2), + STGCNBlock(hidden * 2, hidden * 2, n_joints), + STGCNBlock(hidden * 2, hidden * 4, n_joints, stride=2), + STGCNBlock(hidden * 4, hidden * 4, n_joints), + ]) + self.head = nn.Sequential( + nn.Dropout(0.3), + nn.Linear(hidden * 4, num_classes), + ) + + def forward(self, x_mocap, mask=None): + # x_mocap: (B, T, feat_dim_mocap) + B, T, _ = x_mocap.shape + h = self.proj_in(x_mocap) # (B, T, n_joints * 3) + h = h.reshape(B, T, self.n_joints, self.coord_dim).permute(0, 3, 1, 2) # (B, C, T, V) + for blk in self.blocks: + h = blk(h) + # Global mean pool over time & joints (with mask if provided) + if mask is not None: + # mask: (B, T), h: (B, C, T', V) where T' may be < T due to stride + T_ = h.shape[2] + m = mask[:, :T_].float().unsqueeze(1).unsqueeze(-1) # (B, 1, T', 1) + h = (h * m).sum(dim=(2, 3)) / (m.sum(dim=(2, 3)) * h.shape[3] + 1e-8) + else: + h = h.mean(dim=(2, 3)) + return self.head(h) + + +# --------------------------------------------------------------------------- +# 2) CTR-GCN (Chen et al., ICCV 2021) +# Channel-wise Topology Refinement GCN — learns a separate adjacency +# matrix per channel group, known as SOTA for skeleton action recognition. +# --------------------------------------------------------------------------- + +class CTRGC(nn.Module): + """Simplified CTR-GC block: learnable per-channel topology refinement.""" + def __init__(self, in_ch, out_ch, n_joints, rel_reduction=4): + super().__init__() + self.n_joints = n_joints + self.conv1 = nn.Conv2d(in_ch, out_ch // rel_reduction, 1) + self.conv2 = nn.Conv2d(in_ch, out_ch // rel_reduction, 1) + self.conv3 = nn.Conv2d(in_ch, out_ch, 1) + self.alpha = nn.Parameter(torch.zeros(1)) + self.A = nn.Parameter(torch.eye(n_joints) + 0.1 * torch.randn(n_joints, n_joints)) + + def forward(self, x): + # x: (B, C, T, V) + q = self.conv1(x).mean(dim=2) # (B, C', V) + k = self.conv2(x).mean(dim=2) # (B, C', V) + v = self.conv3(x) # (B, C_out, T, V) + # Channel-specific topology refinement + topology = F.softmax(torch.tanh(q.unsqueeze(-1) - k.unsqueeze(-2)), dim=-1) + # topology: (B, C', V, V); we average across channels to get a shared (B, V, V) + topology = topology.mean(dim=1) + A = self.A.unsqueeze(0) + self.alpha * topology + # apply A to v + out = torch.einsum('bctv,bvw->bctw', v, A) + return out + + +class CTRGCNBlock(nn.Module): + def __init__(self, in_ch, out_ch, n_joints, stride=1): + super().__init__() + self.gc = CTRGC(in_ch, out_ch, n_joints) + self.bn = nn.BatchNorm2d(out_ch) + self.tcn = nn.Sequential( + nn.Conv2d(out_ch, out_ch, (9, 1), padding=(4, 0), stride=(stride, 1)), + nn.BatchNorm2d(out_ch), + ) + if in_ch != out_ch or stride != 1: + self.res = nn.Conv2d(in_ch, out_ch, 1, stride=(stride, 1)) + else: + self.res = nn.Identity() + + def forward(self, x): + res = self.res(x) + h = self.gc(x) + h = self.bn(h) + h = F.relu(h) + h = self.tcn(h) + return F.relu(h + res) + + +class CTRGCN(nn.Module): + def __init__(self, feat_dim_mocap, num_classes, hidden=64, n_joints=52): + super().__init__() + self.n_joints = n_joints + self.coord_dim = 3 + self.proj_in = nn.Linear(feat_dim_mocap, n_joints * self.coord_dim) + self.blocks = nn.ModuleList([ + CTRGCNBlock(self.coord_dim, hidden, n_joints), + CTRGCNBlock(hidden, hidden, n_joints), + CTRGCNBlock(hidden, hidden * 2, n_joints, stride=2), + CTRGCNBlock(hidden * 2, hidden * 4, n_joints, stride=2), + ]) + self.head = nn.Sequential( + nn.Dropout(0.3), + nn.Linear(hidden * 4, num_classes), + ) + + def forward(self, x_mocap, mask=None): + B, T, _ = x_mocap.shape + h = self.proj_in(x_mocap) + h = h.reshape(B, T, self.n_joints, self.coord_dim).permute(0, 3, 1, 2) + for blk in self.blocks: + h = blk(h) + h = h.mean(dim=(2, 3)) + return self.head(h) + + +# --------------------------------------------------------------------------- +# 3) LIMU-BERT (Xu et al., SenSys 2021) +# IMU self-supervised pretraining via masked reconstruction + fine-tune. +# We implement a simpler variant: BERT-style encoder with optional +# pretraining head. +# --------------------------------------------------------------------------- + +class LIMUBertEncoder(nn.Module): + def __init__(self, feat_dim_imu, hidden=128, n_layers=4, n_heads=4, dropout=0.1): + super().__init__() + self.in_proj = nn.Linear(feat_dim_imu, hidden) + self.pos = nn.Parameter(torch.zeros(1, 4096, hidden)) + nn.init.trunc_normal_(self.pos, std=0.02) + layer = nn.TransformerEncoderLayer( + d_model=hidden, nhead=n_heads, dim_feedforward=4 * hidden, + dropout=dropout, batch_first=True, activation='gelu', + ) + self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers) + + def forward(self, x, mask): + T = x.size(1) + h = self.in_proj(x) + self.pos[:, :T, :] + h = self.encoder(h, src_key_padding_mask=~mask) + return h + + +class LIMUBert(nn.Module): + """Supervised-only variant: encoder + classifier head. Paper's + pretraining is a masked-recon objective; for simplicity we report the + supervised-only baseline here.""" + def __init__(self, feat_dim_imu, num_classes, hidden=128, n_layers=4, + n_heads=4, dropout=0.1): + super().__init__() + self.encoder = LIMUBertEncoder(feat_dim_imu, hidden, n_layers, n_heads, dropout) + self.head = nn.Sequential( + nn.LayerNorm(hidden), + nn.Dropout(dropout), + nn.Linear(hidden, num_classes), + ) + + def forward(self, x_imu, mask): + h = self.encoder(x_imu, mask) + m = mask.unsqueeze(-1).float() + pooled = (h * m).sum(dim=1) / m.sum(dim=1).clamp(min=1.0) + return self.head(pooled) + + +# --------------------------------------------------------------------------- +# 4) EMG-CNN (standard 1D CNN baseline from sEMG classification literature) +# E.g. Atzori et al. — multi-layer CNN with moving-window input. +# --------------------------------------------------------------------------- + +class EMGCNN(nn.Module): + def __init__(self, feat_dim_emg, num_classes, hidden=64): + super().__init__() + self.cnn = nn.Sequential( + nn.Conv1d(feat_dim_emg, hidden, 7, padding=3), + nn.BatchNorm1d(hidden), nn.ReLU(), nn.Dropout(0.3), + nn.Conv1d(hidden, hidden * 2, 5, padding=2), + nn.BatchNorm1d(hidden * 2), nn.ReLU(), nn.Dropout(0.3), + nn.Conv1d(hidden * 2, hidden * 4, 3, padding=1), + nn.BatchNorm1d(hidden * 4), nn.ReLU(), + ) + self.head = nn.Linear(hidden * 4, num_classes) + + def forward(self, x_emg, mask): + # (B, T, 8) -> (B, 8, T) for conv1d + h = self.cnn(x_emg.transpose(1, 2)) + # Masked pool + m = mask.unsqueeze(1).float() + T_ = h.size(2) + if m.size(2) != T_: + m = F.adaptive_avg_pool1d(m, T_) + m = (m > 0.5).float() + pooled = (h * m).sum(dim=2) / m.sum(dim=2).clamp(min=1.0) + return self.head(pooled) + + +# --------------------------------------------------------------------------- +# 5) ActionSense baseline (DelPreto et al., NeurIPS '22) +# Simple 3-layer MLP per modality + shared LSTM + classifier. +# --------------------------------------------------------------------------- + +class ActionSenseLSTM(nn.Module): + def __init__(self, modality_dims: dict, num_classes, hidden=128): + super().__init__() + self.mod_names = list(modality_dims.keys()) + self.mod_dims = modality_dims + self.per_mod = nn.ModuleDict({ + name: nn.Sequential( + nn.Linear(d, hidden), nn.ReLU(), nn.Dropout(0.2), + nn.Linear(hidden, hidden), nn.ReLU(), + ) for name, d in modality_dims.items() + }) + concat_dim = hidden * len(modality_dims) + self.lstm = nn.LSTM(concat_dim, hidden, num_layers=2, + batch_first=True, bidirectional=True, dropout=0.2) + self.head = nn.Linear(hidden * 2, num_classes) + + def forward(self, x, mask): + # x: (B, T, F_total), slice by modality + offset = 0 + feats = [] + for name in self.mod_names: + d = self.mod_dims[name] + x_m = x[..., offset:offset + d] + offset += d + feats.append(self.per_mod[name](x_m)) + h = torch.cat(feats, dim=-1) # (B, T, hidden * M) + h, _ = self.lstm(h) + m = mask.unsqueeze(-1).float() + pooled = (h * m).sum(dim=1) / m.sum(dim=1).clamp(min=1.0) + return self.head(pooled) + + +# --------------------------------------------------------------------------- +# 6) MulT (Multimodal Transformer, Tsai et al., ACL 2019) +# Core idea: cross-modal attention between every pair of modalities. +# For a 3-modality input (A, B, C), produce +# {A->B, A->C, B->A, B->C, C->A, C->B} via directed cross-attention. +# --------------------------------------------------------------------------- + +class CrossModalTransformer(nn.Module): + def __init__(self, d_model, n_heads=4, n_layers=2, dropout=0.1): + super().__init__() + self.layers = nn.ModuleList([ + nn.TransformerDecoderLayer( + d_model=d_model, nhead=n_heads, dim_feedforward=4 * d_model, + dropout=dropout, batch_first=True, activation='gelu', + ) for _ in range(n_layers) + ]) + + def forward(self, q, kv, q_mask, kv_mask): + # q: (B, T_q, D), kv: (B, T_kv, D) + h = q + for layer in self.layers: + h = layer(h, kv, + tgt_key_padding_mask=~q_mask, + memory_key_padding_mask=~kv_mask) + return h + + +class MulT(nn.Module): + """Multimodal Transformer. Uses MoCap + EMG + IMU as 3 modalities + (EyeTrack/Pressure omitted to match original 3-mod paper design).""" + def __init__(self, modality_dims: dict, num_classes, d_model=128, + n_layers=2, n_heads=4, dropout=0.1): + super().__init__() + self.mod_names = [m for m in ['mocap', 'emg', 'imu'] if m in modality_dims] + if len(self.mod_names) < 2: + self.mod_names = list(modality_dims.keys())[:3] + self.mod_dims = {m: modality_dims[m] for m in self.mod_names} + self.in_proj = nn.ModuleDict({ + m: nn.Linear(d, d_model) for m, d in self.mod_dims.items() + }) + # Pairwise cross-attention + self.cross = nn.ModuleDict({ + f"{a}_to_{b}": CrossModalTransformer(d_model, n_heads, n_layers, dropout) + for a in self.mod_names for b in self.mod_names if a != b + }) + # Self-attention after cross + self.self_tx = nn.ModuleDict({ + m: nn.TransformerEncoder( + nn.TransformerEncoderLayer( + d_model=d_model, nhead=n_heads, + dim_feedforward=4 * d_model, dropout=dropout, + batch_first=True, activation='gelu', + ), num_layers=1, + ) for m in self.mod_names + }) + total_dim = d_model * len(self.mod_names) * len(self.mod_names) + self.head = nn.Sequential( + nn.LayerNorm(total_dim), + nn.Dropout(dropout), + nn.Linear(total_dim, num_classes), + ) + + def forward(self, x, mask): + # Slice modalities from x + offset = 0 + projs = {} + # Walk through all known mod_dims to find offsets + # We need the FULL modality_dims order, which we don't have here; + # expect caller to already supply x with exactly mod_names in order. + # Workaround: assume caller passes mod_names order matching projection. + for m in self.mod_names: + d = self.mod_dims[m] + projs[m] = self.in_proj[m](x[..., offset:offset + d]) + offset += d + + # Cross-attention: each modality attends to each other + fused = {m: [] for m in self.mod_names} + for a in self.mod_names: + for b in self.mod_names: + if a == b: + fused[a].append(projs[a]) + else: + out = self.cross[f"{a}_to_{b}"](projs[a], projs[b], mask, mask) + fused[a].append(out) + + # Self-attention + pool per modality + pooled = [] + for a in self.mod_names: + # Concat all attended-to representations along feature dim + cat = torch.cat(fused[a], dim=-1) # (B, T, D * M) + # Actually re-project back to D per stream, then self-attn on stacked + # Simplified: self-attention over concatenated, pool, flatten + # Here we just pool each separately + for i, rep in enumerate(fused[a]): + rep = self.self_tx[a](rep) + m = mask.unsqueeze(-1).float() + p = (rep * m).sum(dim=1) / m.sum(dim=1).clamp(min=1.0) + pooled.append(p) + + h = torch.cat(pooled, dim=-1) + return self.head(h) + + +# --------------------------------------------------------------------------- +# 7) Perceiver IO (Jaegle et al., ICML 2021) +# Cross-attention from a fixed-size latent query set to all input tokens, +# repeated for a few iterations. +# --------------------------------------------------------------------------- + +class PerceiverBlock(nn.Module): + def __init__(self, latent_dim, n_heads, dropout): + super().__init__() + self.ca = nn.MultiheadAttention( + latent_dim, n_heads, dropout=dropout, batch_first=True, + ) + self.norm1 = nn.LayerNorm(latent_dim) + self.sa = nn.TransformerEncoderLayer( + d_model=latent_dim, nhead=n_heads, + dim_feedforward=4 * latent_dim, dropout=dropout, + batch_first=True, activation='gelu', + ) + + def forward(self, latents, inputs, input_kpm): + # Cross-attn: latents attend to inputs + h, _ = self.ca(latents, inputs, inputs, key_padding_mask=input_kpm) + latents = self.norm1(latents + h) + # Self-attn on latents + latents = self.sa(latents) + return latents + + +class PerceiverIO(nn.Module): + """Perceiver with N learnable latent queries; supports any modality mix.""" + def __init__(self, modality_dims: dict, num_classes, + latent_dim=128, n_latents=32, n_layers=3, n_heads=4, dropout=0.1): + super().__init__() + self.mod_names = list(modality_dims.keys()) + self.mod_dims = modality_dims + # Per-modality input projection to latent_dim, with modality-id embedding + self.in_proj = nn.ModuleDict({ + m: nn.Linear(d, latent_dim) for m, d in modality_dims.items() + }) + self.mod_emb = nn.Parameter(torch.randn(len(self.mod_names), latent_dim) * 0.02) + # Positional encoding (shared) + self.pos = nn.Parameter(torch.zeros(1, 4096, latent_dim)) + nn.init.trunc_normal_(self.pos, std=0.02) + # Learnable latents + self.latents = nn.Parameter(torch.randn(n_latents, latent_dim) * 0.02) + self.blocks = nn.ModuleList([ + PerceiverBlock(latent_dim, n_heads, dropout) for _ in range(n_layers) + ]) + self.head = nn.Sequential( + nn.LayerNorm(latent_dim), + nn.Linear(latent_dim, num_classes), + ) + + def forward(self, x, mask): + B, T, _ = x.shape + # Project each modality + add modality embedding + offset = 0 + tokens = [] + for i, m in enumerate(self.mod_names): + d = self.mod_dims[m] + tok = self.in_proj[m](x[..., offset:offset + d]) # (B, T, D) + tok = tok + self.mod_emb[i] + offset += d + tokens.append(tok) + # Concatenate along TIME dim, add shared pos enc per-modality + # Each modality gets its own time sequence concatenated + # Simpler: sum across modalities (like early fusion in latent space) + pos + h = torch.stack(tokens, dim=2).mean(dim=2) # (B, T, D) + h = h + self.pos[:, :T, :] + input_kpm = ~mask # (B, T), True = ignore + # Iterative cross-attention + latents = self.latents.unsqueeze(0).expand(B, -1, -1) # (B, N, D) + for blk in self.blocks: + latents = blk(latents, h, input_kpm) + # Mean-pool latents + pooled = latents.mean(dim=1) + return self.head(pooled) diff --git a/experiments/nets/baselines_published/syncfuse.py b/experiments/nets/baselines_published/syncfuse.py new file mode 100644 index 0000000000000000000000000000000000000000..cdb7476df8e267cf5983a47a20fbb19ad7fbff73 --- /dev/null +++ b/experiments/nets/baselines_published/syncfuse.py @@ -0,0 +1,270 @@ +""" +SyncFuse — our proposed method for T1 scene recognition. + +Four components (all toggleable via args for ablation): + + (1) Modality dropout: per-sample independent Bernoulli(p=0.3) drop on each + modality during training; at test time all modalities + are active. Keeps at least 1 modality. + (2) Pretrained transfer: each per-modality backbone is optionally loaded from + an independently pretrained single-modality + checkpoint and frozen during fine-tuning. + (3) Cross-modal temporal-shift attention: + a late cross-attention block where EMG queries + attend to MoCap keys/values at a LEARNED temporal + offset Δ (Gumbel-softmax over {-10,...,+10} bins at + 20 Hz = ±500 ms). Motivated by the paper's case-study + finding (EMG leads motion by ~20 ms sub-frame). + (4) Learnable late fusion: + per-modality classifier logits are combined with a + learnable softmax-weighted average (temperature is + also learned). Equivalent to `late_agg='learned'` + in the repo's existing LateFusionModel. +""" +import torch +import torch.nn as nn +import torch.nn.functional as F +import random + + +def masked_mean(x, mask): + m = mask.unsqueeze(-1).float() + return (x * m).sum(dim=1) / m.sum(dim=1).clamp(min=1.0) + + +# --------------------------------------------------------------------------- +# Per-modality Transformer branch (same as repo's TransformerBackbone) +# --------------------------------------------------------------------------- + +class ModTransformer(nn.Module): + def __init__(self, feat_dim, hidden=128, n_layers=2, n_heads=4, dropout=0.1): + super().__init__() + self.in_proj = nn.Linear(feat_dim, hidden) + self.pos = nn.Parameter(torch.zeros(1, 4096, hidden)) + nn.init.trunc_normal_(self.pos, std=0.02) + layer = nn.TransformerEncoderLayer( + d_model=hidden, nhead=n_heads, dim_feedforward=4 * hidden, + dropout=dropout, batch_first=True, activation='gelu', + ) + self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers) + self.output_dim = hidden + + def forward(self, x, mask): + # x: (B, T, feat_dim) + T = x.size(1) + h = self.in_proj(x) + self.pos[:, :T, :] + h = self.encoder(h, src_key_padding_mask=~mask) + return h # (B, T, hidden) — token-level, NOT pooled + + +# --------------------------------------------------------------------------- +# (3) Cross-modal temporal-shift attention +# --------------------------------------------------------------------------- + +class TemporalShiftAttention(nn.Module): + """Multi-head attention where queries are temporally shifted by a learned + offset Δ from the keys. Δ is drawn from a discrete set {-3,...,+3} via + straight-through Gumbel-softmax: we sample ONE shift per forward pass, + but the softmax weights flow gradient back through shift_logits. + + At 20 Hz bins, ±3 ≈ ±150 ms, which brackets the paper's ~20 ms EMG-motion + lead. Memory cost is ~1 attention pass (not 7).""" + def __init__(self, d_model, n_heads=4, dropout=0.1, max_shift=3, + gumbel_tau=1.0): + super().__init__() + self.max_shift = max_shift + self.shifts = list(range(-max_shift, max_shift + 1)) + self.shift_logits = nn.Parameter(torch.zeros(len(self.shifts))) + self.tau = gumbel_tau + self.attn = nn.MultiheadAttention( + d_model, n_heads, dropout=dropout, batch_first=True, + ) + self.norm = nn.LayerNorm(d_model) + + def _shift_tensor(self, x, shift, mask): + if shift == 0: + return x, mask + B, T, D = x.shape + if shift > 0: + pad = torch.zeros(B, shift, D, device=x.device, dtype=x.dtype) + x_s = torch.cat([x[:, shift:, :], pad], dim=1) + m_s = torch.cat([mask[:, shift:], + torch.zeros(B, shift, device=mask.device, dtype=torch.bool)], + dim=1) + else: + s = -shift + pad = torch.zeros(B, s, D, device=x.device, dtype=x.dtype) + x_s = torch.cat([pad, x[:, :-s, :]], dim=1) + m_s = torch.cat([torch.zeros(B, s, device=mask.device, dtype=torch.bool), + mask[:, :-s]], dim=1) + return x_s, m_s + + def forward(self, q_tokens, kv_tokens, q_mask, kv_mask, hard=False): + if hard or not self.training: + # Eval: take the argmax shift + with torch.no_grad(): + idx = self.shift_logits.argmax().item() + shift = self.shifts[idx] + shifted_kv, shifted_mask = self._shift_tensor(kv_tokens, shift, kv_mask) + out, _ = self.attn(q_tokens, shifted_kv, shifted_kv, + key_padding_mask=~shifted_mask) + return self.norm(q_tokens + out) + + # Training: straight-through Gumbel-softmax to sample 1 shift, + # with gradient flowing via softmax weights. + one_hot = F.gumbel_softmax(self.shift_logits, tau=self.tau, hard=True) + # pick the sampled shift (argmax of the hard one-hot) + idx = int(one_hot.argmax().item()) + shift = self.shifts[idx] + shifted_kv, shifted_mask = self._shift_tensor(kv_tokens, shift, kv_mask) + out, _ = self.attn(q_tokens, shifted_kv, shifted_kv, + key_padding_mask=~shifted_mask) + # scale out by the corresponding soft weight to let gradient flow + out = out * one_hot[idx] + return self.norm(q_tokens + out) + + +# --------------------------------------------------------------------------- +# SyncFuse main model +# --------------------------------------------------------------------------- + +class SyncFuse(nn.Module): + def __init__(self, modality_dims: dict, num_classes, hidden=128, n_heads=4, + n_layers=2, dropout=0.1, + use_xmod_shift=True, use_learned_late=True): + super().__init__() + self.mod_names = list(modality_dims.keys()) + self.mod_dims = modality_dims + self.use_xmod_shift = use_xmod_shift + self.use_learned_late = use_learned_late + + self.branches = nn.ModuleDict({ + m: ModTransformer(d, hidden, n_layers, n_heads, dropout) + for m, d in modality_dims.items() + }) + self.classifiers = nn.ModuleDict({ + m: nn.Sequential(nn.LayerNorm(hidden), nn.Dropout(dropout), + nn.Linear(hidden, num_classes)) + for m in self.mod_names + }) + + # Cross-modal temporal-shift: apply to EMG branch attending to MoCap + # (and symmetrically MoCap->EMG), only when both modalities are present. + if use_xmod_shift and 'emg' in self.mod_names and 'mocap' in self.mod_names: + self.xmod_emg2mocap = TemporalShiftAttention(hidden, n_heads, dropout) + self.xmod_mocap2emg = TemporalShiftAttention(hidden, n_heads, dropout) + else: + self.xmod_emg2mocap = None + self.xmod_mocap2emg = None + + if use_learned_late: + self.late_logits = nn.Parameter(torch.zeros(len(self.mod_names))) + self.late_temperature = nn.Parameter(torch.ones(1)) + + def load_pretrained(self, pretrain_paths: dict, freeze=True): + """Load pretrained single-modality checkpoints into branches. + pretrain_paths: {modality_name: path_to_checkpoint_state_dict}.""" + import torch as _torch + for m, path in pretrain_paths.items(): + if m not in self.branches: + continue + try: + sd = _torch.load(path, weights_only=True, map_location='cpu') + except TypeError: + sd = _torch.load(path, map_location='cpu') + # Map SingleModel keys ("backbone.X.*") -> branch keys + mapped = {} + for k, v in sd.items(): + if k.startswith('backbone.'): + new_k = k.replace('backbone.', '') + if new_k in self.branches[m].state_dict(): + mapped[new_k] = v + if mapped: + self.branches[m].load_state_dict(mapped, strict=False) + if freeze: + for p in self.branches[m].parameters(): + p.requires_grad = False + print(f" [SyncFuse] loaded {len(mapped)} tensors into branch '{m}' (frozen={freeze})") + + def forward(self, x, mask, mod_dropout_p=0.0, training_time=True): + """ + x: (B, T, F_total) concatenated features + mask: (B, T) + mod_dropout_p: probability of dropping each modality (training only) + """ + B, T, _ = x.shape + + # Slice modality features + offset = 0 + feats = {} + for m in self.mod_names: + d = self.mod_dims[m] + feats[m] = x[..., offset:offset + d] + offset += d + + # (1) Modality dropout — per sample, independent per modality + active = {m: torch.ones(B, dtype=torch.bool, device=x.device) for m in self.mod_names} + if training_time and self.training and mod_dropout_p > 0: + drop_map = {m: (torch.rand(B, device=x.device) < mod_dropout_p) + for m in self.mod_names} + all_dropped = torch.stack([drop_map[m] for m in self.mod_names], dim=0).all(dim=0) # (B,) + if all_dropped.any(): + # for all-dropped samples, un-drop one random modality + rescue_idx = torch.randint(0, len(self.mod_names), + (all_dropped.sum().item(),), + device=x.device) + mod_name_tensor = self.mod_names # python list + j = 0 + for b in range(B): + if all_dropped[b]: + r = mod_name_tensor[rescue_idx[j].item()] + drop_map[r][b] = False + j += 1 + for m in self.mod_names: + active[m] = ~drop_map[m] + # zero out dropped features for that branch + feats[m] = feats[m] * active[m].view(B, 1, 1).float() + + # Per-modality encoding + tokens = {} + for m in self.mod_names: + tokens[m] = self.branches[m](feats[m], mask) # (B, T, hidden) + + # (3) Cross-modal temporal-shift (bidirectional EMG <-> MoCap) + if self.xmod_emg2mocap is not None: + tokens['emg'] = self.xmod_emg2mocap( + tokens['emg'], tokens['mocap'], mask, mask, + hard=not self.training, + ) + tokens['mocap'] = self.xmod_mocap2emg( + tokens['mocap'], tokens['emg'], mask, mask, + hard=not self.training, + ) + + # Pool and classify per modality + logits_per = [] + for m in self.mod_names: + pooled = masked_mean(tokens[m], mask) + logits_per.append(self.classifiers[m](pooled)) + stacked = torch.stack(logits_per, dim=0) # (M, B, C) + + # Mask out logits from dropped modalities (so they don't dominate) + if training_time and self.training and mod_dropout_p > 0: + act_mask = torch.stack([active[m].float() for m in self.mod_names], dim=0) # (M, B) + # Re-normalize weights across active modalities + if self.use_learned_late: + w = F.softmax(self.late_logits / self.late_temperature.clamp(min=0.1), dim=0) + w = w.view(-1, 1) * act_mask # (M, B) + w = w / w.sum(dim=0, keepdim=True).clamp(min=1e-6) + out = (stacked * w.unsqueeze(-1)).sum(dim=0) + else: + w = act_mask / act_mask.sum(dim=0, keepdim=True).clamp(min=1e-6) + out = (stacked * w.unsqueeze(-1)).sum(dim=0) + else: + # (4) Learnable late fusion (or simple mean) + if self.use_learned_late: + w = F.softmax(self.late_logits / self.late_temperature.clamp(min=0.1), dim=0) + out = (stacked * w.view(-1, 1, 1)).sum(dim=0) + else: + out = stacked.mean(dim=0) + return out diff --git a/experiments/nets/models.py b/experiments/nets/models.py new file mode 100644 index 0000000000000000000000000000000000000000..2e723f4350971c74264db70fff958f592aa41eb5 --- /dev/null +++ b/experiments/nets/models.py @@ -0,0 +1,648 @@ +""" +Model definitions for Experiment 1: Scene Recognition. +Backbones: CNN1D, BiLSTM, Transformer +Fusion: Early (default), Late, Attention, WeightedLate, GatedLate, Stacking, Product, MoE + +Supports optional per-modality projection via proj_dim parameter: + proj_dim > 0: project each modality to proj_dim before backbone + proj_dim = 0: no projection, use raw features (original behavior) +""" + +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + + +# ============================================================ +# Per-modality projection +# ============================================================ + +class ModalityProjector(nn.Module): + """Project each modality from its raw dimension to proj_dim.""" + + def __init__(self, modality_dims, proj_dim): + super().__init__() + self.mod_names = list(modality_dims.keys()) + self.mod_dims = list(modality_dims.values()) + self.proj_dim = proj_dim + self.projectors = nn.ModuleList() + for dim in self.mod_dims: + self.projectors.append(nn.Sequential( + nn.Linear(dim, proj_dim), + nn.LayerNorm(proj_dim), + nn.ReLU(), + )) + + @property + def output_dim(self): + return self.proj_dim * len(self.mod_dims) + + def forward(self, x): + """x: (B, T, total_raw_dim) -> (B, T, proj_dim * M)""" + parts = [] + offset = 0 + for i, dim in enumerate(self.mod_dims): + x_mod = x[:, :, offset:offset + dim] + offset += dim + parts.append(self.projectors[i](x_mod)) + return torch.cat(parts, dim=-1) + + +# ============================================================ +# Per-modality hidden dim scaling (used when proj_dim=0) +# ============================================================ + +def _compute_per_modality_hidden(mod_dim, base_hidden_dim): + if mod_dim >= 128: + return max(base_hidden_dim, 48) + elif mod_dim >= 32: + return base_hidden_dim + else: + return max(16, base_hidden_dim // 2) + + +# ============================================================ +# Backbones +# ============================================================ + +class CNN1DBackbone(nn.Module): + def __init__(self, input_dim, hidden_dim=128): + super().__init__() + self.conv1 = nn.Sequential( + nn.Conv1d(input_dim, 64, kernel_size=7, padding=3), + nn.BatchNorm1d(64), nn.ReLU(), nn.Dropout(0.1), + ) + self.conv2 = nn.Sequential( + nn.Conv1d(64, 128, kernel_size=5, padding=2), + nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(0.1), + ) + self.conv3 = nn.Sequential( + nn.Conv1d(128, hidden_dim, kernel_size=3, padding=1), + nn.BatchNorm1d(hidden_dim), nn.ReLU(), + ) + self.output_dim = hidden_dim + + def forward(self, x, mask=None): + x = x.permute(0, 2, 1) + x = self.conv1(x) + x = self.conv2(x) + x = self.conv3(x) + if mask is not None: + x = (x * mask.unsqueeze(1).float()).sum(2) / mask.sum(1, keepdim=True).float().clamp(min=1) + else: + x = x.mean(2) + return x + + +class LSTMBackbone(nn.Module): + def __init__(self, input_dim, hidden_dim=128, num_layers=2, dropout=0.2): + super().__init__() + self.lstm = nn.LSTM( + input_dim, hidden_dim, num_layers=num_layers, + batch_first=True, bidirectional=True, + dropout=dropout if num_layers > 1 else 0, + ) + self.attn = nn.Linear(hidden_dim * 2, 1) + self.output_dim = hidden_dim * 2 + + def forward(self, x, mask=None): + out, _ = self.lstm(x) + scores = self.attn(out).squeeze(-1) + if mask is not None: + scores = scores.masked_fill(~mask, float('-inf')) + weights = torch.softmax(scores, dim=1) + out = (out * weights.unsqueeze(-1)).sum(dim=1) + return out + + +class TinyHARBackbone(nn.Module): + """TinyHAR backbone (Zhou et al., ISWC 2022 Best Paper). + + Lightweight model for human activity recognition from wearable sensors. + Uses multi-scale temporal convolutions + cross-channel interaction + temporal pooling. + + Input: (B, T, C) with optional mask + Output: (B, hidden_dim) + """ + + def __init__(self, input_dim, hidden_dim=128, num_scales=4): + super().__init__() + scale_dim = max(4, hidden_dim // num_scales) + actual_hidden = scale_dim * num_scales + + # Multi-scale temporal convolution feature extraction + self.convs = nn.ModuleList() + for i in range(num_scales): + ks = 2 * (i + 1) + 1 # kernel sizes: 3, 5, 7, 9 + self.convs.append(nn.Sequential( + nn.Conv1d(input_dim, scale_dim, kernel_size=ks, padding=ks // 2), + nn.BatchNorm1d(scale_dim), + nn.ReLU(), + )) + + # Cross-channel interaction via multi-head self-attention + nhead = max(1, min(4, actual_hidden // 8)) + # Ensure actual_hidden is divisible by nhead + while actual_hidden % nhead != 0 and nhead > 1: + nhead -= 1 + self.channel_attn = nn.MultiheadAttention( + actual_hidden, num_heads=nhead, batch_first=True, dropout=0.1, + ) + self.channel_norm = nn.LayerNorm(actual_hidden) + self.channel_ff = nn.Sequential( + nn.Linear(actual_hidden, actual_hidden), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(actual_hidden, actual_hidden), + ) + self.ff_norm = nn.LayerNorm(actual_hidden) + + # Temporal attention pooling + self.temporal_query = nn.Parameter(torch.randn(1, 1, actual_hidden) * 0.02) + self.temporal_attn = nn.MultiheadAttention( + actual_hidden, num_heads=1, batch_first=True, dropout=0.1, + ) + + self.output_dim = actual_hidden + + def forward(self, x, mask=None): + # x: (B, T, C) + B, T, C = x.shape + x_t = x.permute(0, 2, 1) # (B, C, T) + + # Multi-scale feature extraction + scale_features = [conv(x_t) for conv in self.convs] + x = torch.cat(scale_features, dim=1) # (B, actual_hidden, T) + x = x.permute(0, 2, 1) # (B, T, actual_hidden) + + # Cross-channel interaction + key_padding_mask = ~mask if mask is not None else None + attn_out, _ = self.channel_attn(x, x, x, key_padding_mask=key_padding_mask) + x = self.channel_norm(x + attn_out) + x = self.ff_norm(x + self.channel_ff(x)) + + # Temporal attention pooling + query = self.temporal_query.expand(B, -1, -1) # (B, 1, actual_hidden) + pooled, _ = self.temporal_attn(query, x, x, key_padding_mask=key_padding_mask) + return pooled.squeeze(1) # (B, actual_hidden) + + +class PositionalEncoding(nn.Module): + def __init__(self, d_model, dropout=0.1, max_len=5000): + super().__init__() + self.dropout = nn.Dropout(p=dropout) + pe = torch.zeros(max_len, d_model) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + def forward(self, x): + x = x + self.pe[:, :x.size(1)] + return self.dropout(x) + + +class TransformerBackbone(nn.Module): + def __init__(self, input_dim, d_model=128, nhead=4, num_layers=2, dropout=0.1): + super().__init__() + self.input_proj = nn.Linear(input_dim, d_model) + self.pos_enc = PositionalEncoding(d_model, dropout=dropout) + encoder_layer = nn.TransformerEncoderLayer( + d_model=d_model, nhead=nhead, dim_feedforward=d_model * 4, + dropout=dropout, batch_first=True, + ) + self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers) + self.output_dim = d_model + + def forward(self, x, mask=None): + x = self.input_proj(x) + x = self.pos_enc(x) + src_key_padding_mask = ~mask if mask is not None else None + x = self.encoder(x, src_key_padding_mask=src_key_padding_mask) + if mask is not None: + x = (x * mask.unsqueeze(-1).float()).sum(1) / mask.sum(1, keepdim=True).float().clamp(min=1) + else: + x = x.mean(1) + return x + + +# ============================================================ +# Full models +# ============================================================ + +def get_backbone(name, input_dim, hidden_dim=128): + if name == 'cnn': + return CNN1DBackbone(input_dim, hidden_dim) + elif name == 'lstm': + return LSTMBackbone(input_dim, hidden_dim) + elif name == 'transformer': + return TransformerBackbone(input_dim, hidden_dim) + elif name == 'tinyhar': + return TinyHARBackbone(input_dim, hidden_dim) + elif name == 'deepconvlstm': + from experiments.published_models import DeepConvLSTMBackbone + return DeepConvLSTMBackbone(input_dim, hidden_dim) + elif name == 'inceptiontime': + from experiments.published_models import InceptionTimeBackbone + return InceptionTimeBackbone(input_dim, hidden_dim) + else: + raise ValueError(f"Unknown backbone: {name}") + + +def _make_branch(backbone_name, raw_dim, hidden_dim, proj_dim): + """Create optional projector + backbone for one modality branch.""" + if proj_dim > 0: + proj = nn.Sequential( + nn.Linear(raw_dim, proj_dim), + nn.LayerNorm(proj_dim), + nn.ReLU(), + ) + bb_input = proj_dim + bb_hidden = hidden_dim + else: + proj = None + bb_input = raw_dim + bb_hidden = _compute_per_modality_hidden(raw_dim, hidden_dim) + bb = get_backbone(backbone_name, bb_input, bb_hidden) + return proj, bb + + +class SingleModel(nn.Module): + """Single backbone + classifier (early fusion or single-modality).""" + + def __init__(self, backbone_name, input_dim, num_classes, hidden_dim=128, + modality_dims=None, proj_dim=0): + super().__init__() + self.projector = None + if proj_dim > 0 and modality_dims: + self.projector = ModalityProjector(modality_dims, proj_dim) + actual_input_dim = self.projector.output_dim + else: + actual_input_dim = input_dim + self.backbone = get_backbone(backbone_name, actual_input_dim, hidden_dim) + self.classifier = nn.Sequential( + nn.Dropout(0.5), + nn.Linear(self.backbone.output_dim, num_classes), + ) + + def forward(self, x, mask=None): + if self.projector is not None: + x = self.projector(x) + feat = self.backbone(x, mask) + return self.classifier(feat) + + +class LateFusionModel(nn.Module): + """Late fusion: separate backbone per modality, configurable logit aggregation. + + late_agg='mean': simple average (original) + late_agg='confidence': entropy-based confidence weighting (0 extra params) + late_agg='learned': temperature-scaled learned weights (M+1 extra params) + """ + + def __init__(self, backbone_name, modality_dims, num_classes, hidden_dim=64, + proj_dim=0, late_agg='mean'): + super().__init__() + self.mod_names = list(modality_dims.keys()) + self.mod_dims = list(modality_dims.values()) + self.late_agg = late_agg + self.projectors = nn.ModuleList() + self.backbones = nn.ModuleList() + self.classifiers = nn.ModuleList() + for dim in self.mod_dims: + proj, bb = _make_branch(backbone_name, dim, hidden_dim, proj_dim) + self.projectors.append(proj if proj else nn.Identity()) + self.backbones.append(bb) + self.classifiers.append(nn.Sequential( + nn.Dropout(0.5), nn.Linear(bb.output_dim, num_classes), + )) + self._has_proj = proj_dim > 0 + + M = len(self.mod_dims) + if late_agg == 'learned': + self.modality_logits = nn.Parameter(torch.zeros(M)) + self.temperature = nn.Parameter(torch.ones(1)) + + def forward(self, x, mask=None): + offset = 0 + all_logits = [] + for i, dim in enumerate(self.mod_dims): + x_mod = x[:, :, offset:offset + dim] + offset += dim + if self._has_proj: + x_mod = self.projectors[i](x_mod) + feat = self.backbones[i](x_mod, mask) + all_logits.append(self.classifiers[i](feat)) + + stacked = torch.stack(all_logits, dim=0) # (M, B, C) + + if self.late_agg == 'confidence': + # Weight by confidence: low entropy → high weight + probs = F.softmax(stacked, dim=-1) # (M, B, C) + entropy = -(probs * (probs + 1e-8).log()).sum(dim=-1) # (M, B) + weights = F.softmax(-entropy, dim=0).unsqueeze(-1) # (M, B, 1) + return (stacked * weights).sum(dim=0) + elif self.late_agg == 'learned': + weights = F.softmax(self.modality_logits / self.temperature, dim=0) + return (stacked * weights.view(-1, 1, 1)).sum(dim=0) + else: # 'mean' + return stacked.mean(dim=0) + + +class AttentionFusionModel(nn.Module): + """Attention fusion: separate encoder per modality -> cross-modal attention -> classifier.""" + + def __init__(self, backbone_name, modality_dims, num_classes, hidden_dim=64, proj_dim=0): + super().__init__() + self.mod_names = list(modality_dims.keys()) + self.mod_dims = list(modality_dims.values()) + unified_dim = hidden_dim + self.projectors = nn.ModuleList() + self.backbones = nn.ModuleList() + self.feat_projections = nn.ModuleList() + for dim in self.mod_dims: + proj, bb = _make_branch(backbone_name, dim, hidden_dim, proj_dim) + self.projectors.append(proj if proj else nn.Identity()) + self.backbones.append(bb) + if bb.output_dim != unified_dim: + self.feat_projections.append(nn.Linear(bb.output_dim, unified_dim)) + else: + self.feat_projections.append(nn.Identity()) + self._has_proj = proj_dim > 0 + nhead = 4 if unified_dim % 4 == 0 else (2 if unified_dim % 2 == 0 else 1) + self.cross_attn = nn.TransformerEncoderLayer( + d_model=unified_dim, nhead=nhead, dim_feedforward=unified_dim * 2, + dropout=0.1, batch_first=True, + ) + self.classifier = nn.Sequential( + nn.Dropout(0.5), nn.Linear(unified_dim, num_classes), + ) + + def forward(self, x, mask=None): + offset = 0 + mod_features = [] + for i, dim in enumerate(self.mod_dims): + x_mod = x[:, :, offset:offset + dim] + offset += dim + if self._has_proj: + x_mod = self.projectors[i](x_mod) + feat = self.backbones[i](x_mod, mask) + feat = self.feat_projections[i](feat) + mod_features.append(feat) + tokens = torch.stack(mod_features, dim=1) + tokens = self.cross_attn(tokens) + pooled = tokens.mean(dim=1) + return self.classifier(pooled) + + +class WeightedLateFusionModel(nn.Module): + def __init__(self, backbone_name, modality_dims, num_classes, hidden_dim=64, proj_dim=0): + super().__init__() + self.mod_names = list(modality_dims.keys()) + self.mod_dims = list(modality_dims.values()) + self.projectors = nn.ModuleList() + self.backbones = nn.ModuleList() + self.classifiers = nn.ModuleList() + for dim in self.mod_dims: + proj, bb = _make_branch(backbone_name, dim, hidden_dim, proj_dim) + self.projectors.append(proj if proj else nn.Identity()) + self.backbones.append(bb) + self.classifiers.append(nn.Sequential( + nn.Dropout(0.5), nn.Linear(bb.output_dim, num_classes), + )) + self._has_proj = proj_dim > 0 + self.modality_weights = nn.Parameter(torch.ones(len(self.mod_dims))) + + def forward(self, x, mask=None): + offset = 0 + all_logits = [] + for i, dim in enumerate(self.mod_dims): + x_mod = x[:, :, offset:offset + dim] + offset += dim + if self._has_proj: + x_mod = self.projectors[i](x_mod) + feat = self.backbones[i](x_mod, mask) + all_logits.append(self.classifiers[i](feat)) + weights = F.softmax(self.modality_weights, dim=0) + stacked = torch.stack(all_logits, dim=0) + return (stacked * weights.view(-1, 1, 1)).sum(dim=0) + + +class GatedLateFusionModel(nn.Module): + def __init__(self, backbone_name, modality_dims, num_classes, hidden_dim=64, proj_dim=0): + super().__init__() + self.mod_names = list(modality_dims.keys()) + self.mod_dims = list(modality_dims.values()) + M = len(self.mod_dims) + self.projectors = nn.ModuleList() + self.backbones = nn.ModuleList() + self.classifiers = nn.ModuleList() + total_feat_dim = 0 + for dim in self.mod_dims: + proj, bb = _make_branch(backbone_name, dim, hidden_dim, proj_dim) + self.projectors.append(proj if proj else nn.Identity()) + self.backbones.append(bb) + total_feat_dim += bb.output_dim + self.classifiers.append(nn.Sequential( + nn.Dropout(0.5), nn.Linear(bb.output_dim, num_classes), + )) + self._has_proj = proj_dim > 0 + self.gate = nn.Sequential( + nn.Linear(total_feat_dim, 32), nn.ReLU(), nn.Linear(32, M), + ) + + def forward(self, x, mask=None): + offset = 0 + all_feats, all_logits = [], [] + for i, dim in enumerate(self.mod_dims): + x_mod = x[:, :, offset:offset + dim] + offset += dim + if self._has_proj: + x_mod = self.projectors[i](x_mod) + feat = self.backbones[i](x_mod, mask) + all_feats.append(feat) + all_logits.append(self.classifiers[i](feat)) + cat_feats = torch.cat(all_feats, dim=1) + gate_weights = F.softmax(self.gate(cat_feats), dim=1) + stacked = torch.stack(all_logits, dim=1) + return (stacked * gate_weights.unsqueeze(-1)).sum(dim=1) + + +class StackingFusionModel(nn.Module): + def __init__(self, backbone_name, modality_dims, num_classes, hidden_dim=64, proj_dim=0): + super().__init__() + self.mod_names = list(modality_dims.keys()) + self.mod_dims = list(modality_dims.values()) + M = len(self.mod_dims) + self.projectors = nn.ModuleList() + self.backbones = nn.ModuleList() + self.classifiers = nn.ModuleList() + for dim in self.mod_dims: + proj, bb = _make_branch(backbone_name, dim, hidden_dim, proj_dim) + self.projectors.append(proj if proj else nn.Identity()) + self.backbones.append(bb) + self.classifiers.append(nn.Sequential( + nn.Dropout(0.5), nn.Linear(bb.output_dim, num_classes), + )) + self._has_proj = proj_dim > 0 + self.meta_learner = nn.Sequential( + nn.Linear(M * num_classes, 32), nn.ReLU(), + nn.Dropout(0.5), nn.Linear(32, num_classes), + ) + + def forward(self, x, mask=None): + offset = 0 + all_logits = [] + for i, dim in enumerate(self.mod_dims): + x_mod = x[:, :, offset:offset + dim] + offset += dim + if self._has_proj: + x_mod = self.projectors[i](x_mod) + feat = self.backbones[i](x_mod, mask) + all_logits.append(self.classifiers[i](feat)) + cat_logits = torch.cat(all_logits, dim=1) + return self.meta_learner(cat_logits) + + +class ProductOfExpertsModel(nn.Module): + def __init__(self, backbone_name, modality_dims, num_classes, hidden_dim=64, proj_dim=0): + super().__init__() + self.mod_names = list(modality_dims.keys()) + self.mod_dims = list(modality_dims.values()) + self.projectors = nn.ModuleList() + self.backbones = nn.ModuleList() + self.classifiers = nn.ModuleList() + for dim in self.mod_dims: + proj, bb = _make_branch(backbone_name, dim, hidden_dim, proj_dim) + self.projectors.append(proj if proj else nn.Identity()) + self.backbones.append(bb) + self.classifiers.append(nn.Sequential( + nn.Dropout(0.5), nn.Linear(bb.output_dim, num_classes), + )) + self._has_proj = proj_dim > 0 + + def forward(self, x, mask=None): + offset = 0 + log_probs_sum = None + for i, dim in enumerate(self.mod_dims): + x_mod = x[:, :, offset:offset + dim] + offset += dim + if self._has_proj: + x_mod = self.projectors[i](x_mod) + feat = self.backbones[i](x_mod, mask) + logits = self.classifiers[i](feat) + log_p = F.log_softmax(logits, dim=1) + log_probs_sum = log_p if log_probs_sum is None else log_probs_sum + log_p + return log_probs_sum + + +class MoEFusionModel(nn.Module): + def __init__(self, backbone_name, modality_dims, num_classes, hidden_dim=64, proj_dim=0): + super().__init__() + self.mod_names = list(modality_dims.keys()) + self.mod_dims = list(modality_dims.values()) + M = len(self.mod_dims) + self.top_k = min(2, M) + self.projectors = nn.ModuleList() + self.backbones = nn.ModuleList() + self.classifiers = nn.ModuleList() + total_feat_dim = 0 + for dim in self.mod_dims: + proj, bb = _make_branch(backbone_name, dim, hidden_dim, proj_dim) + self.projectors.append(proj if proj else nn.Identity()) + self.backbones.append(bb) + total_feat_dim += bb.output_dim + self.classifiers.append(nn.Sequential( + nn.Dropout(0.5), nn.Linear(bb.output_dim, num_classes), + )) + self._has_proj = proj_dim > 0 + self.router = nn.Linear(total_feat_dim, M) + + def forward(self, x, mask=None): + offset = 0 + all_feats, all_logits = [], [] + for i, dim in enumerate(self.mod_dims): + x_mod = x[:, :, offset:offset + dim] + offset += dim + if self._has_proj: + x_mod = self.projectors[i](x_mod) + feat = self.backbones[i](x_mod, mask) + all_feats.append(feat) + all_logits.append(self.classifiers[i](feat)) + cat_feats = torch.cat(all_feats, dim=1) + router_logits = self.router(cat_feats) + top_vals, top_idx = router_logits.topk(self.top_k, dim=1) + top_weights = F.softmax(top_vals, dim=1) + stacked = torch.stack(all_logits, dim=1) + top_idx_exp = top_idx.unsqueeze(-1).expand(-1, -1, stacked.size(-1)) + selected = stacked.gather(1, top_idx_exp) + return (selected * top_weights.unsqueeze(-1)).sum(dim=1) + + +class FeatureConcatFusionModel(nn.Module): + """Feature-level late fusion: separate backbones, concatenate features, joint classifier.""" + + def __init__(self, backbone_name, modality_dims, num_classes, hidden_dim=64, proj_dim=0): + super().__init__() + self.mod_names = list(modality_dims.keys()) + self.mod_dims = list(modality_dims.values()) + self.projectors = nn.ModuleList() + self.backbones = nn.ModuleList() + total_feat_dim = 0 + for dim in self.mod_dims: + proj, bb = _make_branch(backbone_name, dim, hidden_dim, proj_dim) + self.projectors.append(proj if proj else nn.Identity()) + self.backbones.append(bb) + total_feat_dim += bb.output_dim + self._has_proj = proj_dim > 0 + self.classifier = nn.Sequential( + nn.LayerNorm(total_feat_dim), + nn.Dropout(0.5), + nn.Linear(total_feat_dim, hidden_dim), + nn.ReLU(), + nn.Dropout(0.3), + nn.Linear(hidden_dim, num_classes), + ) + + def forward(self, x, mask=None): + offset = 0 + all_feats = [] + for i, dim in enumerate(self.mod_dims): + x_mod = x[:, :, offset:offset + dim] + offset += dim + if self._has_proj: + x_mod = self.projectors[i](x_mod) + feat = self.backbones[i](x_mod, mask) + all_feats.append(feat) + cat_feats = torch.cat(all_feats, dim=1) + return self.classifier(cat_feats) + + +def build_model(backbone_name, fusion, input_dim, modality_dims, num_classes, + hidden_dim=128, proj_dim=0, late_agg='mean'): + """Factory function. proj_dim=0 means no projection (raw features).""" + if fusion == 'early': + return SingleModel(backbone_name, input_dim, num_classes, hidden_dim, + modality_dims=modality_dims, proj_dim=proj_dim) + elif fusion == 'late': + return LateFusionModel(backbone_name, modality_dims, num_classes, hidden_dim, + proj_dim, late_agg=late_agg) + elif fusion == 'attention': + return AttentionFusionModel(backbone_name, modality_dims, num_classes, hidden_dim, proj_dim) + elif fusion == 'weighted_late': + return WeightedLateFusionModel(backbone_name, modality_dims, num_classes, hidden_dim, proj_dim) + elif fusion == 'gated_late': + return GatedLateFusionModel(backbone_name, modality_dims, num_classes, hidden_dim, proj_dim) + elif fusion == 'stacking': + return StackingFusionModel(backbone_name, modality_dims, num_classes, hidden_dim, proj_dim) + elif fusion == 'product': + return ProductOfExpertsModel(backbone_name, modality_dims, num_classes, hidden_dim, proj_dim) + elif fusion == 'moe': + return MoEFusionModel(backbone_name, modality_dims, num_classes, hidden_dim, proj_dim) + elif fusion == 'feat_concat': + return FeatureConcatFusionModel(backbone_name, modality_dims, num_classes, hidden_dim, proj_dim) + else: + raise ValueError(f"Unknown fusion: {fusion}") diff --git a/experiments/nets/models_forecast.py b/experiments/nets/models_forecast.py new file mode 100644 index 0000000000000000000000000000000000000000..ac8a2a3053bc65accd38fdb96dc20a25e6ce5d25 --- /dev/null +++ b/experiments/nets/models_forecast.py @@ -0,0 +1,269 @@ +"""Frame-level future forecasting models. + +Three baselines (all sharing the same forecast head signature): + - TransformerForecast (our DAF-style) + - FUTRForecast (Transformer encoder + parallel query decoder) + - DeepConvLSTMForecast (Ordoñez & Roggen 2016 wearable HAR backbone) + +All take a dict {mod: (B, T_obs, F_mod)} and output (B, T_fut, num_classes). +""" +from __future__ import annotations +from typing import Dict, List + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +# --------------------------------------------------------------------------- +# Shared per-modality projection: each modality -> hidden dim d_model +# --------------------------------------------------------------------------- + +class _PerModalityProj(nn.Module): + def __init__(self, modality_dims: Dict[str, int], d_model: int): + super().__init__() + self.proj = nn.ModuleDict({ + m: nn.Linear(d, d_model) for m, d in modality_dims.items() + }) + self.mod_emb = nn.Parameter(torch.zeros(len(modality_dims), d_model)) + nn.init.trunc_normal_(self.mod_emb, std=0.02) + self.mods = list(modality_dims.keys()) + + def forward(self, x: Dict[str, torch.Tensor]) -> torch.Tensor: + # Concatenate per-modality projections along time? Or sum? + # We sum modality-projected features per time step (with modality + # embedding broadcast). Equivalent to early-fusion at the d_model + # space and is what a "modality-aware Transformer" typically uses. + out = None + for i, m in enumerate(self.mods): + h = self.proj[m](x[m]) + self.mod_emb[i] + out = h if out is None else out + h + return out / len(self.mods) # (B, T_obs, d_model) + + +# --------------------------------------------------------------------------- +# 1. Transformer (DAF-style) forecast model +# --------------------------------------------------------------------------- + +class TransformerForecast(nn.Module): + def __init__(self, modality_dims: Dict[str, int], num_classes: int, + t_obs: int, t_fut: int, d_model: int = 128, + n_heads: int = 4, n_layers: int = 2, dropout: float = 0.1): + super().__init__() + self.t_obs = t_obs + self.t_fut = t_fut + self.num_classes = num_classes + self.embed = _PerModalityProj(modality_dims, d_model) + self.pos = nn.Parameter(torch.zeros(1, t_obs, d_model)) + nn.init.trunc_normal_(self.pos, std=0.02) + layer = nn.TransformerEncoderLayer( + d_model=d_model, nhead=n_heads, dim_feedforward=4 * d_model, + dropout=dropout, batch_first=True, activation="gelu", + ) + self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers) + self.queries = nn.Parameter(torch.zeros(1, t_fut, d_model)) + nn.init.trunc_normal_(self.queries, std=0.02) + self.cross_attn = nn.MultiheadAttention( + d_model, n_heads, dropout=dropout, batch_first=True + ) + self.norm = nn.LayerNorm(d_model) + self.head = nn.Linear(d_model, num_classes) + + def forward(self, x: Dict[str, torch.Tensor]) -> torch.Tensor: + h = self.embed(x) + self.pos + h = self.encoder(h) # (B, T_obs, D) + q = self.queries.expand(h.size(0), -1, -1) # (B, T_fut, D) + out, _ = self.cross_attn(q, h, h, need_weights=False) + out = self.norm(out) + return self.head(out) # (B, T_fut, C) + + +# --------------------------------------------------------------------------- +# 2. FUTR-style forecast (Future Transformer, Gong et al. CVPR 2022) +# Same encoder + parallel query decoder. We add a small Transformer +# decoder so it's not literally identical to TransformerForecast. +# --------------------------------------------------------------------------- + +class FUTRForecast(nn.Module): + def __init__(self, modality_dims: Dict[str, int], num_classes: int, + t_obs: int, t_fut: int, d_model: int = 128, + n_heads: int = 4, n_enc: int = 2, n_dec: int = 1, + dropout: float = 0.1): + super().__init__() + self.t_obs = t_obs + self.t_fut = t_fut + self.num_classes = num_classes + self.embed = _PerModalityProj(modality_dims, d_model) + self.pos = nn.Parameter(torch.zeros(1, t_obs, d_model)) + nn.init.trunc_normal_(self.pos, std=0.02) + enc_layer = nn.TransformerEncoderLayer( + d_model=d_model, nhead=n_heads, dim_feedforward=4 * d_model, + dropout=dropout, batch_first=True, activation="gelu", + ) + self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_enc) + dec_layer = nn.TransformerDecoderLayer( + d_model=d_model, nhead=n_heads, dim_feedforward=4 * d_model, + dropout=dropout, batch_first=True, activation="gelu", + ) + self.decoder = nn.TransformerDecoder(dec_layer, num_layers=n_dec) + self.queries = nn.Parameter(torch.zeros(1, t_fut, d_model)) + nn.init.trunc_normal_(self.queries, std=0.02) + self.head = nn.Linear(d_model, num_classes) + + def forward(self, x: Dict[str, torch.Tensor]) -> torch.Tensor: + memory = self.encoder(self.embed(x) + self.pos) # (B, T_obs, D) + q = self.queries.expand(memory.size(0), -1, -1) # (B, T_fut, D) + out = self.decoder(q, memory) + return self.head(out) # (B, T_fut, C) + + +# --------------------------------------------------------------------------- +# 3. DeepConvLSTM-style forecast +# --------------------------------------------------------------------------- + +class DeepConvLSTMForecast(nn.Module): + def __init__(self, modality_dims: Dict[str, int], num_classes: int, + t_obs: int, t_fut: int, conv_filters: int = 64, + lstm_hidden: int = 128, n_lstm_layers: int = 2, + dropout: float = 0.1): + super().__init__() + self.t_obs = t_obs + self.t_fut = t_fut + self.num_classes = num_classes + self.mods = list(modality_dims.keys()) + in_ch = sum(modality_dims.values()) + # Same 4-layer conv stack as the original DeepConvLSTM + layers = [] + ch = in_ch + for i in range(4): + layers.append(nn.Sequential( + nn.Conv1d(ch, conv_filters, kernel_size=5, padding=2), + nn.BatchNorm1d(conv_filters), + nn.ReLU(), + nn.Dropout(dropout if i < 3 else 0.2), + )) + ch = conv_filters + self.convs = nn.ModuleList(layers) + self.lstm = nn.LSTM( + conv_filters, lstm_hidden, num_layers=n_lstm_layers, + batch_first=True, dropout=dropout if n_lstm_layers > 1 else 0, + ) + self.head = nn.Linear(lstm_hidden, t_fut * num_classes) + + def forward(self, x: Dict[str, torch.Tensor]) -> torch.Tensor: + h = torch.cat([x[m] for m in self.mods], dim=-1) # (B, T_obs, F_total) + h = h.permute(0, 2, 1) # (B, F, T_obs) + for c in self.convs: + h = c(h) + h = h.permute(0, 2, 1) # (B, T_obs, conv_filters) + out, (h_n, _) = self.lstm(h) + feat = h_n[-1] # (B, lstm_hidden) + logits = self.head(feat).view(-1, self.t_fut, self.num_classes) + return logits + + +# --------------------------------------------------------------------------- +# 4. RU-LSTM (Furnari et al. RAL 2019, "Rolling-Unrolling LSTM for action +# anticipation"). Two-phase LSTM: a "rolling" phase encodes past, an +# "unrolling" phase autoregressively decodes future tokens. +# --------------------------------------------------------------------------- + +class RULSTMForecast(nn.Module): + def __init__(self, modality_dims: Dict[str, int], num_classes: int, + t_obs: int, t_fut: int, d_model: int = 128, + n_lstm_layers: int = 2, dropout: float = 0.1): + super().__init__() + self.t_obs = t_obs + self.t_fut = t_fut + self.num_classes = num_classes + self.embed = _PerModalityProj(modality_dims, d_model) + self.rolling = nn.LSTM( + d_model, d_model, num_layers=n_lstm_layers, + batch_first=True, dropout=dropout if n_lstm_layers > 1 else 0, + ) + self.unrolling = nn.LSTM( + d_model, d_model, num_layers=n_lstm_layers, + batch_first=True, dropout=dropout if n_lstm_layers > 1 else 0, + ) + self.fut_init = nn.Parameter(torch.zeros(1, 1, d_model)) + nn.init.trunc_normal_(self.fut_init, std=0.02) + self.head = nn.Linear(d_model, num_classes) + + def forward(self, x: Dict[str, torch.Tensor]) -> torch.Tensor: + h_past = self.embed(x) # (B, T_obs, D) + _, (h_n, c_n) = self.rolling(h_past) + B = h_past.size(0) + # Use a learned initial future token, repeated T_fut times + fut_input = self.fut_init.expand(B, self.t_fut, -1) + out, _ = self.unrolling(fut_input, (h_n, c_n)) + return self.head(out) # (B, T_fut, C) + + +# --------------------------------------------------------------------------- +# 5. AVT (Girdhar & Grauman ICCV 2021, "Anticipative Video Transformer"). +# Causal Transformer over the concatenation of past + future tokens. +# --------------------------------------------------------------------------- + +class AVTForecast(nn.Module): + def __init__(self, modality_dims: Dict[str, int], num_classes: int, + t_obs: int, t_fut: int, d_model: int = 128, + n_heads: int = 4, n_layers: int = 2, dropout: float = 0.1): + super().__init__() + self.t_obs = t_obs + self.t_fut = t_fut + self.num_classes = num_classes + self.embed = _PerModalityProj(modality_dims, d_model) + seq_len = t_obs + t_fut + self.pos = nn.Parameter(torch.zeros(1, seq_len, d_model)) + nn.init.trunc_normal_(self.pos, std=0.02) + layer = nn.TransformerEncoderLayer( + d_model=d_model, nhead=n_heads, dim_feedforward=4 * d_model, + dropout=dropout, batch_first=True, activation="gelu", + ) + self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers) + self.fut_tokens = nn.Parameter(torch.zeros(1, t_fut, d_model)) + nn.init.trunc_normal_(self.fut_tokens, std=0.02) + self.head = nn.Linear(d_model, num_classes) + # Causal mask over concatenated [past | future] sequence + mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool() + self.register_buffer("causal_mask", mask) + + def forward(self, x: Dict[str, torch.Tensor]) -> torch.Tensor: + h_past = self.embed(x) # (B, T_obs, D) + B = h_past.size(0) + h_fut = self.fut_tokens.expand(B, -1, -1) # (B, T_fut, D) + seq = torch.cat([h_past, h_fut], dim=1) + self.pos + out = self.encoder(seq, mask=self.causal_mask) + out_fut = out[:, self.t_obs:, :] + return self.head(out_fut) # (B, T_fut, C) + + +# --------------------------------------------------------------------------- +# Builder +# --------------------------------------------------------------------------- + +def build_forecast_model(name: str, modality_dims: Dict[str, int], + num_classes: int, t_obs: int, t_fut: int, + d_model: int = 128, dropout: float = 0.1) -> nn.Module: + name = name.lower() + if name in ("daf", "transformer"): + return TransformerForecast(modality_dims, num_classes, + t_obs=t_obs, t_fut=t_fut, + d_model=d_model, dropout=dropout) + if name == "futr": + return FUTRForecast(modality_dims, num_classes, + t_obs=t_obs, t_fut=t_fut, + d_model=d_model, dropout=dropout) + if name == "deepconvlstm": + return DeepConvLSTMForecast(modality_dims, num_classes, + t_obs=t_obs, t_fut=t_fut, + dropout=dropout) + if name in ("rulstm", "ru-lstm", "ru_lstm"): + return RULSTMForecast(modality_dims, num_classes, + t_obs=t_obs, t_fut=t_fut, + d_model=d_model, dropout=dropout) + if name == "avt": + return AVTForecast(modality_dims, num_classes, + t_obs=t_obs, t_fut=t_fut, + d_model=d_model, dropout=dropout) + raise ValueError(f"Unknown forecast model: {name!r}") diff --git a/experiments/nets/models_forecast_priv.py b/experiments/nets/models_forecast_priv.py new file mode 100644 index 0000000000000000000000000000000000000000..d86bb9d437e381f58e2a92701d515626b75dc90b --- /dev/null +++ b/experiments/nets/models_forecast_priv.py @@ -0,0 +1,76 @@ +"""Models for T8 v3 — privileged future-pressure conditioning. + +Wraps the existing TransformerForecast (DAF) to accept future pressure as +side-channel context. The future pressure trajectory is encoded into T_fut +tokens that get appended to the past memory; future queries cross-attend +over the union (past sensors + future pressure). This is privileged +information (oracle) — at test time we'd not have future pressure — so +this is a hypothesis-test setup, not a deployable forecaster. +""" +from __future__ import annotations +from typing import Dict + +import torch +import torch.nn as nn + + +class _PerModalityProj(nn.Module): + def __init__(self, modality_dims, d_model): + super().__init__() + self.proj = nn.ModuleDict({ + m: nn.Linear(d, d_model) for m, d in modality_dims.items() + }) + self.mod_emb = nn.Parameter(torch.zeros(len(modality_dims), d_model)) + nn.init.trunc_normal_(self.mod_emb, std=0.02) + self.mods = list(modality_dims.keys()) + + def forward(self, x): + out = None + for i, m in enumerate(self.mods): + h = self.proj[m](x[m]) + self.mod_emb[i] + out = h if out is None else out + h + return out / len(self.mods) + + +class DAFFuturePressure(nn.Module): + """DAF backbone + future-pressure conditioning.""" + + def __init__(self, modality_dims: Dict[str, int], target_dim: int, + t_obs: int, t_fut: int, future_pressure_dim: int = 50, + d_model: int = 128, n_heads: int = 4, n_layers: int = 2, + dropout: float = 0.1): + super().__init__() + self.t_obs = t_obs + self.t_fut = t_fut + self.embed = _PerModalityProj(modality_dims, d_model) + self.pos = nn.Parameter(torch.zeros(1, t_obs, d_model)) + nn.init.trunc_normal_(self.pos, std=0.02) + layer = nn.TransformerEncoderLayer( + d_model=d_model, nhead=n_heads, dim_feedforward=4 * d_model, + dropout=dropout, batch_first=True, activation="gelu", + ) + self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers) + # future-pressure encoder + self.fp_proj = nn.Linear(future_pressure_dim, d_model) + self.fp_pos = nn.Parameter(torch.zeros(1, t_fut, d_model)) + nn.init.trunc_normal_(self.fp_pos, std=0.02) + self.fp_seg = nn.Parameter(torch.zeros(1, 1, d_model)) # segment id + nn.init.trunc_normal_(self.fp_seg, std=0.02) + # decoder side + self.queries = nn.Parameter(torch.zeros(1, t_fut, d_model)) + nn.init.trunc_normal_(self.queries, std=0.02) + self.cross_attn = nn.MultiheadAttention( + d_model, n_heads, dropout=dropout, batch_first=True + ) + self.norm = nn.LayerNorm(d_model) + self.head = nn.Linear(d_model, target_dim) + + def forward(self, x: Dict[str, torch.Tensor], + future_pressure: torch.Tensor) -> torch.Tensor: + h_past = self.encoder(self.embed(x) + self.pos) # (B, T_obs, D) + h_fp = self.fp_proj(future_pressure) + self.fp_pos + self.fp_seg + memory = torch.cat([h_past, h_fp], dim=1) # (B, T_obs+T_fut, D) + q = self.queries.expand(memory.size(0), -1, -1) # (B, T_fut, D) + out, _ = self.cross_attn(q, memory, memory, need_weights=False) + out = self.norm(out) + return self.head(out) # (B, T_fut, target_dim) diff --git a/experiments/nets/models_seqpred.py b/experiments/nets/models_seqpred.py new file mode 100644 index 0000000000000000000000000000000000000000..239e5078678f05bb80e722844abfdd5b277aea17 --- /dev/null +++ b/experiments/nets/models_seqpred.py @@ -0,0 +1,806 @@ +""" +Models for T10 Triplet Next-Action Prediction. + +Two classes live here: + + * TripletHead — shared head module producing (verb_fine, verb_composite, + noun, hand) logits from a pooled feature vector. + * DeepConvLSTMTriplet — single-flow CNN+LSTM baseline (concatenates all + available modalities along the feature axis). + * DailyActFormer — our full-modality cross-modal Transformer that keeps + each modality in its own stem, fuses via a modality + token, and runs a causal temporal Transformer. Supports + the anticipatory auxiliary loss mentioned in the paper + plan (currently as a stub; enabled later in training). + +All models take: + x: dict[mod_name -> (B, T, F_mod)] + mask: BoolTensor (B, T) +and return a dict: + {'verb_fine': (B, NUM_VERB_FINE), + 'verb_composite': (B, NUM_VERB_COMPOSITE), + 'noun': (B, NUM_NOUN), + 'hand': (B, NUM_HAND)} +""" + +from __future__ import annotations + +import math +import sys +from pathlib import Path +from typing import Dict, List, Optional, Sequence + +import torch +import torch.nn as nn +import torch.nn.functional as F + +# Importable from either (a) neurips26 root, or (b) frozen row/code/ folder. +_THIS = Path(__file__).resolve() +sys.path.insert(0, str(_THIS.parent)) +sys.path.insert(0, str(_THIS.parent.parent)) + +try: + from experiments.taxonomy import ( + NUM_VERB_FINE, NUM_VERB_COMPOSITE, NUM_NOUN, NUM_HAND, + ) +except ModuleNotFoundError: + from taxonomy import ( + NUM_VERB_FINE, NUM_VERB_COMPOSITE, NUM_NOUN, NUM_HAND, + ) + +# --------------------------------------------------------------------------- +# Shared triplet head +# --------------------------------------------------------------------------- + +class _PrevActionConcat(nn.Module): + """Embeds the previous-segment (verb_composite, noun) ground-truth labels + and concatenates them to a pooled feature vector. Used by every model + when `use_prev_action=True`. The +1 vocab slot is the BOS / no-prev + sentinel emitted by the dataset for the first kept segment of each + recording. Output dim added to pooled = 2 * prev_emb_dim.""" + + def __init__(self, prev_emb_dim: int = 32): + super().__init__() + from taxonomy import NUM_VERB_COMPOSITE as _NVC, NUM_NOUN as _NN # noqa + self.vc_emb = nn.Embedding(_NVC + 1, prev_emb_dim) + self.n_emb = nn.Embedding(_NN + 1, prev_emb_dim) + self.out_dim = 2 * prev_emb_dim + + def forward(self, pooled: torch.Tensor, + prev_v_comp: Optional[torch.Tensor] = None, + prev_noun: Optional[torch.Tensor] = None) -> torch.Tensor: + if prev_v_comp is None or prev_noun is None: + B = pooled.size(0) + prev_v_comp = torch.full((B,), self.vc_emb.num_embeddings - 1, + dtype=torch.long, device=pooled.device) + prev_noun = torch.full((B,), self.n_emb.num_embeddings - 1, + dtype=torch.long, device=pooled.device) + pe = torch.cat([self.vc_emb(prev_v_comp), self.n_emb(prev_noun)], dim=-1) + return torch.cat([pooled, pe], dim=-1) + + +class TripletHead(nn.Module): + def __init__(self, feat_dim: int, hidden: int = 256, dropout: float = 0.2): + super().__init__() + self.norm = nn.LayerNorm(feat_dim) + self.trunk = nn.Sequential( + nn.Linear(feat_dim, hidden), + nn.GELU(), + nn.Dropout(dropout), + ) + self.verb_fine = nn.Linear(hidden, NUM_VERB_FINE) + self.verb_composite = nn.Linear(hidden, NUM_VERB_COMPOSITE) + self.noun = nn.Linear(hidden, NUM_NOUN) + self.hand = nn.Linear(hidden, NUM_HAND) + + def forward(self, feat: torch.Tensor) -> Dict[str, torch.Tensor]: + h = self.trunk(self.norm(feat)) + return { + "verb_fine": self.verb_fine(h), + "verb_composite": self.verb_composite(h), + "noun": self.noun(h), + "hand": self.hand(h), + } + + +def _masked_mean_pool(h: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: + """Mean over the time axis of `h` (B, T, D) using a boolean mask (B, T).""" + m = mask.to(h.dtype).unsqueeze(-1) + return (h * m).sum(dim=1) / m.sum(dim=1).clamp(min=1.0) + + +# --------------------------------------------------------------------------- +# Baseline: DeepConvLSTM (Ordonez & Roggen 2016) adapted for triplet prediction +# --------------------------------------------------------------------------- + +class DeepConvLSTMTriplet(nn.Module): + """Single-flow CNN+LSTM. Concatenates per-modality features on F axis.""" + + def __init__( + self, + modality_dims: Dict[str, int], + conv_filters: int = 64, + conv_kernel: int = 5, + num_conv_layers: int = 4, + lstm_hidden: int = 128, + num_lstm_layers: int = 2, + dropout: float = 0.2, + head_hidden: int = 256, + use_prev_action: bool = False, + prev_emb_dim: int = 32, + ): + super().__init__() + self.modality_dims = dict(modality_dims) + self.use_prev_action = use_prev_action + in_ch = sum(modality_dims.values()) + + convs: List[nn.Module] = [] + c = in_ch + for i in range(num_conv_layers): + convs.append(nn.Sequential( + nn.Conv1d(c, conv_filters, conv_kernel, padding=conv_kernel // 2), + nn.BatchNorm1d(conv_filters), + nn.ReLU(), + nn.Dropout(dropout if i < num_conv_layers - 1 else dropout + 0.1), + )) + c = conv_filters + self.convs = nn.Sequential(*convs) + + self.lstm = nn.LSTM( + conv_filters, lstm_hidden, num_layers=num_lstm_layers, + batch_first=True, bidirectional=False, + dropout=dropout if num_lstm_layers > 1 else 0.0, + ) + head_in = lstm_hidden + if use_prev_action: + self.prev_concat = _PrevActionConcat(prev_emb_dim) + head_in += self.prev_concat.out_dim + else: + self.prev_concat = None + self.head = TripletHead(head_in, hidden=head_hidden, dropout=dropout) + + def forward( + self, x: Dict[str, torch.Tensor], mask: torch.Tensor, + prev_v_comp: Optional[torch.Tensor] = None, + prev_noun: Optional[torch.Tensor] = None, + ) -> Dict[str, torch.Tensor]: + feats = torch.cat([x[m] for m in x], dim=-1).transpose(1, 2) + feats = self.convs(feats).transpose(1, 2) + out, (h_n, _) = self.lstm(feats) + pooled = h_n[-1] + if self.use_prev_action: + pooled = self.prev_concat(pooled, prev_v_comp, prev_noun) + return self.head(pooled) + + +# --------------------------------------------------------------------------- +# Our model: DailyActFormer +# --------------------------------------------------------------------------- + +class _ModalityStem(nn.Module): + """Multi-scale 1-D conv stem (kernels 3, 5, 9) per modality. + + Borrowed from HandFormer (the top-1 baseline on T10 recognition): three + parallel convolutions capture fast (k=3, ~0.15s @ 20Hz), medium (k=5), + and slow (k=9, ~0.45s) temporal patterns. Output is a 1×1 fusion of + the three branches, projected back to d_model. + """ + + def __init__(self, in_dim: int, d_model: int, kernels=(3, 5, 9), + dropout: float = 0.1): + super().__init__() + self.kernels = kernels + self.branches = nn.ModuleList([ + nn.Conv1d(in_dim, d_model, k, padding=k // 2) for k in kernels + ]) + self.merge = nn.Sequential( + nn.GELU(), + nn.Conv1d(d_model * len(kernels), d_model, 1), + ) + self.norm = nn.LayerNorm(d_model) + self.drop = nn.Dropout(dropout) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # x: (B, T, F_in) -> (B, F_in, T) for conv1d + z = x.transpose(1, 2) + multi = [c(z) for c in self.branches] # each (B, D, T) + h = self.merge(torch.cat(multi, dim=1)).transpose(1, 2) # (B, T, D) + return self.drop(self.norm(h)) + + +class _QueryPool(nn.Module): + """Learnable-query cross-attention pooling (replaces mean pool). + + Inspired by FUTR (the top-5 baseline winner): a single learnable query + cross-attends to the entire encoder output, producing one summary vector. + Compared to a plain mean pool this lets the model weight informative + frames more heavily. + """ + + def __init__(self, d_model: int, n_heads: int = 4, dropout: float = 0.1): + super().__init__() + self.q = nn.Parameter(torch.zeros(1, 1, d_model)) + nn.init.trunc_normal_(self.q, std=0.02) + self.attn = nn.MultiheadAttention( + d_model, n_heads, dropout=dropout, batch_first=True, + ) + self.norm = nn.LayerNorm(d_model) + + def forward(self, h: torch.Tensor, key_padding_mask: Optional[torch.Tensor]): + # h: (B, T, D); key_padding_mask: (B, T) where True = pad-to-mask-out + B = h.size(0) + q = self.q.expand(B, -1, -1) + out, _ = self.attn(q, h, h, key_padding_mask=key_padding_mask, + need_weights=False) + return self.norm(out.squeeze(1)) + + +class _CrossModalTemporalShift(nn.Module): + """Cross-modal temporal-shift attention between two modalities. + + Motivation (paper case study, §sec:grasp-phase-main): EMG activation leads + motion onset by a sub-frame ~20ms in our 100Hz recordings. After the 5x + downsample to 20Hz, that lag is ~0.4 frames, but per-subject variability + plus slack in our segment annotations introduces a few frames of drift + that a fixed alignment cannot capture. + + We learn a discrete temporal shift Δ ∈ {-max_shift, …, +max_shift} frames + applied to one of the two modalities (EMG by default), so the shifted + tokens align with the other branch (MoCap) before cross-modal fusion. The + shift is sampled via straight-through Gumbel-softmax during training; at + inference we take the argmax (deterministic). + + Inputs are per-modality token sequences (B, T, D). Outputs the same shape. + Only the `shift_modality` branch is shifted; other modalities pass through. + """ + + def __init__(self, max_shift: int = 3, tau: float = 1.0): + super().__init__() + self.max_shift = max_shift + self.tau = tau + # Logits over 2*max_shift+1 categorical shift candidates. + self.shift_logits = nn.Parameter(torch.zeros(2 * max_shift + 1)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # x: (B, T, D); produce a shifted version that's a soft-blend over + # the shift dimension. Hard at inference, gumbel-softmax at training. + if self.training: + w = F.gumbel_softmax(self.shift_logits, tau=self.tau, hard=True, dim=-1) + else: + w = F.one_hot(self.shift_logits.argmax(), + num_classes=2 * self.max_shift + 1).float() + shifted = [] + for i, s in enumerate(range(-self.max_shift, self.max_shift + 1)): + shifted.append(w[i] * torch.roll(x, shifts=s, dims=1)) + return torch.stack(shifted, dim=0).sum(dim=0) + + +class _CausalTransformerBlock(nn.Module): + """Standard Transformer encoder block with a strictly causal attention mask.""" + + def __init__(self, d_model: int, n_heads: int, mlp_ratio: float = 4.0, + dropout: float = 0.1): + super().__init__() + self.attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout, + batch_first=True) + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + mlp_dim = int(d_model * mlp_ratio) + self.mlp = nn.Sequential( + nn.Linear(d_model, mlp_dim), nn.GELU(), nn.Dropout(dropout), + nn.Linear(mlp_dim, d_model), nn.Dropout(dropout), + ) + + def forward(self, x: torch.Tensor, attn_mask: torch.Tensor, + key_padding_mask: Optional[torch.Tensor]) -> torch.Tensor: + h = self.norm1(x) + h, _ = self.attn(h, h, h, attn_mask=attn_mask, + key_padding_mask=key_padding_mask, need_weights=False) + x = x + h + x = x + self.mlp(self.norm2(x)) + return x + + +class DailyActFormer(nn.Module): + """Cross-modal Transformer that uses every available modality. + + Architecture outline: + per-modality stem → learnable modality embedding → + concat across time (each frame -> M modality tokens) → + 1 fusion-layer cross-modal attention (compress M→1 per frame) → + temporal Transformer (bidirectional by default; causal when + `causal=True` for anticipation-style next-action prediction) + → pooled → TripletHead + + For simplicity the fusion step is an attention pooling with learnable + queries, rather than a full cross-modal block. This keeps the parameter + count modest (2–4 M range with d_model=128). + """ + + def __init__( + self, + modality_dims: Dict[str, int], + d_model: int = 128, + n_layers: int = 4, + n_heads: int = 4, + dropout: float = 0.1, + head_hidden: int = 256, + max_T: int = 256, + causal: bool = False, + xshift_modality: Optional[str] = "emg", + xshift_max: int = 3, + use_prev_action: bool = False, + prev_emb_dim: int = 32, + ): + super().__init__() + self.modalities = list(modality_dims.keys()) + self.causal = causal + self.use_prev_action = use_prev_action + + # Prev-action concat (shared helper) + if use_prev_action: + self.prev_concat = _PrevActionConcat(prev_emb_dim) + self._prev_extra_dim = self.prev_concat.out_dim + else: + self.prev_concat = None + self._prev_extra_dim = 0 + + # 0) Cross-modal temporal-shift block on one branch (EMG by default). + # Disabled if `xshift_modality` is None or not present. + if xshift_modality is not None and xshift_modality in modality_dims: + self.xshift_modality = xshift_modality + self.xshift = _CrossModalTemporalShift(max_shift=xshift_max) + else: + self.xshift_modality = None + self.xshift = None + + # 1) per-modality 1-D conv stems (each produces d_model features/frame) + self.stems = nn.ModuleDict({ + m: _ModalityStem(F, d_model, dropout=dropout) + for m, F in modality_dims.items() + }) + + # 2) modality embedding (broadcast-add to per-modality tokens) + self.modality_embed = nn.Parameter( + torch.zeros(len(self.modalities), d_model) + ) + nn.init.trunc_normal_(self.modality_embed, std=0.02) + + # 3) per-frame cross-modal fusion: use a single learnable query token + self.fusion_q = nn.Parameter(torch.zeros(1, 1, d_model)) + self.fusion_kv = nn.LayerNorm(d_model) + self.fusion_attn = nn.MultiheadAttention(d_model, n_heads, batch_first=True) + + # 4) positional embedding along time (post-fusion) + self.pos_embed = nn.Parameter(torch.zeros(1, max_T, d_model)) + nn.init.trunc_normal_(self.pos_embed, std=0.02) + self.max_T = max_T + + # 5) causal temporal Transformer + self.temporal_norm = nn.LayerNorm(d_model) + self.temporal = nn.ModuleList([ + _CausalTransformerBlock(d_model, n_heads, dropout=dropout) + for _ in range(n_layers) + ]) + + # 6) Pool: learnable-query cross-attention (replaces mean pool, FUTR-style) + self.pool = _QueryPool(d_model, n_heads=n_heads, dropout=dropout) + + # 7) triplet head: input dim = d_model + (optional prev-action embed) + head_in = d_model + self._prev_extra_dim + self.head = TripletHead(head_in, hidden=head_hidden, dropout=dropout) + + nn.init.trunc_normal_(self.fusion_q, std=0.02) + + # ---- helpers ---- + def _causal_mask(self, T: int, device) -> torch.Tensor: + # MultiheadAttention wants additive mask with -inf above diag. + m = torch.full((T, T), float("-inf"), device=device) + m.triu_(diagonal=1) + return m + + # ---- forward ---- + def forward( + self, x: Dict[str, torch.Tensor], mask: torch.Tensor, + prev_v_comp: Optional[torch.Tensor] = None, + prev_noun: Optional[torch.Tensor] = None, + return_features: bool = False, + ) -> Dict[str, torch.Tensor]: + # Stems: per-modality token streams + stem_tokens: List[torch.Tensor] = [] + mods_in = [m for m in self.modalities if m in x] + if not mods_in: + raise ValueError("No modality from the model signature was provided.") + for i, m in enumerate(mods_in): + h = self.stems[m](x[m]) # (B, T, D) + # Cross-modal temporal shift: apply to one branch (e.g. EMG) so it + # aligns with the others before fusion. Implements paper SyncFuse's + # main novelty (sub-frame anticipatory coupling between EMG/MoCap). + if self.xshift is not None and m == self.xshift_modality: + h = self.xshift(h) + h = h + self.modality_embed[self.modalities.index(m)] + stem_tokens.append(h) + + # Cross-modal fusion: per-frame, attend learnable query over the M stacked + # modality tokens. Output is (B, T, D). + B, T, D = stem_tokens[0].shape + # stack -> (B, T, M, D) -> reshape as (B*T, M, D) + stacked = torch.stack(stem_tokens, dim=2) # (B, T, M, D) + M = stacked.size(2) + stacked = stacked.reshape(B * T, M, D) + kv = self.fusion_kv(stacked) + q = self.fusion_q.expand(B * T, -1, -1) + fused, _ = self.fusion_attn(q, kv, kv, need_weights=False) + fused = fused.reshape(B, T, D) # (B, T, D) + + # Positional embedding + causal temporal Transformer + if T > self.max_T: + raise ValueError(f"T={T} exceeds max_T={self.max_T}") + h = fused + self.pos_embed[:, :T, :] + h = self.temporal_norm(h) + + attn_mask = self._causal_mask(T, h.device) if self.causal else None + key_padding = ~mask if mask is not None else None + for block in self.temporal: + h = block(h, attn_mask=attn_mask, key_padding_mask=key_padding) + + # Pool: learnable-query cross-attention (FUTR-style) over valid frames + pooled = self.pool(h, key_padding_mask=key_padding) + + # Optional: condition on previous segment's labels + if self.use_prev_action: + pooled = self.prev_concat(pooled, prev_v_comp, prev_noun) + + logits = self.head(pooled) + if return_features: + logits["_pooled"] = pooled + return logits + + +# =========================================================================== +# Published baselines, sensor-adapted. Each keeps the original paper's key +# idea (rolling+unrolling LSTM for RULSTM, causal encoder–decoder for FUTR, +# early modality-token fusion for AFFT, etc.) but swaps the RGB/feature input +# for our multimodal sensor streams, and the classification head for our +# shared TripletHead. +# =========================================================================== + + +# --------------------------------------------------------------------------- +# RULSTM (Furnari & Farinella, TPAMI 2020) — sensor-adapted +# Per-modality rolling LSTM summarises the past, a second unrolling LSTM +# takes R-LSTM state and walks `future_steps` steps forward to mimic +# anticipation without needing future sensor data. Fusion is late: each +# modality produces logits, we average them. +# --------------------------------------------------------------------------- + +class _RULSTMBranch(nn.Module): + def __init__(self, in_dim: int, hidden: int, future_steps: int, + dropout: float = 0.2): + super().__init__() + self.future_steps = future_steps + self.rolling = nn.LSTM(in_dim, hidden, batch_first=True) + self.unrolling = nn.LSTMCell(hidden, hidden) + self.drop = nn.Dropout(dropout) + self.out_dim = hidden + + def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: + # x: (B, T, F_in), mask: (B, T) + # Pack-free: LSTM on padded sequences is fine since we pool from h_n. + _, (h_n, c_n) = self.rolling(x) # (1, B, H) + h = h_n.squeeze(0); c = c_n.squeeze(0) + inp = h + for _ in range(self.future_steps): + h, c = self.unrolling(inp, (h, c)) + inp = h + return self.drop(h) + + +class RULSTMTriplet(nn.Module): + def __init__(self, modality_dims: Dict[str, int], hidden: int = 128, + future_steps: int = 8, dropout: float = 0.2, + head_hidden: int = 256, + use_prev_action: bool = False, prev_emb_dim: int = 32): + super().__init__() + self.use_prev_action = use_prev_action + self.branches = nn.ModuleDict({ + m: _RULSTMBranch(F, hidden, future_steps, dropout) + for m, F in modality_dims.items() + }) + head_in = hidden + if use_prev_action: + self.prev_concat = _PrevActionConcat(prev_emb_dim) + head_in += self.prev_concat.out_dim + else: + self.prev_concat = None + self.head = TripletHead(head_in, hidden=head_hidden, dropout=dropout) + + def forward(self, x, mask, prev_v_comp=None, prev_noun=None): + feats = [] + for m in x: + feats.append(self.branches[m](x[m], mask)) + fused = torch.stack(feats, dim=0).mean(dim=0) + if self.use_prev_action: + fused = self.prev_concat(fused, prev_v_comp, prev_noun) + return self.head(fused) + + +# --------------------------------------------------------------------------- +# FUTR (Gong et al., CVPR 2022) — sensor-adapted +# Transformer encoder over observation frames (with per-frame feature from +# concat(modalities)). A decoder query attends over the encoder memory to +# produce a single future-action embedding which is fed into the triplet +# head. No autoregressive decoding — we only predict 1 target segment. +# --------------------------------------------------------------------------- + +class FUTRTriplet(nn.Module): + def __init__(self, modality_dims: Dict[str, int], d_model: int = 128, + n_heads: int = 4, n_layers: int = 3, dropout: float = 0.1, + head_hidden: int = 256, max_T: int = 256, + use_prev_action: bool = False, prev_emb_dim: int = 32): + super().__init__() + self.use_prev_action = use_prev_action + in_dim = sum(modality_dims.values()) + self.in_proj = nn.Linear(in_dim, d_model) + self.pos = nn.Parameter(torch.zeros(1, max_T, d_model)) + nn.init.trunc_normal_(self.pos, std=0.02) + self.max_T = max_T + + enc_layer = nn.TransformerEncoderLayer( + d_model=d_model, nhead=n_heads, dim_feedforward=4 * d_model, + dropout=dropout, batch_first=True, activation="gelu", + ) + self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers) + + self.future_q = nn.Parameter(torch.zeros(1, 1, d_model)) + nn.init.trunc_normal_(self.future_q, std=0.02) + self.cross_attn = nn.MultiheadAttention( + d_model, n_heads, dropout=dropout, batch_first=True, + ) + head_in = d_model + if use_prev_action: + self.prev_concat = _PrevActionConcat(prev_emb_dim) + head_in += self.prev_concat.out_dim + else: + self.prev_concat = None + self.head = TripletHead(head_in, hidden=head_hidden, dropout=dropout) + + def forward(self, x, mask, prev_v_comp=None, prev_noun=None): + feats = torch.cat([x[m] for m in x], dim=-1) + B, T, _ = feats.shape + if T > self.max_T: + raise ValueError(f"T={T} exceeds FUTR max_T={self.max_T}") + h = self.in_proj(feats) + self.pos[:, :T, :] + h = self.encoder(h, src_key_padding_mask=~mask) + q = self.future_q.expand(B, -1, -1) + out, _ = self.cross_attn(q, h, h, key_padding_mask=~mask, + need_weights=False) + pooled = out.squeeze(1) + if self.use_prev_action: + pooled = self.prev_concat(pooled, prev_v_comp, prev_noun) + return self.head(pooled) + + +# --------------------------------------------------------------------------- +# AFFT (Zhong et al., WACV 2023) — sensor-adapted +# Per-modality tokens (one per frame per modality) are concatenated into a +# long token sequence of length T*M and passed through an encoder with +# causal temporal attention so the model must anticipate strictly from the +# past. Fusion happens "anticipatively" inside the attention. +# --------------------------------------------------------------------------- + +class AFFTTriplet(nn.Module): + def __init__(self, modality_dims: Dict[str, int], d_model: int = 96, + n_heads: int = 4, n_layers: int = 3, dropout: float = 0.1, + head_hidden: int = 256, max_T: int = 256, + use_prev_action: bool = False, prev_emb_dim: int = 32): + super().__init__() + self.use_prev_action = use_prev_action + self.modalities = list(modality_dims.keys()) + self.stems = nn.ModuleDict({ + m: nn.Linear(F, d_model) for m, F in modality_dims.items() + }) + self.mod_embed = nn.Parameter( + torch.zeros(len(self.modalities), d_model) + ) + nn.init.trunc_normal_(self.mod_embed, std=0.02) + self.pos = nn.Parameter(torch.zeros(1, max_T, d_model)) + nn.init.trunc_normal_(self.pos, std=0.02) + self.max_T = max_T + self.d_model = d_model + + self.blocks = nn.ModuleList([ + _CausalTransformerBlock(d_model, n_heads, dropout=dropout) + for _ in range(n_layers) + ]) + head_in = d_model + if use_prev_action: + self.prev_concat = _PrevActionConcat(prev_emb_dim) + head_in += self.prev_concat.out_dim + else: + self.prev_concat = None + self.head = TripletHead(head_in, hidden=head_hidden, dropout=dropout) + + def _expand_causal_mask(self, T: int, M: int, device) -> torch.Tensor: + # Token layout: [m0_t0, m1_t0, ..., mM_t0, m0_t1, ..., mM_t(T-1)] + # Token at (m, t) can attend to all (m', t') with t' <= t. + ts = torch.arange(T, device=device).unsqueeze(1).expand(-1, M).reshape(-1) + return ts[:, None] < ts[None, :] # True where future (mask out) + + def forward(self, x, mask, prev_v_comp=None, prev_noun=None): + # Build per-frame token streams. + mods = [m for m in self.modalities if m in x] + per_mod_tokens = [] + B, T, _ = x[mods[0]].shape + for i, m in enumerate(mods): + h = self.stems[m](x[m]) + self.mod_embed[self.modalities.index(m)] + per_mod_tokens.append(h) + stacked = torch.stack(per_mod_tokens, dim=2) + M = stacked.size(2) + tokens = stacked.reshape(B, T * M, self.d_model) + if T > self.max_T: + raise ValueError(f"T={T} exceeds AFFT max_T={self.max_T}") + pos_per_frame = self.pos[:, :T, :].unsqueeze(2).expand(-1, -1, M, -1) + tokens = tokens + pos_per_frame.reshape(1, T * M, self.d_model) + attn_mask = self._expand_causal_mask(T, M, tokens.device) + attn_mask = torch.where(attn_mask, torch.tensor(float("-inf"), + device=tokens.device), + torch.tensor(0.0, device=tokens.device)) + kp = (~mask).unsqueeze(2).expand(-1, -1, M).reshape(B, T * M) + for blk in self.blocks: + tokens = blk(tokens, attn_mask=attn_mask, key_padding_mask=kp) + last_slice = tokens[:, -M:, :] + pooled = last_slice.mean(dim=1) + if self.use_prev_action: + pooled = self.prev_concat(pooled, prev_v_comp, prev_noun) + return self.head(pooled) + + +# --------------------------------------------------------------------------- +# HandFormer (Shamil et al., ECCV 2024) — sensor-adapted +# Originally on 3D hand poses. We feed it only the MoCap modality (which +# contains 10 fingertip joints). Multi-scale 1-D conv over time, followed +# by a Transformer. If MoCap is not in `modalities`, falls back to whatever +# is provided (but then it's no longer the paper's "pose-only" setup). +# --------------------------------------------------------------------------- + +class HandFormerTriplet(nn.Module): + def __init__(self, modality_dims: Dict[str, int], d_model: int = 128, + n_heads: int = 4, n_layers: int = 3, kernels=(3, 5, 9), + dropout: float = 0.1, head_hidden: int = 256, max_T: int = 256, + use_prev_action: bool = False, prev_emb_dim: int = 32): + super().__init__() + self.use_prev_action = use_prev_action + in_dim = sum(modality_dims.values()) + self.multi_conv = nn.ModuleList([ + nn.Conv1d(in_dim, d_model, k, padding=k // 2) for k in kernels + ]) + self.conv_merge = nn.Conv1d(d_model * len(kernels), d_model, 1) + + self.pos = nn.Parameter(torch.zeros(1, max_T, d_model)) + nn.init.trunc_normal_(self.pos, std=0.02) + self.max_T = max_T + + enc_layer = nn.TransformerEncoderLayer( + d_model=d_model, nhead=n_heads, dim_feedforward=4 * d_model, + dropout=dropout, batch_first=True, activation="gelu", + ) + self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers) + head_in = d_model + if use_prev_action: + self.prev_concat = _PrevActionConcat(prev_emb_dim) + head_in += self.prev_concat.out_dim + else: + self.prev_concat = None + self.head = TripletHead(head_in, hidden=head_hidden, dropout=dropout) + + def forward(self, x, mask, prev_v_comp=None, prev_noun=None): + feats = torch.cat([x[m] for m in x], dim=-1).transpose(1, 2) + multi = [c(feats) for c in self.multi_conv] + h = self.conv_merge(torch.cat(multi, dim=1)) + h = h.transpose(1, 2) + T = h.size(1) + if T > self.max_T: + raise ValueError(f"T={T} exceeds HandFormer max_T={self.max_T}") + h = h + self.pos[:, :T, :] + h = self.encoder(h, src_key_padding_mask=~mask) + pooled = _masked_mean_pool(h, mask) + if self.use_prev_action: + pooled = self.prev_concat(pooled, prev_v_comp, prev_noun) + return self.head(pooled) + + +# --------------------------------------------------------------------------- +# Placeholder ActionLLM — a conv-stem sensor encoder + a 2-layer Transformer +# trained from scratch as a surrogate. The *full* LoRA+Qwen version lives in +# `train_pred.py` and can be wired in later if the surrogate is too weak. +# --------------------------------------------------------------------------- + +class ActionLLMSurrogate(nn.Module): + def __init__(self, modality_dims: Dict[str, int], d_model: int = 192, + n_heads: int = 6, n_layers: int = 2, dropout: float = 0.1, + head_hidden: int = 256, max_T: int = 256, + use_prev_action: bool = False, prev_emb_dim: int = 32): + super().__init__() + self.use_prev_action = use_prev_action + in_dim = sum(modality_dims.values()) + self.stem = nn.Sequential( + nn.Conv1d(in_dim, d_model, 5, padding=2), + nn.GELU(), + nn.Conv1d(d_model, d_model, 5, padding=2), + ) + self.pos = nn.Parameter(torch.zeros(1, max_T, d_model)) + nn.init.trunc_normal_(self.pos, std=0.02) + self.max_T = max_T + enc_layer = nn.TransformerEncoderLayer( + d_model=d_model, nhead=n_heads, dim_feedforward=4 * d_model, + dropout=dropout, batch_first=True, activation="gelu", + ) + self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers) + head_in = d_model + if use_prev_action: + self.prev_concat = _PrevActionConcat(prev_emb_dim) + head_in += self.prev_concat.out_dim + else: + self.prev_concat = None + self.head = TripletHead(head_in, hidden=head_hidden, dropout=dropout) + + def forward(self, x, mask, prev_v_comp=None, prev_noun=None): + feats = torch.cat([x[m] for m in x], dim=-1).transpose(1, 2) + h = self.stem(feats).transpose(1, 2) + T = h.size(1) + if T > self.max_T: + raise ValueError(f"T={T} exceeds ActionLLM max_T={self.max_T}") + h = h + self.pos[:, :T, :] + h = self.encoder(h, src_key_padding_mask=~mask) + pooled = _masked_mean_pool(h, mask) + if self.use_prev_action: + pooled = self.prev_concat(pooled, prev_v_comp, prev_noun) + return self.head(pooled) + + +# --------------------------------------------------------------------------- +# Factory +# --------------------------------------------------------------------------- + +def build_model( + name: str, modality_dims: Dict[str, int], **kwargs, +) -> nn.Module: + name = name.lower() + if name in ("deepconvlstm", "dcl"): + return DeepConvLSTMTriplet(modality_dims, **kwargs) + if name in ("dailyactformer", "ours", "daf"): + return DailyActFormer(modality_dims, **kwargs) + if name in ("rulstm",): + return RULSTMTriplet(modality_dims, **kwargs) + if name in ("futr",): + return FUTRTriplet(modality_dims, **kwargs) + if name in ("afft",): + return AFFTTriplet(modality_dims, **kwargs) + if name in ("handformer",): + return HandFormerTriplet(modality_dims, **kwargs) + if name in ("actionllm",): + return ActionLLMSurrogate(modality_dims, **kwargs) + raise ValueError(f"Unknown model: {name}") + + +# --------------------------------------------------------------------------- +# Smoke-test: build each model, run a random batch, check output shapes. +# --------------------------------------------------------------------------- + +if __name__ == "__main__": + B, T = 2, 160 + dims = {"imu": 180, "emg": 8, "eyetrack": 24} + x = {m: torch.randn(B, T, d) for m, d in dims.items()} + mask = torch.ones(B, T, dtype=torch.bool) + + for name in ("deepconvlstm", "dailyactformer", "rulstm", "futr", "afft", + "handformer", "actionllm"): + model = build_model(name, dims) + n_params = sum(p.numel() for p in model.parameters()) + out = model(x, mask) + print(f"{name:16s} params={n_params:>10,} shapes=" + f"vf={tuple(out['verb_fine'].shape)} " + f"vc={tuple(out['verb_composite'].shape)} " + f"n={tuple(out['noun'].shape)} " + f"h={tuple(out['hand'].shape)}") diff --git a/experiments/nets/published_models.py b/experiments/nets/published_models.py new file mode 100644 index 0000000000000000000000000000000000000000..9e933e2f66fc21365b1a15ad397bedf6c718236c --- /dev/null +++ b/experiments/nets/published_models.py @@ -0,0 +1,699 @@ +""" +Published baseline models for NeurIPS 2026 benchmark experiments. + +Contains faithful implementations of 6 published models: + 1. DeepConvLSTM (Ordonez & Roggen, Sensors 2016) - Exp1/Exp3 + 2. InceptionTime (Fawaz et al., DMKD 2020) - Exp1/Exp3 + 3. MS-TCN++ (Li et al., TPAMI 2020) - Exp2 + 4. DiffAct (Liu et al., ICCV 2023) - Exp2 + 5. UnderPressure (Mourot et al., SCA/CGF 2022) - Exp3/Exp4a + 6. emg2pose (Meta, NeurIPS 2024 D&B) - Exp4b +""" + +import math +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + + +# ============================================================ +# 1. DeepConvLSTM (Ordonez & Roggen, Sensors 2016) +# "Deep Convolutional and LSTM Recurrent Neural Networks +# for Multimodal Wearable Activity Recognition" +# 4 Conv layers -> 2 LSTM layers -> pooling/per-frame output +# ============================================================ + +class DeepConvLSTMBackbone(nn.Module): + """DeepConvLSTM backbone for sequence-level classification (Exp1). + + Input: (B, T, C), optional mask + Output: (B, output_dim) + """ + + def __init__(self, input_dim, hidden_dim=128, num_conv_layers=4, + conv_filters=64, conv_kernel=5, num_lstm_layers=2): + super().__init__() + conv_layers = [] + in_ch = input_dim + for i in range(num_conv_layers): + out_ch = conv_filters + conv_layers.append(nn.Sequential( + nn.Conv1d(in_ch, out_ch, conv_kernel, padding=conv_kernel // 2), + nn.BatchNorm1d(out_ch), + nn.ReLU(), + nn.Dropout(0.1 if i < num_conv_layers - 1 else 0.2), + )) + in_ch = out_ch + self.convs = nn.ModuleList(conv_layers) + + self.lstm = nn.LSTM( + conv_filters, hidden_dim, num_layers=num_lstm_layers, + batch_first=True, bidirectional=False, + dropout=0.2 if num_lstm_layers > 1 else 0, + ) + self.output_dim = hidden_dim + + def forward(self, x, mask=None): + # x: (B, T, C) -> Conv expects (B, C, T) + x = x.permute(0, 2, 1) + for conv in self.convs: + x = conv(x) + x = x.permute(0, 2, 1) # (B, T, conv_filters) + + out, (h_n, _) = self.lstm(x) + # Use last hidden state + feat = h_n[-1] # (B, hidden_dim) + return feat + + +class DeepConvLSTMContact(nn.Module): + """DeepConvLSTM for frame-level contact detection (Exp3). + + Input: (B, T, C) + Output: (B, T, 2) + """ + + def __init__(self, input_dim, hidden_dim=64, num_conv_layers=4, + conv_filters=64, conv_kernel=5): + super().__init__() + conv_layers = [] + in_ch = input_dim + for i in range(num_conv_layers): + conv_layers.append(nn.Sequential( + nn.Conv1d(in_ch, conv_filters, conv_kernel, padding=conv_kernel // 2), + nn.BatchNorm1d(conv_filters), + nn.ReLU(), + nn.Dropout(0.1), + )) + in_ch = conv_filters + self.convs = nn.ModuleList(conv_layers) + self.lstm = nn.LSTM(conv_filters, hidden_dim, num_layers=2, + batch_first=True, bidirectional=True, dropout=0.2) + self.head = nn.Linear(hidden_dim * 2, 2) + + def forward(self, x): + x = x.permute(0, 2, 1) + for conv in self.convs: + x = conv(x) + x = x.permute(0, 2, 1) + out, _ = self.lstm(x) + return self.head(out) + + +# ============================================================ +# 2. InceptionTime (Fawaz et al., DMKD 2020) +# "InceptionTime: Finding AlexNet for Time Series Classification" +# Inception modules with multi-scale convolutions + residual +# ============================================================ + +class InceptionModule(nn.Module): + """Single Inception module for time series.""" + + def __init__(self, in_channels, n_filters=32, kernel_sizes=(9, 19, 39), + bottleneck_channels=32): + super().__init__() + # Bottleneck + self.bottleneck = nn.Conv1d(in_channels, bottleneck_channels, 1, bias=False) + + # Parallel convolutions with different kernel sizes (odd kernels for symmetric padding) + self.convs = nn.ModuleList() + for ks in kernel_sizes: + self.convs.append( + nn.Conv1d(bottleneck_channels, n_filters, ks, + padding=(ks - 1) // 2, bias=False) + ) + + # MaxPool branch + self.maxpool_conv = nn.Sequential( + nn.MaxPool1d(3, stride=1, padding=1), + nn.Conv1d(in_channels, n_filters, 1, bias=False), + ) + + self.bn = nn.BatchNorm1d(n_filters * (len(kernel_sizes) + 1)) + self.relu = nn.ReLU() + + def forward(self, x): + # x: (B, C, T) + x_bottleneck = self.bottleneck(x) + conv_outputs = [conv(x_bottleneck) for conv in self.convs] + conv_outputs.append(self.maxpool_conv(x)) + out = torch.cat(conv_outputs, dim=1) + return self.relu(self.bn(out)) + + +class InceptionBlock(nn.Module): + """Stack of Inception modules with a residual connection.""" + + def __init__(self, in_channels, n_filters=32, depth=3): + super().__init__() + n_out = n_filters * 4 # 3 conv branches + 1 maxpool branch + modules = [] + for i in range(depth): + inc = in_channels if i == 0 else n_out + modules.append(InceptionModule(inc, n_filters)) + self.modules_list = nn.ModuleList(modules) + + # Residual connection + self.use_residual = (in_channels != n_out) + if self.use_residual: + self.residual = nn.Sequential( + nn.Conv1d(in_channels, n_out, 1, bias=False), + nn.BatchNorm1d(n_out), + ) + self.relu = nn.ReLU() + + def forward(self, x): + residual = x + for mod in self.modules_list: + x = mod(x) + if self.use_residual: + residual = self.residual(residual) + return self.relu(x + residual) + + +class InceptionTimeBackbone(nn.Module): + """InceptionTime backbone for sequence-level classification (Exp1). + + Input: (B, T, C), optional mask + Output: (B, output_dim) + """ + + def __init__(self, input_dim, hidden_dim=128, n_filters=32, num_blocks=2, depth=3): + super().__init__() + blocks = [] + in_ch = input_dim + for i in range(num_blocks): + blocks.append(InceptionBlock(in_ch, n_filters, depth)) + in_ch = n_filters * 4 + self.blocks = nn.ModuleList(blocks) + self.output_dim = n_filters * 4 + + def forward(self, x, mask=None): + # x: (B, T, C) -> (B, C, T) + x = x.permute(0, 2, 1) + for block in self.blocks: + x = block(x) + # Global average pooling with mask + if mask is not None: + x = (x * mask.unsqueeze(1).float()).sum(2) / mask.sum(1, keepdim=True).float().clamp(min=1) + else: + x = x.mean(2) + return x # (B, n_filters*4) + + +class InceptionTimeContact(nn.Module): + """InceptionTime for frame-level contact detection (Exp3). + + Input: (B, T, C) + Output: (B, T, 2) + """ + + def __init__(self, input_dim, hidden_dim=64, n_filters=32, num_blocks=2, depth=3): + super().__init__() + blocks = [] + in_ch = input_dim + for i in range(num_blocks): + blocks.append(InceptionBlock(in_ch, n_filters, depth)) + in_ch = n_filters * 4 + self.blocks = nn.ModuleList(blocks) + self.head = nn.Conv1d(n_filters * 4, 2, 1) + + def forward(self, x): + x = x.permute(0, 2, 1) + for block in self.blocks: + x = block(x) + out = self.head(x) + return out.permute(0, 2, 1) # (B, T, 2) + + +# ============================================================ +# 3. MS-TCN++ (Li et al., TPAMI 2020) +# "MS-TCN++: Multi-Stage Temporal Convolutional Network +# for Action Segmentation" +# Key improvement: dual dilated layers in each residual block +# ============================================================ + +class DualDilatedResBlock(nn.Module): + """Dual dilated residual block (MS-TCN++ key contribution). + + Uses two parallel dilated convolutions with different dilation rates + to capture both short-range and long-range temporal patterns. + """ + + def __init__(self, channels, dilation1, dilation2): + super().__init__() + # Branch 1: smaller dilation + self.conv1_dilated = nn.Conv1d( + channels, channels, 3, + padding=dilation1, dilation=dilation1 + ) + # Branch 2: larger dilation + self.conv2_dilated = nn.Conv1d( + channels, channels, 3, + padding=dilation2, dilation=dilation2 + ) + self.conv_fusion = nn.Conv1d(channels, channels, 1) + self.bn = nn.BatchNorm1d(channels) + self.dropout = nn.Dropout(0.3) + + def forward(self, x): + residual = x + out1 = F.relu(self.conv1_dilated(x)) + out2 = F.relu(self.conv2_dilated(x)) + out = out1 + out2 + out = self.dropout(F.relu(self.bn(self.conv_fusion(out)))) + return out + residual + + +class MSTCNPPStage(nn.Module): + """Single stage of MS-TCN++ with dual dilated layers.""" + + def __init__(self, in_channels, hidden_channels, num_classes, num_layers=10): + super().__init__() + self.input_conv = nn.Conv1d(in_channels, hidden_channels, 1) + self.layers = nn.ModuleList() + for i in range(num_layers): + dilation1 = 2 ** i + dilation2 = 2 ** (i + 1) if i < num_layers - 1 else 2 ** i + self.layers.append(DualDilatedResBlock(hidden_channels, dilation1, dilation2)) + self.output_conv = nn.Conv1d(hidden_channels, num_classes, 1) + + def forward(self, x): + x = self.input_conv(x) + for layer in self.layers: + x = layer(x) + return self.output_conv(x) + + +class MSTCNPP(nn.Module): + """MS-TCN++ for temporal action segmentation (Exp2). + + Input: (B, T, C) + Output: list of (B, T, num_classes) per stage + """ + + def __init__(self, input_dim, num_classes, hidden_dim=64, num_stages=4, num_layers=10): + super().__init__() + self.stages = nn.ModuleList() + # First stage: input features -> predictions + self.stages.append(MSTCNPPStage(input_dim, hidden_dim, num_classes, num_layers)) + # Refinement stages: predictions -> refined predictions + for _ in range(num_stages - 1): + self.stages.append(MSTCNPPStage(num_classes, hidden_dim, num_classes, num_layers)) + + def forward(self, x): + x = x.permute(0, 2, 1) # (B, C, T) + outputs = [] + for stage in self.stages: + x = stage(x) + outputs.append(x.permute(0, 2, 1)) # (B, T, num_classes) + # Feed softmax of predictions to next stage + if stage != self.stages[-1]: + x = F.softmax(x, dim=1) + return outputs + + +# ============================================================ +# 4. DiffAct (Liu et al., ICCV 2023) +# "Diffusion Action Segmentation" +# Denoising diffusion model for iterative action refinement. +# Simplified but faithful implementation. +# ============================================================ + +class ConditionalLayerNorm(nn.Module): + """Layer norm conditioned on diffusion timestep.""" + + def __init__(self, channels): + super().__init__() + self.norm = nn.GroupNorm(1, channels) # equivalent to LayerNorm for 1D + + def forward(self, x): + return self.norm(x) + + +class DiffActBlock(nn.Module): + """Residual block for DiffAct denoising network.""" + + def __init__(self, channels, dilation, time_emb_dim): + super().__init__() + self.conv1 = nn.Conv1d(channels, channels, 3, padding=dilation, dilation=dilation) + self.conv2 = nn.Conv1d(channels, channels, 1) + self.norm1 = ConditionalLayerNorm(channels) + self.norm2 = ConditionalLayerNorm(channels) + self.time_proj = nn.Linear(time_emb_dim, channels) + self.dropout = nn.Dropout(0.1) + + def forward(self, x, time_emb): + residual = x + x = self.norm1(x) + x = F.relu(self.conv1(x)) + # Add time embedding + t = self.time_proj(time_emb).unsqueeze(-1) # (B, C, 1) + x = x + t + x = self.norm2(x) + x = self.dropout(F.relu(self.conv2(x))) + return x + residual + + +class DiffActConditionEncoder(nn.Module): + """Temporal feature encoder for conditioning the denoising network.""" + + def __init__(self, input_dim, hidden_dim, num_layers=6): + super().__init__() + self.input_conv = nn.Conv1d(input_dim, hidden_dim, 1) + self.layers = nn.ModuleList() + for i in range(num_layers): + dilation = 2 ** (i % 5) + self.layers.append(nn.Sequential( + nn.Conv1d(hidden_dim, hidden_dim, 3, padding=dilation, dilation=dilation), + nn.BatchNorm1d(hidden_dim), + nn.ReLU(), + nn.Dropout(0.1), + )) + + def forward(self, x): + x = self.input_conv(x) + for layer in self.layers: + x = layer(x) + x # residual + return x + + +class SinusoidalTimeEmbedding(nn.Module): + """Sinusoidal positional embedding for diffusion timestep.""" + + def __init__(self, dim): + super().__init__() + self.dim = dim + self.mlp = nn.Sequential( + nn.Linear(dim, dim * 4), + nn.GELU(), + nn.Linear(dim * 4, dim), + ) + + def forward(self, t): + half_dim = self.dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, device=t.device) * -emb) + emb = t.unsqueeze(-1).float() * emb.unsqueeze(0) + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1) + return self.mlp(emb) + + +class DiffAct(nn.Module): + """DiffAct: Diffusion Action Segmentation (Exp2). + + During training: noises ground-truth action probabilities and denoises. + During inference: iteratively denoises from pure noise. + + Input: (B, T, C) + Output: list of (B, T, num_classes) [final denoised prediction] + """ + + def __init__(self, input_dim, num_classes, hidden_dim=64, + num_encoder_layers=6, num_denoise_layers=6, + num_diffusion_steps=10): + super().__init__() + self.num_classes = num_classes + self.num_steps = num_diffusion_steps + + # Condition encoder: extract temporal features from input + self.condition_encoder = DiffActConditionEncoder(input_dim, hidden_dim, num_encoder_layers) + + # Initial prediction head (non-diffusion baseline) + self.initial_head = nn.Conv1d(hidden_dim, num_classes, 1) + + # Time embedding + self.time_emb = SinusoidalTimeEmbedding(hidden_dim) + + # Denoising network + self.denoise_input = nn.Conv1d(num_classes + hidden_dim, hidden_dim, 1) + self.denoise_blocks = nn.ModuleList() + for i in range(num_denoise_layers): + dilation = 2 ** (i % 5) + self.denoise_blocks.append(DiffActBlock(hidden_dim, dilation, hidden_dim)) + self.denoise_output = nn.Conv1d(hidden_dim, num_classes, 1) + + # Noise schedule (cosine) + self._setup_noise_schedule() + + def _setup_noise_schedule(self): + steps = self.num_steps + s = 0.008 + t = torch.linspace(0, steps, steps + 1) + alphas_cumprod = torch.cos(((t / steps) + s) / (1 + s) * math.pi * 0.5) ** 2 + alphas_cumprod = alphas_cumprod / alphas_cumprod[0] + betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1]) + betas = torch.clamp(betas, 0.0001, 0.999) + alphas = 1.0 - betas + alphas_cumprod = torch.cumprod(alphas, dim=0) + self.register_buffer('betas', betas) + self.register_buffer('alphas_cumprod', alphas_cumprod) + self.register_buffer('sqrt_alphas_cumprod', torch.sqrt(alphas_cumprod)) + self.register_buffer('sqrt_one_minus_alphas_cumprod', torch.sqrt(1 - alphas_cumprod)) + + def _add_noise(self, x_start, t, noise=None): + """Add noise to x_start at timestep t.""" + if noise is None: + noise = torch.randn_like(x_start) + sqrt_alpha = self.sqrt_alphas_cumprod[t].view(-1, 1, 1) + sqrt_one_minus = self.sqrt_one_minus_alphas_cumprod[t].view(-1, 1, 1) + return sqrt_alpha * x_start + sqrt_one_minus * noise + + def _denoise_step(self, x_noisy, cond_features, time_emb): + """Single denoising step.""" + x = torch.cat([x_noisy, cond_features], dim=1) # (B, C+hidden, T) + x = self.denoise_input(x) + for block in self.denoise_blocks: + x = block(x, time_emb) + return self.denoise_output(x) + + def forward(self, x): + """ + Training: returns [initial_pred, denoised_pred] + Inference: returns [initial_pred, iteratively_denoised_pred] + """ + x_in = x.permute(0, 2, 1) # (B, C, T) + B, _, T = x_in.shape + + # Encode condition features + cond = self.condition_encoder(x_in) # (B, hidden, T) + initial_logits = self.initial_head(cond).permute(0, 2, 1) # (B, T, num_classes) + + if self.training: + # Training: noise the initial prediction and denoise (end-to-end) + x_start = F.softmax(initial_logits, dim=-1).permute(0, 2, 1) # (B, C, T) + t = torch.randint(0, self.num_steps, (B,), device=x.device) + noise = torch.randn_like(x_start) + x_noisy = self._add_noise(x_start.detach(), t, noise) + time_emb = self.time_emb(t) + denoised = self._denoise_step(x_noisy, cond, time_emb) + return [initial_logits, denoised.permute(0, 2, 1)] + else: + # Inference: iterative denoising from noise + x_t = torch.randn(B, self.num_classes, T, device=x.device) + for step in reversed(range(self.num_steps)): + t = torch.full((B,), step, device=x.device, dtype=torch.long) + time_emb = self.time_emb(t) + pred_noise = self._denoise_step(x_t, cond, time_emb) + # Simplified DDPM update + alpha = self.alphas_cumprod[step] + alpha_prev = self.alphas_cumprod[step - 1] if step > 0 else torch.tensor(1.0) + beta = self.betas[step] + x_t = (1 / torch.sqrt(1 - beta)) * ( + x_t - beta / self.sqrt_one_minus_alphas_cumprod[step] * pred_noise + ) + if step > 0: + x_t = x_t + torch.sqrt(beta) * torch.randn_like(x_t) * 0.5 + return [initial_logits, x_t.permute(0, 2, 1)] + + +# ============================================================ +# 5. UnderPressure (Mourot et al., SCA/CGF 2022) +# "UnderPressure: Deep Learning for Foot Contact Detection, +# Ground Reaction Force Estimation and Footskate Cleanup" +# GRU-based architecture for contact detection + force regression. +# Adapted for hand contact detection and MoCap->Pressure prediction. +# ============================================================ + +class UnderPressureContact(nn.Module): + """UnderPressure model adapted for hand contact detection (Exp3). + + Architecture: Conv feature extractor -> BiGRU -> contact prediction head + Input: (B, T, C) + Output: (B, T, 2) [right_contact, left_contact] + """ + + def __init__(self, input_dim, hidden_dim=64, num_gru_layers=2): + super().__init__() + # Feature extractor (conv layers for local temporal patterns) + self.feature_extractor = nn.Sequential( + nn.Conv1d(input_dim, hidden_dim, 7, padding=3), + nn.BatchNorm1d(hidden_dim), + nn.ReLU(), + nn.Conv1d(hidden_dim, hidden_dim, 5, padding=2), + nn.BatchNorm1d(hidden_dim), + nn.ReLU(), + ) + # BiGRU for temporal modeling + self.gru = nn.GRU( + hidden_dim, hidden_dim, num_layers=num_gru_layers, + batch_first=True, bidirectional=True, + dropout=0.2 if num_gru_layers > 1 else 0, + ) + # Contact prediction head + self.contact_head = nn.Sequential( + nn.Linear(hidden_dim * 2, hidden_dim), + nn.ReLU(), + nn.Dropout(0.2), + nn.Linear(hidden_dim, 2), + ) + + def forward(self, x): + # x: (B, T, C) -> (B, C, T) + feat = self.feature_extractor(x.permute(0, 2, 1)) + feat = feat.permute(0, 2, 1) # (B, T, hidden) + gru_out, _ = self.gru(feat) + return self.contact_head(gru_out) # (B, T, 2) + + +class UnderPressureRegressor(nn.Module): + """UnderPressure model adapted for MoCap -> Pressure regression (Exp4a). + + Architecture: Conv feature extractor -> BiGRU -> pressure regression head + Input: (B, T, input_dim) + Output: (B, T, output_dim) + """ + + def __init__(self, input_dim, output_dim, hidden_dim=128, num_gru_layers=2): + super().__init__() + self.feature_extractor = nn.Sequential( + nn.Conv1d(input_dim, hidden_dim, 7, padding=3), + nn.BatchNorm1d(hidden_dim), + nn.ReLU(), + nn.Conv1d(hidden_dim, hidden_dim, 5, padding=2), + nn.BatchNorm1d(hidden_dim), + nn.ReLU(), + nn.Conv1d(hidden_dim, hidden_dim, 3, padding=1), + nn.BatchNorm1d(hidden_dim), + nn.ReLU(), + ) + self.gru = nn.GRU( + hidden_dim, hidden_dim, num_layers=num_gru_layers, + batch_first=True, bidirectional=True, + dropout=0.2 if num_gru_layers > 1 else 0, + ) + self.regression_head = nn.Sequential( + nn.Linear(hidden_dim * 2, hidden_dim), + nn.ReLU(), + nn.Dropout(0.2), + nn.Linear(hidden_dim, output_dim), + ) + + def forward(self, x): + feat = self.feature_extractor(x.permute(0, 2, 1)) + feat = feat.permute(0, 2, 1) + gru_out, _ = self.gru(feat) + return self.regression_head(gru_out) + + +# ============================================================ +# 6. emg2pose (Meta/Facebook Research, NeurIPS 2024 D&B) +# "emg2pose: A Large and Diverse Benchmark for +# Surface Electromyographic Hand Pose Estimation" +# CNN feature extractor + Transformer encoder, +# with optional velocity-based integration (vemg2pose). +# ============================================================ + +class EMG2PoseEncoder(nn.Module): + """CNN + Transformer encoder from emg2pose.""" + + def __init__(self, input_dim, hidden_dim=128, num_transformer_layers=4, nhead=4): + super().__init__() + # Multi-scale CNN feature extractor + self.conv_small = nn.Sequential( + nn.Conv1d(input_dim, hidden_dim // 2, 3, padding=1), + nn.BatchNorm1d(hidden_dim // 2), + nn.ReLU(), + ) + self.conv_medium = nn.Sequential( + nn.Conv1d(input_dim, hidden_dim // 4, 7, padding=3), + nn.BatchNorm1d(hidden_dim // 4), + nn.ReLU(), + ) + self.conv_large = nn.Sequential( + nn.Conv1d(input_dim, hidden_dim // 4, 15, padding=7), + nn.BatchNorm1d(hidden_dim // 4), + nn.ReLU(), + ) + # Projection to hidden_dim + self.proj = nn.Sequential( + nn.Conv1d(hidden_dim, hidden_dim, 1), + nn.BatchNorm1d(hidden_dim), + nn.ReLU(), + ) + # Transformer encoder for temporal modeling + encoder_layer = nn.TransformerEncoderLayer( + d_model=hidden_dim, nhead=nhead, + dim_feedforward=hidden_dim * 4, + dropout=0.1, batch_first=True, + ) + self.transformer = nn.TransformerEncoder(encoder_layer, num_transformer_layers) + + def forward(self, x): + # x: (B, T, C) -> (B, C, T) + x_t = x.permute(0, 2, 1) + f_small = self.conv_small(x_t) + f_medium = self.conv_medium(x_t) + f_large = self.conv_large(x_t) + feat = torch.cat([f_small, f_medium, f_large], dim=1) + feat = self.proj(feat).permute(0, 2, 1) # (B, T, hidden) + return self.transformer(feat) + + +class EMG2Pose(nn.Module): + """emg2pose model for EMG -> Hand Pose regression (Exp4b). + + Predicts per-frame hand joint positions from EMG signals. + Uses velocity-based integration (vemg2pose variant): + predict velocity -> integrate to get positions. + + Input: (B, T, input_dim) [EMG channels] + Output: (B, T, output_dim) [hand joint positions] + """ + + def __init__(self, input_dim, output_dim, hidden_dim=128, + num_transformer_layers=4, use_velocity=True): + super().__init__() + self.use_velocity = use_velocity + self.encoder = EMG2PoseEncoder(input_dim, hidden_dim, num_transformer_layers) + + if use_velocity: + # Predict velocity, then integrate + self.velocity_head = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim // 2), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_dim // 2, output_dim), + ) + # Learnable initial position + self.initial_pos = nn.Parameter(torch.zeros(1, 1, output_dim)) + else: + # Direct position prediction + self.position_head = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim // 2), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_dim // 2, output_dim), + ) + + def forward(self, x): + features = self.encoder(x) # (B, T, hidden) + + if self.use_velocity: + velocity = self.velocity_head(features) # (B, T, output_dim) + # Cumulative sum to integrate velocity -> position + positions = torch.cumsum(velocity, dim=1) + self.initial_pos + return positions + else: + return self.position_head(features) diff --git a/experiments/s9_primitives.json b/experiments/s9_primitives.json new file mode 100644 index 0000000000000000000000000000000000000000..85130c953ff3ca41c7ce6cc5767b102dd4056444 --- /dev/null +++ b/experiments/s9_primitives.json @@ -0,0 +1,76 @@ +{ + "version": "s9_docx_2025_12_05", + "source": "${PULSE_ROOT}", + "categories": ["hand", "arm", "body", "fine", "composite"], + "primitives": [ + {"id": 0, "category": "hand", "zh": "伸手", "en": "reach", "note": "forward/up/down/side"}, + {"id": 1, "category": "hand", "zh": "抓握", "en": "grasp", "note": "pinch / hold / clamp"}, + {"id": 2, "category": "hand", "zh": "松开", "en": "release", "note": "release object"}, + {"id": 3, "category": "hand", "zh": "旋转手腕", "en": "rotate_wrist", "note": "twist / turn"}, + {"id": 4, "category": "hand", "zh": "按压", "en": "press", "note": "downward force"}, + {"id": 5, "category": "hand", "zh": "拉动", "en": "pull", "note": "toward self"}, + {"id": 6, "category": "hand", "zh": "推动", "en": "push", "note": "outward force"}, + {"id": 7, "category": "hand", "zh": "滑动", "en": "slide", "note": "translation motion"}, + {"id": 8, "category": "hand", "zh": "捏合", "en": "pinch", "note": "two/multi finger pinch"}, + {"id": 9, "category": "hand", "zh": "展开", "en": "spread_fingers", "note": "fingers open"}, + + {"id": 10, "category": "arm", "zh": "抬起", "en": "raise_arm", "note": "arm up"}, + {"id": 11, "category": "arm", "zh": "放下", "en": "lower_arm", "note": "arm down"}, + {"id": 12, "category": "arm", "zh": "伸展", "en": "extend_arm", "note": "arm straight"}, + {"id": 13, "category": "arm", "zh": "弯曲", "en": "bend_elbow", "note": "elbow bend"}, + {"id": 14, "category": "arm", "zh": "摆动", "en": "swing_arm", "note": "left-right / forward-back"}, + {"id": 15, "category": "arm", "zh": "环绕", "en": "circle_arm", "note": "circular motion"}, + + {"id": 16, "category": "body", "zh": "弯腰", "en": "bend_torso", "note": "lean forward"}, + {"id": 17, "category": "body", "zh": "直立", "en": "stand_upright", "note": "return to standing"}, + {"id": 18, "category": "body", "zh": "蹲下", "en": "squat_down", "note": "lower center of mass"}, + {"id": 19, "category": "body", "zh": "站起", "en": "stand_up", "note": "return to height"}, + {"id": 20, "category": "body", "zh": "转身", "en": "turn_body", "note": "torso rotate"}, + {"id": 21, "category": "body", "zh": "侧身", "en": "lean_side", "note": "torso tilt"}, + {"id": 22, "category": "body", "zh": "迈步", "en": "step", "note": "shift position"}, + + {"id": 23, "category": "fine", "zh": "插入", "en": "insert", "note": "object enters"}, + {"id": 24, "category": "fine", "zh": "拔出", "en": "extract", "note": "object exits"}, + {"id": 25, "category": "fine", "zh": "折叠", "en": "fold", "note": "change shape"}, + {"id": 26, "category": "fine", "zh": "撕扯", "en": "tear", "note": "separate"}, + {"id": 27, "category": "fine", "zh": "擦拭", "en": "wipe", "note": "back-and-forth"}, + + {"id": 28, "category": "composite", "zh": "拿起物品", "en": "pick_up_object", "note": "reach -> grasp -> raise"}, + {"id": 29, "category": "composite", "zh": "放下物品", "en": "put_down_object", "note": "move -> release -> retract"}, + {"id": 30, "category": "composite", "zh": "移动物品", "en": "move_object", "note": "pick_up -> move -> put_down"}, + {"id": 31, "category": "composite", "zh": "交换手持物", "en": "transfer_between_hands","note": "one hand grasp -> other hand take -> first release"}, + {"id": 32, "category": "composite", "zh": "打开盖子", "en": "open_lid", "note": "grasp -> rotate/lift"}, + {"id": 33, "category": "composite", "zh": "关闭盖子", "en": "close_lid", "note": "align -> press/rotate"}, + {"id": 34, "category": "composite", "zh": "倒入液体", "en": "pour_liquid", "note": "lift -> tilt -> control flow -> reset"}, + {"id": 35, "category": "composite", "zh": "舀取", "en": "scoop", "note": "insert -> raise -> move"}, + {"id": 36, "category": "composite", "zh": "打开柜门", "en": "open_cabinet_door", "note": "grasp handle -> pull"}, + {"id": 37, "category": "composite", "zh": "关闭柜门", "en": "close_cabinet_door", "note": "push -> confirm"}, + {"id": 38, "category": "composite", "zh": "打开抽屉", "en": "open_drawer", "note": "grasp -> pull out"}, + {"id": 39, "category": "composite", "zh": "按下开关", "en": "press_switch", "note": "reach -> press"}, + {"id": 40, "category": "composite", "zh": "折叠衣物", "en": "fold_clothing", "note": "spread -> fold -> flatten"}, + {"id": 41, "category": "composite", "zh": "叠放物品", "en": "stack_objects", "note": "pick_up -> align -> place gently"}, + {"id": 42, "category": "composite", "zh": "排列物品", "en": "arrange_objects", "note": "move -> adjust spacing -> align"}, + {"id": 43, "category": "composite", "zh": "分类收纳", "en": "sort_and_store", "note": "identify -> group -> place"}, + {"id": 44, "category": "composite", "zh": "擦拭表面", "en": "wipe_surface", "note": "take cloth -> press -> back-and-forth"}, + {"id": 45, "category": "composite", "zh": "扫除垃圾", "en": "sweep_debris", "note": "broom -> gather -> dustpan"}, + {"id": 46, "category": "composite", "zh": "倾倒垃圾", "en": "dump_trash", "note": "lift container -> align -> tilt -> pour"}, + {"id": 47, "category": "composite", "zh": "喷洒液体", "en": "spray_liquid", "note": "press nozzle -> move -> release"}, + {"id": 48, "category": "composite", "zh": "撕胶带", "en": "tear_tape", "note": "pull -> tear off"}, + {"id": 49, "category": "composite", "zh": "贴标签", "en": "stick_label", "note": "peel -> align -> press"}, + {"id": 50, "category": "composite", "zh": "包裹物品", "en": "wrap_object", "note": "spread wrap -> place item -> fold -> seal"}, + {"id": 51, "category": "composite", "zh": "系绳打结", "en": "tie_knot", "note": "cross -> through -> tighten"}, + {"id": 52, "category": "composite", "zh": "拿起笔", "en": "pick_up_pen", "note": "pinch -> adjust grip"}, + {"id": 53, "category": "composite", "zh": "写字", "en": "write", "note": "controlled motion -> apply pressure"}, + {"id": 54, "category": "composite", "zh": "翻页", "en": "turn_page", "note": "pinch corner -> flip"}, + {"id": 55, "category": "composite", "zh": "插入电源", "en": "plug_in_power", "note": "align -> push in"}, + {"id": 56, "category": "composite", "zh": "连接线缆", "en": "connect_cable", "note": "align connector -> insert -> confirm"}, + {"id": 57, "category": "composite", "zh": "组装部件", "en": "assemble_parts", "note": "align -> snap/screw"}, + {"id": 58, "category": "composite", "zh": "称重", "en": "weigh", "note": "place item -> read scale"}, + {"id": 59, "category": "composite", "zh": "量取", "en": "measure_volume", "note": "pour -> read marking -> adjust"}, + {"id": 60, "category": "composite", "zh": "计数", "en": "count", "note": "move one by one -> tally"}, + {"id": 61, "category": "composite", "zh": "挂衣服", "en": "hang_clothing", "note": "take hanger -> insert garment -> hang"}, + {"id": 62, "category": "composite", "zh": "铲猫砂", "en": "scoop_litter", "note": "insert -> raise -> sift -> pour"}, + {"id": 63, "category": "composite", "zh": "搅拌", "en": "stir", "note": "insert spoon -> circular motion"}, + {"id": 64, "category": "composite", "zh": "剪切", "en": "cut", "note": "hold scissors -> align -> close"} + ] +} diff --git a/experiments/slurm/freeze_all_rows.sh b/experiments/slurm/freeze_all_rows.sh new file mode 100644 index 0000000000000000000000000000000000000000..6c0ecb0f0185b87fd6b7ea37ff083983ee8ea2df --- /dev/null +++ b/experiments/slurm/freeze_all_rows.sh @@ -0,0 +1,179 @@ +#!/bin/bash +# Create folder structure for ALL rows across Tables 1, 3, 4, 5, 7 and +# freeze the current experiments/ code into each one. After this you can +# cd into any // and run ./run.sh to submit 5 SLURM seeds. +# +# Re-running this script is safe: it will re-freeze the code (overwrite the +# snapshot), but won't clobber any existing seeds/ outputs. +set -euo pipefail + +BASEDIR=${BASEDIR:-${PULSE_ROOT}} +EXP=${BASEDIR}/experiments +SETUP="${EXP}/setup_row.sh" + +COMMON="--epochs 40 --batch_size 32 --lr 3e-4 --weight_decay 1e-4 \ +--patience 12 --label_smoothing 0.05 --use_class_weights \ +--num_workers 2" + +ALL5="imu,emg,eyetrack,mocap,pressure" + +row () { + # $1=table $2=row $3=desc $4=cli + bash "${SETUP}" --table "$1" --row "$2" --desc "$3" --cli "$4 ${COMMON}" +} + +# ============================================================ +# Table 1: Main comparison at T_fut=2s +# ============================================================ +T1=table1_main_comparison +cat > "${BASEDIR}/${T1}/README.md" <<'EOF' +# Table 1: Main Comparison (Next-Action Prediction, T_fut = 2 s) + +Each baseline is run on its most favourable modality subset; our model +(DailyActFormer) uses all 5 synchronised modalities. 5 seeds per row; +report mean ± std of Verb fine Top-1/5, Noun Top-1/5, Hand Top-1, Action +Top-1 (= verb ∧ noun ∧ hand). Action Top-1 is the headline metric. + +| Row | Method | Family | Modalities | +|-----|-------------------|-----------------|---------------------| +| 01 | DailyActFormer | cross-modal Trf | imu+emg+eye+mocap+P | +| 02 | DeepConvLSTM | CNN+LSTM (IMU) | imu | +| 03 | DeepConvLSTM 3mod | CNN+LSTM | imu+mocap+emg | +| 04 | RULSTM | rolling LSTM | imu+mocap | +| 05 | FUTR | long-term Trf | mocap+imu+emg | +| 06 | AFFT | multimodal Trf | imu+emg+eye+mocap | +| 07 | HandFormer | hand-pose Trf | mocap (fingers) | +| 08 | ActionLLM (LoRA) | LLM-based | imu+emg+eye | +EOF + +mkdir -p "${BASEDIR}/${T1}" +row ${T1} row01_ours_dailyactformer_all5 \ + "Our model, all 5 modalities (headline row)" \ + "--model dailyactformer --modalities ${ALL5} --t_obs 8 --t_fut 2" + +row ${T1} row02_deepconvlstm_imu \ + "DeepConvLSTM on IMU only (classic HAR baseline)" \ + "--model deepconvlstm --modalities imu --t_obs 8 --t_fut 2" + +row ${T1} row03_deepconvlstm_3mod \ + "DeepConvLSTM on IMU+MoCap+EMG (best 3-modality concat)" \ + "--model deepconvlstm --modalities imu,mocap,emg --t_obs 8 --t_fut 2" + +row ${T1} row04_rulstm_imu_mocap \ + "RULSTM, rolling-unrolling LSTM (IMU + MoCap late fusion)" \ + "--model rulstm --modalities imu,mocap --t_obs 8 --t_fut 2" + +row ${T1} row05_futr_3mod \ + "FUTR (causal transformer) on MoCap+IMU+EMG" \ + "--model futr --modalities mocap,imu,emg --t_obs 8 --t_fut 2" + +row ${T1} row06_afft_4mod \ + "AFFT (anticipative feature fusion transformer) on 4 modalities" \ + "--model afft --modalities imu,emg,eyetrack,mocap --t_obs 8 --t_fut 2" + +row ${T1} row07_handformer_mocap \ + "HandFormer (skeleton-only ECCV'24) on MoCap finger joints" \ + "--model handformer --modalities mocap --t_obs 8 --t_fut 2" + +row ${T1} row08_actionllm_3mod \ + "ActionLLM (Qwen2.5-0.5B + LoRA) on IMU+EMG+EyeTrack" \ + "--model actionllm --modalities imu,emg,eyetrack --t_obs 8 --t_fut 2" + +# ============================================================ +# Table 3: Horizon curve (DailyActFormer) +# ============================================================ +T3=table3_horizon_curve +mkdir -p "${BASEDIR}/${T3}" +cat > "${BASEDIR}/${T3}/README.md" <<'EOF' +# Table 3: Prediction Horizon Curve (DailyActFormer, all 5 modalities) + +Same model, varying T_fut. Expect monotonic drop in Action Top-1 as +horizon grows; plot line graph in the paper alongside this table. +EOF +HORIZONS=(1 2 5 10 15) +for i in "${!HORIZONS[@]}"; do + tfut="${HORIZONS[$i]}" + idx=$(printf "%02d" $((i+1))) + row ${T3} row${idx}_ours_tfut${tfut}s \ + "Our model at T_fut=${tfut}s" \ + "--model dailyactformer --modalities ${ALL5} --t_obs 8 --t_fut ${tfut}" +done + +# ============================================================ +# Table 4: Modality ablation on DailyActFormer (T_fut=2s) +# ============================================================ +T4=table4_modality_ablation +mkdir -p "${BASEDIR}/${T4}" +cat > "${BASEDIR}/${T4}/README.md" <<'EOF' +# Table 4: Modality Ablation (DailyActFormer, T_fut = 2 s) + +Same model, progressively remove modalities. Each row trained from scratch. +EOF +row ${T4} row01_full_5mod "Full 5-modality (reference)" "--model dailyactformer --modalities imu,emg,eyetrack,mocap,pressure --t_obs 8 --t_fut 2" +row ${T4} row02_no_pressure "Drop pressure" "--model dailyactformer --modalities imu,emg,eyetrack,mocap --t_obs 8 --t_fut 2" +row ${T4} row03_no_eyetrack "Drop eye-tracking" "--model dailyactformer --modalities imu,emg,mocap,pressure --t_obs 8 --t_fut 2" +row ${T4} row04_no_emg "Drop EMG" "--model dailyactformer --modalities imu,eyetrack,mocap,pressure --t_obs 8 --t_fut 2" +row ${T4} row05_no_imu "Drop IMU" "--model dailyactformer --modalities emg,eyetrack,mocap,pressure --t_obs 8 --t_fut 2" +row ${T4} row06_no_mocap "Drop MoCap" "--model dailyactformer --modalities imu,emg,eyetrack,pressure --t_obs 8 --t_fut 2" +row ${T4} row07_imu_emg_only "Only IMU + EMG (physiology-light)" "--model dailyactformer --modalities imu,emg --t_obs 8 --t_fut 2" +row ${T4} row08_mocap_only "Only MoCap (skeleton-only)" "--model dailyactformer --modalities mocap --t_obs 8 --t_fut 2" + +# ============================================================ +# Table 5: Component ablation (DailyActFormer switches) +# ============================================================ +T5=table5_component_ablation +mkdir -p "${BASEDIR}/${T5}" +cat > "${BASEDIR}/${T5}/README.md" <<'EOF' +# Table 5: Component Ablation (DailyActFormer, T_fut = 2 s) + +Each row toggles one architectural/training component of our model. +Component flags are implemented as CLI switches on train_seqpred.py; +see models_seqpred.py for the corresponding model options. +EOF +row ${T5} row01_full \ + "Full model (reference)" \ + "--model dailyactformer --modalities ${ALL5} --t_obs 8 --t_fut 2" +row ${T5} row02_no_composite_head \ + "Drop the auxiliary verb-composite head (lambda=0)" \ + "--model dailyactformer --modalities ${ALL5} --t_obs 8 --t_fut 2 --lambda_verb_composite 0.0" +row ${T5} row03_equal_lambda \ + "Equal-weight all 4 heads (no prior on verb>hand)" \ + "--model dailyactformer --modalities ${ALL5} --t_obs 8 --t_fut 2 --lambda_verb_composite 1.0 --lambda_hand 1.0" +row ${T5} row04_no_class_weight \ + "No inverse-frequency class weighting" \ + "--model dailyactformer --modalities ${ALL5} --t_obs 8 --t_fut 2 --lambda_verb_composite 0.5" +# row04 re-exposes the default; the variable-off is the absence of --use_class_weights +# We patch this manually — strip the flag out of COMMON. +ROW_DIR="${BASEDIR}/${T5}/row04_no_class_weight/run.sh" +if [[ -e "${ROW_DIR}" ]]; then + sed -i 's/--use_class_weights //g' "${ROW_DIR}" +fi + +row ${T5} row05_no_label_smoothing \ + "Label smoothing off" \ + "--model dailyactformer --modalities ${ALL5} --t_obs 8 --t_fut 2 --label_smoothing 0.0" + +# ============================================================ +# Table 7: Missing-modality robustness (train once, eval 6 ways) +# ============================================================ +T7=table7_missing_modality +mkdir -p "${BASEDIR}/${T7}" +cat > "${BASEDIR}/${T7}/README.md" <<'EOF' +# Table 7: Missing-Modality Robustness (T_fut = 2 s) + +Train DailyActFormer with random per-modality dropout (p=0.3). At test time, +evaluate under 6 configurations: full / drop one modality each. Only the +training job has its own folder; eval uses the trained checkpoint to fill +multiple rows of the final table. +EOF +row ${T7} row01_train_with_modality_dropout \ + "DailyActFormer trained with --modality_dropout 0.3" \ + "--model dailyactformer --modalities ${ALL5} --t_obs 8 --t_fut 2 --modality_dropout 0.3" +# The 6 test-time configurations (full / no_P / no_E / no_emg / no_imu / +# no_mocap) will be produced by a separate eval script that loads the +# checkpoint from row01 and runs evaluate() with modality subsets. See +# experiments/tasks/eval_missing_modality.py (TBD). + +echo "" +echo "[ok] Froze rows under:" +echo " ${BASEDIR}/{${T1},${T3},${T4},${T5},${T7}}/" diff --git a/experiments/slurm/run_ablation_fix.sh b/experiments/slurm/run_ablation_fix.sh new file mode 100644 index 0000000000000000000000000000000000000000..6746868d0e981229140e2513eee995b6753c5d1f --- /dev/null +++ b/experiments/slurm/run_ablation_fix.sh @@ -0,0 +1,33 @@ +#!/bin/bash +#SBATCH --job-name=ablation_fix +#SBATCH --partition=gpuA800 +#SBATCH --gres=gpu:1 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=4 +#SBATCH --mem=32G +#SBATCH --time=1:00:00 +#SBATCH --output=${PULSE_ROOT}/results/ablation_fix_%j.log + +# Fix: mocap+emg late+pretrained — pretrain MOCAP branch (idx=0) instead of emg +set -e +export PYTHONUNBUFFERED=1 + +PYTHON=python +BASEDIR=${PULSE_ROOT} +SCRIPT=${BASEDIR}/experiments/train_exp1.py +OUTDIR=${BASEDIR}/results/modality_ablation +COMMON="--model transformer --epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 15 --proj_dim 0 --output_dir $OUTDIR" +SEEDS=(42 123 456 789 2024) + +PT_MOCAP=${BASEDIR}/results/exp1_v8/transformer_mocap_early/model_best.pt + +echo "=== Fix: mocap+emg / late+pretrained(mocap, idx=0) ===" +for seed in "${SEEDS[@]}"; do + echo " mocap+emg seed=$seed" + $PYTHON $SCRIPT --modalities mocap,emg --fusion late --seed $seed \ + --pretrained_backbone $PT_MOCAP --freeze_backbone_idx 0 \ + --tag ablation_pt_s${seed} $COMMON 2>&1 | tail -5 +done + +echo "=== Done ===" diff --git a/experiments/slurm/run_ablation_fusion.sh b/experiments/slurm/run_ablation_fusion.sh new file mode 100644 index 0000000000000000000000000000000000000000..6b74c6e940ae969cf64a98c4d9bf5151170499c4 --- /dev/null +++ b/experiments/slurm/run_ablation_fusion.sh @@ -0,0 +1,174 @@ +#!/bin/bash +#SBATCH --job-name=ablation_fuse +#SBATCH --partition=gpuA800 +#SBATCH --gres=gpu:2 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=64G +#SBATCH --time=4:00:00 +#SBATCH --output=${PULSE_ROOT}/results/ablation_fusion_%j.log + +# Test confidence-weighted and learned-weight fusion on all multi-modal combos +# Compare against existing mean fusion results + +set -e +export PYTHONUNBUFFERED=1 + +PYTHON=python +BASEDIR=${PULSE_ROOT} +SCRIPT=${BASEDIR}/experiments/train_exp1.py +OUTDIR=${BASEDIR}/results/modality_ablation +COMMON="--model transformer --epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 15 --proj_dim 0 --output_dir $OUTDIR" +SEEDS=(42 123 456 789 2024) + +PT_IMU=${BASEDIR}/results/exp1_v7/transformer_imu_early/model_best.pt +PT_MOCAP=${BASEDIR}/results/exp1_v8/transformer_mocap_early/model_best.pt + +echo "=== Ablation: Confidence & Learned Fusion ===" + +# ============================================================ +# GPU 0: confidence-weighted fusion +# ============================================================ +( +export CUDA_VISIBLE_DEVICES=0 + +# mocap+imu / confidence / pretrained imu (idx=1) +echo "--- GPU0: mocap+imu / confidence ---" +for seed in "${SEEDS[@]}"; do + echo " mocap+imu confidence seed=$seed" + $PYTHON $SCRIPT --modalities mocap,imu --fusion late --late_agg confidence \ + --seed $seed --pretrained_backbone $PT_IMU --freeze_backbone_idx 1 \ + --tag ablation_conf_s${seed} $COMMON 2>&1 | tail -3 +done + +# emg+imu / confidence / pretrained imu (idx=1) +echo "--- GPU0: emg+imu / confidence ---" +for seed in "${SEEDS[@]}"; do + echo " emg+imu confidence seed=$seed" + $PYTHON $SCRIPT --modalities emg,imu --fusion late --late_agg confidence \ + --seed $seed --pretrained_backbone $PT_IMU --freeze_backbone_idx 1 \ + --tag ablation_conf_s${seed} $COMMON 2>&1 | tail -3 +done + +# mocap+emg / confidence / pretrained mocap (idx=0) +echo "--- GPU0: mocap+emg / confidence ---" +for seed in "${SEEDS[@]}"; do + echo " mocap+emg confidence seed=$seed" + $PYTHON $SCRIPT --modalities mocap,emg --fusion late --late_agg confidence \ + --seed $seed --pretrained_backbone $PT_MOCAP --freeze_backbone_idx 0 \ + --tag ablation_conf_s${seed} $COMMON 2>&1 | tail -3 +done + +# mocap+emg+imu / confidence / pretrained imu (idx=2, modalities=mocap,emg,imu) +echo "--- GPU0: mocap+emg+imu / confidence ---" +for seed in "${SEEDS[@]}"; do + echo " mocap+emg+imu confidence seed=$seed" + $PYTHON $SCRIPT --modalities imu,mocap,emg --fusion late --late_agg confidence \ + --seed $seed --pretrained_backbone $PT_IMU --freeze_backbone_idx 0 \ + --tag ablation_conf_s${seed} $COMMON 2>&1 | tail -3 +done + +echo "--- GPU0 Done ---" +) & +PID0=$! + +# ============================================================ +# GPU 1: learned-weight fusion +# ============================================================ +( +export CUDA_VISIBLE_DEVICES=1 + +# mocap+imu / learned / pretrained imu (idx=1) +echo "--- GPU1: mocap+imu / learned ---" +for seed in "${SEEDS[@]}"; do + echo " mocap+imu learned seed=$seed" + $PYTHON $SCRIPT --modalities mocap,imu --fusion late --late_agg learned \ + --seed $seed --pretrained_backbone $PT_IMU --freeze_backbone_idx 1 \ + --tag ablation_lrn_s${seed} $COMMON 2>&1 | tail -3 +done + +# emg+imu / learned / pretrained imu (idx=1) +echo "--- GPU1: emg+imu / learned ---" +for seed in "${SEEDS[@]}"; do + echo " emg+imu learned seed=$seed" + $PYTHON $SCRIPT --modalities emg,imu --fusion late --late_agg learned \ + --seed $seed --pretrained_backbone $PT_IMU --freeze_backbone_idx 1 \ + --tag ablation_lrn_s${seed} $COMMON 2>&1 | tail -3 +done + +# mocap+emg / learned / pretrained mocap (idx=0) +echo "--- GPU1: mocap+emg / learned ---" +for seed in "${SEEDS[@]}"; do + echo " mocap+emg learned seed=$seed" + $PYTHON $SCRIPT --modalities mocap,emg --fusion late --late_agg learned \ + --seed $seed --pretrained_backbone $PT_MOCAP --freeze_backbone_idx 0 \ + --tag ablation_lrn_s${seed} $COMMON 2>&1 | tail -3 +done + +# mocap+emg+imu / learned / pretrained imu (idx=0, modalities=imu,mocap,emg) +echo "--- GPU1: mocap+emg+imu / learned ---" +for seed in "${SEEDS[@]}"; do + echo " mocap+emg+imu learned seed=$seed" + $PYTHON $SCRIPT --modalities imu,mocap,emg --fusion late --late_agg learned \ + --seed $seed --pretrained_backbone $PT_IMU --freeze_backbone_idx 0 \ + --tag ablation_lrn_s${seed} $COMMON 2>&1 | tail -3 +done + +echo "--- GPU1 Done ---" +) & +PID1=$! + +wait $PID0 $PID1 + +# ============================================================ +# Collect results +# ============================================================ +echo "" +echo "=== Fusion Comparison ===" +$PYTHON -c " +import json, os, numpy as np + +base = '$OUTDIR' +v8_base = '${BASEDIR}/results/exp1_v8_multiseed' +v9_base = '${BASEDIR}/results/exp1_v9' +seeds = [42, 123, 456, 789, 2024] + +configs = [ + # (label, pattern_template) + # mean (from previous ablation run) + ('mocap+imu / mean', base + '/transformer_mocap-imu_late_ablation_pt_s{}/results.json'), + ('mocap+imu / confidence', base + '/transformer_mocap-imu_late_ablation_conf_s{}/results.json'), + ('mocap+imu / learned', base + '/transformer_mocap-imu_late_ablation_lrn_s{}/results.json'), + ('emg+imu / mean', base + '/transformer_emg-imu_late_ablation_pt_s{}/results.json'), + ('emg+imu / confidence', base + '/transformer_emg-imu_late_ablation_conf_s{}/results.json'), + ('emg+imu / learned', base + '/transformer_emg-imu_late_ablation_lrn_s{}/results.json'), + ('mocap+emg / mean', base + '/transformer_mocap-emg_late_ablation_pt_s{}/results.json'), + ('mocap+emg / confidence', base + '/transformer_mocap-emg_late_ablation_conf_s{}/results.json'), + ('mocap+emg / learned', base + '/transformer_mocap-emg_late_ablation_lrn_s{}/results.json'), + ('3mod / mean', v9_base + '/transformer_imu-mocap-emg_late_pt_s{}/results.json'), + ('3mod / confidence', base + '/transformer_imu-mocap-emg_late_ablation_conf_s{}/results.json'), + ('3mod / learned', base + '/transformer_imu-mocap-emg_late_ablation_lrn_s{}/results.json'), +] + +print(f'{\"Config\":<30} {\"F1 (mean±std)\":<20} {\"Acc (mean±std)\":<20} N') +print('-' * 75) +for label, pat in configs: + f1s, accs = [], [] + for s in seeds: + path = pat.format(s) + if os.path.exists(path): + with open(path) as f: + d = json.load(f) + f1s.append(d['test_macro_f1']) + accs.append(d['test_accuracy']) + if f1s: + f1 = np.array(f1s) + acc = np.array(accs) + print(f'{label:<30} {f1.mean():.3f}±{f1.std():.3f} {acc.mean():.3f}±{acc.std():.3f} {len(f1s)}') + else: + print(f'{label:<30} (no results)') +" + +echo "" +echo "=== All done ===" diff --git a/experiments/slurm/run_asformer_exp3.sh b/experiments/slurm/run_asformer_exp3.sh new file mode 100644 index 0000000000000000000000000000000000000000..5c3a5974e67c5b37daa895318e477e4c6f6fea98 --- /dev/null +++ b/experiments/slurm/run_asformer_exp3.sh @@ -0,0 +1,44 @@ +#!/bin/bash +#SBATCH --partition=gpuA800 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=4 +#SBATCH --gres=gpu:1 +#SBATCH --mem=32G +#SBATCH --time=4:00:00 +#SBATCH --job-name=ASF_exp3 +#SBATCH --output=${PULSE_ROOT}/results/asformer_exp3_%j.log + +set -e +PYTHON=python +PROJECT=${PULSE_ROOT} +cd $PROJECT + +EXP3_OUT=$PROJECT/results/published_baselines/exp3_asformer +mkdir -p $EXP3_OUT + +echo "=== ASFormer Contact Detection ===" + +for MOD in mocap emg imu "mocap,emg" "mocap,emg,eyetrack" "mocap,emg,eyetrack,imu"; do + echo "--- ASFormer / ${MOD} ---" + $PYTHON experiments/train_exp3.py \ + --model asformer --modalities $MOD \ + --hidden_dim 64 --epochs 50 --batch_size 32 \ + --lr 1e-3 --weight_decay 1e-4 --downsample 2 \ + --seed 42 --output_dir $EXP3_OUT 2>&1 | tail -8 +done + +echo "" +echo "=== Results ===" +for f in $EXP3_OUT/*/results.json; do + if [ -f "$f" ]; then + $PYTHON -c " +import json +with open('$f') as fp: + r = json.load(fp) +mods = ','.join(r.get('input_modalities', [])) +m = r.get('test_metrics', {}) +print(f' ASFormer | {mods:<30} | R_F1={m.get(\"right_f1\",0):.4f} L_F1={m.get(\"left_f1\",0):.4f} Avg_F1={m.get(\"avg_f1\",0):.4f}') +" + fi +done diff --git a/experiments/slurm/run_exp1.sh b/experiments/slurm/run_exp1.sh new file mode 100644 index 0000000000000000000000000000000000000000..7ab6db60e12a8a369bcb6eb567f53828425a2d28 --- /dev/null +++ b/experiments/slurm/run_exp1.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH -J exp1_scene +#SBATCH -p gpuA800 +#SBATCH --gres=gpu:1 +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=64G +#SBATCH -t 12:00:00 +#SBATCH -o ${PULSE_ROOT}/results/exp1/slurm_%j.out +#SBATCH -e ${PULSE_ROOT}/results/exp1/slurm_%j.err + +export PYTHONUNBUFFERED=1 + +echo "=== Job Info ===" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $SLURM_NODELIST" +echo "Start time: $(date)" +nvidia-smi --query-gpu=name,memory.total --format=csv,noheader +echo "================" + +PYTHON=python +SCRIPT=${PULSE_ROOT}/experiments/train_exp1.py +OUTDIR=${PULSE_ROOT}/results/exp1 + +cd ${PULSE_ROOT} + +$PYTHON $SCRIPT --run_all \ + --epochs 100 \ + --batch_size 16 \ + --lr 1e-3 \ + --weight_decay 1e-4 \ + --hidden_dim 128 \ + --downsample 5 \ + --patience 15 \ + --seed 42 \ + --output_dir $OUTDIR + +echo "=== Done ===" +echo "End time: $(date)" diff --git a/experiments/slurm/run_exp1_fusion.sh b/experiments/slurm/run_exp1_fusion.sh new file mode 100644 index 0000000000000000000000000000000000000000..cbb7d9fb3f445f3f0587d64cbab5faa3afc272d8 --- /dev/null +++ b/experiments/slurm/run_exp1_fusion.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Submit all fusion experiments as individual 1-GPU SLURM jobs +# SLURM scheduler will automatically place them on any available GPU + +PYTHON=python +SCRIPT=${PULSE_ROOT}/experiments/train_exp1.py +OUTDIR=${PULSE_ROOT}/results/exp1 +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +COMMON_ARGS="--model transformer --epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 15 --seed 42 --output_dir $OUTDIR" + +FUSIONS=(weighted_late gated_late stacking product moe late attention) +MODALITIES=("mocap,emg,eyetrack" "mocap,emg,eyetrack,imu,pressure") + +for fusion in "${FUSIONS[@]}"; do + for mods in "${MODALITIES[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + job_name="f_${fusion}_${mod_tag}" + sbatch \ + -J "$job_name" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=8 \ + --mem=32G \ + -t 3:00:00 \ + -o "${LOGDIR}/${job_name}_%j.out" \ + -e "${LOGDIR}/${job_name}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion $fusion --modalities $mods $COMMON_ARGS" + echo "Submitted: $job_name" + done +done + +echo "All 14 fusion experiments submitted!" diff --git a/experiments/slurm/run_exp1_parallel.sh b/experiments/slurm/run_exp1_parallel.sh new file mode 100644 index 0000000000000000000000000000000000000000..042e24259d699fdea49b79b09e952dcca6a967e7 --- /dev/null +++ b/experiments/slurm/run_exp1_parallel.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# Scene Recognition (Exp1) - Parallelized version +# Part 1: 9 modality combos × 3 backbones = 27 jobs (early fusion) +# Part 2: 7 fusion methods × transformer × (3-core + all-5) = 14 jobs +# Total: 41 jobs + +PYTHON=python +BASEDIR=${PULSE_ROOT} +SCRIPT=${BASEDIR}/experiments/train_exp1.py +OUTDIR=${BASEDIR}/results/exp1_v2 +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +COMMON="--epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 15 --seed 42 --output_dir $OUTDIR" + +MODS=("mocap" "emg" "eyetrack" "imu" "pressure" "mocap,emg,eyetrack" "mocap,emg,eyetrack,imu" "mocap,emg,eyetrack,pressure" "mocap,emg,eyetrack,imu,pressure") +MODELS=("cnn" "lstm" "transformer") + +# Part 1: Modality ablation × 3 backbones +echo "=== Part 1: Modality Ablation (27 jobs) ===" +for mods in "${MODS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + for model in "${MODELS[@]}"; do + sbatch \ + -J "exp1_${model}_${mod_tag}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${LOGDIR}/${model}_${mod_tag}_early_%j.out" \ + -e "${LOGDIR}/${model}_${mod_tag}_early_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model $model --modalities $mods --fusion early $COMMON" + echo " Submitted: $model / $mods / early" + done +done + +# Part 2: Fusion methods × transformer +FUSIONS=("late" "attention" "weighted_late" "gated_late" "stacking" "product" "moe") +FUSION_MODS=("mocap,emg,eyetrack" "mocap,emg,eyetrack,imu,pressure") + +echo "" +echo "=== Part 2: Fusion Ablation (14 jobs) ===" +for fmods in "${FUSION_MODS[@]}"; do + fmod_tag=$(echo $fmods | tr ',' '-') + for fusion in "${FUSIONS[@]}"; do + sbatch \ + -J "exp1_tf_${fusion}_${fmod_tag}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${LOGDIR}/transformer_${fmod_tag}_${fusion}_%j.out" \ + -e "${LOGDIR}/transformer_${fmod_tag}_${fusion}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model transformer --modalities $fmods --fusion $fusion $COMMON" + echo " Submitted: transformer / $fmods / $fusion" + done +done + +echo "" +echo "Total: 41 jobs | Scene Recognition | Updated IMU data" +echo "Results: $OUTDIR" diff --git a/experiments/slurm/run_exp1_small.sh b/experiments/slurm/run_exp1_small.sh new file mode 100644 index 0000000000000000000000000000000000000000..479114bdec10a96a3e71c10704ab3240cb6a8560 --- /dev/null +++ b/experiments/slurm/run_exp1_small.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# Exp1 small model: hidden_dim=32, dropout=0.5, weight_decay=1e-3 +# 3 modalities: mocap, emg, imu (exclude pressure & eyetrack) +# Output: results/exp1_small + +PYTHON=python +SCRIPT=${PULSE_ROOT}/experiments/train_exp1.py +OUTDIR=${PULSE_ROOT}/results/exp1_small +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +COMMON="--model transformer --epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-3 --hidden_dim 32 --downsample 5 --patience 15 --seed 42 --output_dir $OUTDIR" + +# ============================================================ +# Part 1: Single modality (early fusion = single backbone) +# ============================================================ +for mod in mocap emg imu; do + job_name="s_${mod}" + sbatch \ + -J "$job_name" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=8 \ + --mem=32G \ + -t 1:00:00 \ + -o "${LOGDIR}/${job_name}_%j.out" \ + -e "${LOGDIR}/${job_name}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion early --modalities $mod $COMMON" + echo "Submitted: $job_name" +done + +# ============================================================ +# Part 2: Multi-modality early fusion (4 combos) +# ============================================================ +EARLY_COMBOS=("mocap,emg" "mocap,imu" "emg,imu" "mocap,emg,imu") +for mods in "${EARLY_COMBOS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + job_name="e_${mod_tag}" + sbatch \ + -J "$job_name" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=8 \ + --mem=32G \ + -t 1:00:00 \ + -o "${LOGDIR}/${job_name}_%j.out" \ + -e "${LOGDIR}/${job_name}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion early --modalities $mods $COMMON" + echo "Submitted: $job_name" +done + +# ============================================================ +# Part 3: Fusion methods x modality sets +# ============================================================ +FUSIONS=(late attention weighted_late gated_late stacking product moe) +FUSION_MODS=("mocap,emg,imu" "mocap,imu") + +for fusion in "${FUSIONS[@]}"; do + for mods in "${FUSION_MODS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + job_name="f_${fusion}_${mod_tag}" + sbatch \ + -J "$job_name" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=8 \ + --mem=32G \ + -t 1:00:00 \ + -o "${LOGDIR}/${job_name}_%j.out" \ + -e "${LOGDIR}/${job_name}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion $fusion --modalities $mods $COMMON" + echo "Submitted: $job_name" + done +done + +echo "" +echo "Total: 3 single + 4 early + 14 fusion = 21 jobs submitted!" +echo "Results will be saved to: $OUTDIR" diff --git a/experiments/slurm/run_exp1_small2.sh b/experiments/slurm/run_exp1_small2.sh new file mode 100644 index 0000000000000000000000000000000000000000..f550102ff2dd20156d4f6b9a4f145146eedf1363 --- /dev/null +++ b/experiments/slurm/run_exp1_small2.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# Exp1 small2: per-modality hidden_dim + missing emg+imu fusion experiments +# hidden_dim=32 base, scaled per modality: mocap(211)->48, imu(161)->48, emg(9)->16 +# Output: results/exp1_small2 + +PYTHON=python +SCRIPT=${PULSE_ROOT}/experiments/train_exp1.py +OUTDIR=${PULSE_ROOT}/results/exp1_small2 +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +COMMON="--model transformer --epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-3 --hidden_dim 32 --downsample 5 --patience 15 --seed 42 --output_dir $OUTDIR" + +# ============================================================ +# Part 1: Single modality baselines (3 jobs) +# ============================================================ +for mod in mocap emg imu; do + job_name="s2_${mod}" + sbatch \ + -J "$job_name" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=8 \ + --mem=32G \ + -t 1:00:00 \ + -o "${LOGDIR}/${job_name}_%j.out" \ + -e "${LOGDIR}/${job_name}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion early --modalities $mod $COMMON" + echo "Submitted: $job_name" +done + +# ============================================================ +# Part 2: Early fusion baselines (3 combos) +# ============================================================ +EARLY_COMBOS=("emg,imu" "mocap,imu" "mocap,emg,imu") +for mods in "${EARLY_COMBOS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + job_name="s2_e_${mod_tag}" + sbatch \ + -J "$job_name" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=8 \ + --mem=32G \ + -t 1:00:00 \ + -o "${LOGDIR}/${job_name}_%j.out" \ + -e "${LOGDIR}/${job_name}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion early --modalities $mods $COMMON" + echo "Submitted: $job_name" +done + +# ============================================================ +# Part 3: Fusion methods x modality combos (7 methods x 3 combos = 21 jobs) +# Key addition: emg,imu fusion (was missing in round 1) +# ============================================================ +FUSIONS=(late attention weighted_late gated_late stacking product moe) +FUSION_MODS=("emg,imu" "mocap,imu" "mocap,emg,imu") + +for fusion in "${FUSIONS[@]}"; do + for mods in "${FUSION_MODS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + job_name="s2_${fusion}_${mod_tag}" + sbatch \ + -J "$job_name" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=8 \ + --mem=32G \ + -t 1:00:00 \ + -o "${LOGDIR}/${job_name}_%j.out" \ + -e "${LOGDIR}/${job_name}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion $fusion --modalities $mods $COMMON" + echo "Submitted: $job_name" + done +done + +echo "" +echo "Total: 3 single + 3 early + 21 fusion = 27 jobs submitted!" +echo "Results will be saved to: $OUTDIR" diff --git a/experiments/slurm/run_exp1_small3.sh b/experiments/slurm/run_exp1_small3.sh new file mode 100644 index 0000000000000000000000000000000000000000..88680fc0bfc7f299da9fa15ff0957ae4aeaab135 --- /dev/null +++ b/experiments/slurm/run_exp1_small3.sh @@ -0,0 +1,137 @@ +#!/bin/bash +# Exp1 small3: Data augmentation + Frozen pretrained IMU + Label smoothing +# Goal: Break the IMU-alone F1=0.771 ceiling with emg+imu fusion +# Phase 0: pretrain IMU with hidden_dim=48 (matches fusion branch) +# Baselines: IMU+aug+ls, emg+imu early+aug+ls +# Group A: 7 fusion + aug + ls (no freeze) +# Group B: 7 fusion + frozen IMU + ls (no aug) [dep: phase0] +# Group C: 7 fusion + frozen IMU + aug + ls [dep: phase0] +# Total: 1 + 2 + 7 + 7 + 7 = 24 jobs + +PYTHON=python +SCRIPT=${PULSE_ROOT}/experiments/train_exp1.py +OUTDIR=${PULSE_ROOT}/results/exp1_small3 +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +COMMON="--model transformer --epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-3 --hidden_dim 32 --downsample 5 --patience 15 --seed 42" +FUSIONS=(late attention weighted_late gated_late stacking product moe) + +# ============================================================ +# Phase 0: Pretrain IMU with hidden_dim=48 (matches fusion branch) +# ============================================================ +PHASE0_JOB=$(sbatch --parsable \ + -J "s3_phase0_imu48" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=8 \ + --mem=32G \ + -t 1:00:00 \ + -o "${LOGDIR}/phase0_imu48_%j.out" \ + -e "${LOGDIR}/phase0_imu48_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --model transformer --fusion early --modalities imu --hidden_dim 48 --epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-3 --downsample 5 --patience 15 --seed 42 --output_dir ${OUTDIR}/phase0") +echo "Phase 0 (IMU h48): job $PHASE0_JOB" + +PRETRAINED="${OUTDIR}/phase0/transformer_imu_early/model_best.pt" + +# ============================================================ +# Baselines (no dependency) +# ============================================================ + +# Baseline 1: IMU alone + augment + label_smoothing +sbatch \ + -J "s3_bl_imu_aug" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=8 \ + --mem=32G \ + -t 1:00:00 \ + -o "${LOGDIR}/bl_imu_aug_%j.out" \ + -e "${LOGDIR}/bl_imu_aug_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion early --modalities imu $COMMON --augment --label_smoothing 0.1 --tag bl_aug --output_dir $OUTDIR" +echo "Submitted: baseline IMU+aug+ls" + +# Baseline 2: emg,imu early + augment + label_smoothing +sbatch \ + -J "s3_bl_ei_aug" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=8 \ + --mem=32G \ + -t 1:00:00 \ + -o "${LOGDIR}/bl_ei_aug_%j.out" \ + -e "${LOGDIR}/bl_ei_aug_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion early --modalities emg,imu $COMMON --augment --label_smoothing 0.1 --tag bl_aug --output_dir $OUTDIR" +echo "Submitted: baseline emg+imu early+aug+ls" + +# ============================================================ +# Group A: emg+imu x 7 fusion + augment + label_smoothing (no freeze) +# ============================================================ +for fusion in "${FUSIONS[@]}"; do + sbatch \ + -J "s3_A_${fusion}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=8 \ + --mem=32G \ + -t 1:00:00 \ + -o "${LOGDIR}/grpA_${fusion}_%j.out" \ + -e "${LOGDIR}/grpA_${fusion}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion $fusion --modalities emg,imu $COMMON --augment --label_smoothing 0.1 --tag grpA --output_dir $OUTDIR" + echo "Submitted: Group A $fusion" +done + +# ============================================================ +# Group B: emg+imu x 7 fusion + frozen IMU + label_smoothing (no augment) +# Depends on Phase 0 +# ============================================================ +for fusion in "${FUSIONS[@]}"; do + sbatch \ + --dependency=afterok:${PHASE0_JOB} \ + -J "s3_B_${fusion}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=8 \ + --mem=32G \ + -t 1:00:00 \ + -o "${LOGDIR}/grpB_${fusion}_%j.out" \ + -e "${LOGDIR}/grpB_${fusion}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion $fusion --modalities emg,imu $COMMON --label_smoothing 0.1 --pretrained_backbone $PRETRAINED --freeze_backbone_idx 1 --tag grpB --output_dir $OUTDIR" + echo "Submitted: Group B $fusion (dep: $PHASE0_JOB)" +done + +# ============================================================ +# Group C: emg+imu x 7 fusion + frozen IMU + augment + label_smoothing +# Depends on Phase 0 +# ============================================================ +for fusion in "${FUSIONS[@]}"; do + sbatch \ + --dependency=afterok:${PHASE0_JOB} \ + -J "s3_C_${fusion}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=8 \ + --mem=32G \ + -t 1:00:00 \ + -o "${LOGDIR}/grpC_${fusion}_%j.out" \ + -e "${LOGDIR}/grpC_${fusion}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion $fusion --modalities emg,imu $COMMON --augment --label_smoothing 0.1 --pretrained_backbone $PRETRAINED --freeze_backbone_idx 1 --tag grpC --output_dir $OUTDIR" + echo "Submitted: Group C $fusion (dep: $PHASE0_JOB)" +done + +echo "" +echo "Total: 1 phase0 + 2 baselines + 7 grpA + 7 grpB + 7 grpC = 24 jobs" +echo "Results: $OUTDIR" +echo "Phase 0 job ID: $PHASE0_JOB (Groups B & C depend on it)" diff --git a/experiments/slurm/run_exp1_v3.sh b/experiments/slurm/run_exp1_v3.sh new file mode 100644 index 0000000000000000000000000000000000000000..10c0c7df85bf1c731a6eaf69677590eac3564a4f --- /dev/null +++ b/experiments/slurm/run_exp1_v3.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# Scene Recognition (Exp1 v3) - Train 14 vols / Test 4 vols (no val) +# v23,v24 moved from val to train; v3 stays in test +# Part 1: 9 modality combos × 3 backbones = 27 jobs (early fusion) +# Part 2: 7 fusion methods × transformer × (3-core + all-5) = 14 jobs +# Total: 41 jobs + +PYTHON=python +BASEDIR=${PULSE_ROOT} +SCRIPT=${BASEDIR}/experiments/train_exp1.py +OUTDIR=${BASEDIR}/results/exp1_v3 +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +COMMON="--epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 15 --seed 42 --output_dir $OUTDIR" + +MODS=("mocap" "emg" "eyetrack" "imu" "pressure" "mocap,emg,eyetrack" "mocap,emg,eyetrack,imu" "mocap,emg,eyetrack,pressure" "mocap,emg,eyetrack,imu,pressure") +MODELS=("cnn" "lstm" "transformer") + +# Part 1: Modality ablation × 3 backbones +echo "=== Part 1: Modality Ablation (27 jobs) ===" +for mods in "${MODS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + for model in "${MODELS[@]}"; do + sbatch \ + -J "e1v3_${model}_${mod_tag}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${LOGDIR}/${model}_${mod_tag}_early_%j.out" \ + -e "${LOGDIR}/${model}_${mod_tag}_early_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model $model --modalities $mods --fusion early $COMMON" + echo " $model / $mods / early" + done +done + +# Part 2: Fusion methods × transformer +FUSIONS=("late" "attention" "weighted_late" "gated_late" "stacking" "product" "moe") +FUSION_MODS=("mocap,emg,eyetrack" "mocap,emg,eyetrack,imu,pressure") + +echo "" +echo "=== Part 2: Fusion Ablation (14 jobs) ===" +for fmods in "${FUSION_MODS[@]}"; do + fmod_tag=$(echo $fmods | tr ',' '-') + for fusion in "${FUSIONS[@]}"; do + sbatch \ + -J "e1v3_tf_${fusion}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${LOGDIR}/transformer_${fmod_tag}_${fusion}_%j.out" \ + -e "${LOGDIR}/transformer_${fmod_tag}_${fusion}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model transformer --modalities $fmods --fusion $fusion $COMMON" + echo " transformer / $fmods / $fusion" + done +done + +echo "" +echo "Total: 41 jobs | Scene Recognition v3 | Train=14vols, Test=4vols" +echo "Results: $OUTDIR" diff --git a/experiments/slurm/run_exp1_v4.sh b/experiments/slurm/run_exp1_v4.sh new file mode 100644 index 0000000000000000000000000000000000000000..94d512248552f9a8b86d3c58775213b0319576c9 --- /dev/null +++ b/experiments/slurm/run_exp1_v4.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# Scene Recognition (Exp1 v4) - Per-modality projection to 50 dims +# All modalities projected to 50d via FC before backbone processing +# Train 14 vols / Test 4 vols (no val) +# Part 1: 9 modality combos × 3 backbones = 27 jobs (early fusion) +# Part 2: 7 fusion methods × transformer × (3-core + all-5) = 14 jobs +# Total: 41 jobs + +PYTHON=python +BASEDIR=${PULSE_ROOT} +SCRIPT=${BASEDIR}/experiments/train_exp1.py +OUTDIR=${BASEDIR}/results/exp1_v4 +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +COMMON="--epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 15 --seed 42 --output_dir $OUTDIR" + +MODS=("mocap" "emg" "eyetrack" "imu" "pressure" "mocap,emg,eyetrack" "mocap,emg,eyetrack,imu" "mocap,emg,eyetrack,pressure" "mocap,emg,eyetrack,imu,pressure") +MODELS=("cnn" "lstm" "transformer") + +# Part 1: Modality ablation × 3 backbones +echo "=== Part 1: Modality Ablation (27 jobs) ===" +for mods in "${MODS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + for model in "${MODELS[@]}"; do + sbatch \ + -J "e1v4_${model}_${mod_tag}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${LOGDIR}/${model}_${mod_tag}_early_%j.out" \ + -e "${LOGDIR}/${model}_${mod_tag}_early_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model $model --modalities $mods --fusion early $COMMON" + echo " $model / $mods / early" + done +done + +# Part 2: Fusion methods × transformer +FUSIONS=("late" "attention" "weighted_late" "gated_late" "stacking" "product" "moe") +FUSION_MODS=("mocap,emg,eyetrack" "mocap,emg,eyetrack,imu,pressure") + +echo "" +echo "=== Part 2: Fusion Ablation (14 jobs) ===" +for fmods in "${FUSION_MODS[@]}"; do + fmod_tag=$(echo $fmods | tr ',' '-') + for fusion in "${FUSIONS[@]}"; do + sbatch \ + -J "e1v4_tf_${fusion}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${LOGDIR}/transformer_${fmod_tag}_${fusion}_%j.out" \ + -e "${LOGDIR}/transformer_${fmod_tag}_${fusion}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model transformer --modalities $fmods --fusion $fusion $COMMON" + echo " transformer / $fmods / $fusion" + done +done + +echo "" +echo "Total: 41 jobs | Scene Recognition v4 | Proj50d | Train=14vols, Test=4vols" +echo "Results: $OUTDIR" diff --git a/experiments/slurm/run_exp1_v5.sh b/experiments/slurm/run_exp1_v5.sh new file mode 100644 index 0000000000000000000000000000000000000000..f4d0a09b32c38c5489287e8cd8c036f3ff6b3b61 --- /dev/null +++ b/experiments/slurm/run_exp1_v5.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Scene Recognition (Exp1 v5) - Only imu, mocap, emg +# Per-modality projection to 50d +# Train 14 vols / Test 4 vols + +PYTHON=python +BASEDIR=${PULSE_ROOT} +SCRIPT=${BASEDIR}/experiments/train_exp1.py +OUTDIR=${BASEDIR}/results/exp1_v5 +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +COMMON="--epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 15 --seed 42 --output_dir $OUTDIR" +MODELS=("cnn" "lstm" "transformer") + +# Part 1: Single modality (3 mods × 3 backbones = 9 jobs) +echo "=== Part 1: Single Modality (9 jobs) ===" +for mods in "imu" "mocap" "emg"; do + for model in "${MODELS[@]}"; do + sbatch -J "e1v5_${model}_${mods}" -p gpuA800 --gres=gpu:1 -N1 -n1 \ + --cpus-per-task=4 --mem=32G -t 2:00:00 \ + -o "${LOGDIR}/${model}_${mods}_early_%j.out" \ + -e "${LOGDIR}/${model}_${mods}_early_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model $model --modalities $mods --fusion early $COMMON" + echo " $model / $mods / early" + done +done + +# Part 2: Multi-modality early fusion (4 combos × 3 backbones = 12 jobs) +echo "" +echo "=== Part 2: Multi-Modality Early Fusion (12 jobs) ===" +for mods in "imu,mocap" "imu,emg" "mocap,emg" "imu,mocap,emg"; do + mod_tag=$(echo $mods | tr ',' '-') + for model in "${MODELS[@]}"; do + sbatch -J "e1v5_${model}_${mod_tag}" -p gpuA800 --gres=gpu:1 -N1 -n1 \ + --cpus-per-task=4 --mem=32G -t 2:00:00 \ + -o "${LOGDIR}/${model}_${mod_tag}_early_%j.out" \ + -e "${LOGDIR}/${model}_${mod_tag}_early_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model $model --modalities $mods --fusion early $COMMON" + echo " $model / $mods / early" + done +done + +# Part 3: Fusion ablation with imu+mocap+emg × transformer (7 jobs) +FUSIONS=("late" "attention" "weighted_late" "gated_late" "stacking" "product" "moe") +echo "" +echo "=== Part 3: Fusion Ablation - transformer × imu+mocap+emg (7 jobs) ===" +for fusion in "${FUSIONS[@]}"; do + sbatch -J "e1v5_tf_${fusion}" -p gpuA800 --gres=gpu:1 -N1 -n1 \ + --cpus-per-task=4 --mem=32G -t 2:00:00 \ + -o "${LOGDIR}/transformer_imu-mocap-emg_${fusion}_%j.out" \ + -e "${LOGDIR}/transformer_imu-mocap-emg_${fusion}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model transformer --modalities imu,mocap,emg --fusion $fusion $COMMON" + echo " transformer / imu,mocap,emg / $fusion" +done + +echo "" +echo "Total: 28 jobs | 3 modalities: imu(160d→50d), mocap(156d→50d), emg(8d→50d)" +echo "Results: $OUTDIR" diff --git a/experiments/slurm/run_exp1_v6.sh b/experiments/slurm/run_exp1_v6.sh new file mode 100644 index 0000000000000000000000000000000000000000..2e69508cd41c0d8e3240dbdeb26df490aa27ba33 --- /dev/null +++ b/experiments/slurm/run_exp1_v6.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# Scene Recognition (Exp1 v6) - Fixed mocap: skeleton TSV (422d) instead of marker CSV (156d) +# Per-modality projection to 50d, only imu/mocap/emg + +PYTHON=python +BASEDIR=${PULSE_ROOT} +SCRIPT=${BASEDIR}/experiments/train_exp1.py +OUTDIR=${BASEDIR}/results/exp1_v6 +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +COMMON="--epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 15 --seed 42 --output_dir $OUTDIR" +MODELS=("cnn" "lstm" "transformer") + +# Part 1: Single modality (3 mods × 3 backbones = 9 jobs) +echo "=== Part 1: Single Modality (9 jobs) ===" +for mods in "imu" "mocap" "emg"; do + for model in "${MODELS[@]}"; do + sbatch -J "e1v6_${model}_${mods}" -p gpuA800 --gres=gpu:1 -N1 -n1 \ + --cpus-per-task=4 --mem=32G -t 2:00:00 \ + -o "${LOGDIR}/${model}_${mods}_early_%j.out" \ + -e "${LOGDIR}/${model}_${mods}_early_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model $model --modalities $mods --fusion early $COMMON" + echo " $model / $mods / early" + done +done + +# Part 2: Multi-modality early fusion (4 combos × 3 backbones = 12 jobs) +echo "" +echo "=== Part 2: Multi-Modality Early Fusion (12 jobs) ===" +for mods in "imu,mocap" "imu,emg" "mocap,emg" "imu,mocap,emg"; do + mod_tag=$(echo $mods | tr ',' '-') + for model in "${MODELS[@]}"; do + sbatch -J "e1v6_${model}_${mod_tag}" -p gpuA800 --gres=gpu:1 -N1 -n1 \ + --cpus-per-task=4 --mem=32G -t 2:00:00 \ + -o "${LOGDIR}/${model}_${mod_tag}_early_%j.out" \ + -e "${LOGDIR}/${model}_${mod_tag}_early_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model $model --modalities $mods --fusion early $COMMON" + echo " $model / $mods / early" + done +done + +# Part 3: Fusion ablation with imu+mocap+emg × transformer (7 jobs) +FUSIONS=("late" "attention" "weighted_late" "gated_late" "stacking" "product" "moe") +echo "" +echo "=== Part 3: Fusion Ablation - transformer × imu+mocap+emg (7 jobs) ===" +for fusion in "${FUSIONS[@]}"; do + sbatch -J "e1v6_tf_${fusion}" -p gpuA800 --gres=gpu:1 -N1 -n1 \ + --cpus-per-task=4 --mem=32G -t 2:00:00 \ + -o "${LOGDIR}/transformer_imu-mocap-emg_${fusion}_%j.out" \ + -e "${LOGDIR}/transformer_imu-mocap-emg_${fusion}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model transformer --modalities imu,mocap,emg --fusion $fusion $COMMON" + echo " transformer / imu,mocap,emg / $fusion" +done + +echo "" +echo "Total: 28 jobs | mocap=422d(skeleton TSV), imu=160d, emg=8d → all proj 50d" +echo "Results: $OUTDIR" diff --git a/experiments/slurm/run_exp1_v7.sh b/experiments/slurm/run_exp1_v7.sh new file mode 100644 index 0000000000000000000000000000000000000000..bb90796733aa8a33de133e82f8063d1b8c71443e --- /dev/null +++ b/experiments/slurm/run_exp1_v7.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# Scene Recognition (Exp1 v7) - NO projection, corrected mocap (skeleton TSV 422d) +# Compare with v6 (proj_dim=50) to isolate projection effect + +PYTHON=python +BASEDIR=${PULSE_ROOT} +SCRIPT=${BASEDIR}/experiments/train_exp1.py +OUTDIR=${BASEDIR}/results/exp1_v7 +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +COMMON="--epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 15 --seed 42 --proj_dim 0 --output_dir $OUTDIR" +MODELS=("cnn" "lstm" "transformer") + +# Part 1: Single modality (3 × 3 = 9 jobs) +echo "=== Part 1: Single Modality (9 jobs) ===" +for mods in "imu" "mocap" "emg"; do + for model in "${MODELS[@]}"; do + sbatch -J "e1v7_${model}_${mods}" -p gpuA800 --gres=gpu:1 -N1 -n1 \ + --cpus-per-task=4 --mem=32G -t 2:00:00 \ + -o "${LOGDIR}/${model}_${mods}_early_%j.out" \ + -e "${LOGDIR}/${model}_${mods}_early_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model $model --modalities $mods --fusion early $COMMON" + echo " $model / $mods / early" + done +done + +# Part 2: Multi-modality early fusion (4 × 3 = 12 jobs) +echo "" +echo "=== Part 2: Multi-Modality Early Fusion (12 jobs) ===" +for mods in "imu,mocap" "imu,emg" "mocap,emg" "imu,mocap,emg"; do + mod_tag=$(echo $mods | tr ',' '-') + for model in "${MODELS[@]}"; do + sbatch -J "e1v7_${model}_${mod_tag}" -p gpuA800 --gres=gpu:1 -N1 -n1 \ + --cpus-per-task=4 --mem=32G -t 2:00:00 \ + -o "${LOGDIR}/${model}_${mod_tag}_early_%j.out" \ + -e "${LOGDIR}/${model}_${mod_tag}_early_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model $model --modalities $mods --fusion early $COMMON" + echo " $model / $mods / early" + done +done + +# Part 3: Fusion ablation × transformer × 3-modality (7 jobs) +FUSIONS=("late" "attention" "weighted_late" "gated_late" "stacking" "product" "moe") +echo "" +echo "=== Part 3: Fusion Ablation - transformer × imu+mocap+emg (7 jobs) ===" +for fusion in "${FUSIONS[@]}"; do + sbatch -J "e1v7_tf_${fusion}" -p gpuA800 --gres=gpu:1 -N1 -n1 \ + --cpus-per-task=4 --mem=32G -t 2:00:00 \ + -o "${LOGDIR}/transformer_imu-mocap-emg_${fusion}_%j.out" \ + -e "${LOGDIR}/transformer_imu-mocap-emg_${fusion}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model transformer --modalities imu,mocap,emg --fusion $fusion $COMMON" + echo " transformer / imu,mocap,emg / $fusion" +done + +echo "" +echo "Total: 28 jobs | NO projection | mocap=422d(skeleton), imu=160d, emg=8d" +echo "Results: $OUTDIR" diff --git a/experiments/slurm/run_exp1_v8.sh b/experiments/slurm/run_exp1_v8.sh new file mode 100644 index 0000000000000000000000000000000000000000..7985d6eb2680a598829046c2fa37dae9c35405c9 --- /dev/null +++ b/experiments/slurm/run_exp1_v8.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# Scene Recognition (Exp1 v8) - Mocap with hip-relative + velocity (620d) +# No projection, compare with v7 (raw mocap 422d) + +PYTHON=python +BASEDIR=${PULSE_ROOT} +SCRIPT=${BASEDIR}/experiments/train_exp1.py +OUTDIR=${BASEDIR}/results/exp1_v8 +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +COMMON="--epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 15 --seed 42 --proj_dim 0 --output_dir $OUTDIR" +MODELS=("cnn" "lstm" "transformer") + +# Part 1: Single modality (3 × 3 = 9 jobs, but only mocap changed; imu/emg same as v7) +# Only run mocap single + all combos involving mocap + fusion +echo "=== Part 1: Mocap single modality (3 jobs) ===" +for model in "${MODELS[@]}"; do + sbatch -J "e1v8_${model}_mocap" -p gpuA800 --gres=gpu:1 -N1 -n1 \ + --cpus-per-task=4 --mem=32G -t 2:00:00 \ + -o "${LOGDIR}/${model}_mocap_early_%j.out" \ + -e "${LOGDIR}/${model}_mocap_early_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model $model --modalities mocap --fusion early $COMMON" + echo " $model / mocap / early" +done + +# Part 2: All combos involving mocap (6 combos × relevant backbones) +echo "" +echo "=== Part 2: Multi-modal with mocap (12 jobs) ===" +for mods in "imu,mocap" "mocap,emg" "imu,mocap,emg"; do + mod_tag=$(echo $mods | tr ',' '-') + for model in "${MODELS[@]}"; do + sbatch -J "e1v8_${model}_${mod_tag}" -p gpuA800 --gres=gpu:1 -N1 -n1 \ + --cpus-per-task=4 --mem=64G -t 2:00:00 \ + -o "${LOGDIR}/${model}_${mod_tag}_early_%j.out" \ + -e "${LOGDIR}/${model}_${mod_tag}_early_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model $model --modalities $mods --fusion early $COMMON" + echo " $model / $mods / early" + done +done + +# Part 3: Fusion ablation × transformer × 3-modality (7 jobs) +FUSIONS=("late" "attention" "weighted_late" "gated_late" "stacking" "product" "moe") +echo "" +echo "=== Part 3: Fusion Ablation - transformer × imu+mocap+emg (7 jobs) ===" +for fusion in "${FUSIONS[@]}"; do + sbatch -J "e1v8_tf_${fusion}" -p gpuA800 --gres=gpu:1 -N1 -n1 \ + --cpus-per-task=4 --mem=64G -t 2:00:00 \ + -o "${LOGDIR}/transformer_imu-mocap-emg_${fusion}_%j.out" \ + -e "${LOGDIR}/transformer_imu-mocap-emg_${fusion}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model transformer --modalities imu,mocap,emg --fusion $fusion $COMMON" + echo " transformer / imu,mocap,emg / $fusion" +done + +echo "" +echo "Total: 22 jobs | mocap=620d (hip-relative+velocity) | No projection" +echo "Results: $OUTDIR" diff --git a/experiments/slurm/run_exp1_v8_multiseed.sh b/experiments/slurm/run_exp1_v8_multiseed.sh new file mode 100644 index 0000000000000000000000000000000000000000..a8b439ca926f451db73711172baeb85ea468dfdf --- /dev/null +++ b/experiments/slurm/run_exp1_v8_multiseed.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# Exp1 v8 Multi-seed: Top configs × 5 seeds to measure variance +# Configs: (1) transformer+imu early, (2) transformer+3mod late, (3) transformer+3mod stacking +# Seeds: 42, 123, 456, 789, 2024 + +PYTHON=python +BASEDIR=${PULSE_ROOT} +SCRIPT=${BASEDIR}/experiments/train_exp1.py +OUTDIR=${BASEDIR}/results/exp1_v8_multiseed +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +COMMON="--epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 15 --proj_dim 0 --output_dir $OUTDIR" +SEEDS=(42 123 456 789 2024) + +# Config 1: Transformer + imu (single, early) +echo "=== Transformer + imu (5 seeds) ===" +for seed in "${SEEDS[@]}"; do + sbatch -J "ms_tf_imu_s${seed}" -p gpuA800 --gres=gpu:1 -N1 -n1 \ + --cpus-per-task=4 --mem=32G -t 2:00:00 \ + -o "${LOGDIR}/tf_imu_early_s${seed}_%j.out" \ + -e "${LOGDIR}/tf_imu_early_s${seed}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model transformer --modalities imu --fusion early --seed $seed --tag s${seed} $COMMON" + echo " seed=$seed" +done + +# Config 2: Transformer + imu,mocap,emg late fusion +echo "" +echo "=== Transformer + 3mod late (5 seeds) ===" +for seed in "${SEEDS[@]}"; do + sbatch -J "ms_tf_3m_late_s${seed}" -p gpuA800 --gres=gpu:1 -N1 -n1 \ + --cpus-per-task=4 --mem=64G -t 2:00:00 \ + -o "${LOGDIR}/tf_3mod_late_s${seed}_%j.out" \ + -e "${LOGDIR}/tf_3mod_late_s${seed}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model transformer --modalities imu,mocap,emg --fusion late --seed $seed --tag s${seed} $COMMON" + echo " seed=$seed" +done + +# Config 3: Transformer + imu,mocap,emg stacking fusion +echo "" +echo "=== Transformer + 3mod stacking (5 seeds) ===" +for seed in "${SEEDS[@]}"; do + sbatch -J "ms_tf_3m_stack_s${seed}" -p gpuA800 --gres=gpu:1 -N1 -n1 \ + --cpus-per-task=4 --mem=64G -t 2:00:00 \ + -o "${LOGDIR}/tf_3mod_stacking_s${seed}_%j.out" \ + -e "${LOGDIR}/tf_3mod_stacking_s${seed}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model transformer --modalities imu,mocap,emg --fusion stacking --seed $seed --tag s${seed} $COMMON" + echo " seed=$seed" +done + +echo "" +echo "Total: 15 jobs | 3 configs × 5 seeds" +echo "Results: $OUTDIR" diff --git a/experiments/slurm/run_exp1_v9.sh b/experiments/slurm/run_exp1_v9.sh new file mode 100644 index 0000000000000000000000000000000000000000..34aa41250a4ee7157dabaebfb5f9df67a14973fd --- /dev/null +++ b/experiments/slurm/run_exp1_v9.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# Scene Recognition (Exp1 v9) - Improvements over v8 +# Changes: (A) augmentation, (B) feat_concat fusion, (C) pretrained branches +# All use transformer, imu+mocap+emg, no projection, 5 seeds + +PYTHON=python +BASEDIR=${PULSE_ROOT} +SCRIPT=${BASEDIR}/experiments/train_exp1.py +OUTDIR=${BASEDIR}/results/exp1_v9 +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +BASE="--model transformer --modalities imu,mocap,emg --epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 15 --proj_dim 0 --output_dir $OUTDIR" +SEEDS=(42 123 456 789 2024) + +# Pretrained single-modality models (modality order: imu=0, mocap=1, emg=2) +PT_IMU=${PULSE_ROOT}/results/exp1_v7/transformer_imu_early/model_best.pt +PT_MOCAP=${PULSE_ROOT}/results/exp1_v8/transformer_mocap_early/model_best.pt +PT_EMG=${PULSE_ROOT}/results/exp1_v7/transformer_emg_early/model_best.pt + +# Group A: late fusion + augmentation (5 seeds) +echo "=== A: late + augment (5 seeds) ===" +for seed in "${SEEDS[@]}"; do + sbatch -J "v9_late_aug_s${seed}" -p gpuA800 --gres=gpu:1 -N1 -n1 \ + --cpus-per-task=4 --mem=64G -t 2:00:00 \ + -o "${LOGDIR}/late_aug_s${seed}_%j.out" \ + -e "${LOGDIR}/late_aug_s${seed}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --fusion late --augment --seed $seed --tag aug_s${seed} $BASE" + echo " late+aug seed=$seed" +done + +# Group B: feat_concat fusion (5 seeds) +echo "" +echo "=== B: feat_concat (5 seeds) ===" +for seed in "${SEEDS[@]}"; do + sbatch -J "v9_fc_s${seed}" -p gpuA800 --gres=gpu:1 -N1 -n1 \ + --cpus-per-task=4 --mem=64G -t 2:00:00 \ + -o "${LOGDIR}/feat_concat_s${seed}_%j.out" \ + -e "${LOGDIR}/feat_concat_s${seed}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --fusion feat_concat --seed $seed --tag s${seed} $BASE" + echo " feat_concat seed=$seed" +done + +# Group C: feat_concat + augmentation (5 seeds) +echo "" +echo "=== C: feat_concat + augment (5 seeds) ===" +for seed in "${SEEDS[@]}"; do + sbatch -J "v9_fc_aug_s${seed}" -p gpuA800 --gres=gpu:1 -N1 -n1 \ + --cpus-per-task=4 --mem=64G -t 2:00:00 \ + -o "${LOGDIR}/feat_concat_aug_s${seed}_%j.out" \ + -e "${LOGDIR}/feat_concat_aug_s${seed}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --fusion feat_concat --augment --seed $seed --tag aug_s${seed} $BASE" + echo " feat_concat+aug seed=$seed" +done + +# Group D: late + pretrained IMU branch (freeze_idx=0) (5 seeds) +echo "" +echo "=== D: late + pretrained IMU (5 seeds) ===" +for seed in "${SEEDS[@]}"; do + sbatch -J "v9_late_pt_s${seed}" -p gpuA800 --gres=gpu:1 -N1 -n1 \ + --cpus-per-task=4 --mem=64G -t 2:00:00 \ + -o "${LOGDIR}/late_pretrained_s${seed}_%j.out" \ + -e "${LOGDIR}/late_pretrained_s${seed}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --fusion late --pretrained_backbone $PT_IMU --freeze_backbone_idx 0 --seed $seed --tag pt_s${seed} $BASE" + echo " late+pretrained seed=$seed" +done + +# Group E: late + augment + pretrained IMU (5 seeds) +echo "" +echo "=== E: late + augment + pretrained IMU (5 seeds) ===" +for seed in "${SEEDS[@]}"; do + sbatch -J "v9_late_aug_pt_s${seed}" -p gpuA800 --gres=gpu:1 -N1 -n1 \ + --cpus-per-task=4 --mem=64G -t 2:00:00 \ + -o "${LOGDIR}/late_aug_pt_s${seed}_%j.out" \ + -e "${LOGDIR}/late_aug_pt_s${seed}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --fusion late --augment --pretrained_backbone $PT_IMU --freeze_backbone_idx 0 --seed $seed --tag aug_pt_s${seed} $BASE" + echo " late+aug+pretrained seed=$seed" +done + +echo "" +echo "Total: 25 jobs | 5 groups × 5 seeds" +echo "Results: $OUTDIR" diff --git a/experiments/slurm/run_exp2.sh b/experiments/slurm/run_exp2.sh new file mode 100644 index 0000000000000000000000000000000000000000..bc8a06af56f69bb3c5f12a3ace5f469c1c3e0801 --- /dev/null +++ b/experiments/slurm/run_exp2.sh @@ -0,0 +1,35 @@ +#!/bin/bash +#SBATCH -J exp2_seg +#SBATCH -p gpuA800 +#SBATCH --gres=gpu:1 +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=64G +#SBATCH -t 12:00:00 +#SBATCH -o ${PULSE_ROOT}/results/exp2/slurm_%j.out +#SBATCH -e ${PULSE_ROOT}/results/exp2/slurm_%j.err + +export PYTHONUNBUFFERED=1 + +echo "=== Job Info ===" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $SLURM_NODELIST" +echo "Start: $(date)" +nvidia-smi --query-gpu=name,memory.total --format=csv,noheader +echo "================" + +PYTHON=python +cd ${PULSE_ROOT} + +$PYTHON experiments/train_exp2.py --run_all \ + --epochs 80 \ + --batch_size 16 \ + --lr 5e-4 \ + --hidden_dim 64 \ + --downsample 2 \ + --patience 15 \ + --seed 42 \ + --output_dir ${PULSE_ROOT}/results/exp2 + +echo "=== Done: $(date) ===" diff --git a/experiments/slurm/run_exp2_combos.sh b/experiments/slurm/run_exp2_combos.sh new file mode 100644 index 0000000000000000000000000000000000000000..47ed24fa76d48213a2e6b988761887f797e290a4 --- /dev/null +++ b/experiments/slurm/run_exp2_combos.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Exp2 Action Segmentation: run all modality combos from Exp1 +# Already done: mocap, emg, mocap+emg+eyetrack, mocap+emg+eyetrack+imu, all 5 +# Missing: imu, pressure, eyetrack, emg+imu, mocap+imu, mocap+emg+imu, +# mocap+emg+eyetrack+pressure, mocap+emg +# = 8 combos x 3 models = 24 jobs + +PYTHON=python +SCRIPT=${PULSE_ROOT}/experiments/train_exp2.py +OUTDIR=${PULSE_ROOT}/results/exp2 +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +COMMON="--epochs 80 --batch_size 16 --lr 5e-4 --weight_decay 1e-4 --hidden_dim 64 --downsample 2 --patience 15 --seed 42 --output_dir $OUTDIR" +MODELS=(tcn mstcn lstm) +MISSING_MODS=("imu" "pressure" "eyetrack" "emg,imu" "mocap,imu" "mocap,emg,imu" "mocap,emg,eyetrack,pressure" "mocap,emg") + +COUNT=0 +for mods in "${MISSING_MODS[@]}"; do + for model in "${MODELS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + job_name="e2_${model}_${mod_tag}" + sbatch \ + -J "$job_name" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=8 \ + --mem=32G \ + -t 2:00:00 \ + -o "${LOGDIR}/${job_name}_%j.out" \ + -e "${LOGDIR}/${job_name}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --model $model --modalities $mods $COMMON" + echo "Submitted: $job_name" + COUNT=$((COUNT + 1)) + done +done + +echo "" +echo "Total: $COUNT jobs submitted" +echo "Results: $OUTDIR" diff --git a/experiments/slurm/run_exp2_fix.sh b/experiments/slurm/run_exp2_fix.sh new file mode 100644 index 0000000000000000000000000000000000000000..75658eb247b889e57d19a7ba9aa783f197c3cbe0 --- /dev/null +++ b/experiments/slurm/run_exp2_fix.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH -J exp2_fix +#SBATCH -p gpuA800 +#SBATCH --gres=gpu:1 +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=64G +#SBATCH -t 4:00:00 +#SBATCH -o ${PULSE_ROOT}/results/exp2/slurm_fix_%j.out +#SBATCH -e ${PULSE_ROOT}/results/exp2/slurm_fix_%j.err + +export PYTHONUNBUFFERED=1 + +echo "=== Job Info ===" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $SLURM_NODELIST" +echo "Start: $(date)" +nvidia-smi --query-gpu=name,memory.total --format=csv,noheader +echo "================" + +PYTHON=python +cd ${PULSE_ROOT} + +# Run the 3 missing experiments: 3-core combo (mocap,emg,eyetrack) × 3 models +for MODEL in tcn mstcn lstm; do + $PYTHON experiments/train_exp2.py \ + --model $MODEL \ + --modalities mocap,emg,eyetrack \ + --epochs 80 \ + --batch_size 16 \ + --lr 5e-4 \ + --hidden_dim 64 \ + --downsample 2 \ + --patience 15 \ + --seed 42 \ + --output_dir ${PULSE_ROOT}/results/exp2 +done + +echo "=== Done: $(date) ===" diff --git a/experiments/slurm/run_exp3.sh b/experiments/slurm/run_exp3.sh new file mode 100644 index 0000000000000000000000000000000000000000..c8267c7bc7ab8eeaed7a50880105dfac08d4d274 --- /dev/null +++ b/experiments/slurm/run_exp3.sh @@ -0,0 +1,35 @@ +#!/bin/bash +#SBATCH -J exp3_contact +#SBATCH -p gpuA800 +#SBATCH --gres=gpu:1 +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=64G +#SBATCH -t 12:00:00 +#SBATCH -o ${PULSE_ROOT}/results/exp3/slurm_%j.out +#SBATCH -e ${PULSE_ROOT}/results/exp3/slurm_%j.err + +export PYTHONUNBUFFERED=1 + +echo "=== Job Info ===" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $SLURM_NODELIST" +echo "Start: $(date)" +nvidia-smi --query-gpu=name,memory.total --format=csv,noheader +echo "================" + +PYTHON=python +cd ${PULSE_ROOT} + +$PYTHON experiments/train_exp3.py --run_all \ + --epochs 50 \ + --batch_size 32 \ + --lr 1e-3 \ + --hidden_dim 64 \ + --downsample 2 \ + --patience 10 \ + --seed 42 \ + --output_dir ${PULSE_ROOT}/results/exp3 + +echo "=== Done: $(date) ===" diff --git a/experiments/slurm/run_exp4.sh b/experiments/slurm/run_exp4.sh new file mode 100644 index 0000000000000000000000000000000000000000..ee967a0f3cb1b21fd70f4cb537d46af385fedade --- /dev/null +++ b/experiments/slurm/run_exp4.sh @@ -0,0 +1,35 @@ +#!/bin/bash +#SBATCH -J exp4_cross +#SBATCH -p gpuA800 +#SBATCH --gres=gpu:1 +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=64G +#SBATCH -t 12:00:00 +#SBATCH -o ${PULSE_ROOT}/results/exp4/slurm_%j.out +#SBATCH -e ${PULSE_ROOT}/results/exp4/slurm_%j.err + +export PYTHONUNBUFFERED=1 + +echo "=== Job Info ===" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $SLURM_NODELIST" +echo "Start: $(date)" +nvidia-smi --query-gpu=name,memory.total --format=csv,noheader +echo "================" + +PYTHON=python +cd ${PULSE_ROOT} + +$PYTHON experiments/train_exp4.py --run_all \ + --epochs 50 \ + --batch_size 32 \ + --lr 5e-4 \ + --hidden_dim 128 \ + --downsample 2 \ + --patience 10 \ + --seed 42 \ + --output_dir ${PULSE_ROOT}/results/exp4 + +echo "=== Done: $(date) ===" diff --git a/experiments/slurm/run_modality_ablation.sh b/experiments/slurm/run_modality_ablation.sh new file mode 100644 index 0000000000000000000000000000000000000000..a77dcd78ec9d7128bff0bfe0c3927c16948d4375 --- /dev/null +++ b/experiments/slurm/run_modality_ablation.sh @@ -0,0 +1,154 @@ +#!/bin/bash +#SBATCH --job-name=mod_ablation +#SBATCH --partition=gpuA800 +#SBATCH --gres=gpu:2 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=64G +#SBATCH --time=4:00:00 +#SBATCH --output=${PULSE_ROOT}/results/modality_ablation_%j.log + +# Modality Ablation Matrix for Scene Recognition (Exp1) +# 7 configs: 3 single + 3 two-modal + 1 three-modal (already done) +# All use Transformer backbone, hidden_dim=128, 5 seeds +# Single modality: early fusion +# Multi modality: late fusion + pretrained strongest branch + +set -e +export PYTHONUNBUFFERED=1 + +PYTHON=python +BASEDIR=${PULSE_ROOT} +SCRIPT=${BASEDIR}/experiments/train_exp1.py +OUTDIR=${BASEDIR}/results/modality_ablation +mkdir -p $OUTDIR + +COMMON="--model transformer --epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 15 --proj_dim 0 --output_dir $OUTDIR" +SEEDS=(42 123 456 789 2024) + +# Pretrained single-modality backbones (seed=42, from v7/v8) +PT_IMU=${BASEDIR}/results/exp1_v7/transformer_imu_early/model_best.pt +PT_MOCAP=${BASEDIR}/results/exp1_v8/transformer_mocap_early/model_best.pt +PT_EMG=${BASEDIR}/results/exp1_v7/transformer_emg_early/model_best.pt + +echo "=== Modality Ablation Matrix ===" +echo "Output: $OUTDIR" + +# ============================================================ +# GPU 0: Single modality (mocap, emg) + two-modal (mocap+emg) +# ============================================================ +( +export CUDA_VISIBLE_DEVICES=0 + +# --- Phase 0: Single modality × 5 seeds --- +echo "--- GPU0: Single modality mocap ---" +for seed in "${SEEDS[@]}"; do + echo " mocap seed=$seed" + $PYTHON $SCRIPT --modalities mocap --fusion early --seed $seed \ + --tag ablation_s${seed} $COMMON 2>&1 | tail -5 +done + +echo "--- GPU0: Single modality emg ---" +for seed in "${SEEDS[@]}"; do + echo " emg seed=$seed" + $PYTHON $SCRIPT --modalities emg --fusion early --seed $seed \ + --tag ablation_s${seed} $COMMON 2>&1 | tail -5 +done + +# --- Phase 1: Two-modal mocap+emg / late+pretrained(emg) --- +# modalities=mocap,emg → idx0=mocap, idx1=emg → pretrain emg (idx=1) +echo "--- GPU0: mocap+emg late+pretrained ---" +for seed in "${SEEDS[@]}"; do + echo " mocap+emg seed=$seed" + $PYTHON $SCRIPT --modalities mocap,emg --fusion late --seed $seed \ + --pretrained_backbone $PT_EMG --freeze_backbone_idx 1 \ + --tag ablation_pt_s${seed} $COMMON 2>&1 | tail -5 +done + +echo "--- GPU0 Done ---" +) & +PID0=$! + +# ============================================================ +# GPU 1: Two-modal (mocap+imu, emg+imu) +# ============================================================ +( +export CUDA_VISIBLE_DEVICES=1 + +# --- mocap+imu / late+pretrained(imu) --- +# modalities=mocap,imu → idx0=mocap, idx1=imu → pretrain imu (idx=1) +echo "--- GPU1: mocap+imu late+pretrained ---" +for seed in "${SEEDS[@]}"; do + echo " mocap+imu seed=$seed" + $PYTHON $SCRIPT --modalities mocap,imu --fusion late --seed $seed \ + --pretrained_backbone $PT_IMU --freeze_backbone_idx 1 \ + --tag ablation_pt_s${seed} $COMMON 2>&1 | tail -5 +done + +# --- emg+imu / late+pretrained(imu) --- +# modalities=emg,imu → idx0=emg, idx1=imu → pretrain imu (idx=1) +echo "--- GPU1: emg+imu late+pretrained ---" +for seed in "${SEEDS[@]}"; do + echo " emg+imu seed=$seed" + $PYTHON $SCRIPT --modalities emg,imu --fusion late --seed $seed \ + --pretrained_backbone $PT_IMU --freeze_backbone_idx 1 \ + --tag ablation_pt_s${seed} $COMMON 2>&1 | tail -5 +done + +echo "--- GPU1 Done ---" +) & +PID1=$! + +wait $PID0 $PID1 + +# ============================================================ +# Collect results +# ============================================================ +echo "" +echo "=== Results Summary ===" +$PYTHON -c " +import json, os, numpy as np + +base = '$OUTDIR' +configs = [ + ('mocap / early', 'transformer_mocap_early_ablation_s{}'), + ('emg / early', 'transformer_emg_early_ablation_s{}'), + ('imu / early', None), # from v8_multiseed + ('mocap+emg / late+pt', 'transformer_mocap-emg_late_ablation_pt_s{}'), + ('mocap+imu / late+pt', 'transformer_mocap-imu_late_ablation_pt_s{}'), + ('emg+imu / late+pt', 'transformer_emg-imu_late_ablation_pt_s{}'), + ('mocap+emg+imu / late+pt', None), # from v9 +] + +seeds = [42, 123, 456, 789, 2024] +v8_base = '${BASEDIR}/results/exp1_v8_multiseed' +v9_base = '${BASEDIR}/results/exp1_v9' + +print(f'{\"Config\":<30} {\"F1 (mean±std)\":<20} {\"Acc (mean±std)\":<20} N') +print('-' * 75) + +for label, pattern in configs: + f1s, accs = [], [] + for s in seeds: + if label == 'imu / early': + path = os.path.join(v8_base, f'transformer_imu_early_s{s}', 'results.json') + elif label == 'mocap+emg+imu / late+pt': + path = os.path.join(v9_base, f'transformer_imu-mocap-emg_late_pt_s{s}', 'results.json') + else: + path = os.path.join(base, pattern.format(s), 'results.json') + if os.path.exists(path): + with open(path) as f: + d = json.load(f) + f1s.append(d['test_macro_f1']) + accs.append(d['test_accuracy']) + if f1s: + f1 = np.array(f1s) + acc = np.array(accs) + print(f'{label:<30} {f1.mean():.3f}±{f1.std():.3f} {acc.mean():.3f}±{acc.std():.3f} {len(f1s)}') + else: + print(f'{label:<30} (no results)') +" + +echo "" +echo "=== All done ===" diff --git a/experiments/slurm/run_new_exps.sh b/experiments/slurm/run_new_exps.sh new file mode 100644 index 0000000000000000000000000000000000000000..9ee7d78e83fe21322a5036a16a1bb03b7543789e --- /dev/null +++ b/experiments/slurm/run_new_exps.sh @@ -0,0 +1,118 @@ +#!/bin/bash +# Submit all 3 new benchmark experiments (A: missing modality, B: grip force +# regression, C: T5 text retrieval) in parallel to the gpuA800 partition. +# Each single-GPU job is sbatched independently. + +set -u +PYTHON=python +BASEDIR=${PULSE_ROOT} +OUTROOT=${BASEDIR}/results/exp_new +mkdir -p ${OUTROOT}/slurm_logs + +SUBMIT() { + # args: job_name time_hrs cmd... + local jname=$1; shift + local hrs=$1; shift + sbatch \ + -J "${jname}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t "${hrs}:00:00" \ + -o "${OUTROOT}/slurm_logs/${jname}_%j.out" \ + -e "${OUTROOT}/slurm_logs/${jname}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $*" +} + +# --------------------------------------------------------------------------- +# Experiment A: Missing-modality robustness +# Train late-fusion Transformer with random modality dropout at p=0.3 +# 5 seeds, all 5 modalities +# --------------------------------------------------------------------------- +echo "=== Exp A: Missing-modality robustness (5 jobs) ===" +for seed in 42 123 456 789 2024; do + SUBMIT "expA_missing_seed${seed}" 2 \ + "$PYTHON experiments/tasks/train_exp_missing.py \ + --model transformer --fusion late \ + --modalities mocap,emg,eyetrack,imu,pressure \ + --mod_dropout_p 0.3 \ + --epochs 100 --batch_size 16 --lr 1e-3 --hidden_dim 128 \ + --patience 15 --augment \ + --seed ${seed} \ + --output_dir ${OUTROOT}/expA_missing \ + --tag ''" + echo " Submitted: expA_missing_seed${seed}" +done + +# Baseline (no dropout) for comparison, same seeds +for seed in 42 123 456; do + SUBMIT "expA_baseline_seed${seed}" 2 \ + "$PYTHON experiments/tasks/train_exp_missing.py \ + --model transformer --fusion late \ + --modalities mocap,emg,eyetrack,imu,pressure \ + --mod_dropout_p 0.0 \ + --epochs 100 --batch_size 16 --lr 1e-3 --hidden_dim 128 \ + --patience 15 --augment \ + --seed ${seed} \ + --output_dir ${OUTROOT}/expA_baseline \ + --tag ''" + echo " Submitted: expA_baseline_seed${seed}" +done + +# --------------------------------------------------------------------------- +# Experiment B: Grip force regression (T4') +# 3 backbones x 3 modality configs x 3 seeds +# --------------------------------------------------------------------------- +echo "" +echo "=== Exp B: Grip force regression ===" +BACKBONES=("transformer" "lstm") +MOD_CONFIGS=( + "emg" + "mocap" + "emg,imu" + "mocap,emg,imu,eyetrack" +) +for bb in "${BACKBONES[@]}"; do + for mods in "${MOD_CONFIGS[@]}"; do + for seed in 42 123 456; do + mod_tag=$(echo $mods | tr ',' '-') + SUBMIT "expB_grip_${bb}_${mod_tag}_s${seed}" 1 \ + "$PYTHON experiments/tasks/train_exp_grip.py \ + --backbone ${bb} --modalities ${mods} \ + --epochs 60 --batch_size 8 --lr 1e-3 \ + --hidden_dim 128 --patience 12 \ + --seed ${seed} \ + --output_dir ${OUTROOT}/expB_grip \ + --tag ''" + echo " Submitted: expB_grip_${bb}_${mod_tag}_s${seed}" + done + done +done + +# --------------------------------------------------------------------------- +# Experiment C: T5 text retrieval +# 2 modality configs x 3 seeds +# --------------------------------------------------------------------------- +echo "" +echo "=== Exp C: T5 text retrieval ===" +for mods in "mocap,emg,eyetrack,imu" "emg,imu" "mocap"; do + for seed in 42 123 456; do + mod_tag=$(echo $mods | tr ',' '-') + SUBMIT "expC_retrieval_${mod_tag}_s${seed}" 1 \ + "$PYTHON experiments/tasks/train_exp_retrieval.py \ + --modalities ${mods} \ + --epochs 60 --batch_size 64 --lr 5e-4 \ + --hidden_dim 128 --emb_dim 128 \ + --seed ${seed} \ + --output_dir ${OUTROOT}/expC_retrieval \ + --tag ''" + echo " Submitted: expC_retrieval_${mod_tag}_s${seed}" + done +done + +echo "" +echo "All jobs submitted. Monitor with: squeue -u \$USER" +echo "Results in: ${OUTROOT}/" diff --git a/experiments/slurm/run_pred.sh b/experiments/slurm/run_pred.sh new file mode 100644 index 0000000000000000000000000000000000000000..3d12be1c527a49b3b6f4d6da090f38d93e306c4e --- /dev/null +++ b/experiments/slurm/run_pred.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# Sensor-to-text with LoRA-tuned Qwen2.5-0.5B +# LoRA on q_proj/v_proj + instruction prefix + max 20 tokens +# Total: 9 jobs + +PYTHON=python +BASEDIR=${PULSE_ROOT} +TRAIN_SCRIPT=${BASEDIR}/experiments/tasks/train_pred.py +OUTDIR=${BASEDIR}/results/pred_llm2 +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +LLM="${BASEDIR}/models/qwen2.5-0.5b" +COMMON="--epochs 50 --batch_size 8 --lr 5e-4 --weight_decay 1e-4 --hidden_dim 64 --n_sensor_tokens 8 --downsample 5 --patience 15 --seed 42 --lora_r 8 --lora_alpha 16 --output_dir $OUTDIR --llm_name $LLM --window_sec 15.0" + +MODS=("imu" "emg" "mocap" "emg,imu" "mocap,imu" "mocap,emg,imu" "mocap,emg,eyetrack" "mocap,emg,eyetrack,imu" "mocap,emg,eyetrack,imu,pressure") + +for mods in "${MODS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + sbatch \ + -J "pllm2_${mod_tag}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=8 \ + --mem=64G \ + -t 4:00:00 \ + -o "${LOGDIR}/${mod_tag}_%j.out" \ + -e "${LOGDIR}/${mod_tag}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; export HF_HUB_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; cd ${BASEDIR}; $PYTHON $TRAIN_SCRIPT --modalities $mods $COMMON" + echo "Submitted: $mods" +done + +echo "" +echo "Total: 9 jobs" +echo "LLM: $LLM (LoRA r=8 alpha=16)" +echo "Results: $OUTDIR" diff --git a/experiments/slurm/run_pred_cls.sh b/experiments/slurm/run_pred_cls.sh new file mode 100644 index 0000000000000000000000000000000000000000..54e46e048138e1643bf7d581d0bd8787fe2874ba --- /dev/null +++ b/experiments/slurm/run_pred_cls.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Action Prediction via Verb-Category Classification (20 classes) +# Transformer classifier + data augmentation + label smoothing + class weights +# Total: 9 jobs + +PYTHON=python +BASEDIR=${PULSE_ROOT} +TRAIN_SCRIPT=${BASEDIR}/experiments/tasks/train_pred_cls.py +OUTDIR=${BASEDIR}/results/pred_cls +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +COMMON="--epochs 80 --batch_size 32 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 64 --downsample 5 --patience 20 --seed 42 --augment --noise_std 0.1 --time_mask_ratio 0.1 --label_smoothing 0.1 --output_dir $OUTDIR --window_sec 15.0" + +MODS=("imu" "emg" "mocap" "emg,imu" "mocap,imu" "mocap,emg,imu" "mocap,emg,eyetrack" "mocap,emg,eyetrack,imu" "mocap,emg,eyetrack,imu,pressure") + +for mods in "${MODS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + sbatch \ + -J "pcls_${mod_tag}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${LOGDIR}/${mod_tag}_%j.out" \ + -e "${LOGDIR}/${mod_tag}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $TRAIN_SCRIPT --modalities $mods $COMMON" + echo "Submitted: $mods" +done + +echo "" +echo "Total: 9 jobs" +echo "Classes: 20 verb categories" +echo "Results: $OUTDIR" diff --git a/experiments/slurm/run_pred_cls2.sh b/experiments/slurm/run_pred_cls2.sh new file mode 100644 index 0000000000000000000000000000000000000000..311bbcbd7b8f6c2c936d15ba521081e0a0873f85 --- /dev/null +++ b/experiments/slurm/run_pred_cls2.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Action Prediction Round 2: 8 coarse classes + hidden_dim=128 +# Total: 9 jobs + +PYTHON=python +BASEDIR=${PULSE_ROOT} +TRAIN_SCRIPT=${BASEDIR}/experiments/tasks/train_pred_cls.py +OUTDIR=${BASEDIR}/results/pred_cls2 +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +COMMON="--coarse --epochs 80 --batch_size 32 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 20 --seed 42 --augment --noise_std 0.1 --time_mask_ratio 0.1 --label_smoothing 0.1 --output_dir $OUTDIR --window_sec 15.0" + +MODS=("imu" "emg" "mocap" "emg,imu" "mocap,imu" "mocap,emg,imu" "mocap,emg,eyetrack" "mocap,emg,eyetrack,imu" "mocap,emg,eyetrack,imu,pressure") + +for mods in "${MODS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + sbatch \ + -J "pcls2_${mod_tag}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${LOGDIR}/${mod_tag}_%j.out" \ + -e "${LOGDIR}/${mod_tag}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $TRAIN_SCRIPT --modalities $mods $COMMON" + echo "Submitted: $mods" +done + +echo "" +echo "Total: 9 jobs | 8 coarse classes | hidden_dim=128" +echo "Results: $OUTDIR" diff --git a/experiments/slurm/run_pred_cls3.sh b/experiments/slurm/run_pred_cls3.sh new file mode 100644 index 0000000000000000000000000000000000000000..c8f51454918da5a82f5755b2f3022dd08e179b5f --- /dev/null +++ b/experiments/slurm/run_pred_cls3.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Action Prediction Round 3: 8 coarse classes + prev action label + hidden_dim=128 +# Transition baseline: acc=0.31 F1w=0.25 — target: beat this with sensor+prev_action +# Total: 9 jobs + +PYTHON=python +BASEDIR=${PULSE_ROOT} +TRAIN_SCRIPT=${BASEDIR}/experiments/tasks/train_pred_cls.py +OUTDIR=${BASEDIR}/results/pred_cls3 +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +COMMON="--coarse --use_prev_action --epochs 80 --batch_size 32 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 20 --seed 42 --augment --noise_std 0.1 --time_mask_ratio 0.1 --label_smoothing 0.1 --output_dir $OUTDIR --window_sec 15.0" + +MODS=("imu" "emg" "mocap" "emg,imu" "mocap,imu" "mocap,emg,imu" "mocap,emg,eyetrack" "mocap,emg,eyetrack,imu" "mocap,emg,eyetrack,imu,pressure") + +for mods in "${MODS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + sbatch \ + -J "pcls3_${mod_tag}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${LOGDIR}/${mod_tag}_%j.out" \ + -e "${LOGDIR}/${mod_tag}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $TRAIN_SCRIPT --modalities $mods $COMMON" + echo "Submitted: $mods" +done + +echo "" +echo "Total: 9 jobs | 8 coarse + prev_action | hidden_dim=128" +echo "Baseline to beat: majority transition F1w=0.25" +echo "Results: $OUTDIR" diff --git a/experiments/slurm/run_pred_cls4.sh b/experiments/slurm/run_pred_cls4.sh new file mode 100644 index 0000000000000000000000000000000000000000..f54c16a05680a8dd0a6c1590a705634304e4af44 --- /dev/null +++ b/experiments/slurm/run_pred_cls4.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Round 4: Anti-overfit — smaller model + higher dropout + lower lr + stronger augment +# Focus on top 6 modalities (skip eyetrack-only combos which are toxic) +# Also add a prev_action-only baseline (for ablation) +# Total: 7 jobs + +PYTHON=python +BASEDIR=${PULSE_ROOT} +TRAIN_SCRIPT=${BASEDIR}/experiments/tasks/train_pred_cls.py +OUTDIR=${BASEDIR}/results/pred_cls4 +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +# Smaller model, stronger regularization +COMMON="--coarse --use_prev_action --epochs 100 --batch_size 32 --lr 3e-4 --weight_decay 5e-4 --hidden_dim 64 --downsample 5 --patience 25 --seed 42 --augment --noise_std 0.2 --time_mask_ratio 0.15 --label_smoothing 0.15 --output_dir $OUTDIR --window_sec 15.0" + +# Top modalities only (no eyetrack-only combos) +MODS=("imu" "emg" "mocap" "emg,imu" "mocap,imu" "mocap,emg,imu") + +for mods in "${MODS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + sbatch \ + -J "pcls4_${mod_tag}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${LOGDIR}/${mod_tag}_%j.out" \ + -e "${LOGDIR}/${mod_tag}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $TRAIN_SCRIPT --modalities $mods $COMMON" + echo "Submitted: $mods" +done + +# Ablation: sensor-only (no prev_action) for best combo emg,imu +COMMON_NOPREV="--coarse --epochs 100 --batch_size 32 --lr 3e-4 --weight_decay 5e-4 --hidden_dim 64 --downsample 5 --patience 25 --seed 42 --augment --noise_std 0.2 --time_mask_ratio 0.15 --label_smoothing 0.15 --output_dir $OUTDIR --window_sec 15.0" +sbatch \ + -J "pcls4_emg-imu_noprev" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${LOGDIR}/emg-imu_noprev_%j.out" \ + -e "${LOGDIR}/emg-imu_noprev_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $TRAIN_SCRIPT --modalities emg,imu $COMMON_NOPREV" +echo "Submitted: emg,imu (no prev_action ablation)" + +echo "" +echo "Total: 7 jobs | anti-overfit: hidden=64, lr=3e-4, wd=5e-4, dropout, noise=0.2" +echo "Results: $OUTDIR" diff --git a/experiments/slurm/run_pred_cls5.sh b/experiments/slurm/run_pred_cls5.sh new file mode 100644 index 0000000000000000000000000000000000000000..d6c29a333fe8d4073edd0ec2e51acee9184e2e44 --- /dev/null +++ b/experiments/slurm/run_pred_cls5.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Round 5: h=128 (keep capacity) + moderate regularization + multiple seeds +# Best of R3 capacity + some anti-overfit from R4 +# Also: 3 seeds for the best config to get confidence intervals + +PYTHON=python +BASEDIR=${PULSE_ROOT} +TRAIN_SCRIPT=${BASEDIR}/experiments/tasks/train_pred_cls.py +OUTDIR=${BASEDIR}/results/pred_cls5 +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +# h=128, lr=5e-4, wd=3e-4, dropout=0.3, moderate augment +COMMON="--coarse --use_prev_action --epochs 80 --batch_size 32 --lr 5e-4 --weight_decay 3e-4 --hidden_dim 128 --dropout 0.3 --downsample 5 --patience 20 --augment --noise_std 0.15 --time_mask_ratio 0.12 --label_smoothing 0.1 --output_dir $OUTDIR --window_sec 15.0" + +# Top 6 modality combos +MODS=("imu" "emg" "mocap" "emg,imu" "mocap,imu" "mocap,emg,imu") + +for mods in "${MODS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + sbatch \ + -J "pcls5_${mod_tag}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${LOGDIR}/${mod_tag}_s42_%j.out" \ + -e "${LOGDIR}/${mod_tag}_s42_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $TRAIN_SCRIPT --modalities $mods --seed 42 $COMMON" + echo "Submitted: $mods seed=42" +done + +# 2 extra seeds for emg,imu (best combo) for confidence intervals +for seed in 123 456; do + sbatch \ + -J "pcls5_emg-imu_s${seed}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${LOGDIR}/emg-imu_s${seed}_%j.out" \ + -e "${LOGDIR}/emg-imu_s${seed}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $TRAIN_SCRIPT --modalities emg,imu --seed $seed $COMMON" + echo "Submitted: emg,imu seed=$seed" +done + +echo "" +echo "Total: 8 jobs | h=128, lr=5e-4, dropout=0.3, wd=3e-4" +echo "Results: $OUTDIR" diff --git a/experiments/slurm/run_pred_multiseed.sh b/experiments/slurm/run_pred_multiseed.sh new file mode 100644 index 0000000000000000000000000000000000000000..8682ecf19555e8ad9dea948a065775a56d12fa44 --- /dev/null +++ b/experiments/slurm/run_pred_multiseed.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# Action Prediction multi-seed: 5 seeds × top 3 modalities +# Best settings from pred_cls3: 8 coarse + prev_action + ds=5 + window=10s +# Total: 15 jobs + +PYTHON=python +BASEDIR=${PULSE_ROOT} +TRAIN_SCRIPT=${BASEDIR}/experiments/tasks/train_pred_cls.py +OUTDIR=${BASEDIR}/results/pred_multiseed +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +BASE="--mode prediction --coarse --use_prev_action --epochs 80 --batch_size 32 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --dropout 0.2 --downsample 5 --patience 20 --augment --noise_std 0.1 --time_mask_ratio 0.1 --label_smoothing 0.1 --window_sec 10.0 --output_dir $OUTDIR" + +# Top 3 from pred_cls3: emg,imu (F1w=0.306), mocap,emg,eyetrack,imu (0.277), mocap,emg,imu (0.272) +TOP_MODS=("emg,imu" "mocap,emg,eyetrack,imu" "mocap,emg,imu") +SEEDS=(42 123 456 789 1024) + +for mods in "${TOP_MODS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + for seed in "${SEEDS[@]}"; do + sbatch \ + -J "pred_ms_${mod_tag}_s${seed}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${LOGDIR}/${mod_tag}_s${seed}_%j.out" \ + -e "${LOGDIR}/${mod_tag}_s${seed}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $TRAIN_SCRIPT --modalities $mods --seed $seed --tag s${seed} $BASE" + echo "Submitted: $mods seed=$seed" + done +done + +echo "" +echo "Total: 15 jobs | Prediction Multi-seed" +echo "Results: $OUTDIR" diff --git a/experiments/slurm/run_pub_extra.sh b/experiments/slurm/run_pub_extra.sh new file mode 100644 index 0000000000000000000000000000000000000000..2359ce7ac1eb946df67afdd3ae45727fd7291fbf --- /dev/null +++ b/experiments/slurm/run_pub_extra.sh @@ -0,0 +1,46 @@ +#!/bin/bash +#SBATCH --partition=gpuA800 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=4 +#SBATCH --gres=gpu:1 +#SBATCH --mem=32G +#SBATCH --time=6:00:00 +#SBATCH --job-name=PubExtra +#SBATCH --output=${PULSE_ROOT}/results/pub_extra_%j.log + +# Extra published baseline experiments: +# 1. TinyHAR with more modality combos & fusion for scene recognition +# 2. TinyHAR for all 5 modalities +set -e +PYTHON=python +PROJECT=${PULSE_ROOT} +cd $PROJECT +OUT1=$PROJECT/results/published_baselines/exp1_tinyhar_extra +mkdir -p $OUT1 + +echo "=== TinyHAR Extra Experiments ===" + +# More fusion strategies for emg+imu +for FUSION in attention gated_late stacking product moe; do + echo "--- TinyHAR / emg,imu / ${FUSION} ---" + $PYTHON experiments/train_exp1.py \ + --model tinyhar --modalities emg,imu --fusion $FUSION \ + --hidden_dim 32 --epochs 100 --batch_size 16 \ + --lr 1e-3 --weight_decay 1e-3 --downsample 5 \ + --seed 42 --output_dir $OUT1 \ + --tag extra 2>&1 | tail -3 +done + +# More modality combos with late fusion +for MOD in "mocap,imu" "mocap,emg,eyetrack" "mocap,emg,eyetrack,imu,pressure"; do + echo "--- TinyHAR / ${MOD} / late ---" + $PYTHON experiments/train_exp1.py \ + --model tinyhar --modalities $MOD --fusion late \ + --hidden_dim 32 --epochs 100 --batch_size 16 \ + --lr 1e-3 --weight_decay 1e-3 --downsample 5 \ + --seed 42 --output_dir $OUT1 \ + --tag extra 2>&1 | tail -3 +done + +echo "=== Done ===" diff --git a/experiments/slurm/run_pub_multiseed_exp1.sh b/experiments/slurm/run_pub_multiseed_exp1.sh new file mode 100644 index 0000000000000000000000000000000000000000..8fd0fdc1f1cc1a3f299d21ea227e5af7f8d9387a --- /dev/null +++ b/experiments/slurm/run_pub_multiseed_exp1.sh @@ -0,0 +1,37 @@ +#!/bin/bash +#SBATCH --partition=gpuA800 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=4 +#SBATCH --gres=gpu:1 +#SBATCH --mem=32G +#SBATCH --time=6:00:00 +#SBATCH --job-name=TinyHAR_ms +#SBATCH --output=${PULSE_ROOT}/results/pub_multiseed_exp1_%j.log + +# TinyHAR multi-seed scene recognition (5 seeds for best configs) +set -e +PYTHON=python +PROJECT=${PULSE_ROOT} +cd $PROJECT +OUT=$PROJECT/results/published_baselines/exp1_tinyhar_multiseed +mkdir -p $OUT + +echo "=== TinyHAR Multi-Seed Scene Recognition ===" + +for SEED in 42 123 456 789 2024; do + for MOD in imu "emg,imu"; do + for FUSION in early late; do + # Skip emg,imu+early with non-42 seeds if already done + echo "--- seed=$SEED / ${MOD} / ${FUSION} ---" + $PYTHON experiments/train_exp1.py \ + --model tinyhar --modalities $MOD --fusion $FUSION \ + --hidden_dim 32 --epochs 100 --batch_size 16 \ + --lr 1e-3 --weight_decay 1e-3 --downsample 5 \ + --seed $SEED --output_dir $OUT \ + --tag "s${SEED}" 2>&1 | tail -3 + done + done +done + +echo "=== Done ===" diff --git a/experiments/slurm/run_pub_multiseed_exp2.sh b/experiments/slurm/run_pub_multiseed_exp2.sh new file mode 100644 index 0000000000000000000000000000000000000000..7e6d039cf1c7c5a08e368353c7c5f7992a19d2a8 --- /dev/null +++ b/experiments/slurm/run_pub_multiseed_exp2.sh @@ -0,0 +1,33 @@ +#!/bin/bash +#SBATCH --partition=gpuA800 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=4 +#SBATCH --gres=gpu:1 +#SBATCH --mem=32G +#SBATCH --time=8:00:00 +#SBATCH --job-name=ASF_seg_ms +#SBATCH --output=${PULSE_ROOT}/results/pub_multiseed_exp2_%j.log + +# ASFormer multi-seed temporal segmentation +set -e +PYTHON=python +PROJECT=${PULSE_ROOT} +cd $PROJECT +OUT=$PROJECT/results/published_baselines/exp2_asformer_multiseed +mkdir -p $OUT + +echo "=== ASFormer Multi-Seed Temporal Segmentation ===" + +for SEED in 42 123 456 789 2024; do + for MOD in mocap "mocap,emg,eyetrack" "mocap,emg,eyetrack,imu"; do + echo "--- seed=$SEED / ${MOD} ---" + $PYTHON experiments/train_exp2.py \ + --model asformer --modalities $MOD \ + --hidden_dim 64 --epochs 80 --batch_size 16 \ + --lr 5e-4 --weight_decay 1e-4 --downsample 2 \ + --seed $SEED --output_dir $OUT 2>&1 | tail -3 + done +done + +echo "=== Done ===" diff --git a/experiments/slurm/run_pub_multiseed_exp3.sh b/experiments/slurm/run_pub_multiseed_exp3.sh new file mode 100644 index 0000000000000000000000000000000000000000..c7500ba6d11ee1d50bcef157672e0a24f2e000db --- /dev/null +++ b/experiments/slurm/run_pub_multiseed_exp3.sh @@ -0,0 +1,33 @@ +#!/bin/bash +#SBATCH --partition=gpuA800 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=4 +#SBATCH --gres=gpu:1 +#SBATCH --mem=32G +#SBATCH --time=6:00:00 +#SBATCH --job-name=ASF_ct_ms +#SBATCH --output=${PULSE_ROOT}/results/pub_multiseed_exp3_%j.log + +# ASFormer multi-seed contact detection +set -e +PYTHON=python +PROJECT=${PULSE_ROOT} +cd $PROJECT +OUT=$PROJECT/results/published_baselines/exp3_asformer_multiseed +mkdir -p $OUT + +echo "=== ASFormer Multi-Seed Contact Detection ===" + +for SEED in 42 123 456 789 2024; do + for MOD in emg imu mocap "mocap,emg"; do + echo "--- seed=$SEED / ${MOD} ---" + $PYTHON experiments/train_exp3.py \ + --model asformer --modalities $MOD \ + --hidden_dim 64 --epochs 50 --batch_size 32 \ + --lr 1e-3 --weight_decay 1e-4 --downsample 2 \ + --seed $SEED --output_dir $OUT 2>&1 | tail -3 + done +done + +echo "=== Done ===" diff --git a/experiments/slurm/run_published_baselines.sh b/experiments/slurm/run_published_baselines.sh new file mode 100644 index 0000000000000000000000000000000000000000..7be76b2932951fe1e5de7eabc4b7a58919408959 --- /dev/null +++ b/experiments/slurm/run_published_baselines.sh @@ -0,0 +1,175 @@ +#!/bin/bash +#SBATCH --partition=gpuA800 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=8 +#SBATCH --gres=gpu:2 +#SBATCH --mem=64G +#SBATCH --time=12:00:00 +#SBATCH --job-name=PubBaselines +#SBATCH --output=${PULSE_ROOT}/results/published_baselines_%j.log + +# Published Baselines for DailyAct-5M +# ASFormer (Yi et al., BMVC 2021) - Temporal Segmentation & Contact Detection +# TinyHAR (Zhou et al., ISWC 2022 Best Paper) - Scene Recognition + +set -e +PYTHON=python +PROJECT=${PULSE_ROOT} +cd $PROJECT + +EXP1_OUT=$PROJECT/results/published_baselines/exp1_tinyhar +EXP2_OUT=$PROJECT/results/published_baselines/exp2_asformer +EXP3_OUT=$PROJECT/results/published_baselines/exp3_asformer +mkdir -p $EXP1_OUT $EXP2_OUT $EXP3_OUT + +echo "==========================================" +echo "Published Baselines - $(date)" +echo "==========================================" + +# ============================================================ +# Group 1: TinyHAR for Scene Recognition (Exp 1) +# Run on GPU 0 +# ============================================================ +( +export CUDA_VISIBLE_DEVICES=0 + +echo "" +echo "=== [GPU0] Exp1: TinyHAR Scene Recognition ===" + +# Single modalities +for MOD in imu mocap emg eyetrack pressure; do + echo "--- TinyHAR / ${MOD} / early ---" + $PYTHON experiments/train_exp1.py \ + --model tinyhar --modalities $MOD --fusion early \ + --hidden_dim 32 --epochs 100 --batch_size 16 \ + --lr 1e-3 --weight_decay 1e-3 --downsample 5 \ + --seed 42 --output_dir $EXP1_OUT \ + --tag published 2>&1 | tail -5 +done + +# Best multi-modal combos +for MOD in "emg,imu" "mocap,emg,imu" "mocap,emg,eyetrack,imu"; do + echo "--- TinyHAR / ${MOD} / early ---" + $PYTHON experiments/train_exp1.py \ + --model tinyhar --modalities $MOD --fusion early \ + --hidden_dim 32 --epochs 100 --batch_size 16 \ + --lr 1e-3 --weight_decay 1e-3 --downsample 5 \ + --seed 42 --output_dir $EXP1_OUT \ + --tag published 2>&1 | tail -5 +done + +# TinyHAR with late fusion (emg + imu) +for FUSION in late weighted_late feat_concat; do + echo "--- TinyHAR / emg,imu / ${FUSION} ---" + $PYTHON experiments/train_exp1.py \ + --model tinyhar --modalities emg,imu --fusion $FUSION \ + --hidden_dim 32 --epochs 100 --batch_size 16 \ + --lr 1e-3 --weight_decay 1e-3 --downsample 5 \ + --seed 42 --output_dir $EXP1_OUT \ + --tag published 2>&1 | tail -5 +done + +echo "[GPU0] TinyHAR experiments complete." +) & +PID_GPU0=$! + + +# ============================================================ +# Group 2: ASFormer for Segmentation (Exp 2) + Contact (Exp 3) +# Run on GPU 1 +# ============================================================ +( +export CUDA_VISIBLE_DEVICES=1 + +echo "" +echo "=== [GPU1] Exp2: ASFormer Temporal Segmentation ===" + +# Key modality combinations +for MOD in mocap emg "mocap,emg,eyetrack" "mocap,emg,eyetrack,imu" "mocap,emg,eyetrack,imu,pressure"; do + echo "--- ASFormer / ${MOD} ---" + $PYTHON experiments/train_exp2.py \ + --model asformer --modalities $MOD \ + --hidden_dim 64 --epochs 80 --batch_size 16 \ + --lr 5e-4 --weight_decay 1e-4 --downsample 2 \ + --seed 42 --output_dir $EXP2_OUT 2>&1 | tail -5 +done + +echo "" +echo "=== [GPU1] Exp3: ASFormer Contact Detection ===" + +# Key modality combinations +for MOD in mocap emg imu "mocap,emg" "mocap,emg,eyetrack" "mocap,emg,eyetrack,imu"; do + echo "--- ASFormer / ${MOD} ---" + $PYTHON experiments/train_exp3.py \ + --model asformer --modalities $MOD \ + --hidden_dim 64 --epochs 50 --batch_size 32 \ + --lr 1e-3 --weight_decay 1e-4 --downsample 2 \ + --seed 42 --output_dir $EXP3_OUT 2>&1 | tail -5 +done + +echo "[GPU1] ASFormer experiments complete." +) & +PID_GPU1=$! + +# Wait for both GPU groups +wait $PID_GPU0 +wait $PID_GPU1 + +echo "" +echo "==========================================" +echo "All published baseline experiments complete - $(date)" +echo "==========================================" + +# ============================================================ +# Collect results summary +# ============================================================ +echo "" +echo "=== Results Summary ===" + +echo "" +echo "--- Exp1: TinyHAR Scene Recognition ---" +for f in $EXP1_OUT/*/results.json; do + if [ -f "$f" ]; then + $PYTHON -c " +import json +with open('$f') as fp: + r = json.load(fp) +mods = ','.join(r.get('modalities', [])) +fus = r.get('fusion', 'early') +f1 = r.get('test_macro_f1', 0) +acc = r.get('test_accuracy', 0) +print(f' TinyHAR | {mods:<30} | {fus:<12} | F1={f1:.4f} Acc={acc:.4f}') +" + fi +done + +echo "" +echo "--- Exp2: ASFormer Temporal Segmentation ---" +for f in $EXP2_OUT/*/results.json; do + if [ -f "$f" ]; then + $PYTHON -c " +import json +with open('$f') as fp: + r = json.load(fp) +mods = ','.join(r.get('modalities', [])) +m = r.get('test_metrics', {}) +print(f' ASFormer | {mods:<35} | Acc={m.get(\"frame_acc\",0):.4f} F1={m.get(\"frame_f1\",0):.4f} Seg@50={m.get(\"seg_f1@50\",0):.4f}') +" + fi +done + +echo "" +echo "--- Exp3: ASFormer Contact Detection ---" +for f in $EXP3_OUT/*/results.json; do + if [ -f "$f" ]; then + $PYTHON -c " +import json +with open('$f') as fp: + r = json.load(fp) +mods = ','.join(r.get('input_modalities', [])) +m = r.get('test_metrics', {}) +print(f' ASFormer | {mods:<30} | R_F1={m.get(\"right_f1\",0):.4f} L_F1={m.get(\"left_f1\",0):.4f} Avg_F1={m.get(\"avg_f1\",0):.4f}') +" + fi +done diff --git a/experiments/slurm/run_published_baselines_v2.sh b/experiments/slurm/run_published_baselines_v2.sh new file mode 100644 index 0000000000000000000000000000000000000000..280b4f41498bfcaea53eb972d37dc50e4be5063e --- /dev/null +++ b/experiments/slurm/run_published_baselines_v2.sh @@ -0,0 +1,109 @@ +#!/bin/bash +# ============================================================ +# Run all 6 published baseline models across 4 experiments +# Submit to SLURM gpuA800 partition +# ============================================================ + +PYTHON=python3 +BASEDIR=${PULSE_ROOT} +OUTBASE=${BASEDIR}/results/published_baselines_v2 + +SEED=42 +ENV_SETUP="export PYTHONUNBUFFERED=1; export LD_LIBRARY_PATH=${PULSE_ROOT} cd ${BASEDIR}" + +submit() { + # $1=job_name $2=time $3=mem $4=command + local LOGDIR="${OUTBASE}/slurm_logs" + mkdir -p "$LOGDIR" + sbatch -J "$1" -p gpuA800 --gres=gpu:1 -N1 -n1 \ + --cpus-per-task=4 --mem="$3" -t "$2" \ + -o "${LOGDIR}/${1}_%j.out" \ + -e "${LOGDIR}/${1}_%j.err" \ + --export=ALL \ + --wrap="${ENV_SETUP}; $4" + echo " Submitted: $1" +} + +# ============================================================ +# Exp1: Scene Recognition - DeepConvLSTM + InceptionTime +# ============================================================ +echo "=== Exp1: Scene Recognition ===" +OUTDIR_E1=${OUTBASE}/exp1 +EXP1_COMMON="--epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 32 --downsample 5 --patience 15 --seed $SEED --output_dir $OUTDIR_E1" + +for model in deepconvlstm inceptiontime; do + # Single modality + for mod in imu mocap emg; do + submit "e1_${model}_${mod}" "2:00:00" "32G" \ + "$PYTHON experiments/train_exp1.py --model $model --modalities $mod --fusion early $EXP1_COMMON" + done + # Multi-modal early + late + submit "e1_${model}_ime_early" "2:00:00" "32G" \ + "$PYTHON experiments/train_exp1.py --model $model --modalities imu,mocap,emg --fusion early $EXP1_COMMON" + submit "e1_${model}_ime_late" "2:00:00" "32G" \ + "$PYTHON experiments/train_exp1.py --model $model --modalities imu,mocap,emg --fusion late $EXP1_COMMON" +done +# Total Exp1: 2 models × (3 single + 2 multi) = 10 jobs + +# ============================================================ +# Exp2: Action Segmentation - MS-TCN++ + DiffAct +# ============================================================ +echo "" +echo "=== Exp2: Action Segmentation ===" +OUTDIR_E2=${OUTBASE}/exp2 +EXP2_COMMON="--epochs 80 --batch_size 16 --lr 5e-4 --weight_decay 1e-4 --hidden_dim 64 --downsample 2 --patience 15 --seed $SEED --output_dir $OUTDIR_E2" + +for model in mstcnpp diffact; do + for mods in mocap mocap,emg,eyetrack mocap,emg,eyetrack,imu mocap,emg,eyetrack,imu,pressure; do + mod_tag=${mods//,/-} + submit "e2_${model}_${mod_tag}" "6:00:00" "64G" \ + "$PYTHON experiments/train_exp2.py --model $model --modalities $mods $EXP2_COMMON" + done +done +# Total Exp2: 2 models × 4 modality combos = 8 jobs + +# ============================================================ +# Exp3: Contact Detection - DeepConvLSTM + InceptionTime + UnderPressure +# ============================================================ +echo "" +echo "=== Exp3: Contact Detection ===" +OUTDIR_E3=${OUTBASE}/exp3 +EXP3_COMMON="--epochs 50 --batch_size 32 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 64 --downsample 2 --patience 10 --seed $SEED --output_dir $OUTDIR_E3" + +for model in deepconvlstm inceptiontime underpressure; do + for mods in mocap emg imu mocap,emg mocap,emg,eyetrack,imu; do + mod_tag=${mods//,/-} + submit "e3_${model}_${mod_tag}" "4:00:00" "32G" \ + "$PYTHON experiments/train_exp3.py --model $model --modalities $mods $EXP3_COMMON" + done +done +# Total Exp3: 3 models × 5 modality combos = 15 jobs + +# ============================================================ +# Exp4: Cross-Modal Prediction - UnderPressure (4a) + emg2pose (4b) +# ============================================================ +echo "" +echo "=== Exp4: Cross-Modal Prediction ===" +OUTDIR_E4=${OUTBASE}/exp4 +EXP4_COMMON="--epochs 50 --batch_size 32 --lr 5e-4 --weight_decay 1e-4 --hidden_dim 128 --downsample 2 --patience 10 --seed $SEED --output_dir $OUTDIR_E4" + +# 4a: MoCap -> Pressure (UnderPressure) +submit "e4_4a_underpressure" "4:00:00" "32G" \ + "$PYTHON experiments/train_exp4.py --subtask 4a --model underpressure $EXP4_COMMON" + +# 4b: EMG -> Hand Pose (emg2pose velocity + direct) +submit "e4_4b_emg2pose" "4:00:00" "32G" \ + "$PYTHON experiments/train_exp4.py --subtask 4b --model emg2pose $EXP4_COMMON" +submit "e4_4b_emg2pose_direct" "4:00:00" "32G" \ + "$PYTHON experiments/train_exp4.py --subtask 4b --model emg2pose_direct $EXP4_COMMON" +# Total Exp4: 3 jobs + +echo "" +echo "=== Total: 36 jobs submitted ===" +echo " Exp1: 10 jobs (DeepConvLSTM + InceptionTime)" +echo " Exp2: 8 jobs (MS-TCN++ + DiffAct)" +echo " Exp3: 15 jobs (DeepConvLSTM + InceptionTime + UnderPressure)" +echo " Exp4: 3 jobs (UnderPressure + emg2pose)" +echo "" +echo "Monitor: squeue -u \$(whoami)" +echo "Results: ${OUTBASE}/" diff --git a/experiments/slurm/run_recog.sh b/experiments/slurm/run_recog.sh new file mode 100644 index 0000000000000000000000000000000000000000..8a96d6f670d777b18f654468172c3d5650366e48 --- /dev/null +++ b/experiments/slurm/run_recog.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Action Recognition: classify current action from within-segment sensor data +# 20 fine verb classes, no prev_action needed +# Total: 9 jobs + +PYTHON=python +BASEDIR=${PULSE_ROOT} +TRAIN_SCRIPT=${BASEDIR}/experiments/tasks/train_pred_cls.py +OUTDIR=${BASEDIR}/results/recog +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +# 20 fine classes, recognition mode, window=10s +COMMON="--mode recognition --epochs 80 --batch_size 32 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --dropout 0.2 --downsample 5 --patience 20 --seed 42 --augment --noise_std 0.1 --time_mask_ratio 0.1 --label_smoothing 0.1 --output_dir $OUTDIR --window_sec 10.0" + +MODS=("imu" "emg" "mocap" "emg,imu" "mocap,imu" "mocap,emg,imu" "mocap,emg,eyetrack" "mocap,emg,eyetrack,imu" "mocap,emg,eyetrack,imu,pressure") + +for mods in "${MODS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + sbatch \ + -J "recog_${mod_tag}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${LOGDIR}/${mod_tag}_%j.out" \ + -e "${LOGDIR}/${mod_tag}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $TRAIN_SCRIPT --modalities $mods $COMMON" + echo "Submitted: $mods" +done + +echo "" +echo "Total: 9 jobs | Action Recognition | 20 fine classes | window=10s" +echo "Results: $OUTDIR" diff --git a/experiments/slurm/run_recog2.sh b/experiments/slurm/run_recog2.sh new file mode 100644 index 0000000000000000000000000000000000000000..dac9629fd566ce82e46a626a02be068683f53493 --- /dev/null +++ b/experiments/slurm/run_recog2.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Action Recognition Round 2: Fix over-padding + add prev_action +# Key insight: segments are 1-6s (median 2s), window_sec=10 was 80% padding +# Group A: window=4s, 8 coarse (9 jobs) +# Group B: window=4s, 8 coarse + prev_action (9 jobs) +# Total: 18 jobs + +PYTHON=python +BASEDIR=${PULSE_ROOT} +TRAIN_SCRIPT=${BASEDIR}/experiments/tasks/train_pred_cls.py +OUTDIR_A=${BASEDIR}/results/recog2a +OUTDIR_B=${BASEDIR}/results/recog2b +LOGDIR_A=${OUTDIR_A}/slurm_logs +LOGDIR_B=${OUTDIR_B}/slurm_logs +mkdir -p $LOGDIR_A $LOGDIR_B + +COMMON_A="--mode recognition --coarse --epochs 80 --batch_size 32 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --dropout 0.2 --downsample 5 --patience 20 --seed 42 --augment --noise_std 0.1 --time_mask_ratio 0.1 --label_smoothing 0.1 --window_sec 4.0" +COMMON_B="$COMMON_A --use_prev_action" + +MODS=("imu" "emg" "mocap" "emg,imu" "mocap,imu" "mocap,emg,imu" "mocap,emg,eyetrack" "mocap,emg,eyetrack,imu" "mocap,emg,eyetrack,imu,pressure") + +echo "=== Group A: window=4s, no prev_action ===" +for mods in "${MODS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + sbatch \ + -J "rec2a_${mod_tag}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${LOGDIR_A}/${mod_tag}_%j.out" \ + -e "${LOGDIR_A}/${mod_tag}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $TRAIN_SCRIPT --modalities $mods $COMMON_A --output_dir $OUTDIR_A" + echo "Submitted A: $mods" +done + +echo "" +echo "=== Group B: window=4s + prev_action ===" +for mods in "${MODS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + sbatch \ + -J "rec2b_${mod_tag}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${LOGDIR_B}/${mod_tag}_%j.out" \ + -e "${LOGDIR_B}/${mod_tag}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $TRAIN_SCRIPT --modalities $mods $COMMON_B --output_dir $OUTDIR_B" + echo "Submitted B: $mods" +done + +echo "" +echo "Total: 18 jobs | Recognition Round 2" +echo "Group A (window=4s): $OUTDIR_A" +echo "Group B (window=4s+prev): $OUTDIR_B" diff --git a/experiments/slurm/run_recog3.sh b/experiments/slurm/run_recog3.sh new file mode 100644 index 0000000000000000000000000000000000000000..d5725e93567ea1663f39ca005ee8cbf040e594e7 --- /dev/null +++ b/experiments/slurm/run_recog3.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# Action Recognition Round 3: +# Group A: 8 coarse + prev + window=4s + downsample=2 (more frames) — 9 jobs +# Group B: 20 fine + prev + window=4s — 9 jobs +# Group C: 8 coarse + prev + window=4s + smaller model h=64 — 3 best modalities +# Total: 21 jobs + +PYTHON=python +BASEDIR=${PULSE_ROOT} +TRAIN_SCRIPT=${BASEDIR}/experiments/tasks/train_pred_cls.py +OUTDIR_A=${BASEDIR}/results/recog3a +OUTDIR_B=${BASEDIR}/results/recog3b +OUTDIR_C=${BASEDIR}/results/recog3c +mkdir -p ${OUTDIR_A}/slurm_logs ${OUTDIR_B}/slurm_logs ${OUTDIR_C}/slurm_logs + +MODS=("imu" "emg" "mocap" "emg,imu" "mocap,imu" "mocap,emg,imu" "mocap,emg,eyetrack" "mocap,emg,eyetrack,imu" "mocap,emg,eyetrack,imu,pressure") + +# Group A: 8 coarse + prev + downsample=2 (gives ~100 frames for 2s segments at 100Hz) +COMMON_A="--mode recognition --coarse --use_prev_action --epochs 80 --batch_size 32 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --dropout 0.2 --downsample 2 --patience 20 --seed 42 --augment --noise_std 0.1 --time_mask_ratio 0.1 --label_smoothing 0.1 --window_sec 4.0" + +echo "=== Group A: 8 coarse + prev + ds=2 ===" +for mods in "${MODS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + sbatch \ + -J "rec3a_${mod_tag}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${OUTDIR_A}/slurm_logs/${mod_tag}_%j.out" \ + -e "${OUTDIR_A}/slurm_logs/${mod_tag}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $TRAIN_SCRIPT --modalities $mods $COMMON_A --output_dir $OUTDIR_A" + echo "Submitted A: $mods" +done + +# Group B: 20 fine + prev + window=4s + ds=5 +COMMON_B="--mode recognition --use_prev_action --epochs 80 --batch_size 32 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --dropout 0.2 --downsample 5 --patience 20 --seed 42 --augment --noise_std 0.1 --time_mask_ratio 0.1 --label_smoothing 0.1 --window_sec 4.0" + +echo "" +echo "=== Group B: 20 fine + prev ===" +for mods in "${MODS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + sbatch \ + -J "rec3b_${mod_tag}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${OUTDIR_B}/slurm_logs/${mod_tag}_%j.out" \ + -e "${OUTDIR_B}/slurm_logs/${mod_tag}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $TRAIN_SCRIPT --modalities $mods $COMMON_B --output_dir $OUTDIR_B" + echo "Submitted B: $mods" +done + +# Group C: 8 coarse + prev + h=64 (less overfit) — top 3 from Group B round 2 +COMMON_C="--mode recognition --coarse --use_prev_action --epochs 80 --batch_size 32 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 64 --dropout 0.3 --downsample 5 --patience 20 --seed 42 --augment --noise_std 0.1 --time_mask_ratio 0.1 --label_smoothing 0.1 --window_sec 4.0" +TOP_MODS=("mocap,emg,eyetrack" "mocap,emg,imu" "imu") + +echo "" +echo "=== Group C: 8 coarse + prev + h=64 ===" +for mods in "${TOP_MODS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + sbatch \ + -J "rec3c_${mod_tag}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${OUTDIR_C}/slurm_logs/${mod_tag}_%j.out" \ + -e "${OUTDIR_C}/slurm_logs/${mod_tag}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $TRAIN_SCRIPT --modalities $mods $COMMON_C --output_dir $OUTDIR_C" + echo "Submitted C: $mods" +done + +echo "" +echo "Total: 21 jobs | Recognition Round 3" +echo "A (ds=2): $OUTDIR_A | B (20fine+prev): $OUTDIR_B | C (h=64): $OUTDIR_C" diff --git a/experiments/slurm/run_recog4.sh b/experiments/slurm/run_recog4.sh new file mode 100644 index 0000000000000000000000000000000000000000..ec1d4a6c947171d664042f18209eeda9f8110f8f --- /dev/null +++ b/experiments/slurm/run_recog4.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Action Recognition Round 4: Fix epoch-1 overfit with lower LR + warmup +# Test top 3 modality combos from recog3a with LR sweep +# Total: 9 jobs + +PYTHON=python +BASEDIR=${PULSE_ROOT} +TRAIN_SCRIPT=${BASEDIR}/experiments/tasks/train_pred_cls.py +OUTDIR=${BASEDIR}/results/recog4 +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +# Best settings from recog3a: ds=2, window=4s, coarse, prev_action +BASE="--mode recognition --coarse --use_prev_action --epochs 80 --batch_size 32 --weight_decay 1e-4 --hidden_dim 128 --dropout 0.2 --downsample 2 --patience 20 --seed 42 --augment --noise_std 0.1 --time_mask_ratio 0.1 --label_smoothing 0.1 --window_sec 4.0 --output_dir $OUTDIR" + +# Top 3 modality combos +TOP_MODS=("mocap,emg,eyetrack" "mocap,imu" "mocap,emg,imu") +LRS=("3e-4" "1e-4" "5e-5") + +for mods in "${TOP_MODS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + for lr in "${LRS[@]}"; do + lr_tag=$(echo $lr | tr '-' 'n') + sbatch \ + -J "rec4_${mod_tag}_${lr_tag}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${LOGDIR}/${mod_tag}_lr${lr_tag}_%j.out" \ + -e "${LOGDIR}/${mod_tag}_lr${lr_tag}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $TRAIN_SCRIPT --modalities $mods --lr $lr --tag lr${lr_tag} $BASE" + echo "Submitted: $mods lr=$lr" + done +done + +echo "" +echo "Total: 9 jobs | Recognition Round 4 | LR sweep" +echo "Results: $OUTDIR" diff --git a/experiments/slurm/run_recog_coarse.sh b/experiments/slurm/run_recog_coarse.sh new file mode 100644 index 0000000000000000000000000000000000000000..18d1711e52c9c446422ae9fa5b677343aa4396ee --- /dev/null +++ b/experiments/slurm/run_recog_coarse.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Action Recognition with 8 coarse classes (compare with 20 fine) +# Total: 9 jobs + +PYTHON=python +BASEDIR=${PULSE_ROOT} +TRAIN_SCRIPT=${BASEDIR}/experiments/tasks/train_pred_cls.py +OUTDIR=${BASEDIR}/results/recog_coarse +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +COMMON="--mode recognition --coarse --epochs 80 --batch_size 32 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --dropout 0.2 --downsample 5 --patience 20 --seed 42 --augment --noise_std 0.1 --time_mask_ratio 0.1 --label_smoothing 0.1 --output_dir $OUTDIR --window_sec 10.0" + +MODS=("imu" "emg" "mocap" "emg,imu" "mocap,imu" "mocap,emg,imu" "mocap,emg,eyetrack" "mocap,emg,eyetrack,imu" "mocap,emg,eyetrack,imu,pressure") + +for mods in "${MODS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + sbatch \ + -J "recogC_${mod_tag}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${LOGDIR}/${mod_tag}_%j.out" \ + -e "${LOGDIR}/${mod_tag}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $TRAIN_SCRIPT --modalities $mods $COMMON" + echo "Submitted: $mods" +done + +echo "" +echo "Total: 9 jobs | Recognition | 8 coarse classes | window=10s" +echo "Results: $OUTDIR" diff --git a/experiments/slurm/run_recog_ensemble.sh b/experiments/slurm/run_recog_ensemble.sh new file mode 100644 index 0000000000000000000000000000000000000000..7dd67b1be40056107168926d44eef4877c336a51 --- /dev/null +++ b/experiments/slurm/run_recog_ensemble.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# Action Recognition Ensemble: 5 seeds × top 3 modality combos +# Then evaluate ensemble via majority voting +# Total: 15 jobs + +PYTHON=python +BASEDIR=${PULSE_ROOT} +TRAIN_SCRIPT=${BASEDIR}/experiments/tasks/train_pred_cls.py +OUTDIR=${BASEDIR}/results/recog_ens +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p $LOGDIR + +BASE="--mode recognition --coarse --use_prev_action --epochs 80 --batch_size 32 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --dropout 0.2 --downsample 2 --patience 20 --augment --noise_std 0.1 --time_mask_ratio 0.1 --label_smoothing 0.1 --window_sec 4.0 --output_dir $OUTDIR" + +TOP_MODS=("mocap,emg,eyetrack" "mocap,imu" "mocap,emg,imu") +SEEDS=(42 123 456 789 1024) + +for mods in "${TOP_MODS[@]}"; do + mod_tag=$(echo $mods | tr ',' '-') + for seed in "${SEEDS[@]}"; do + sbatch \ + -J "ens_${mod_tag}_s${seed}" \ + -p gpuA800 \ + --gres=gpu:1 \ + -N 1 -n 1 \ + --cpus-per-task=4 \ + --mem=32G \ + -t 2:00:00 \ + -o "${LOGDIR}/${mod_tag}_s${seed}_%j.out" \ + -e "${LOGDIR}/${mod_tag}_s${seed}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $TRAIN_SCRIPT --modalities $mods --seed $seed --tag s${seed} $BASE" + echo "Submitted: $mods seed=$seed" + done +done + +echo "" +echo "Total: 15 jobs | Ensemble seeds" +echo "Results: $OUTDIR" diff --git a/experiments/slurm/run_seqpred_all.sh b/experiments/slurm/run_seqpred_all.sh new file mode 100644 index 0000000000000000000000000000000000000000..fcd977563f2a3c710b3f02f2c2966bc6bcc60d34 --- /dev/null +++ b/experiments/slurm/run_seqpred_all.sh @@ -0,0 +1,161 @@ +#!/bin/bash +# SLURM launcher for T10 Triplet Next-Action Prediction experiments. +# +# Produces all five tables from the paper plan: +# Table 1: main comparison (T_fut=2s) — 1 model × 5 seeds +# Table 3: horizon curve — 5 horizons × 5 seeds (same model) +# Table 4: modality ablation — 6 configs × 5 seeds (ours only) +# Table 5: component ablation — 5 variants × 5 seeds (ours only) +# Table 7: missing-modality robustness — trained once w/ modality dropout, +# evaluated under 6 test-time drops +# +# ~140 jobs in total. Uses `gpuHygonZ100` (2 idle nodes); change PARTITION to +# `gpuA800` if larger slots are available. +# +# Usage: +# bash experiments/run_seqpred_all.sh +# bash experiments/run_seqpred_all.sh --dry # print what would submit +# +# Outputs: results/seqpred/_/{config.json, results.json, +# model_best.pt} +# Aggregate into tables with experiments/analysis/aggregate_seqpred.py (TBD). + +set -euo pipefail + +DRY=${1:-} +PYTHON=${PYTHON:-python3} +BASEDIR=${BASEDIR:-${PULSE_ROOT}} +TRAIN=${BASEDIR}/experiments/tasks/train_seqpred.py +OUTDIR=${BASEDIR}/results/seqpred +LOGDIR=${OUTDIR}/slurm_logs +mkdir -p "${LOGDIR}" + +PARTITION=${PARTITION:-gpuHygonZ100} +GPU_GRES=${GPU_GRES:-gpu:1} +CPUS=${CPUS:-4} +MEM=${MEM:-48G} +TIME=${TIME:-6:00:00} + +BASE_ARGS="--epochs 40 --batch_size 32 --lr 3e-4 --weight_decay 1e-4 \ + --dropout 0.2 --patience 12 --label_smoothing 0.05 \ + --use_class_weights --num_workers 2" + +ALL_MODS="imu,emg,eyetrack,mocap,pressure" + +submit() { + local JOB_NAME=$1 + local OUT_SUB=$2 + shift 2 + local CMD="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; \ + ${PYTHON} ${TRAIN} $* --output_dir ${OUTDIR}/${OUT_SUB}" + if [[ "${DRY}" == "--dry" ]]; then + echo "--- ${JOB_NAME} ---" + echo " out: ${OUTDIR}/${OUT_SUB}" + echo " $*" + return + fi + sbatch \ + -J "sp_${JOB_NAME}" \ + -p "${PARTITION}" \ + --gres="${GPU_GRES}" \ + -N 1 -n 1 \ + --cpus-per-task=${CPUS} \ + --mem=${MEM} \ + -t "${TIME}" \ + -o "${LOGDIR}/${JOB_NAME}_%j.out" \ + -e "${LOGDIR}/${JOB_NAME}_%j.err" \ + --export=ALL \ + --wrap="${CMD}" + echo "submitted: ${JOB_NAME} -> ${OUT_SUB}" +} + +SEEDS=(42 123 456 789 1024) + +# --------------------------------------------------------------------- +# Table 1: main comparison at T_fut=2s +# Baselines (B1..B8) run on their preferred modality subsets; +# DailyActFormer runs on ALL 5 modalities. +# --------------------------------------------------------------------- +echo "=== Table 1: main comparison ===" + +for seed in "${SEEDS[@]}"; do + # --- our model, full 5-modality --- + submit "t1_ours_all5_s${seed}" "t1_ours_all5/seed${seed}" \ + --model dailyactformer --modalities ${ALL_MODS} \ + --t_obs 8 --t_fut 2 --seed ${seed} ${BASE_ARGS} + + # --- DeepConvLSTM (IMU only) --- + submit "t1_dcl_imu_s${seed}" "t1_dcl_imu/seed${seed}" \ + --model deepconvlstm --modalities imu \ + --t_obs 8 --t_fut 2 --seed ${seed} ${BASE_ARGS} + + # --- DeepConvLSTM (IMU+MoCap+EMG, best 3-modality for baselines) --- + submit "t1_dcl_3mod_s${seed}" "t1_dcl_3mod/seed${seed}" \ + --model deepconvlstm --modalities imu,mocap,emg \ + --t_obs 8 --t_fut 2 --seed ${seed} ${BASE_ARGS} +done + +# --------------------------------------------------------------------- +# Table 3: horizon curve (our model only, 5 horizons × 5 seeds = 25 jobs) +# --------------------------------------------------------------------- +echo "" +echo "=== Table 3: horizon curve ===" +for tfut in 1 2 5 10 15; do + for seed in "${SEEDS[@]}"; do + submit "t3_ours_tfut${tfut}_s${seed}" \ + "t3_ours_tfut${tfut}/seed${seed}" \ + --model dailyactformer --modalities ${ALL_MODS} \ + --t_obs 8 --t_fut ${tfut} --seed ${seed} ${BASE_ARGS} + done +done + +# --------------------------------------------------------------------- +# Table 4: modality ablation on our model (remove one modality at a time) +# --------------------------------------------------------------------- +echo "" +echo "=== Table 4: modality ablation ===" +declare -A ABLATIONS +ABLATIONS["noPressure"]="imu,emg,eyetrack,mocap" +ABLATIONS["noEyeTrack"]="imu,emg,mocap,pressure" +ABLATIONS["noEMG"]="imu,eyetrack,mocap,pressure" +ABLATIONS["noIMU"]="emg,eyetrack,mocap,pressure" +ABLATIONS["noMoCap"]="imu,emg,eyetrack,pressure" +ABLATIONS["onlyIMU_EMG"]="imu,emg" +ABLATIONS["onlyMoCap"]="mocap" +ABLATIONS["onlyEMG"]="emg" +for tag in "${!ABLATIONS[@]}"; do + mods="${ABLATIONS[$tag]}" + for seed in "${SEEDS[@]}"; do + submit "t4_${tag}_s${seed}" "t4_${tag}/seed${seed}" \ + --model dailyactformer --modalities ${mods} \ + --t_obs 8 --t_fut 2 --seed ${seed} ${BASE_ARGS} + done +done + +# --------------------------------------------------------------------- +# Table 5: component ablation on our model +# (ablation switches TBD — parameter hooks need to be added to the model +# first. For now submit a placeholder using lambda weights.) +# --------------------------------------------------------------------- +echo "" +echo "=== Table 5: component ablation (placeholders) ===" +# 5a: no aux verb_composite head (set lambda to 0) +for seed in "${SEEDS[@]}"; do + submit "t5_noComp_s${seed}" "t5_noComp/seed${seed}" \ + --model dailyactformer --modalities ${ALL_MODS} \ + --t_obs 8 --t_fut 2 --seed ${seed} ${BASE_ARGS} \ + --lambda_verb_composite 0.0 +done +# 5b: equal-weight heads (remove our lambda prior) +for seed in "${SEEDS[@]}"; do + submit "t5_equalLambda_s${seed}" "t5_equalLambda/seed${seed}" \ + --model dailyactformer --modalities ${ALL_MODS} \ + --t_obs 8 --t_fut 2 --seed ${seed} ${BASE_ARGS} \ + --lambda_verb_composite 1.0 --lambda_hand 1.0 +done + +# 5c/5d/5e (modality-stem / fusion / causal-mask toggles) require model +# plumbing — we'll add CLI flags later. + +echo "" +echo "All done. Inspect with: squeue -u \$USER | head" diff --git a/experiments/slurm/run_t1_all.sh b/experiments/slurm/run_t1_all.sh new file mode 100644 index 0000000000000000000000000000000000000000..aeaf3b5d66b6630fe2f5004da3a66e7450cfb4a0 --- /dev/null +++ b/experiments/slurm/run_t1_all.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# Submit all T1 scene recognition baselines + SyncFuse. +# 8 methods x 3 seeds = 24 jobs, each on 1 A800 GPU. + +set -u +PYTHON=python +BASEDIR=${PULSE_ROOT} +OUTDIR=${BASEDIR}/results/t1_extended +LOGDIR=${OUTDIR}/slurm_logs +PRETRAIN_DIR=${BASEDIR}/results/exp1_v2 +mkdir -p ${LOGDIR} + +COMMON="--epochs 80 --batch_size 8 --lr 1e-3 --hidden_dim 128 \ + --downsample 5 --patience 15 --output_dir ${OUTDIR}" + +SUBMIT() { + local jname=$1 hrs=$2; shift 2 + sbatch -J "${jname}" -p gpuA800 --gres=gpu:1 -N 1 -n 1 \ + --cpus-per-task=4 --mem=32G -t "${hrs}:00:00" \ + -o "${LOGDIR}/${jname}_%j.out" \ + -e "${LOGDIR}/${jname}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $*" +} + +METHODS=(stgcn ctrgcn limu_bert emg_cnn actionsense mult perceiver) +SEEDS=(42 123 456) + +echo "=== 7 published baselines x 3 seeds = 21 jobs ===" +for m in "${METHODS[@]}"; do + for s in "${SEEDS[@]}"; do + SUBMIT "t1_${m}_s${s}" 2 \ + "$PYTHON experiments/train_baselines_t1.py \ + --method ${m} --seed ${s} ${COMMON}" + echo " submitted ${m}_s${s}" + done +done + +echo "" +echo "=== SyncFuse full (all 4 components) x 3 seeds = 3 jobs ===" +for s in "${SEEDS[@]}"; do + SUBMIT "t1_syncfuse_s${s}" 3 \ + "$PYTHON experiments/train_baselines_t1.py \ + --method syncfuse --seed ${s} \ + --mod_dropout_p 0.3 --use_xmod_shift --use_learned_late \ + --pretrained_dir ${PRETRAIN_DIR} ${COMMON}" + echo " submitted syncfuse_s${s}" +done + +echo "" +echo "=== SyncFuse ablations x 1 seed (42) = 4 jobs ===" +# Ablate each component +# - no modality dropout +SUBMIT "t1_syncfuse_abl_noDrop" 3 \ + "$PYTHON experiments/train_baselines_t1.py \ + --method syncfuse --seed 42 --tag noDrop \ + --mod_dropout_p 0.0 --use_xmod_shift --use_learned_late \ + --pretrained_dir ${PRETRAIN_DIR} ${COMMON}" +# - no pretrained transfer +SUBMIT "t1_syncfuse_abl_noPre" 3 \ + "$PYTHON experiments/train_baselines_t1.py \ + --method syncfuse --seed 42 --tag noPre \ + --mod_dropout_p 0.3 --use_xmod_shift --use_learned_late ${COMMON}" +# - no cross-modal shift +SUBMIT "t1_syncfuse_abl_noShift" 3 \ + "$PYTHON experiments/train_baselines_t1.py \ + --method syncfuse --seed 42 --tag noShift \ + --mod_dropout_p 0.3 --use_learned_late \ + --pretrained_dir ${PRETRAIN_DIR} ${COMMON}" +# - no learnable late fusion +SUBMIT "t1_syncfuse_abl_noLearn" 3 \ + "$PYTHON experiments/train_baselines_t1.py \ + --method syncfuse --seed 42 --tag noLearn \ + --mod_dropout_p 0.3 --use_xmod_shift \ + --pretrained_dir ${PRETRAIN_DIR} ${COMMON}" + +echo "" +echo "All jobs submitted. squeue -u \$USER" diff --git a/experiments/slurm/run_t1_pretrain_unified.sh b/experiments/slurm/run_t1_pretrain_unified.sh new file mode 100644 index 0000000000000000000000000000000000000000..6d20ea7625c1b9334085879dcf9dd013d503a356 --- /dev/null +++ b/experiments/slurm/run_t1_pretrain_unified.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# T1 unified-protocol pretrained-backbone experiments. +# +# Goal: directly compare SyncFuse and a plain Transformer+Late head under +# matched pretraining conditions, on BOTH the 4-mod and the 3-mod IME +# subsets, so that table tab:scene-published (3-mod IME) and +# tab:scene-published-ext (4-mod) can be reconciled. +# +# 4 methods x 3 seeds = 12 jobs. +# syncfuse 4-mod (mocap+emg+eye+imu), pretrained, unfrozen +# syncfuse_ime 3-mod IME (mocap+emg+imu), pretrained, unfrozen +# transformer_late 4-mod, pretrained, unfrozen +# transformer_late_ime 3-mod IME, pretrained, unfrozen + +set -u +PYTHON=python +BASEDIR=${PULSE_ROOT} +OUTDIR=${BASEDIR}/results/t1_unified_pretrain +LOGDIR=${OUTDIR}/slurm_logs +PRETRAIN_DIR=${BASEDIR}/results/exp1_v2 +mkdir -p ${LOGDIR} + +COMMON="--epochs 80 --batch_size 8 --lr 1e-3 --hidden_dim 128 \ + --downsample 5 --patience 15 --output_dir ${OUTDIR} \ + --pretrained_dir ${PRETRAIN_DIR}" +# Note: we do NOT pass --freeze_pretrained, so pretrained backbones are +# fine-tuned along with the rest of the model. + +SUBMIT() { + local jname=$1 hrs=$2; shift 2 + sbatch -J "${jname}" -p gpuA800 --gres=gpu:1 -N 1 -n 1 \ + --cpus-per-task=4 --mem=32G -t "${hrs}:00:00" \ + -o "${LOGDIR}/${jname}_%j.out" \ + -e "${LOGDIR}/${jname}_%j.err" \ + --export=ALL \ + --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $*" +} + +SEEDS=(42 123 456) + +# --- SyncFuse 4-mod + pretrain (unfrozen) --- +for s in "${SEEDS[@]}"; do + SUBMIT "t1pt_syncfuse_4mod_s${s}" 3 \ + "$PYTHON experiments/train_baselines_t1.py \ + --method syncfuse --seed ${s} \ + --mod_dropout_p 0.3 --use_xmod_shift --use_learned_late \ + ${COMMON}" +done + +# --- SyncFuse 3-mod IME + pretrain (unfrozen) --- +for s in "${SEEDS[@]}"; do + SUBMIT "t1pt_syncfuse_ime_s${s}" 3 \ + "$PYTHON experiments/train_baselines_t1.py \ + --method syncfuse_ime --seed ${s} \ + --mod_dropout_p 0.3 --use_xmod_shift --use_learned_late \ + ${COMMON}" +done + +# --- Transformer+Late 4-mod + pretrain (unfrozen) --- +for s in "${SEEDS[@]}"; do + SUBMIT "t1pt_tlate_4mod_s${s}" 3 \ + "$PYTHON experiments/train_baselines_t1.py \ + --method transformer_late --seed ${s} \ + ${COMMON}" +done + +# --- Transformer+Late 3-mod IME + pretrain (unfrozen) --- +for s in "${SEEDS[@]}"; do + SUBMIT "t1pt_tlate_ime_s${s}" 3 \ + "$PYTHON experiments/train_baselines_t1.py \ + --method transformer_late_ime --seed ${s} \ + ${COMMON}" +done + +echo +echo "Submitted 4 methods x 3 seeds = 12 jobs to gpuA800." +echo "Tail logs: squeue -u \$USER ; ls ${LOGDIR}" diff --git a/experiments/slurm/run_t5_3cls_emgonly.sh b/experiments/slurm/run_t5_3cls_emgonly.sh new file mode 100644 index 0000000000000000000000000000000000000000..388a1a68b349d75aa6c66a975adda29a8a563317 --- /dev/null +++ b/experiments/slurm/run_t5_3cls_emgonly.sh @@ -0,0 +1,46 @@ +#!/bin/bash +#SBATCH --partition=gpuA800 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=4 +#SBATCH --gres=gpu:1 +#SBATCH --mem=32G +#SBATCH --time=1:30:00 +#SBATCH --job-name=t5_emg +#SBATCH --output=${PULSE_ROOT}/results/t5_3class_emgonly/slurm_logs/%x_%j.out +#SBATCH --error=${PULSE_ROOT}/results/t5_3class_emgonly/slurm_logs/%x_%j.err + +# T5 3-class with EMG-only kinematic baseline. +# Hypothesis: with MoCap dropped from baseline, pressure's contribution +# to "Sustained-vs-Attempted" recognition is no longer compressed by +# kinematic position info. Predicted lift: +0.20 ~ +0.30 macro F1 +# (vs +0.074 with full kinematics). +# +# Args: BACKBONE COND +set -e +PYTHON=python +PROJECT=${PULSE_ROOT} +cd "$PROJECT" + +BACKBONE="$1"; COND="$2" +case "$COND" in + no_pressure) INPUTS="emg" ;; + with_pressure) INPUTS="emg,pressure" ;; + pressureonly) INPUTS="pressure" ;; + *) echo "bad cond $COND"; exit 1 ;; +esac + +OUT_DIR="$PROJECT/results/t5_3class_emgonly/${BACKBONE}_${COND}" +mkdir -p "$OUT_DIR" + +echo "=== T5 3cls-EMGonly: backbone=$BACKBONE cond=$COND inputs=$INPUTS ===" +$PYTHON experiments/tasks/train_grasp_state.py \ + --model "$BACKBONE" \ + --input_modalities "$INPUTS" \ + --t_obs 1.0 --t_fut 0.5 --anchor_stride 0.25 \ + --per_class_max 10000 \ + --label_mode three_class --sustained_threshold_sec 0.3 \ + --epochs 30 --batch_size 64 --lr 3e-4 --weight_decay 1e-3 \ + --d_model 64 --dropout 0.3 \ + --num_workers 2 --seed 42 --patience 6 \ + --output_dir "$OUT_DIR" diff --git a/experiments/slurm/run_t5_3cls_emgonly_cv.sh b/experiments/slurm/run_t5_3cls_emgonly_cv.sh new file mode 100644 index 0000000000000000000000000000000000000000..c49182a40dc53e5ee591101af1dd2795ab1d736d --- /dev/null +++ b/experiments/slurm/run_t5_3cls_emgonly_cv.sh @@ -0,0 +1,62 @@ +#!/bin/bash +#SBATCH --partition=gpuA800 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=4 +#SBATCH --gres=gpu:1 +#SBATCH --mem=32G +#SBATCH --time=1:30:00 +#SBATCH --job-name=t5_emg_cv +#SBATCH --output=${PULSE_ROOT}/results/t5_3class_emgonly_cv/slurm_logs/%x_%j.out +#SBATCH --error=${PULSE_ROOT}/results/t5_3class_emgonly_cv/slurm_logs/%x_%j.err + +# Volunteer-stratified 5-fold CV for the EMG-only 3-class headline result. +# Args: BACKBONE COND FOLD +# Train/Test vols come from ${PULSE_ROOT}/results/t5_3class_emgonly_cv/cv_folds.json (FOLD k → test = folds[k]). +set -e +PYTHON=python +PROJECT=${PULSE_ROOT} +cd "$PROJECT" + +BACKBONE="$1"; COND="$2"; FOLD="$3" +case "$COND" in + no_pressure) INPUTS="emg" ;; + with_pressure) INPUTS="emg,pressure" ;; + pressureonly) INPUTS="pressure" ;; + *) echo "bad cond $COND"; exit 1 ;; +esac + +# DCL needs lr=1e-4 + 50 epochs (see project_t5_v3_tgsr.md memory) +if [ "$BACKBONE" = "deepconvlstm" ]; then + LR=1e-4; EPOCHS=50; PATIENCE=12 +else + LR=3e-4; EPOCHS=30; PATIENCE=6 +fi + +# Pull train/test vol lists for fold $FOLD +read TRAIN_VOLS TEST_VOLS < <($PYTHON - < 10g). +# +# Args: BACKBONE COND +# BACKBONE ∈ {daf, futr, deepconvlstm} +# COND ∈ {no_pressure, with_pressure, pressureonly} + +set -e +PYTHON=python +PROJECT=${PULSE_ROOT} +cd "$PROJECT" + +BACKBONE="$1"; COND="$2" +case "$COND" in + no_pressure) INPUTS="emg,imu,mocap" ;; + with_pressure) INPUTS="emg,imu,mocap,pressure" ;; + pressureonly) INPUTS="pressure" ;; + *) echo "bad cond $COND"; exit 1 ;; +esac + +OUT_DIR="$PROJECT/results/t5_grasp_state_v2/${BACKBONE}_${COND}" +mkdir -p "$OUT_DIR" + +echo "=== T5v3p (proper contact) backbone=$BACKBONE cond=$COND inputs=$INPUTS ===" +$PYTHON experiments/tasks/train_grasp_state.py \ + --model "$BACKBONE" \ + --input_modalities "$INPUTS" \ + --t_obs 1.0 --t_fut 0.5 --anchor_stride 0.25 \ + --per_class_max 15000 \ + --epochs 30 --batch_size 64 --lr 3e-4 --weight_decay 1e-3 \ + --d_model 64 --dropout 0.3 \ + --num_workers 2 --seed 42 --patience 6 \ + --output_dir "$OUT_DIR" diff --git a/experiments/slurm/run_t8v2_sanity.sh b/experiments/slurm/run_t8v2_sanity.sh new file mode 100644 index 0000000000000000000000000000000000000000..f0693786021c4ea37134bc6a905aa9598cfbdb78 --- /dev/null +++ b/experiments/slurm/run_t8v2_sanity.sh @@ -0,0 +1,46 @@ +#!/bin/bash +#SBATCH --partition=gpuA800 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=4 +#SBATCH --gres=gpu:1 +#SBATCH --mem=32G +#SBATCH --time=1:00:00 +#SBATCH --job-name=t8v2_sanity +#SBATCH --output=${PULSE_ROOT}/results/t8_signal_v2/slurm_logs/%x_%j.out +#SBATCH --error=${PULSE_ROOT}/results/t8_signal_v2/slurm_logs/%x_%j.err + +# Sanity cell for revised T8 design (cross-modal baseline, with vs without pressure). +# Two arms in one job: no_pressure and with_pressure; target=mocap; DAF; T_fut=0.5s. +# Cross-modal input: target=mocap -> input = [emg, imu] (+pressure for treatment). + +set -e +PYTHON=python +PROJECT=${PULSE_ROOT} +cd "$PROJECT" + +OUT_BASE="$PROJECT/results/t8_signal_v2" +COND="$1" # "no_pressure" or "with_pressure" +if [ "$COND" = "no_pressure" ]; then + INPUT_MODS="emg,imu" +elif [ "$COND" = "with_pressure" ]; then + INPUT_MODS="emg,imu,pressure" +else + echo "usage: sbatch run_t8v2_sanity.sh {no_pressure|with_pressure}" + exit 1 +fi + +OUT_DIR="$OUT_BASE/_sanity_mocap_h050_daf_${COND}" +mkdir -p "$OUT_DIR" + +echo "=== sanity ${COND}: target=mocap input=${INPUT_MODS} T_fut=0.5s DAF ===" +$PYTHON experiments/tasks/train_signal_forecast.py \ + --model daf \ + --input_modalities "$INPUT_MODS" \ + --target_modality mocap \ + --t_obs 1.5 --t_fut 0.5 --anchor_stride 0.25 \ + --per_event_max 8000 \ + --epochs 25 --batch_size 64 --lr 3e-4 --weight_decay 1e-4 \ + --d_model 128 --dropout 0.1 \ + --num_workers 2 --seed 42 --patience 5 \ + --output_dir "$OUT_DIR" diff --git a/experiments/slurm/run_t8v2_sweep.sh b/experiments/slurm/run_t8v2_sweep.sh new file mode 100644 index 0000000000000000000000000000000000000000..ff98f36840998ecfddcfb3e9c3f631f534e5f9c0 --- /dev/null +++ b/experiments/slurm/run_t8v2_sweep.sh @@ -0,0 +1,62 @@ +#!/bin/bash +#SBATCH --partition=gpuA800 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=4 +#SBATCH --gres=gpu:1 +#SBATCH --mem=32G +#SBATCH --time=1:30:00 +#SBATCH --job-name=t8v2 +#SBATCH --output=${PULSE_ROOT}/results/t8_signal_v2/slurm_logs/%x_%j.out +#SBATCH --error=${PULSE_ROOT}/results/t8_signal_v2/slurm_logs/%x_%j.err + +# Sweep cell for revised T8 design. +# Args: TARGET DESIGN COND +# TARGET ∈ {mocap, imu, emg} +# DESIGN ∈ {A, B} +# A = short horizon : T_fut=0.2 d_model=128 epochs=25 patience=5 +# B = bigger model : T_fut=0.5 d_model=256 epochs=50 patience=10 +# COND ∈ {no_pressure, with_pressure} + +set -e +PYTHON=python +PROJECT=${PULSE_ROOT} +cd "$PROJECT" + +TARGET="$1"; DESIGN="$2"; COND="$3" + +# Cross-modal "other kinematics" baseline +case "$TARGET" in + mocap) BASE_INPUTS="emg,imu" ;; + imu) BASE_INPUTS="emg,mocap" ;; + emg) BASE_INPUTS="imu,mocap" ;; + *) echo "bad target $TARGET"; exit 1 ;; +esac +if [ "$COND" = "with_pressure" ]; then + INPUTS="${BASE_INPUTS},pressure" +elif [ "$COND" = "no_pressure" ]; then + INPUTS="${BASE_INPUTS}" +else + echo "bad cond $COND"; exit 1 +fi + +case "$DESIGN" in + A) TFUT=0.2; DMODEL=128; EPOCHS=25; PAT=5 ;; + B) TFUT=0.5; DMODEL=256; EPOCHS=50; PAT=10 ;; + *) echo "bad design $DESIGN"; exit 1 ;; +esac + +OUT_DIR="$PROJECT/results/t8_signal_v2/${DESIGN}_${TARGET}_tfut${TFUT}_daf_${COND}" +mkdir -p "$OUT_DIR" + +echo "=== design=$DESIGN target=$TARGET cond=$COND inputs=$INPUTS T_fut=$TFUT d_model=$DMODEL epochs=$EPOCHS ===" +$PYTHON experiments/tasks/train_signal_forecast.py \ + --model daf \ + --input_modalities "$INPUTS" \ + --target_modality "$TARGET" \ + --t_obs 1.5 --t_fut "$TFUT" --anchor_stride 0.25 \ + --per_event_max 8000 \ + --epochs "$EPOCHS" --batch_size 64 --lr 3e-4 --weight_decay 1e-4 \ + --d_model "$DMODEL" --dropout 0.1 \ + --num_workers 2 --seed 42 --patience "$PAT" \ + --output_dir "$OUT_DIR" diff --git a/experiments/slurm/setup_row.sh b/experiments/slurm/setup_row.sh new file mode 100644 index 0000000000000000000000000000000000000000..200affe4da461e1ee26d71b88dff9e70e2bf84ce --- /dev/null +++ b/experiments/slurm/setup_row.sh @@ -0,0 +1,101 @@ +#!/bin/bash +# Freeze the current experiments/ code into a row folder and emit a ready-to- +# submit run.sh. Each row becomes a self-contained, reproducible bundle. +# +# Usage: +# bash experiments/setup_row.sh \ +# --table table1_main_comparison \ +# --row row01_ours_dailyactformer_all5 \ +# --desc "Our model, all 5 modalities, T_fut=2s (headline row)" \ +# --cli "--model dailyactformer --modalities imu,emg,eyetrack,mocap,pressure \ +# --t_obs 8 --t_fut 2 --epochs 40 --batch_size 32 \ +# --lr 3e-4 --use_class_weights" + +set -euo pipefail + +BASEDIR=${BASEDIR:-${PULSE_ROOT}} +EXP=${BASEDIR}/experiments + +TABLE="" +ROW="" +DESC="" +CLI="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --table) TABLE="$2"; shift 2 ;; + --row) ROW="$2"; shift 2 ;; + --desc) DESC="$2"; shift 2 ;; + --cli) CLI="$2"; shift 2 ;; + *) echo "unknown arg: $1"; exit 1 ;; + esac +done +if [[ -z "${TABLE}" || -z "${ROW}" || -z "${CLI}" ]]; then + echo "usage: setup_row.sh --table T --row R [--desc D] --cli CLI" + exit 1 +fi + +ROW_DIR="${BASEDIR}/${TABLE}/${ROW}" +mkdir -p "${ROW_DIR}/code" "${ROW_DIR}/seeds" + +# 1. Snapshot code files. Only copy those that affect this experiment. +# dataset.py is included because dataset_seqpred.py imports +# load_modality_array / MODALITY_FILES from it. +for f in taxonomy.py taxonomy_v3.json dataset.py dataset_seqpred.py \ + models_seqpred.py train_seqpred.py; do + if [[ -e "${EXP}/${f}" ]]; then + cp "${EXP}/${f}" "${ROW_DIR}/code/" + fi +done + +# 2. Write a config.md describing this row. +cat > "${ROW_DIR}/config.md" < --output_dir +\`\`\` + +Each seed produces \`seeds/seed/{config.json, results.json, model_best.pt, train.log}\`. +EOF + +# 3. Write run.sh which submits 5 seeds under SLURM, each writing to +# seeds/seed/. This script is checked in with the frozen code, so re- +# running it in the future uses the exact same code. +cat > "${ROW_DIR}/run.sh" < 0 else scene_segs[i] + current = scene_segs[i] + segments.append((prev, current)) + + return segments, classes + + +def compute_transition_matrix(segments, num_classes): + """Compute P(next|prev) from training segments.""" + counts = np.zeros((num_classes, num_classes)) + for prev, current in segments: + counts[prev, current] += 1 + # Normalize rows + row_sums = counts.sum(axis=1, keepdims=True) + row_sums[row_sums == 0] = 1 + trans_matrix = counts / row_sums + return trans_matrix + + +def main(): + for coarse in [True, False]: + tag = "8 coarse" if coarse else "20 fine" + print(f"\n{'='*60}") + print(f"Baselines — {tag} classes") + print(f"{'='*60}") + + train_segs, classes = load_annotations(TRAIN_VOLS, coarse=coarse) + test_segs, _ = load_annotations(TEST_VOLS, coarse=coarse) + + num_classes = len(classes) + + # Extract test labels + test_prev = [s[0] for s in test_segs] + test_true = [s[1] for s in test_segs] + train_labels = [s[1] for s in train_segs] + + print(f"Train segments: {len(train_segs)}") + print(f"Test segments: {len(test_segs)}") + + # 1. Majority class baseline + label_counts = Counter(train_labels) + majority_class = label_counts.most_common(1)[0][0] + majority_preds = [majority_class] * len(test_true) + maj_acc = accuracy_score(test_true, majority_preds) + maj_f1w = f1_score(test_true, majority_preds, average='weighted', zero_division=0) + maj_f1m = f1_score(test_true, majority_preds, average='macro', zero_division=0) + print(f"\n1. Majority class baseline (always predict '{classes[majority_class]}'):") + print(f" acc={maj_acc:.3f} f1w={maj_f1w:.3f} f1m={maj_f1m:.3f}") + + # 2. Class frequency baseline (predict based on train distribution) + freq = np.zeros(num_classes) + for l in train_labels: + freq[l] += 1 + freq = freq / freq.sum() + np.random.seed(42) + freq_preds = np.random.choice(num_classes, size=len(test_true), p=freq) + freq_acc = accuracy_score(test_true, freq_preds) + freq_f1w = f1_score(test_true, freq_preds, average='weighted', zero_division=0) + freq_f1m = f1_score(test_true, freq_preds, average='macro', zero_division=0) + print(f"\n2. Random (train distribution) baseline:") + print(f" acc={freq_acc:.3f} f1w={freq_f1w:.3f} f1m={freq_f1m:.3f}") + + # 3. Transition matrix baseline + trans_matrix = compute_transition_matrix(train_segs, num_classes) + trans_preds = [] + for prev in test_prev: + # Predict most likely next given prev + trans_preds.append(np.argmax(trans_matrix[prev])) + trans_acc = accuracy_score(test_true, trans_preds) + trans_f1w = f1_score(test_true, trans_preds, average='weighted', zero_division=0) + trans_f1m = f1_score(test_true, trans_preds, average='macro', zero_division=0) + print(f"\n3. Transition matrix baseline (argmax P(next|prev)):") + print(f" acc={trans_acc:.3f} f1w={trans_f1w:.3f} f1m={trans_f1m:.3f}") + + # Print transition matrix + print(f"\n Transition matrix (rows=prev, cols=next):") + header = " " + "".join(f"{c[:2]:>6}" for c in classes) + print(header) + for i, row in enumerate(trans_matrix): + vals = "".join(f"{v:6.2f}" for v in row) + print(f" {classes[i][:2]}{vals}") + + # 4. Transition + sampling (sample from P(next|prev) instead of argmax) + np.random.seed(42) + trans_sample_preds = [] + for prev in test_prev: + p = trans_matrix[prev] + if p.sum() == 0: + trans_sample_preds.append(majority_class) + else: + trans_sample_preds.append(np.random.choice(num_classes, p=p)) + ts_acc = accuracy_score(test_true, trans_sample_preds) + ts_f1w = f1_score(test_true, trans_sample_preds, average='weighted', zero_division=0) + ts_f1m = f1_score(test_true, trans_sample_preds, average='macro', zero_division=0) + print(f"\n4. Transition matrix + sampling baseline:") + print(f" acc={ts_acc:.3f} f1w={ts_f1w:.3f} f1m={ts_f1m:.3f}") + + # Per-class report for transition argmax + print(f"\n Per-class report (transition argmax):") + report = classification_report(test_true, trans_preds, + target_names=classes, zero_division=0) + print(report) + + +if __name__ == '__main__': + main() diff --git a/experiments/tasks/eval_combined.py b/experiments/tasks/eval_combined.py new file mode 100644 index 0000000000000000000000000000000000000000..5308bf8311a882354d393aa837da45eea0a5bc5d --- /dev/null +++ b/experiments/tasks/eval_combined.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python3 +""" +Combine sensor-only NN predictions with transition matrix at inference time. +P(y|x,prev) ∝ P_nn(y|x)^α × P_trans(y|prev)^β +Tune α,β on validation set. +""" + +import os +import sys +import json +import re +import numpy as np +import torch +import torch.nn as nn +from collections import Counter +from sklearn.metrics import accuracy_score, f1_score + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from data.dataset import DATASET_DIR, TRAIN_VOLS, VAL_VOLS, TEST_VOLS +from tasks.train_pred_cls import ( + ActionPredDataset, TransformerClassifier, + ACTION_CLASSES_COARSE, init_classes +) +# Initialize global classes +init_classes(coarse=True) +COARSE_CLASSES = ACTION_CLASSES_COARSE + +ANNOTATION_DIR = "${PULSE_ROOT}" + + +def get_predictions(model, dataset, device): + """Get softmax predictions from model.""" + model.eval() + loader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=False) + all_probs = [] + all_labels = [] + all_prev = [] + with torch.no_grad(): + for batch in loader: + features = batch['features'].to(device) + mask = batch['mask'].to(device) + logits = model(features, mask) # no prev_action + probs = torch.softmax(logits, dim=1).cpu().numpy() + all_probs.append(probs) + all_labels.extend(batch['label']) + all_prev.extend(batch['prev_label']) + return np.concatenate(all_probs), np.array(all_labels), np.array(all_prev) + + +def compute_transition_matrix(dataset, num_classes): + """Compute P(current|prev) from dataset.""" + counts = np.zeros((num_classes, num_classes)) + for i in range(len(dataset)): + sample = dataset[i] + prev = sample['prev_label'] + curr = sample['label'] + counts[prev, curr] += 1 + row_sums = counts.sum(axis=1, keepdims=True) + row_sums[row_sums == 0] = 1 + return counts / row_sums + + +def combined_predict(nn_probs, trans_matrix, prev_labels, alpha, beta): + """Combine NN and transition predictions.""" + N, C = nn_probs.shape + combined = np.zeros_like(nn_probs) + for i in range(N): + trans_prob = trans_matrix[prev_labels[i]] + # Multiplicative combination with temperature + p = (nn_probs[i] ** alpha) * (trans_prob ** beta) + p_sum = p.sum() + if p_sum > 0: + combined[i] = p / p_sum + else: + combined[i] = trans_prob + return np.argmax(combined, axis=1) + + +def main(): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + # Models to evaluate (sensor-only, no prev_action) + models_info = [ + # (results_dir, modalities, description) + ('recog2a', 'imu', 'Recog: IMU'), + ('recog2a', 'mocap,emg,eyetrack', 'Recog: MEE'), + ('recog2a', 'mocap,emg,imu', 'Recog: MEI'), + ('recog_coarse', 'imu', 'Recog10s: IMU'), + ('recog_coarse', 'mocap,emg,imu', 'Recog10s: MEI'), + ] + + base_dir = '${PULSE_ROOT}/results' + + for results_dir, modalities, desc in models_info: + mod_str = modalities.replace(',', '-') + + # Find the model directory + result_base = os.path.join(base_dir, results_dir) + # Pattern: recog_cls_coarse_{mod_str} + model_dir = os.path.join(result_base, f'recog_cls_coarse_{mod_str}') + if not os.path.exists(model_dir): + print(f" Skip {desc}: {model_dir} not found") + continue + + results_file = os.path.join(model_dir, 'results.json') + if not os.path.exists(results_file): + continue + + r = json.load(open(results_file)) + args_dict = r['args'] + + # Recreate datasets + mods = modalities.split(',') + window_sec = args_dict['window_sec'] + downsample = args_dict['downsample'] + + train_ds = ActionPredDataset( + TRAIN_VOLS, mods, window_sec=window_sec, + downsample=downsample, coarse=True, mode='recognition') + stats = train_ds.get_stats() + val_ds = ActionPredDataset( + VAL_VOLS, mods, window_sec=window_sec, + downsample=downsample, stats=stats, coarse=True, mode='recognition') + test_ds = ActionPredDataset( + TEST_VOLS, mods, window_sec=window_sec, + downsample=downsample, stats=stats, coarse=True, mode='recognition') + + num_classes = len(COARSE_CLASSES) + + # Build and load model (without prev_action) + model = TransformerClassifier( + train_ds.feat_dim, num_classes, + d_model=args_dict['hidden_dim'], nhead=4, num_layers=2, + dropout=args_dict['dropout'], use_prev_action=False + ).to(device) + ckpt = torch.load(os.path.join(model_dir, 'model_best.pt'), + map_location=device, weights_only=True) + model.load_state_dict(ckpt) + + # Get predictions + val_probs, val_labels, val_prev = get_predictions(model, val_ds, device) + test_probs, test_labels, test_prev = get_predictions(model, test_ds, device) + + # Compute transition matrix from train + trans_matrix = compute_transition_matrix(train_ds, num_classes) + + # Baseline: NN only + nn_preds = np.argmax(test_probs, axis=1) + nn_f1w = f1_score(test_labels, nn_preds, average='weighted', zero_division=0) + + # Baseline: Transition only + trans_preds = np.array([np.argmax(trans_matrix[p]) for p in test_prev]) + trans_f1w = f1_score(test_labels, trans_preds, average='weighted', zero_division=0) + + # Grid search α, β on validation + best_val_f1 = -1 + best_params = (1.0, 1.0) + for alpha in [0.0, 0.3, 0.5, 0.7, 1.0, 1.5, 2.0]: + for beta in [0.0, 0.3, 0.5, 0.7, 1.0, 1.5, 2.0]: + if alpha == 0 and beta == 0: + continue + preds = combined_predict(val_probs, trans_matrix, val_prev, alpha, beta) + f1w = f1_score(val_labels, preds, average='weighted', zero_division=0) + if f1w > best_val_f1: + best_val_f1 = f1w + best_params = (alpha, beta) + + # Evaluate on test with best params + alpha, beta = best_params + combined_preds = combined_predict(test_probs, trans_matrix, test_prev, alpha, beta) + comb_f1w = f1_score(test_labels, combined_preds, average='weighted', zero_division=0) + comb_acc = accuracy_score(test_labels, combined_preds) + + # Also try simple additive combination + best_val_f1_add = -1 + best_w = 0.5 + for w in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]: + preds_add = [] + for i in range(len(val_probs)): + p = w * val_probs[i] + (1 - w) * trans_matrix[val_prev[i]] + preds_add.append(np.argmax(p)) + f1w = f1_score(val_labels, preds_add, average='weighted', zero_division=0) + if f1w > best_val_f1_add: + best_val_f1_add = f1w + best_w = w + + # Test with best w + preds_add = [] + for i in range(len(test_probs)): + p = best_w * test_probs[i] + (1 - best_w) * trans_matrix[test_prev[i]] + preds_add.append(np.argmax(p)) + add_f1w = f1_score(test_labels, preds_add, average='weighted', zero_division=0) + + print(f"\n{desc} ({mod_str}):") + print(f" NN only: F1w={nn_f1w:.3f}") + print(f" Trans only: F1w={trans_f1w:.3f}") + print(f" Multiplicative (α={alpha:.1f}, β={beta:.1f}): F1w={comb_f1w:.3f}") + print(f" Additive (w={best_w:.1f}): F1w={add_f1w:.3f}") + + +if __name__ == '__main__': + main() diff --git a/experiments/tasks/published_baselines.py b/experiments/tasks/published_baselines.py new file mode 100644 index 0000000000000000000000000000000000000000..2d89454af678ee4cc01ed1864b080571f2ab7138 --- /dev/null +++ b/experiments/tasks/published_baselines.py @@ -0,0 +1,295 @@ +""" +Published baseline models for DailyAct-5M benchmark. + +ASFormer: Transformer for Action Segmentation (Yi et al., BMVC 2021) + - Multi-stage encoder-decoder transformer with dilated attention + - For temporal action segmentation (Exp 2) and contact detection (Exp 3) + +TinyHAR: Lightweight Deep Learning Model for HAR (Zhou et al., ISWC 2022 Best Paper) + - Multi-scale temporal convolution + cross-channel attention + temporal pooling + - Implemented as backbone in models.py for scene recognition (Exp 1) +""" + +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + + +# ============================================================ +# Positional Encoding (shared) +# ============================================================ + +class PositionalEncoding1D(nn.Module): + """Sinusoidal positional encoding.""" + + def __init__(self, d_model, dropout=0.1, max_len=10000): + super().__init__() + self.dropout = nn.Dropout(p=dropout) + pe = torch.zeros(max_len, d_model) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + if d_model % 2 == 1: + pe[:, 1::2] = torch.cos(position * div_term[:-1]) + else: + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) # (1, max_len, d_model) + self.register_buffer('pe', pe) + + def forward(self, x): + x = x + self.pe[:, :x.size(1)] + return self.dropout(x) + + +# ============================================================ +# ASFormer (Yi et al., BMVC 2021) +# ============================================================ + +class ConvFeedForward(nn.Module): + """Position-wise convolution feed-forward used in ASFormer.""" + + def __init__(self, d_model, kernel_size=3, dropout=0.1): + super().__init__() + self.norm = nn.LayerNorm(d_model) + self.conv1 = nn.Conv1d(d_model, d_model * 2, kernel_size, padding=kernel_size // 2) + self.conv2 = nn.Conv1d(d_model * 2, d_model, 1) + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + # x: (B, T, D) + residual = x + x = self.norm(x) + x = x.permute(0, 2, 1) # (B, D, T) + x = self.dropout(F.relu(self.conv1(x))) + x = self.dropout(self.conv2(x)) + x = x.permute(0, 2, 1) # (B, T, D) + return residual + x + + +class DilatedAttention(nn.Module): + """Multi-head self-attention with dilated temporal mask. + + At dilation d and window w, position t attends to positions + {t + k*d : k in [-w, w]}, creating a hierarchical receptive field. + """ + + def __init__(self, d_model, dilation, num_heads=1, dropout=0.1, window_size=5): + super().__init__() + self.d_model = d_model + self.dilation = dilation + self.window_size = window_size + self.num_heads = num_heads + self.head_dim = d_model // num_heads + + self.norm = nn.LayerNorm(d_model) + self.qkv = nn.Linear(d_model, 3 * d_model) + self.out_proj = nn.Linear(d_model, d_model) + self.dropout = nn.Dropout(dropout) + + # Cache for dilated masks + self._mask_cache = {} + + def _get_dilated_mask(self, T, device): + """Create or retrieve cached dilated attention mask.""" + key = (T, self.dilation, self.window_size, device) + if key not in self._mask_cache: + positions = torch.arange(T, device=device) + diff = positions.unsqueeze(1) - positions.unsqueeze(0) # (T, T) + mask = torch.zeros(T, T, dtype=torch.bool, device=device) + for w in range(-self.window_size, self.window_size + 1): + mask |= (diff == w * self.dilation) + self._mask_cache[key] = mask + return self._mask_cache[key] + + def forward(self, x, cross_kv=None): + # x: (B, T, D) + B, T, D = x.shape + residual = x + x = self.norm(x) + + if cross_kv is not None: + q = self.qkv(x)[:, :, :D] # only use Q from x + kv = self.qkv(cross_kv)[:, :, D:] # K, V from cross_kv + q = q.view(B, T, self.num_heads, self.head_dim).transpose(1, 2) + k = kv[:, :, :D].view(B, T, self.num_heads, self.head_dim).transpose(1, 2) + v = kv[:, :, D:].view(B, T, self.num_heads, self.head_dim).transpose(1, 2) + else: + qkv = self.qkv(x).view(B, T, 3, self.num_heads, self.head_dim) + qkv = qkv.permute(2, 0, 3, 1, 4) # (3, B, H, T, head_dim) + q, k, v = qkv[0], qkv[1], qkv[2] + + scale = self.head_dim ** -0.5 + attn = (q @ k.transpose(-2, -1)) * scale # (B, H, T, T) + + # Apply dilated attention mask + dilated_mask = self._get_dilated_mask(T, x.device) # (T, T) + attn = attn.masked_fill(~dilated_mask.unsqueeze(0).unsqueeze(0), float('-inf')) + + attn = F.softmax(attn, dim=-1) + attn = self.dropout(attn) + + out = (attn @ v).transpose(1, 2).reshape(B, T, D) + out = self.out_proj(out) + return residual + self.dropout(out) + + +class ASFormerEncoderBlock(nn.Module): + """Single encoder block: dilated self-attention + conv feed-forward.""" + + def __init__(self, d_model, dilation, num_heads=1, kernel_size=3, + dropout=0.1, window_size=5): + super().__init__() + self.self_attn = DilatedAttention(d_model, dilation, num_heads, dropout, window_size) + self.ffn = ConvFeedForward(d_model, kernel_size, dropout) + + def forward(self, x): + x = self.self_attn(x) + x = self.ffn(x) + return x + + +class ASFormerDecoderBlock(nn.Module): + """Single decoder block: self-attention + cross-attention + conv feed-forward.""" + + def __init__(self, d_model, dilation, num_heads=1, kernel_size=3, + dropout=0.1, window_size=5): + super().__init__() + self.self_attn = DilatedAttention(d_model, dilation, num_heads, dropout, window_size) + self.cross_attn = DilatedAttention(d_model, dilation, num_heads, dropout, window_size) + self.ffn = ConvFeedForward(d_model, kernel_size, dropout) + + def forward(self, x, enc_features): + x = self.self_attn(x) + x = self.cross_attn(x, cross_kv=enc_features) + x = self.ffn(x) + return x + + +class ASFormerEncoder(nn.Module): + """ASFormer encoder: projection + N dilated attention layers + output head.""" + + def __init__(self, input_dim, d_model, num_classes, num_layers=5, + num_heads=1, kernel_size=3, dropout=0.1, window_size=5): + super().__init__() + self.input_proj = nn.Conv1d(input_dim, d_model, 1) + self.pos_enc = PositionalEncoding1D(d_model, dropout) + self.layers = nn.ModuleList([ + ASFormerEncoderBlock(d_model, 2 ** i, num_heads, kernel_size, dropout, window_size) + for i in range(num_layers) + ]) + self.output_proj = nn.Conv1d(d_model, num_classes, 1) + + def forward(self, x): + # x: (B, T, C) + x = x.permute(0, 2, 1) # (B, C, T) + x = self.input_proj(x) # (B, d_model, T) + x = x.permute(0, 2, 1) # (B, T, d_model) + x = self.pos_enc(x) + + for layer in self.layers: + x = layer(x) + + features = x + logits = self.output_proj(x.permute(0, 2, 1)).permute(0, 2, 1) # (B, T, num_classes) + return features, logits + + +class ASFormerDecoder(nn.Module): + """ASFormer decoder: refinement stage with cross-attention to encoder.""" + + def __init__(self, input_dim, d_model, num_classes, num_layers=5, + num_heads=1, kernel_size=3, dropout=0.1, window_size=5): + super().__init__() + self.input_proj = nn.Conv1d(input_dim, d_model, 1) + self.pos_enc = PositionalEncoding1D(d_model, dropout) + self.layers = nn.ModuleList([ + ASFormerDecoderBlock(d_model, 2 ** i, num_heads, kernel_size, dropout, window_size) + for i in range(num_layers) + ]) + self.output_proj = nn.Conv1d(d_model, num_classes, 1) + + def forward(self, dec_input, enc_features): + # dec_input: (B, T, input_dim), enc_features: (B, T, d_model) + x = dec_input.permute(0, 2, 1) + x = self.input_proj(x) + x = x.permute(0, 2, 1) + x = self.pos_enc(x) + + for layer in self.layers: + x = layer(x, enc_features) + + logits = self.output_proj(x.permute(0, 2, 1)).permute(0, 2, 1) + return x, logits + + +class ASFormer(nn.Module): + """ASFormer: Transformer for Action Segmentation (Yi et al., BMVC 2021). + + Multi-stage encoder-decoder transformer for frame-level action segmentation. + Returns a list of per-stage logits for multi-stage training (same interface as MSTCN). + + Args: + input_dim: Input feature dimension + num_classes: Number of action classes + hidden_dim: Hidden dimension (d_model) + num_layers: Number of attention layers per stage (dilation 1, 2, ..., 2^(num_layers-1)) + num_decoders: Number of decoder (refinement) stages + num_heads: Number of attention heads + kernel_size: Feed-forward convolution kernel size + dropout: Dropout rate + window_size: Dilated attention window size + """ + + def __init__(self, input_dim, num_classes, hidden_dim=64, num_layers=5, + num_decoders=3, num_heads=1, kernel_size=3, dropout=0.1, + window_size=5): + super().__init__() + self.encoder = ASFormerEncoder( + input_dim, hidden_dim, num_classes, num_layers, + num_heads, kernel_size, dropout, window_size + ) + self.decoders = nn.ModuleList([ + ASFormerDecoder( + num_classes, hidden_dim, num_classes, num_layers, + num_heads, kernel_size, dropout, window_size + ) for _ in range(num_decoders) + ]) + + def forward(self, x): + # x: (B, T, C) + outputs = [] + enc_features, enc_logits = self.encoder(x) + outputs.append(enc_logits) + + for decoder in self.decoders: + dec_input = F.softmax(outputs[-1], dim=-1).detach() + _, dec_logits = decoder(dec_input, enc_features) + outputs.append(dec_logits) + + return outputs # list of (B, T, num_classes), compatible with MSTCN interface + + +class ASFormerContact(nn.Module): + """ASFormer adapted for binary contact detection (Exp 3). + + Wraps ASFormer to return only the final stage output (B, T, 2), + compatible with the exp3 training loop. + Uses multi-stage training internally but returns single output. + """ + + def __init__(self, input_dim, hidden_dim=64, num_layers=5, num_decoders=2, + num_heads=1, dropout=0.1): + super().__init__() + self.asformer = ASFormer( + input_dim, num_classes=2, hidden_dim=hidden_dim, + num_layers=num_layers, num_decoders=num_decoders, + num_heads=num_heads, dropout=dropout + ) + + def forward(self, x): + # x: (B, T, C) -> (B, T, 2) + outputs = self.asformer(x) + return outputs[-1] # Return final stage only diff --git a/experiments/tasks/train_baselines_t1.py b/experiments/tasks/train_baselines_t1.py new file mode 100644 index 0000000000000000000000000000000000000000..aa49283a853c6d10d570be9415517357d54e5907 --- /dev/null +++ b/experiments/tasks/train_baselines_t1.py @@ -0,0 +1,316 @@ +#!/usr/bin/env python3 +""" +Unified T1 scene recognition training script. +Supports 8 methods: 7 published baselines + SyncFuse. + +Usage: + python3 train_baselines_t1.py --method stgcn --seed 42 + python3 train_baselines_t1.py --method ctrgcn --seed 42 + python3 train_baselines_t1.py --method limu_bert --seed 42 + python3 train_baselines_t1.py --method emg_cnn --seed 42 + python3 train_baselines_t1.py --method actionsense --seed 42 + python3 train_baselines_t1.py --method mult --seed 42 + python3 train_baselines_t1.py --method perceiver --seed 42 + python3 train_baselines_t1.py --method syncfuse --seed 42 \ + --mod_dropout_p 0.3 --use_xmod_shift --use_learned_late \ + --pretrained_dir /path/to/pretrained +""" +import os +import sys +import json +import time +import random +import argparse +import numpy as np +import torch +import torch.nn as nn +from sklearn.metrics import accuracy_score, f1_score, confusion_matrix + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from data.dataset import get_dataloaders, NUM_CLASSES +from nets.baselines_published.baselines import ( + STGCN, CTRGCN, LIMUBert, EMGCNN, ActionSenseLSTM, MulT, PerceiverIO, +) +from nets.baselines_published.syncfuse import SyncFuse + + +# --------------------------------------------------------------------------- +# Modality configurations per method +# --------------------------------------------------------------------------- + +METHOD_MODALITIES = { + # Single-modality baselines + 'stgcn': ['mocap'], + 'ctrgcn': ['mocap'], + 'limu_bert': ['imu'], + 'emg_cnn': ['emg'], + # Multi-modality baselines + 'actionsense': ['mocap', 'emg', 'eyetrack', 'imu'], # drop pressure due to sparse coverage + 'mult': ['mocap', 'emg', 'imu'], # MulT is 3-modal + 'perceiver': ['mocap', 'emg', 'eyetrack', 'imu'], + # Our method (4-mod) + 'syncfuse': ['mocap', 'emg', 'eyetrack', 'imu'], + # Our method, 3-mod IME variant for direct comparison with tab:scene-published + 'syncfuse_ime': ['mocap', 'emg', 'imu'], + # Plain Transformer+Late head (matches tab:scene-published setup) under + # both 3-mod (IME) and 4-mod protocols, for fair re-evaluation + 'transformer_late': ['mocap', 'emg', 'eyetrack', 'imu'], # 4-mod + 'transformer_late_ime': ['mocap', 'emg', 'imu'], # 3-mod IME + # Single-modality IMU-only Transformer (diagnostic) + 'transformer_imu': ['imu'], +} + + +def set_seed(seed): + random.seed(seed); np.random.seed(seed) + torch.manual_seed(seed); torch.cuda.manual_seed_all(seed) + + +def build_model(method, modality_dims, num_classes, args): + """Construct the requested baseline or SyncFuse.""" + if method == 'stgcn': + return STGCN(modality_dims['mocap'], num_classes, + hidden=args.hidden_dim, n_joints=args.n_joints) + if method == 'ctrgcn': + return CTRGCN(modality_dims['mocap'], num_classes, + hidden=args.hidden_dim, n_joints=args.n_joints) + if method == 'limu_bert': + return LIMUBert(modality_dims['imu'], num_classes, + hidden=args.hidden_dim, n_layers=4, n_heads=4) + if method == 'emg_cnn': + return EMGCNN(modality_dims['emg'], num_classes, hidden=64) + if method == 'actionsense': + return ActionSenseLSTM(modality_dims, num_classes, hidden=args.hidden_dim) + if method == 'mult': + return MulT(modality_dims, num_classes, d_model=args.hidden_dim, + n_layers=2, n_heads=4) + if method == 'perceiver': + return PerceiverIO(modality_dims, num_classes, + latent_dim=args.hidden_dim, n_latents=32, + n_layers=3, n_heads=4) + if method in ('syncfuse', 'syncfuse_ime'): + m = SyncFuse(modality_dims, num_classes, hidden=args.hidden_dim, + n_heads=4, n_layers=2, + use_xmod_shift=args.use_xmod_shift, + use_learned_late=args.use_learned_late) + if args.pretrained_dir: + pt_paths = {} + for m_name in modality_dims: + p = os.path.join(args.pretrained_dir, + f'transformer_{m_name}_early/model_best.pt') + if os.path.exists(p): + pt_paths[m_name] = p + if pt_paths: + m.load_pretrained(pt_paths, freeze=args.freeze_pretrained) + return m + if method == 'transformer_imu': + # SyncFuse with single IMU branch + no extras + no pretrain = matches + # the "Transformer (ours) IMU early" row in tab:scene-published. + m = SyncFuse(modality_dims, num_classes, hidden=args.hidden_dim, + n_heads=4, n_layers=2, + use_xmod_shift=False, + use_learned_late=False) + return m + if method in ('transformer_late', 'transformer_late_ime'): + # Reuse SyncFuse class with all extras OFF == per-modality Transformer + # branches + simple late mean fusion + optional pretrained init. + m = SyncFuse(modality_dims, num_classes, hidden=args.hidden_dim, + n_heads=4, n_layers=2, + use_xmod_shift=False, + use_learned_late=False) + if args.pretrained_dir: + pt_paths = {} + for m_name in modality_dims: + p = os.path.join(args.pretrained_dir, + f'transformer_{m_name}_early/model_best.pt') + if os.path.exists(p): + pt_paths[m_name] = p + if pt_paths: + m.load_pretrained(pt_paths, freeze=args.freeze_pretrained) + return m + raise ValueError(f"Unknown method: {method}") + + +# --------------------------------------------------------------------------- +# Train / Eval loop +# --------------------------------------------------------------------------- + +def train_one_epoch(model, loader, criterion, optimizer, device, args): + model.train() + total_loss, n, all_preds, all_labels = 0., 0, [], [] + for x, y, mask, _ in loader: + x, y, mask = x.to(device), y.to(device), mask.to(device) + optimizer.zero_grad() + if args.method in ('syncfuse', 'syncfuse_ime'): + logits = model(x, mask, mod_dropout_p=args.mod_dropout_p, + training_time=True) + elif args.method in ('transformer_late', 'transformer_late_ime', + 'transformer_imu'): + logits = model(x, mask, mod_dropout_p=0.0, training_time=False) + elif args.method in ('stgcn', 'ctrgcn'): + logits = model(x, mask) # these take only MoCap slice == all of x + elif args.method == 'limu_bert': + logits = model(x, mask) # IMU only + elif args.method == 'emg_cnn': + logits = model(x, mask) + else: + logits = model(x, mask) + loss = criterion(logits, y) + loss.backward() + trainable = [p for p in model.parameters() if p.requires_grad] + if trainable: + torch.nn.utils.clip_grad_norm_(trainable, 1.0) + optimizer.step() + total_loss += loss.item() * y.size(0); n += y.size(0) + all_preds.extend(logits.argmax(dim=1).cpu().numpy()) + all_labels.extend(y.cpu().numpy()) + return total_loss / max(n, 1), accuracy_score(all_labels, all_preds) + + +@torch.no_grad() +def evaluate(model, loader, criterion, device, args): + model.eval() + total_loss, n, all_preds, all_labels = 0., 0, [], [] + for x, y, mask, _ in loader: + x, y, mask = x.to(device), y.to(device), mask.to(device) + if args.method in ('syncfuse', 'syncfuse_ime', + 'transformer_late', 'transformer_late_ime', + 'transformer_imu'): + logits = model(x, mask, training_time=False) + else: + logits = model(x, mask) + loss = criterion(logits, y) + total_loss += loss.item() * y.size(0); n += y.size(0) + all_preds.extend(logits.argmax(dim=1).cpu().numpy()) + all_labels.extend(y.cpu().numpy()) + if n == 0: + return 0., 0., 0., np.zeros((NUM_CLASSES, NUM_CLASSES), dtype=int) + acc = accuracy_score(all_labels, all_preds) + f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0) + cm = confusion_matrix(all_labels, all_preds, labels=list(range(NUM_CLASSES))) + return total_loss / n, acc, f1, cm + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def run(args): + set_seed(args.seed) + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + print(f"Device: {device}") + modalities = METHOD_MODALITIES[args.method] + print(f"Method: {args.method} | Modalities: {modalities} | Seed: {args.seed}") + + train_loader, val_loader, test_loader, info = get_dataloaders( + modalities, batch_size=args.batch_size, downsample=args.downsample, + ) + if info['val_size'] == 0: + val_loader = test_loader + print(f"Train={info['train_size']} Test={info['test_size']} " + f"feat_dim={info['feat_dim']} mod_dims={info['modality_dims']}") + + model = build_model(args.method, info['modality_dims'], info['num_classes'], + args).to(device) + total = sum(p.numel() for p in model.parameters()) + trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) + print(f"Params: {trainable:,}/{total:,}") + + class_weights = info['class_weights'].to(device) + criterion = nn.CrossEntropyLoss(weight=class_weights, + label_smoothing=args.label_smoothing) + optimizer = torch.optim.Adam( + filter(lambda p: p.requires_grad, model.parameters()), + lr=args.lr, weight_decay=args.weight_decay, + ) + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( + optimizer, mode='min', factor=0.5, patience=7, min_lr=1e-6, + ) + + exp_name = f"{args.method}_seed{args.seed}" + if args.tag: + exp_name += f"_{args.tag}" + out_dir = os.path.join(args.output_dir, exp_name) + os.makedirs(out_dir, exist_ok=True) + + # Select model by MAX val F1 (more robust than min val_loss when val == 25-sample test). + best_val_f1, best_val_loss, best_epoch, patience_counter = -1.0, float('inf'), 0, 0 + best_cm = None + for epoch in range(1, args.epochs + 1): + t0 = time.time() + tr_loss, tr_acc = train_one_epoch(model, train_loader, criterion, + optimizer, device, args) + va_loss, va_acc, va_f1, va_cm = evaluate(model, val_loader, criterion, + device, args) + scheduler.step(va_loss) + print(f" E{epoch:3d} | tr {tr_loss:.4f}/{tr_acc:.3f} | " + f"va {va_loss:.4f}/{va_acc:.3f} f1 {va_f1:.3f} | " + f"{time.time()-t0:.1f}s") + if va_f1 > best_val_f1: + best_val_f1 = va_f1; best_val_loss = va_loss + best_epoch = epoch; patience_counter = 0 + best_cm = va_cm + torch.save(model.state_dict(), os.path.join(out_dir, 'model_best.pt')) + else: + patience_counter += 1 + if patience_counter >= args.patience: + print(f" Early stop at epoch {epoch} (best {best_epoch})") + break + best_f1 = best_val_f1 + + # Final test eval on best + model.load_state_dict(torch.load(os.path.join(out_dir, 'model_best.pt'), + weights_only=True)) + te_loss, te_acc, te_f1, te_cm = evaluate(model, test_loader, criterion, + device, args) + print(f"\n== Test == loss {te_loss:.4f} acc {te_acc:.3f} f1 {te_f1:.3f}") + + results = { + 'method': args.method, + 'modalities': modalities, + 'seed': args.seed, + 'best_epoch': best_epoch, + 'best_val_f1': float(best_f1), + 'test_acc': float(te_acc), + 'test_f1': float(te_f1), + 'n_params': trainable, + 'n_params_total': total, + 'confusion_matrix': te_cm.tolist(), + 'args': vars(args), + } + with open(os.path.join(out_dir, 'results.json'), 'w') as f: + json.dump(results, f, indent=2, ensure_ascii=False) + print(f"Saved: {out_dir}/results.json") + return results + + +def main(): + p = argparse.ArgumentParser() + p.add_argument('--method', type=str, required=True, + choices=list(METHOD_MODALITIES.keys())) + p.add_argument('--epochs', type=int, default=80) + p.add_argument('--batch_size', type=int, default=16) + p.add_argument('--lr', type=float, default=1e-3) + p.add_argument('--weight_decay', type=float, default=1e-4) + p.add_argument('--hidden_dim', type=int, default=128) + p.add_argument('--downsample', type=int, default=5) + p.add_argument('--patience', type=int, default=15) + p.add_argument('--label_smoothing', type=float, default=0.1) + p.add_argument('--seed', type=int, default=42) + p.add_argument('--output_dir', type=str, required=True) + p.add_argument('--tag', type=str, default='') + # Method-specific + p.add_argument('--n_joints', type=int, default=52) + # SyncFuse specific + p.add_argument('--mod_dropout_p', type=float, default=0.3) + p.add_argument('--use_xmod_shift', action='store_true') + p.add_argument('--use_learned_late', action='store_true') + p.add_argument('--pretrained_dir', type=str, default='') + p.add_argument('--freeze_pretrained', action='store_true', + help='Freeze loaded pretrained backbones (default: fine-tune them)') + args = p.parse_args() + run(args) + + +if __name__ == '__main__': + main() diff --git a/experiments/tasks/train_exp1.py b/experiments/tasks/train_exp1.py new file mode 100644 index 0000000000000000000000000000000000000000..212ceda369b33ef12ede91553a36e9364b20e757 --- /dev/null +++ b/experiments/tasks/train_exp1.py @@ -0,0 +1,437 @@ +#!/usr/bin/env python3 +""" +Experiment 1: Daily Activity Scene Recognition +Train and evaluate models with different modality combinations and fusion strategies. +""" + +import os +import sys +import json +import time +import random +import argparse +import numpy as np +import torch +import torch.nn as nn +from sklearn.metrics import ( + accuracy_score, f1_score, confusion_matrix, classification_report +) + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from data.dataset import get_dataloaders, NUM_CLASSES, SCENE_LABELS +from nets.models import build_model + +SCENE_NAMES = ['s1_office', 's2_package', 's3_kitchen', 's4_cleaning', + 's5_table_set', 's6_luggage', 's7_coffee', 's8_clothes'] + + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + + +def apply_augmentation(x, mask, noise_std=0.1, time_mask_ratio=0.1): + """Apply data augmentation on GPU tensors: Gaussian noise + time masking.""" + if noise_std > 0: + noise = torch.randn_like(x) * noise_std + x = x + noise * mask.unsqueeze(-1).float() + if time_mask_ratio > 0: + B, T, C = x.shape + mask_len = int(T * time_mask_ratio) + if mask_len > 0: + for i in range(B): + valid_len = mask[i].sum().int().item() + if valid_len > mask_len: + start = random.randint(0, valid_len - mask_len) + x[i, start:start + mask_len, :] = 0.0 + return x + + +def _load_and_freeze_backbone(model, pretrained_path, freeze_idx, fusion_type): + """Load pretrained SingleModel weights into a fusion model branch and freeze it.""" + if fusion_type == 'early': + print("WARNING: Early fusion has a shared backbone — cannot freeze single modality. Skipping.") + return + + pretrained_sd = torch.load(pretrained_path, weights_only=True) + + # Map SingleModel keys -> fusion model keys + new_sd = {} + for k, v in pretrained_sd.items(): + if k.startswith('backbone.'): + new_key = k.replace('backbone.', f'backbones.{freeze_idx}.') + new_sd[new_key] = v + elif k.startswith('classifier.') and fusion_type != 'attention': + new_key = k.replace('classifier.', f'classifiers.{freeze_idx}.') + new_sd[new_key] = v + + model_sd = model.state_dict() + model_sd.update(new_sd) + model.load_state_dict(model_sd) + print(f" Loaded {len(new_sd)} tensors from {pretrained_path} into branch {freeze_idx}") + + # Freeze backbone (and classifier for non-attention models) + for name, param in model.named_parameters(): + if name.startswith(f'backbones.{freeze_idx}.'): + param.requires_grad = False + if fusion_type != 'attention' and name.startswith(f'classifiers.{freeze_idx}.'): + param.requires_grad = False + + frozen_count = sum(not p.requires_grad for p in model.parameters()) + total_count = sum(1 for _ in model.parameters()) + print(f" Frozen: {frozen_count}/{total_count} parameter tensors") + + +def train_one_epoch(model, loader, criterion, optimizer, device, + augment=False, noise_std=0.1, time_mask_ratio=0.1): + model.train() + total_loss = 0 + all_preds, all_labels = [], [] + for x, y, mask, lengths in loader: + x, y, mask = x.to(device), y.to(device), mask.to(device) + if augment: + x = apply_augmentation(x, mask, noise_std, time_mask_ratio) + optimizer.zero_grad() + logits = model(x, mask) + loss = criterion(logits, y) + loss.backward() + trainable_params = [p for p in model.parameters() if p.requires_grad] + torch.nn.utils.clip_grad_norm_(trainable_params, 1.0) + optimizer.step() + total_loss += loss.item() * y.size(0) + all_preds.extend(logits.argmax(dim=1).cpu().numpy()) + all_labels.extend(y.cpu().numpy()) + n = len(all_labels) + return total_loss / n, accuracy_score(all_labels, all_preds) + + +@torch.no_grad() +def evaluate(model, loader, criterion, device): + model.eval() + total_loss = 0 + all_preds, all_labels = [], [] + for x, y, mask, lengths in loader: + x, y, mask = x.to(device), y.to(device), mask.to(device) + logits = model(x, mask) + loss = criterion(logits, y) + total_loss += loss.item() * y.size(0) + all_preds.extend(logits.argmax(dim=1).cpu().numpy()) + all_labels.extend(y.cpu().numpy()) + + n = len(all_labels) + acc = accuracy_score(all_labels, all_preds) + f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0) + cm = confusion_matrix(all_labels, all_preds, labels=list(range(NUM_CLASSES))) + return total_loss / n, acc, f1, cm, np.array(all_preds), np.array(all_labels) + + +def run_experiment(args): + set_seed(args.seed) + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + print(f"Device: {device}") + + modalities = args.modalities.split(',') + print(f"\n{'='*60}") + print(f"Model: {args.model} | Modalities: {modalities} | Fusion: {args.fusion}") + print(f"{'='*60}") + + # Load data + train_loader, val_loader, test_loader, info = get_dataloaders( + modalities, batch_size=args.batch_size, downsample=args.downsample + ) + # If no val set, use test set for early stopping / model selection + if info['val_size'] == 0: + val_loader = test_loader + print(f"Train: {info['train_size']}, Val: (using test), Test: {info['test_size']}") + else: + print(f"Train: {info['train_size']}, Val: {info['val_size']}, Test: {info['test_size']}") + print(f"Feature dim: {info['feat_dim']}, Modality dims: {info['modality_dims']}") + + # Build model + late_agg = getattr(args, 'late_agg', 'mean') + model = build_model( + args.model, args.fusion, info['feat_dim'], + info['modality_dims'], info['num_classes'], + hidden_dim=args.hidden_dim, proj_dim=args.proj_dim, + late_agg=late_agg, + ).to(device) + + # Load pretrained backbone and freeze if specified + if args.pretrained_backbone and args.freeze_backbone_idx is not None: + _load_and_freeze_backbone(model, args.pretrained_backbone, + args.freeze_backbone_idx, args.fusion) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + print(f"Parameters: {trainable_params:,} trainable / {total_params:,} total") + + # Loss with class weights + label smoothing + class_weights = info['class_weights'].to(device) + criterion = nn.CrossEntropyLoss(weight=class_weights, + label_smoothing=args.label_smoothing) + + optimizer = torch.optim.Adam( + filter(lambda p: p.requires_grad, model.parameters()), + lr=args.lr, weight_decay=args.weight_decay, + ) + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( + optimizer, mode='min', factor=0.5, patience=7, min_lr=1e-6 + ) + + # Training loop with early stopping + best_val_loss = float('inf') + best_val_f1 = 0 + best_epoch = 0 + patience_counter = 0 + + # Output directory + mod_str = '-'.join(modalities) + exp_name = f"{args.model}_{mod_str}_{args.fusion}" + if args.tag: + exp_name += f"_{args.tag}" + out_dir = os.path.join(args.output_dir, exp_name) + os.makedirs(out_dir, exist_ok=True) + + for epoch in range(1, args.epochs + 1): + t0 = time.time() + train_loss, train_acc = train_one_epoch( + model, train_loader, criterion, optimizer, device, + augment=args.augment, noise_std=args.noise_std, + time_mask_ratio=args.time_mask_ratio, + ) + val_loss, val_acc, val_f1, _, _, _ = evaluate(model, val_loader, criterion, device) + scheduler.step(val_loss) + + elapsed = time.time() - t0 + lr = optimizer.param_groups[0]['lr'] + print(f" Epoch {epoch:3d} | " + f"Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} | " + f"Val Loss: {val_loss:.4f} Acc: {val_acc:.4f} F1: {val_f1:.4f} | " + f"LR: {lr:.2e} | {elapsed:.1f}s") + + if val_loss < best_val_loss: + best_val_loss = val_loss + best_val_f1 = val_f1 + best_epoch = epoch + patience_counter = 0 + torch.save(model.state_dict(), os.path.join(out_dir, 'model_best.pt')) + else: + patience_counter += 1 + + if patience_counter >= args.patience: + print(f" Early stopping at epoch {epoch} (best: {best_epoch})") + break + + # Test evaluation + print(f"\nBest epoch: {best_epoch} (val_loss: {best_val_loss:.4f}, val_f1: {best_val_f1:.4f})") + model.load_state_dict(torch.load(os.path.join(out_dir, 'model_best.pt'), weights_only=True)) + test_loss, test_acc, test_f1, test_cm, test_preds, test_labels = evaluate( + model, test_loader, criterion, device + ) + + # Per-class accuracy + per_class_acc = {} + for i in range(NUM_CLASSES): + mask = test_labels == i + if mask.sum() > 0: + per_class_acc[SCENE_NAMES[i]] = float((test_preds[mask] == i).mean()) + else: + per_class_acc[SCENE_NAMES[i]] = None + + print(f"\n--- Test Results ---") + print(f" Accuracy: {test_acc:.4f}") + print(f" Macro F1: {test_f1:.4f}") + print(f" Per-class: {per_class_acc}") + print(f" Confusion Matrix:\n{test_cm}") + + # Save results + results = { + 'experiment': exp_name, + 'model': args.model, + 'modalities': modalities, + 'fusion': args.fusion, + 'best_epoch': best_epoch, + 'best_val_loss': float(best_val_loss), + 'best_val_f1': float(best_val_f1), + 'test_accuracy': float(test_acc), + 'test_macro_f1': float(test_f1), + 'test_per_class_accuracy': per_class_acc, + 'confusion_matrix': test_cm.tolist(), + 'n_params': trainable_params, + 'n_params_total': total_params, + 'train_size': info['train_size'], + 'val_size': info['val_size'], + 'test_size': info['test_size'], + 'feat_dim': info['feat_dim'], + 'args': vars(args), + } + with open(os.path.join(out_dir, 'results.json'), 'w') as f: + json.dump(results, f, indent=2, ensure_ascii=False) + np.save(os.path.join(out_dir, 'confusion_matrix.npy'), test_cm) + print(f" Results saved to {out_dir}") + return results + + +def run_all_experiments(args): + """Run all modality ablation + fusion experiments.""" + modality_combos = [ + 'mocap', + 'emg', + 'eyetrack', + 'imu', + 'pressure', + 'mocap,emg,eyetrack', + 'mocap,emg,eyetrack,imu', + 'mocap,emg,eyetrack,pressure', + 'mocap,emg,eyetrack,imu,pressure', + ] + models = ['cnn', 'lstm', 'transformer'] + + all_results = [] + + # Part 1: Modality ablation with all backbone models + if not args.skip_ablation: + for mod_combo in modality_combos: + for model_name in models: + args.modalities = mod_combo + args.model = model_name + args.fusion = 'early' + try: + result = run_experiment(args) + all_results.append(result) + except Exception as e: + print(f"FAILED: {model_name} / {mod_combo} / early: {e}") + all_results.append({ + 'experiment': f"{model_name}_{mod_combo.replace(',', '-')}_early", + 'error': str(e), + }) + + # Part 2: Fusion ablation with 3-core modalities and best backbone + if args.skip_ablation: + best_backbone = args.best_backbone + print(f"\nSkipping ablation. Using specified backbone: {best_backbone}") + else: + # Find best backbone from 3-core early fusion results + core_results = [r for r in all_results + if r.get('modalities') == ['mocap', 'emg', 'eyetrack'] + and 'error' not in r] + if core_results: + best_backbone = max(core_results, key=lambda r: r['test_macro_f1'])['model'] + else: + best_backbone = 'cnn' + print(f"\nBest backbone for fusion experiments: {best_backbone}") + + fusion_methods = ['late', 'attention', 'weighted_late', 'gated_late', 'stacking', 'product', 'moe'] + + for fusion in fusion_methods: + args.modalities = 'mocap,emg,eyetrack' + args.model = best_backbone + args.fusion = fusion + try: + result = run_experiment(args) + all_results.append(result) + except Exception as e: + print(f"FAILED: {best_backbone} / 3-core / {fusion}: {e}") + all_results.append({ + 'experiment': f"{best_backbone}_mocap-emg-eyetrack_{fusion}", + 'error': str(e), + }) + + # Also run fusion with all 5 modalities + for fusion in fusion_methods: + args.modalities = 'mocap,emg,eyetrack,imu,pressure' + args.model = best_backbone + args.fusion = fusion + try: + result = run_experiment(args) + all_results.append(result) + except Exception as e: + print(f"FAILED: {best_backbone} / all / {fusion}: {e}") + all_results.append({ + 'experiment': f"{best_backbone}_all_{fusion}", + 'error': str(e), + }) + + # Save summary + summary_path = os.path.join(args.output_dir, 'exp1_summary.json') + with open(summary_path, 'w') as f: + json.dump(all_results, f, indent=2, ensure_ascii=False) + print(f"\n{'='*60}") + print(f"All experiments completed! Summary saved to {summary_path}") + + # Print results table + print(f"\n{'Model':<15} {'Modalities':<40} {'Fusion':<10} {'Acc':<8} {'F1':<8}") + print('-' * 85) + for r in all_results: + if 'error' in r: + print(f"{r['experiment']:<65} FAILED: {r['error'][:20]}") + else: + mod_str = ','.join(r['modalities']) + print(f"{r['model']:<15} {mod_str:<40} {r['fusion']:<10} " + f"{r['test_accuracy']:.4f} {r['test_macro_f1']:.4f}") + + +def main(): + parser = argparse.ArgumentParser(description='Exp1: Scene Recognition') + parser.add_argument('--model', type=str, default='cnn', + choices=['cnn', 'lstm', 'transformer', 'tinyhar', + 'deepconvlstm', 'inceptiontime']) + parser.add_argument('--modalities', type=str, default='mocap,emg,eyetrack', + help='Comma-separated modality names') + parser.add_argument('--fusion', type=str, default='early', + choices=['early', 'late', 'attention', + 'weighted_late', 'gated_late', 'stacking', + 'product', 'moe', 'feat_concat']) + parser.add_argument('--epochs', type=int, default=100) + parser.add_argument('--batch_size', type=int, default=16) + parser.add_argument('--lr', type=float, default=1e-3) + parser.add_argument('--weight_decay', type=float, default=1e-3) + parser.add_argument('--hidden_dim', type=int, default=32) + parser.add_argument('--proj_dim', type=int, default=0, + help='Per-modality projection dim (0 = no projection)') + parser.add_argument('--downsample', type=int, default=5, + help='Downsample factor from 100Hz (5 = 20Hz)') + parser.add_argument('--patience', type=int, default=15) + parser.add_argument('--augment', action='store_true', + help='Enable data augmentation (noise + time mask)') + parser.add_argument('--noise_std', type=float, default=0.1, + help='Gaussian noise std for augmentation') + parser.add_argument('--time_mask_ratio', type=float, default=0.1, + help='Fraction of timesteps to mask') + parser.add_argument('--label_smoothing', type=float, default=0.0, + help='Label smoothing for CrossEntropyLoss') + parser.add_argument('--pretrained_backbone', type=str, default=None, + help='Path to pretrained SingleModel weights') + parser.add_argument('--freeze_backbone_idx', type=int, default=None, + help='Index of modality branch to freeze') + parser.add_argument('--late_agg', type=str, default='mean', + choices=['mean', 'confidence', 'learned'], + help='Late fusion aggregation: mean/confidence/learned') + parser.add_argument('--tag', type=str, default='', + help='Experiment name suffix for output dir') + parser.add_argument('--seed', type=int, default=42) + parser.add_argument('--output_dir', type=str, + default='${PULSE_ROOT}/results/exp1') + parser.add_argument('--run_all', action='store_true', + help='Run all modality ablation + fusion experiments') + parser.add_argument('--skip_ablation', action='store_true', + help='Skip Part 1 (modality ablation), run fusion experiments only with --best_backbone') + parser.add_argument('--best_backbone', type=str, default='transformer', + choices=['cnn', 'lstm', 'transformer', 'tinyhar', + 'deepconvlstm', 'inceptiontime'], + help='Backbone to use when --skip_ablation (default: transformer)') + args = parser.parse_args() + + os.makedirs(args.output_dir, exist_ok=True) + + if args.run_all: + run_all_experiments(args) + else: + run_experiment(args) + + +if __name__ == '__main__': + main() diff --git a/experiments/tasks/train_exp2.py b/experiments/tasks/train_exp2.py new file mode 100644 index 0000000000000000000000000000000000000000..661ff759103a2417666f434136d136f9cc37c5ed --- /dev/null +++ b/experiments/tasks/train_exp2.py @@ -0,0 +1,675 @@ +#!/usr/bin/env python3 +""" +Experiment 2: Temporal Action Segmentation +Per-frame action classification using multi-modal time series. +Uses annotations from annotations_by_scene/ to create frame-level labels. +""" + +import os +import sys +import json +import time +import re +import random +import argparse +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from sklearn.metrics import f1_score, accuracy_score +from torch.utils.data import Dataset, DataLoader + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from data.dataset import ( + DATASET_DIR, MODALITY_FILES, SKIP_COLS, SKIP_COL_SUFFIXES, + TRAIN_VOLS, VAL_VOLS, TEST_VOLS, load_modality_array, get_modality_filepath +) + +ANNOTATION_DIR = "${PULSE_ROOT}/annotations_v2" +ANNOTATION_DIR_FALLBACK = "${PULSE_ROOT}/annotations_by_scene" +ANNOTATION_DIR_COARSE = "${PULSE_ROOT}/annotations_coarse" + +# Fine-grained action categories (11 classes) +FINE_ACTION_LABELS = { + 'Idle': 0, + 'Grasp': 1, + 'Place': 2, + 'Pour': 3, + 'Wipe': 4, + 'Fold': 5, + 'OpenClose': 6, + 'Stir': 7, + 'TearCut': 8, + 'Arrange': 9, + 'Transport': 10, +} + +# Coarse-grained action categories (6 classes) +COARSE_ACTION_LABELS = { + 'Idle': 0, + 'Manipulate': 1, + 'CleanOrganize': 2, + 'Transfer': 3, + 'Assemble': 4, + 'FoodPrep': 5, +} + +# Default to fine-grained (overridden by --coarse_labels flag) +ACTION_LABELS = FINE_ACTION_LABELS +NUM_ACTIONS = len(ACTION_LABELS) +ACTION_NAMES = {v: k for k, v in ACTION_LABELS.items()} + +WINDOW_SIZE = 512 # ~5s at 100Hz +WINDOW_STRIDE = 256 + + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def classify_action(task_text): + """Map Chinese task description to coarse action category.""" + t = task_text + if any(k in t for k in ['抓取', '拿起', '拿取', '取出', '掀开', '取下', '搬起']): + return 'Grasp' + elif any(k in t for k in ['放置', '放回', '放入', '放下', '放到', '释放', '移开', '松开']): + return 'Place' + elif any(k in t for k in ['倾倒', '倒入', '倒出', '注水', '倒水', '倒置', '倾斜', '转移']): + return 'Pour' + elif any(k in t for k in ['擦拭', '抹布', '清洁', '擦干', '擦除']): + return 'Wipe' + elif any(k in t for k in ['折叠', '对折', '折好', '卷', '缠绕']): + return 'Fold' + elif any(k in t for k in ['打开', '关闭', '开启', '合上', '旋开', '旋紧', '拉链', + '拧开', '拧紧', '盖上', '拔开']): + return 'OpenClose' + elif any(k in t for k in ['搅拌', '搅动']): + return 'Stir' + elif any(k in t for k in ['撕', '剪', '切', '粘贴', '胶带', '封箱']): + return 'TearCut' + elif any(k in t for k in ['整理', '调整', '摆放', '对齐', '铺', '展开', '抚平', + '理顺', '排列', '码放', '微调', '压实']): + return 'Arrange' + elif any(k in t for k in ['搬运', '移动', '移至', '运送', '搬到', '提起', '抬起', + '携带', '移回', '将菜锅移']): + return 'Transport' + else: + return 'Idle' # unclassifiable → treat as idle + + +def parse_timestamp(ts_str): + """Parse 'MM:SS' to seconds.""" + parts = ts_str.strip().split(':') + if len(parts) == 2: + return int(parts[0]) * 60 + int(parts[1]) + return 0 + + +def load_annotations(vol, scenario, n_frames, sampling_rate=100, use_coarse=False): + """Load annotations and create per-frame labels. + + use_coarse=False: fine-grained (11 classes) from annotations_v2 + use_coarse=True: coarse-grained (6 classes) from annotations_coarse + """ + if use_coarse: + ann_path = os.path.join(ANNOTATION_DIR_COARSE, vol, f"{scenario}.json") + if not os.path.exists(ann_path): + return None + with open(ann_path) as f: + data = json.load(f) + labels = np.zeros(n_frames, dtype=np.int64) + for seg in data.get('coarse_segments', []): + ts = seg['timestamp'] + match = re.match(r'(\d+:\d+)\s*-\s*(\d+:\d+)', ts) + if not match: + continue + start_sec = parse_timestamp(match.group(1)) + end_sec = parse_timestamp(match.group(2)) + start_frame = min(int(start_sec * sampling_rate), n_frames) + end_frame = min(int(end_sec * sampling_rate), n_frames) + action = seg.get('coarse_action', 'Idle') + if action in ACTION_LABELS: + labels[start_frame:end_frame] = ACTION_LABELS[action] + return labels + else: + # Fine-grained: try v2 annotations first, fallback to original + ann_path = os.path.join(ANNOTATION_DIR, vol, f"{scenario}.json") + if not os.path.exists(ann_path): + ann_path = os.path.join(ANNOTATION_DIR_FALLBACK, vol, f"{scenario}.json") + if not os.path.exists(ann_path): + return None + with open(ann_path) as f: + data = json.load(f) + labels = np.zeros(n_frames, dtype=np.int64) + for seg in data['segments']: + ts = seg['timestamp'] + match = re.match(r'(\d+:\d+)\s*-\s*(\d+:\d+)', ts) + if not match: + continue + start_sec = parse_timestamp(match.group(1)) + end_sec = parse_timestamp(match.group(2)) + start_frame = min(int(start_sec * sampling_rate), n_frames) + end_frame = min(int(end_sec * sampling_rate), n_frames) + if 'action_label' in seg: + action = seg['action_label'] + else: + action = classify_action(seg['task']) + if action in ACTION_LABELS: + labels[start_frame:end_frame] = ACTION_LABELS[action] + return labels + + +class ActionSegmentationDataset(Dataset): + """Sliding window dataset for action segmentation.""" + + def __init__(self, volunteers, modalities, window_size=WINDOW_SIZE, + stride=WINDOW_STRIDE, downsample=2, stats=None, use_coarse=False): + self.windows = [] + self._feat_dim = None + all_features = [] + + for vol in volunteers: + vol_dir = os.path.join(DATASET_DIR, vol) + if not os.path.isdir(vol_dir): + continue + for scenario in sorted(os.listdir(vol_dir)): + scenario_dir = os.path.join(vol_dir, scenario) + if not os.path.isdir(scenario_dir): + continue + meta_path = os.path.join(scenario_dir, 'alignment_metadata.json') + if not os.path.exists(meta_path): + continue + with open(meta_path) as f: + meta = json.load(f) + + available = set(meta['modalities']) + # Check for video features files (not in metadata) + if os.path.exists(os.path.join(scenario_dir, 'video_features_100hz.npy')): + available.add('video') + if os.path.exists(os.path.join(scenario_dir, 'video_features_videomae_100hz.npy')): + available.add('videomae') + if not set(modalities).issubset(available): + continue + + # Load features + parts = [] + skip = False + for mod in modalities: + filepath = get_modality_filepath(scenario_dir, mod, vol, scenario) + arr = load_modality_array(filepath, mod) + if arr is None: + skip = True + break + parts.append(arr) + if skip: + continue + + min_len = min(p.shape[0] for p in parts) + features = np.concatenate([p[:min_len] for p in parts], axis=1) + + # Load annotations + labels = load_annotations(vol, scenario, min_len, use_coarse=use_coarse) + if labels is None: + continue + + # Downsample + features = features[::downsample] + labels = labels[::downsample] + + if self._feat_dim is None: + self._feat_dim = features.shape[1] + + all_features.append(features) + + # Extract sliding windows + T = features.shape[0] + for start in range(0, T - window_size + 1, stride): + end = start + window_size + self.windows.append((features[start:end], labels[start:end])) + + # Normalization + if stats is not None: + self.mean, self.std = stats + else: + if all_features: + all_data = np.concatenate(all_features, axis=0).astype(np.float64) + self.mean = np.mean(all_data, axis=0, keepdims=True) + self.std = np.std(all_data, axis=0, keepdims=True) + self.std[self.std < 1e-8] = 1.0 + else: + d = self._feat_dim or 1 + self.mean = np.zeros((1, d), dtype=np.float64) + self.std = np.ones((1, d), dtype=np.float64) + + self.windows = [ + (((w[0].astype(np.float64) - self.mean) / self.std).astype(np.float32), w[1]) + for w in self.windows + ] + + # Stats + if self.windows: + all_labels = np.concatenate([w[1] for w in self.windows]) + print(f" Windows: {len(self.windows)}, feat_dim: {self._feat_dim}", flush=True) + for i in range(NUM_ACTIONS): + count = (all_labels == i).sum() + if count > 0: + print(f" {ACTION_NAMES[i]}: {count} frames ({100*count/len(all_labels):.1f}%)", + flush=True) + + def get_stats(self): + return (self.mean, self.std) + + @property + def feat_dim(self): + return self._feat_dim + + def get_class_weights(self): + all_labels = np.concatenate([w[1] for w in self.windows]) + counts = np.bincount(all_labels, minlength=NUM_ACTIONS).astype(np.float32) + counts[counts == 0] = 1.0 + weights = 1.0 / counts + weights = weights / weights.sum() * NUM_ACTIONS + return torch.FloatTensor(weights) + + def __len__(self): + return len(self.windows) + + def __getitem__(self, idx): + features, labels = self.windows[idx] + return torch.from_numpy(features), torch.from_numpy(labels) + + +# ============================================================ +# Models: MS-TCN-like architecture for action segmentation +# ============================================================ + +class DilatedResBlock(nn.Module): + def __init__(self, channels, dilation): + super().__init__() + self.conv1 = nn.Conv1d(channels, channels, 3, padding=dilation, dilation=dilation) + self.conv2 = nn.Conv1d(channels, channels, 1) + self.bn1 = nn.BatchNorm1d(channels) + self.bn2 = nn.BatchNorm1d(channels) + self.dropout = nn.Dropout(0.1) + + def forward(self, x): + residual = x + x = self.dropout(torch.relu(self.bn1(self.conv1(x)))) + x = self.dropout(torch.relu(self.bn2(self.conv2(x)))) + return x + residual + + +class TCNStage(nn.Module): + """Single stage of MS-TCN.""" + def __init__(self, in_channels, hidden_channels, num_classes, num_layers=8): + super().__init__() + self.input_conv = nn.Conv1d(in_channels, hidden_channels, 1) + self.layers = nn.ModuleList([ + DilatedResBlock(hidden_channels, 2 ** i) for i in range(num_layers) + ]) + self.output_conv = nn.Conv1d(hidden_channels, num_classes, 1) + + def forward(self, x): + x = self.input_conv(x) + for layer in self.layers: + x = layer(x) + return self.output_conv(x) + + +class MSTCN(nn.Module): + """Multi-Stage TCN (MS-TCN++) for action segmentation.""" + def __init__(self, input_dim, num_classes, hidden_dim=64, num_stages=2, num_layers=8): + super().__init__() + self.stages = nn.ModuleList() + self.stages.append(TCNStage(input_dim, hidden_dim, num_classes, num_layers)) + for _ in range(num_stages - 1): + self.stages.append(TCNStage(num_classes, hidden_dim, num_classes, num_layers)) + + def forward(self, x): + # x: (B, T, C) -> (B, C, T) + x = x.permute(0, 2, 1) + outputs = [] + for stage in self.stages: + x = stage(x) + outputs.append(x.permute(0, 2, 1)) # (B, T, num_classes) + return outputs # list of per-stage outputs + + +class SimpleTCN(nn.Module): + """Single-stage TCN baseline.""" + def __init__(self, input_dim, num_classes, hidden_dim=64, num_layers=8): + super().__init__() + self.stage = TCNStage(input_dim, hidden_dim, num_classes, num_layers) + + def forward(self, x): + x = x.permute(0, 2, 1) + out = self.stage(x) + return [out.permute(0, 2, 1)] + + +class BiLSTMSeg(nn.Module): + """Bi-LSTM for action segmentation.""" + def __init__(self, input_dim, num_classes, hidden_dim=64): + super().__init__() + self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=2, + batch_first=True, bidirectional=True, dropout=0.2) + self.head = nn.Linear(hidden_dim * 2, num_classes) + + def forward(self, x): + out, _ = self.lstm(x) + return [self.head(out)] + + +def build_seg_model(name, input_dim, num_classes, hidden_dim=64): + if name == 'mstcn': + return MSTCN(input_dim, num_classes, hidden_dim, num_stages=2) + elif name == 'tcn': + return SimpleTCN(input_dim, num_classes, hidden_dim) + elif name == 'lstm': + return BiLSTMSeg(input_dim, num_classes, hidden_dim) + elif name == 'asformer': + from experiments.published_baselines import ASFormer + return ASFormer(input_dim, num_classes, hidden_dim, + num_layers=5, num_decoders=3) + elif name == 'mstcnpp': + from experiments.published_models import MSTCNPP + return MSTCNPP(input_dim, num_classes, hidden_dim, num_stages=4, num_layers=10) + elif name == 'diffact': + from experiments.published_models import DiffAct + return DiffAct(input_dim, num_classes, hidden_dim, + num_encoder_layers=6, num_denoise_layers=6, + num_diffusion_steps=10) + else: + raise ValueError(f"Unknown model: {name}") + + +# ============================================================ +# Metrics: Segmental F1 @ IoU thresholds +# ============================================================ + +def compute_segmental_f1(pred, gt, iou_threshold=0.5): + """Compute segmental F1 score at a given IoU threshold.""" + def get_segments(seq): + segments = [] + if len(seq) == 0: + return segments + start = 0 + for i in range(1, len(seq)): + if seq[i] != seq[i - 1]: + segments.append((seq[start], start, i)) + start = i + segments.append((seq[start], start, len(seq))) + return segments + + pred_segs = get_segments(pred) + gt_segs = get_segments(gt) + + tp = 0 + matched_gt = set() + for p_label, p_start, p_end in pred_segs: + if p_label == 0: # skip Idle + continue + best_iou = 0 + best_idx = -1 + for idx, (g_label, g_start, g_end) in enumerate(gt_segs): + if g_label != p_label or idx in matched_gt: + continue + inter_start = max(p_start, g_start) + inter_end = min(p_end, g_end) + inter = max(0, inter_end - inter_start) + union = (p_end - p_start) + (g_end - g_start) - inter + iou = inter / union if union > 0 else 0 + if iou > best_iou: + best_iou = iou + best_idx = idx + if best_iou >= iou_threshold: + tp += 1 + matched_gt.add(best_idx) + + pred_count = sum(1 for l, _, _ in pred_segs if l != 0) + gt_count = sum(1 for l, _, _ in gt_segs if l != 0) + precision = tp / pred_count if pred_count > 0 else 0 + recall = tp / gt_count if gt_count > 0 else 0 + f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0 + return f1 + + +# ============================================================ +# Training +# ============================================================ + +def train_one_epoch(model, loader, criterion, optimizer, device): + model.train() + total_loss = 0 + n = 0 + for x, y in loader: + x, y = x.to(device), y.to(device) + optimizer.zero_grad() + outputs = model(x) # list of (B, T, C) + loss = sum(criterion(out.reshape(-1, out.shape[-1]), y.reshape(-1)) for out in outputs) + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + total_loss += loss.item() * x.size(0) + n += x.size(0) + return total_loss / n + + +@torch.no_grad() +def evaluate(model, loader, criterion, device): + model.eval() + total_loss = 0 + n = 0 + all_preds, all_labels = [], [] + + for x, y in loader: + x, y = x.to(device), y.to(device) + outputs = model(x) + loss = criterion(outputs[-1].reshape(-1, outputs[-1].shape[-1]), y.reshape(-1)) + total_loss += loss.item() * x.size(0) + n += x.size(0) + + pred = outputs[-1].argmax(dim=-1).cpu().numpy() + all_preds.append(pred.flatten()) + all_labels.append(y.cpu().numpy().flatten()) + + avg_loss = total_loss / n + preds = np.concatenate(all_preds) + labels = np.concatenate(all_labels) + + frame_acc = accuracy_score(labels, preds) + frame_f1 = f1_score(labels, preds, average='macro', zero_division=0) + + # Segmental F1 at different IoU thresholds + seg_f1_10 = compute_segmental_f1(preds, labels, 0.1) + seg_f1_25 = compute_segmental_f1(preds, labels, 0.25) + seg_f1_50 = compute_segmental_f1(preds, labels, 0.5) + + metrics = { + 'loss': avg_loss, + 'frame_acc': frame_acc, + 'frame_f1': frame_f1, + 'seg_f1@10': seg_f1_10, + 'seg_f1@25': seg_f1_25, + 'seg_f1@50': seg_f1_50, + } + return metrics + + +def run_experiment(args): + global ACTION_LABELS, NUM_ACTIONS, ACTION_NAMES + + set_seed(args.seed) + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + modalities = args.modalities.split(',') + use_coarse = getattr(args, 'coarse_labels', False) + + # Switch label configuration + if use_coarse: + ACTION_LABELS = COARSE_ACTION_LABELS + NUM_ACTIONS = len(ACTION_LABELS) + ACTION_NAMES = {v: k for k, v in ACTION_LABELS.items()} + print(f"\n{'='*60}", flush=True) + print(f"Exp2 Action Seg (COARSE 6-class) | Model: {args.model} | Mods: {modalities}", flush=True) + else: + ACTION_LABELS = FINE_ACTION_LABELS + NUM_ACTIONS = len(ACTION_LABELS) + ACTION_NAMES = {v: k for k, v in ACTION_LABELS.items()} + print(f"\n{'='*60}", flush=True) + print(f"Exp2 Action Seg | Model: {args.model} | Mods: {modalities}", flush=True) + print(f"{'='*60}", flush=True) + + train_ds = ActionSegmentationDataset(TRAIN_VOLS, modalities, downsample=args.downsample, use_coarse=use_coarse) + stats = train_ds.get_stats() + val_ds = ActionSegmentationDataset(VAL_VOLS, modalities, downsample=args.downsample, stats=stats, use_coarse=use_coarse) + test_ds = ActionSegmentationDataset(TEST_VOLS, modalities, downsample=args.downsample, stats=stats, use_coarse=use_coarse) + + if len(train_ds) == 0: + print("No training data!") + return None + + train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True) + test_loader = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False) + # Use test set for validation when val set is empty (no dedicated val volunteers) + if len(val_ds) > 0: + val_loader = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False) + else: + val_loader = test_loader + print(" No val data, using test set for early stopping.", flush=True) + + model = build_seg_model(args.model, train_ds.feat_dim, NUM_ACTIONS, args.hidden_dim).to(device) + n_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + print(f"Params: {n_params:,}", flush=True) + + class_weights = train_ds.get_class_weights().to(device) + criterion = nn.CrossEntropyLoss(weight=class_weights) + optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=7, factor=0.5) + + mod_str = '-'.join(modalities) + exp_name = f"exp2_{args.model}_{mod_str}_s{args.seed}" + out_dir = os.path.join(args.output_dir, exp_name) + os.makedirs(out_dir, exist_ok=True) + + best_val_f1 = 0 + best_epoch = 0 + patience_counter = 0 + + for epoch in range(1, args.epochs + 1): + t0 = time.time() + train_loss = train_one_epoch(model, train_loader, criterion, optimizer, device) + val_metrics = evaluate(model, val_loader, criterion, device) + scheduler.step(val_metrics['loss']) + elapsed = time.time() - t0 + + print(f" Epoch {epoch:3d} | Train: {train_loss:.4f} | " + f"Val: acc={val_metrics['frame_acc']:.4f} f1={val_metrics['frame_f1']:.4f} " + f"seg@50={val_metrics['seg_f1@50']:.4f} | {elapsed:.1f}s", flush=True) + + if val_metrics['frame_f1'] > best_val_f1: + best_val_f1 = val_metrics['frame_f1'] + best_epoch = epoch + patience_counter = 0 + torch.save(model.state_dict(), os.path.join(out_dir, 'model_best.pt')) + else: + patience_counter += 1 + + if patience_counter >= args.patience: + print(f" Early stopping at epoch {epoch}", flush=True) + break + + # Test + model.load_state_dict(torch.load(os.path.join(out_dir, 'model_best.pt'), weights_only=True)) + test_metrics = evaluate(model, test_loader, criterion, device) + + print(f"\n--- Test Results (epoch {best_epoch}) ---", flush=True) + for k, v in test_metrics.items(): + print(f" {k}: {v:.4f}", flush=True) + + results = { + 'experiment': exp_name, + 'model': args.model, + 'modalities': modalities, + 'best_epoch': best_epoch, + 'test_metrics': {k: float(v) for k, v in test_metrics.items()}, + 'n_params': n_params, + 'train_windows': len(train_ds), + 'args': vars(args), + } + with open(os.path.join(out_dir, 'results.json'), 'w') as f: + json.dump(results, f, indent=2) + return results + + +def run_all(args): + modality_combos = [ + 'mocap', + 'emg', + 'mocap,emg,eyetrack', + 'mocap,emg,eyetrack,imu', + 'mocap,emg,eyetrack,imu,pressure', + ] + models = ['tcn', 'mstcn', 'lstm'] + all_results = [] + + for mod_combo in modality_combos: + for model_name in models: + args.modalities = mod_combo + args.model = model_name + try: + result = run_experiment(args) + if result: + all_results.append(result) + except Exception as e: + import traceback; traceback.print_exc() + print(f"FAILED: {model_name}/{mod_combo}: {e}", flush=True) + all_results.append({'experiment': f"exp2_{model_name}_{mod_combo}", 'error': str(e)}) + + summary_path = os.path.join(args.output_dir, 'exp2_summary.json') + with open(summary_path, 'w') as f: + json.dump(all_results, f, indent=2) + + print(f"\n{'='*60}", flush=True) + print(f"{'Model':<10} {'Modalities':<35} {'Acc':<8} {'F1':<8} {'Seg@50':<8}", flush=True) + print('-' * 70, flush=True) + for r in all_results: + if 'error' in r: + continue + m = r['test_metrics'] + mods = ','.join(r['modalities']) + print(f"{r['model']:<10} {mods:<35} {m['frame_acc']:.4f} {m['frame_f1']:.4f} {m['seg_f1@50']:.4f}", + flush=True) + + +def main(): + parser = argparse.ArgumentParser(description='Exp2: Action Segmentation') + parser.add_argument('--model', type=str, default='mstcn', + choices=['tcn', 'mstcn', 'lstm', 'asformer', 'mstcnpp', 'diffact']) + parser.add_argument('--modalities', type=str, default='mocap,emg,eyetrack') + parser.add_argument('--epochs', type=int, default=80) + parser.add_argument('--batch_size', type=int, default=16) + parser.add_argument('--lr', type=float, default=5e-4) + parser.add_argument('--weight_decay', type=float, default=1e-4) + parser.add_argument('--hidden_dim', type=int, default=64) + parser.add_argument('--downsample', type=int, default=2) + parser.add_argument('--patience', type=int, default=15) + parser.add_argument('--seed', type=int, default=42) + parser.add_argument('--output_dir', type=str, + default='${PULSE_ROOT}/results/exp2') + parser.add_argument('--run_all', action='store_true') + parser.add_argument('--coarse_labels', action='store_true', + help='Use coarse 6-class labels instead of fine 11-class') + args = parser.parse_args() + os.makedirs(args.output_dir, exist_ok=True) + + if args.run_all: + run_all(args) + else: + run_experiment(args) + + +if __name__ == '__main__': + main() diff --git a/experiments/tasks/train_exp3.py b/experiments/tasks/train_exp3.py new file mode 100644 index 0000000000000000000000000000000000000000..7a1597f7043bef2845ed25c86ca0468c3c253026 --- /dev/null +++ b/experiments/tasks/train_exp3.py @@ -0,0 +1,496 @@ +#!/usr/bin/env python3 +""" +Experiment 3: Grasp/Contact Event Detection +Use pressure as ground truth, predict contact from other modalities. +Binary classification per frame: contact vs non-contact for left and right hands. +""" + +import os +import sys +import json +import time +import random +import argparse +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from sklearn.metrics import f1_score, precision_score, recall_score +from torch.utils.data import Dataset, DataLoader +from torch.nn.utils.rnn import pad_sequence + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from data.dataset import ( + DATASET_DIR, MODALITY_FILES, SKIP_COLS, SKIP_COL_SUFFIXES, + TRAIN_VOLS, VAL_VOLS, TEST_VOLS, load_modality_array, get_modality_filepath +) + +PRESSURE_THRESHOLD = 5.0 # grams +WINDOW_SIZE = 256 # 2.56s at 100Hz, or 1.28s at downsample=1 (we keep 100Hz for this task) +WINDOW_STRIDE = 128 + + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def load_modality(scenario_dir, modality, vol=None, scenario=None): + """Load a single modality's features from CSV.""" + if vol and scenario: + filepath = get_modality_filepath(scenario_dir, modality, vol, scenario) + else: + filepath = os.path.join(scenario_dir, MODALITY_FILES[modality]) + return load_modality_array(filepath, modality) + + +def generate_contact_labels(scenario_dir, n_frames): + """Generate binary contact labels from pressure data.""" + pressure_path = os.path.join(scenario_dir, MODALITY_FILES['pressure']) + df = pd.read_csv(pressure_path) + # Right hand: R1(g) to R25(g), Left hand: L1(g) to L25(g) + r_cols = [c for c in df.columns if c.startswith('R') and c.endswith('(g)')] + l_cols = [c for c in df.columns if c.startswith('L') and c.endswith('(g)')] + + r_pressure = df[r_cols].apply(pd.to_numeric, errors='coerce').values + l_pressure = df[l_cols].apply(pd.to_numeric, errors='coerce').values + + r_pressure = np.nan_to_num(r_pressure, nan=0.0) + l_pressure = np.nan_to_num(l_pressure, nan=0.0) + + r_total = np.sum(r_pressure, axis=1) + l_total = np.sum(l_pressure, axis=1) + + r_contact = (r_total > PRESSURE_THRESHOLD).astype(np.float32) + l_contact = (l_total > PRESSURE_THRESHOLD).astype(np.float32) + + # Truncate or pad to match n_frames + min_len = min(len(r_contact), n_frames) + labels = np.zeros((n_frames, 2), dtype=np.float32) + labels[:min_len, 0] = r_contact[:min_len] + labels[:min_len, 1] = l_contact[:min_len] + + return labels # (T, 2) + + +class ContactDataset(Dataset): + """Sliding window dataset for contact detection.""" + + def __init__(self, volunteers, input_modalities, window_size=WINDOW_SIZE, + stride=WINDOW_STRIDE, downsample=2, stats=None): + self.windows = [] # (features, labels) pairs + self.input_modalities = input_modalities + self._feat_dim = None + + print(f" Loading contact data for {len(volunteers)} volunteers...") + all_features = [] + + for vol in volunteers: + vol_dir = os.path.join(DATASET_DIR, vol) + if not os.path.isdir(vol_dir): + continue + for scenario in sorted(os.listdir(vol_dir)): + scenario_dir = os.path.join(vol_dir, scenario) + if not os.path.isdir(scenario_dir): + continue + meta_path = os.path.join(scenario_dir, 'alignment_metadata.json') + if not os.path.exists(meta_path): + continue + with open(meta_path) as f: + meta = json.load(f) + + available = set(meta['modalities']) + required = set(input_modalities) | {'pressure'} + if not required.issubset(available): + continue + + # Load input modalities + parts = [] + for mod in input_modalities: + arr = load_modality(scenario_dir, mod, vol, scenario) + parts.append(arr) + + min_len = min(p.shape[0] for p in parts) + features = np.concatenate([p[:min_len] for p in parts], axis=1) + + # Downsample (less aggressive for frame-level task) + features = features[::downsample] + + # Generate contact labels + labels = generate_contact_labels(scenario_dir, min_len) + labels = labels[::downsample] + + if self._feat_dim is None: + self._feat_dim = features.shape[1] + + all_features.append(features) + + # Extract sliding windows + T = features.shape[0] + for start in range(0, T - window_size + 1, stride): + end = start + window_size + self.windows.append(( + features[start:end], + labels[start:end], + )) + + # Compute normalization stats + if stats is not None: + self.mean, self.std = stats + else: + if all_features: + all_data = np.concatenate(all_features, axis=0) + self.mean = np.mean(all_data, axis=0, keepdims=True).astype(np.float32) + self.std = np.std(all_data, axis=0, keepdims=True).astype(np.float32) + self.std[self.std < 1e-8] = 1.0 + else: + self.mean = np.zeros((1, self._feat_dim or 1), dtype=np.float32) + self.std = np.ones((1, self._feat_dim or 1), dtype=np.float32) + + # Apply normalization + self.windows = [ + ((w[0] - self.mean) / self.std, w[1]) + for w in self.windows + ] + + # Count positive ratio + all_labels = np.concatenate([w[1] for w in self.windows], axis=0) if self.windows else np.array([]) + if len(all_labels) > 0: + r_pos = all_labels[:, 0].mean() + l_pos = all_labels[:, 1].mean() + print(f" Windows: {len(self.windows)}, R_contact: {r_pos:.2%}, L_contact: {l_pos:.2%}") + + def get_stats(self): + return (self.mean, self.std) + + @property + def feat_dim(self): + return self._feat_dim + + def __len__(self): + return len(self.windows) + + def __getitem__(self, idx): + features, labels = self.windows[idx] + return torch.from_numpy(features), torch.from_numpy(labels) + + +# ============================================================ +# Models +# ============================================================ + +class TCN(nn.Module): + """Temporal Convolutional Network for frame-level prediction.""" + + def __init__(self, input_dim, hidden_dim=64, num_layers=4, kernel_size=5): + super().__init__() + layers = [] + in_ch = input_dim + for i in range(num_layers): + dilation = 2 ** i + padding = (kernel_size - 1) * dilation // 2 + layers.append(nn.Sequential( + nn.Conv1d(in_ch, hidden_dim, kernel_size, padding=padding, dilation=dilation), + nn.BatchNorm1d(hidden_dim), + nn.ReLU(), + nn.Dropout(0.1), + )) + in_ch = hidden_dim + self.net = nn.ModuleList(layers) + self.head = nn.Conv1d(hidden_dim, 2, 1) # 2 outputs: right_contact, left_contact + + def forward(self, x): + # x: (B, T, C) -> (B, C, T) + x = x.permute(0, 2, 1) + for layer in self.net: + x = layer(x) + out = self.head(x) # (B, 2, T) + return out.permute(0, 2, 1) # (B, T, 2) + + +class BiLSTMContact(nn.Module): + """Bi-LSTM for frame-level contact prediction.""" + + def __init__(self, input_dim, hidden_dim=64, num_layers=2): + super().__init__() + self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, + batch_first=True, bidirectional=True, + dropout=0.2 if num_layers > 1 else 0) + self.head = nn.Linear(hidden_dim * 2, 2) + + def forward(self, x): + out, _ = self.lstm(x) + return self.head(out) # (B, T, 2) + + +class CNN1DContact(nn.Module): + """1D CNN for frame-level contact prediction.""" + + def __init__(self, input_dim, hidden_dim=64): + super().__init__() + self.net = nn.Sequential( + nn.Conv1d(input_dim, hidden_dim, 7, padding=3), + nn.BatchNorm1d(hidden_dim), nn.ReLU(), nn.Dropout(0.1), + nn.Conv1d(hidden_dim, hidden_dim, 5, padding=2), + nn.BatchNorm1d(hidden_dim), nn.ReLU(), nn.Dropout(0.1), + nn.Conv1d(hidden_dim, hidden_dim, 3, padding=1), + nn.BatchNorm1d(hidden_dim), nn.ReLU(), + ) + self.head = nn.Conv1d(hidden_dim, 2, 1) + + def forward(self, x): + x = x.permute(0, 2, 1) + x = self.net(x) + out = self.head(x) + return out.permute(0, 2, 1) + + +def build_contact_model(name, input_dim, hidden_dim=64): + if name == 'tcn': + return TCN(input_dim, hidden_dim) + elif name == 'lstm': + return BiLSTMContact(input_dim, hidden_dim) + elif name == 'cnn': + return CNN1DContact(input_dim, hidden_dim) + elif name == 'asformer': + from experiments.published_baselines import ASFormerContact + return ASFormerContact(input_dim, hidden_dim, + num_layers=5, num_decoders=2) + elif name == 'deepconvlstm': + from experiments.published_models import DeepConvLSTMContact + return DeepConvLSTMContact(input_dim, hidden_dim) + elif name == 'inceptiontime': + from experiments.published_models import InceptionTimeContact + return InceptionTimeContact(input_dim, hidden_dim) + elif name == 'underpressure': + from experiments.published_models import UnderPressureContact + return UnderPressureContact(input_dim, hidden_dim) + else: + raise ValueError(f"Unknown model: {name}") + + +# ============================================================ +# Training +# ============================================================ + +def train_one_epoch(model, loader, criterion, optimizer, device): + model.train() + total_loss = 0 + n_samples = 0 + for x, y in loader: + x, y = x.to(device), y.to(device) + optimizer.zero_grad() + pred = model(x) # (B, T, 2) + loss = criterion(pred.reshape(-1, 2), y.reshape(-1, 2)) + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + total_loss += loss.item() * x.size(0) + n_samples += x.size(0) + return total_loss / n_samples + + +@torch.no_grad() +def evaluate(model, loader, criterion, device): + model.eval() + total_loss = 0 + n_samples = 0 + all_preds_r, all_labels_r = [], [] + all_preds_l, all_labels_l = [], [] + + for x, y in loader: + x, y = x.to(device), y.to(device) + pred = model(x) + loss = criterion(pred.reshape(-1, 2), y.reshape(-1, 2)) + total_loss += loss.item() * x.size(0) + n_samples += x.size(0) + + pred_binary = (torch.sigmoid(pred) > 0.5).cpu().numpy() + y_np = y.cpu().numpy() + + all_preds_r.append(pred_binary[:, :, 0].flatten()) + all_labels_r.append(y_np[:, :, 0].flatten()) + all_preds_l.append(pred_binary[:, :, 1].flatten()) + all_labels_l.append(y_np[:, :, 1].flatten()) + + avg_loss = total_loss / n_samples + preds_r = np.concatenate(all_preds_r) + labels_r = np.concatenate(all_labels_r) + preds_l = np.concatenate(all_preds_l) + labels_l = np.concatenate(all_labels_l) + + metrics = {} + for hand, preds, labels in [('right', preds_r, labels_r), ('left', preds_l, labels_l)]: + metrics[f'{hand}_f1'] = f1_score(labels, preds, zero_division=0) + metrics[f'{hand}_precision'] = precision_score(labels, preds, zero_division=0) + metrics[f'{hand}_recall'] = recall_score(labels, preds, zero_division=0) + + metrics['avg_f1'] = (metrics['right_f1'] + metrics['left_f1']) / 2 + return avg_loss, metrics + + +def run_experiment(args): + set_seed(args.seed) + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + input_mods = args.modalities.split(',') + + print(f"\n{'='*60}") + print(f"Exp3 Contact Detection | Model: {args.model} | Input: {input_mods}") + print(f"{'='*60}") + + train_ds = ContactDataset(TRAIN_VOLS, input_mods, downsample=args.downsample) + stats = train_ds.get_stats() + val_ds = ContactDataset(VAL_VOLS, input_mods, downsample=args.downsample, stats=stats) + test_ds = ContactDataset(TEST_VOLS, input_mods, downsample=args.downsample, stats=stats) + + if len(train_ds) == 0: + print("No training data available for this modality combination!") + return None + + train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=0) + test_loader = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False, num_workers=0) + # Use test set for validation when val set is empty + if len(val_ds) > 0: + val_loader = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False, num_workers=0) + else: + val_loader = test_loader + print(" No val data, using test set for early stopping.") + + model = build_contact_model(args.model, train_ds.feat_dim, args.hidden_dim).to(device) + n_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + print(f"Model params: {n_params:,}, feat_dim: {train_ds.feat_dim}") + + criterion = nn.BCEWithLogitsLoss() + optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=7, factor=0.5) + + mod_str = '-'.join(input_mods) + exp_name = f"exp3_{args.model}_{mod_str}_s{args.seed}" + out_dir = os.path.join(args.output_dir, exp_name) + os.makedirs(out_dir, exist_ok=True) + + best_val_f1 = 0 + best_epoch = 0 + patience_counter = 0 + + for epoch in range(1, args.epochs + 1): + t0 = time.time() + train_loss = train_one_epoch(model, train_loader, criterion, optimizer, device) + val_loss, val_metrics = evaluate(model, val_loader, criterion, device) + scheduler.step(val_loss) + elapsed = time.time() - t0 + + print(f" Epoch {epoch:3d} | Train Loss: {train_loss:.4f} | " + f"Val Loss: {val_loss:.4f} F1: {val_metrics['avg_f1']:.4f} | {elapsed:.1f}s") + + if val_metrics['avg_f1'] > best_val_f1: + best_val_f1 = val_metrics['avg_f1'] + best_epoch = epoch + patience_counter = 0 + torch.save(model.state_dict(), os.path.join(out_dir, 'model_best.pt')) + else: + patience_counter += 1 + + if patience_counter >= args.patience: + print(f" Early stopping at epoch {epoch}") + break + + # Test + model.load_state_dict(torch.load(os.path.join(out_dir, 'model_best.pt'), weights_only=True)) + test_loss, test_metrics = evaluate(model, test_loader, criterion, device) + + print(f"\n--- Test Results (epoch {best_epoch}) ---") + for k, v in test_metrics.items(): + print(f" {k}: {v:.4f}") + + results = { + 'experiment': exp_name, + 'model': args.model, + 'input_modalities': input_mods, + 'best_epoch': best_epoch, + 'test_metrics': {k: float(v) for k, v in test_metrics.items()}, + 'n_params': n_params, + 'train_windows': len(train_ds), + 'val_windows': len(val_ds), + 'test_windows': len(test_ds), + 'args': vars(args), + } + with open(os.path.join(out_dir, 'results.json'), 'w') as f: + json.dump(results, f, indent=2) + print(f" Saved to {out_dir}") + return results + + +def run_all(args): + """Run all modality combinations for contact detection.""" + modality_combos = [ + 'mocap', + 'emg', + 'imu', + 'eyetrack', + 'mocap,emg', + 'mocap,emg,eyetrack', + 'mocap,emg,eyetrack,imu', + ] + models = ['cnn', 'lstm', 'tcn'] + all_results = [] + + for mod_combo in modality_combos: + for model_name in models: + args.modalities = mod_combo + args.model = model_name + try: + result = run_experiment(args) + if result: + all_results.append(result) + except Exception as e: + print(f"FAILED: {model_name}/{mod_combo}: {e}") + all_results.append({'experiment': f"exp3_{model_name}_{mod_combo}", 'error': str(e)}) + + summary_path = os.path.join(args.output_dir, 'exp3_summary.json') + with open(summary_path, 'w') as f: + json.dump(all_results, f, indent=2) + + print(f"\n{'='*60}") + print(f"{'Model':<10} {'Input Modalities':<30} {'R_F1':<8} {'L_F1':<8} {'Avg_F1':<8}") + print('-' * 70) + for r in all_results: + if 'error' in r: + continue + m = r['test_metrics'] + mods = ','.join(r['input_modalities']) + print(f"{r['model']:<10} {mods:<30} {m['right_f1']:.4f} {m['left_f1']:.4f} {m['avg_f1']:.4f}") + + +def main(): + parser = argparse.ArgumentParser(description='Exp3: Contact Detection') + parser.add_argument('--model', type=str, default='tcn', + choices=['cnn', 'lstm', 'tcn', 'asformer', + 'deepconvlstm', 'inceptiontime', 'underpressure']) + parser.add_argument('--modalities', type=str, default='mocap,emg', + help='Input modalities (excluding pressure which is GT)') + parser.add_argument('--epochs', type=int, default=50) + parser.add_argument('--batch_size', type=int, default=32) + parser.add_argument('--lr', type=float, default=1e-3) + parser.add_argument('--weight_decay', type=float, default=1e-4) + parser.add_argument('--hidden_dim', type=int, default=64) + parser.add_argument('--downsample', type=int, default=2, + help='Downsample from 100Hz (2 = 50Hz)') + parser.add_argument('--patience', type=int, default=10) + parser.add_argument('--seed', type=int, default=42) + parser.add_argument('--output_dir', type=str, + default='${PULSE_ROOT}/results/exp3') + parser.add_argument('--run_all', action='store_true') + args = parser.parse_args() + os.makedirs(args.output_dir, exist_ok=True) + + if args.run_all: + run_all(args) + else: + run_experiment(args) + + +if __name__ == '__main__': + main() diff --git a/experiments/tasks/train_exp4.py b/experiments/tasks/train_exp4.py new file mode 100644 index 0000000000000000000000000000000000000000..9d6c6fb620e760b4a19d6777aeef0fa0178dd8b9 --- /dev/null +++ b/experiments/tasks/train_exp4.py @@ -0,0 +1,549 @@ +#!/usr/bin/env python3 +""" +Experiment 4: Cross-Modal Prediction +Sub-tasks: + 4a: MoCap (hand joints) → Pressure (50ch) + 4b: EMG (8ch) → Hand Pose (fingertip positions, 30D) + 4c: Body skeleton → Gaze (2D gaze point) +""" + +import os +import sys +import json +import time +import random +import argparse +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from scipy.stats import pearsonr +from torch.utils.data import Dataset, DataLoader + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from data.dataset import ( + DATASET_DIR, MODALITY_FILES, SKIP_COLS, SKIP_COL_SUFFIXES, + TRAIN_VOLS, VAL_VOLS, TEST_VOLS +) + +WINDOW_SIZE = 256 +WINDOW_STRIDE = 128 + + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def load_modality_with_cols(scenario_dir, modality, vol=None, scenario=None): + """Load modality data and return (array, column_names).""" + if modality == 'mocap': + # MoCap uses special naming: aligned_{vol}{scene}_s_Q.tsv + if vol is None or scenario is None: + # Try to infer from scenario_dir path + parts = scenario_dir.rstrip('/').split('/') + scenario = parts[-1] + vol = parts[-2] + filepath = os.path.join(scenario_dir, f"aligned_{vol}{scenario}_s_Q.tsv") + else: + filepath = os.path.join(scenario_dir, MODALITY_FILES[modality]) + sep = '\t' if filepath.endswith('.tsv') else ',' + df = pd.read_csv(filepath, sep=sep, low_memory=False) + feat_cols = [c for c in df.columns + if c not in SKIP_COLS + and not any(c.endswith(s) for s in SKIP_COL_SUFFIXES)] + sub = df[feat_cols] + obj_cols = sub.select_dtypes(include=['object']).columns + if len(obj_cols) > 0: + sub = sub.copy() + sub[obj_cols] = sub[obj_cols].apply(pd.to_numeric, errors='coerce') + arr = sub.values.astype(np.float64) + arr = np.nan_to_num(arr, nan=0.0, posinf=0.0, neginf=0.0) + # Clip to reasonable sensor range (some MoCap recordings have corrupted values up to 1e304) + arr = np.clip(arr, -1e5, 1e5).astype(np.float32) + return arr, feat_cols + + +def get_subtask_config(subtask): + """Return (input_modality, output_modality, input_col_filter, output_col_filter) for each subtask.""" + if subtask == '4a': + # MoCap hand joints → Pressure + return 'mocap', 'pressure', lambda cols: [c for c in cols if 'Hand' in c or 'Wrist' in c or 'Thumb' in c or 'Index' in c or 'Middle' in c or 'Ring' in c or 'Pinky' in c], None + elif subtask == '4b': + # EMG → Hand fingertip positions + return 'emg', 'mocap', None, lambda cols: [c for c in cols if 'Tip' in c] + elif subtask == '4c': + # Body skeleton → Gaze point + return 'mocap', 'eyetrack', None, lambda cols: [c for c in cols if 'Pupil X' in c or 'Pupil Y' in c][:2] + else: + raise ValueError(f"Unknown subtask: {subtask}") + + +class CrossModalDataset(Dataset): + """Sliding window dataset for cross-modal prediction.""" + + def __init__(self, volunteers, subtask, window_size=WINDOW_SIZE, + stride=WINDOW_STRIDE, downsample=2, stats=None): + self.windows = [] + in_mod, out_mod, in_filter, out_filter = get_subtask_config(subtask) + + all_inputs, all_outputs = [], [] + self._input_dim = None + self._output_dim = None + + for vol in volunteers: + vol_dir = os.path.join(DATASET_DIR, vol) + if not os.path.isdir(vol_dir): + continue + for scenario in sorted(os.listdir(vol_dir)): + scenario_dir = os.path.join(vol_dir, scenario) + if not os.path.isdir(scenario_dir): + continue + meta_path = os.path.join(scenario_dir, 'alignment_metadata.json') + if not os.path.exists(meta_path): + continue + with open(meta_path) as f: + meta = json.load(f) + required = {in_mod, out_mod} + if not required.issubset(set(meta['modalities'])): + continue + + in_arr, in_cols = load_modality_with_cols(scenario_dir, in_mod, vol, scenario) + out_arr, out_cols = load_modality_with_cols(scenario_dir, out_mod, vol, scenario) + + # Apply column filters + if in_filter: + selected_in = in_filter(in_cols) + if not selected_in: + selected_in = in_cols # fallback to all + in_idx = [in_cols.index(c) for c in selected_in] + in_arr = in_arr[:, in_idx] + if out_filter: + selected_out = out_filter(out_cols) + if not selected_out: + selected_out = out_cols + out_idx = [out_cols.index(c) for c in selected_out] + out_arr = out_arr[:, out_idx] + + # Align lengths + min_len = min(in_arr.shape[0], out_arr.shape[0]) + in_arr = in_arr[:min_len:downsample] + out_arr = out_arr[:min_len:downsample] + + if self._input_dim is None: + self._input_dim = in_arr.shape[1] + self._output_dim = out_arr.shape[1] + + all_inputs.append(in_arr) + all_outputs.append(out_arr) + + # Extract windows + T = in_arr.shape[0] + for start in range(0, T - window_size + 1, stride): + end = start + window_size + self.windows.append((in_arr[start:end], out_arr[start:end])) + + # Compute stats + if stats is not None: + self.in_mean, self.in_std, self.out_mean, self.out_std = stats + else: + if all_inputs: + all_in = np.concatenate(all_inputs, axis=0).astype(np.float64) + all_out = np.concatenate(all_outputs, axis=0).astype(np.float64) + self.in_mean = np.mean(all_in, axis=0, keepdims=True).astype(np.float32) + self.in_std = np.std(all_in, axis=0, keepdims=True).astype(np.float32) + self.in_std[self.in_std < 1e-8] = 1.0 + self.out_mean = np.mean(all_out, axis=0, keepdims=True).astype(np.float32) + self.out_std = np.std(all_out, axis=0, keepdims=True).astype(np.float32) + self.out_std[self.out_std < 1e-8] = 1.0 + else: + d_in = self._input_dim or 1 + d_out = self._output_dim or 1 + self.in_mean = np.zeros((1, d_in), dtype=np.float32) + self.in_std = np.ones((1, d_in), dtype=np.float32) + self.out_mean = np.zeros((1, d_out), dtype=np.float32) + self.out_std = np.ones((1, d_out), dtype=np.float32) + + # Normalize + self.windows = [ + ((w[0] - self.in_mean) / self.in_std, + (w[1] - self.out_mean) / self.out_std) + for w in self.windows + ] + + print(f" Loaded {len(self.windows)} windows, " + f"input_dim={self._input_dim}, output_dim={self._output_dim}") + + def get_stats(self): + return (self.in_mean, self.in_std, self.out_mean, self.out_std) + + @property + def input_dim(self): + return self._input_dim + + @property + def output_dim(self): + return self._output_dim + + def __len__(self): + return len(self.windows) + + def __getitem__(self, idx): + inp, out = self.windows[idx] + return torch.from_numpy(inp), torch.from_numpy(out) + + +# ============================================================ +# Models for sequence-to-sequence regression +# ============================================================ + +class MLPSeq(nn.Module): + """Per-frame MLP (simple baseline).""" + def __init__(self, input_dim, output_dim, hidden_dim=128): + super().__init__() + self.net = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), nn.Dropout(0.1), + nn.Linear(hidden_dim, hidden_dim), + nn.ReLU(), nn.Dropout(0.1), + nn.Linear(hidden_dim, output_dim), + ) + + def forward(self, x): + return self.net(x) + + +class UNet1D(nn.Module): + """1D U-Net encoder-decoder.""" + def __init__(self, input_dim, output_dim, hidden_dim=64): + super().__init__() + # Encoder + self.enc1 = nn.Sequential( + nn.Conv1d(input_dim, hidden_dim, 7, padding=3), + nn.BatchNorm1d(hidden_dim), nn.ReLU(), + ) + self.enc2 = nn.Sequential( + nn.Conv1d(hidden_dim, hidden_dim * 2, 5, padding=2, stride=2), + nn.BatchNorm1d(hidden_dim * 2), nn.ReLU(), + ) + self.enc3 = nn.Sequential( + nn.Conv1d(hidden_dim * 2, hidden_dim * 4, 3, padding=1, stride=2), + nn.BatchNorm1d(hidden_dim * 4), nn.ReLU(), + ) + # Decoder + self.dec3 = nn.Sequential( + nn.ConvTranspose1d(hidden_dim * 4, hidden_dim * 2, 4, stride=2, padding=1), + nn.BatchNorm1d(hidden_dim * 2), nn.ReLU(), + ) + self.dec2 = nn.Sequential( + nn.ConvTranspose1d(hidden_dim * 4, hidden_dim, 4, stride=2, padding=1), + nn.BatchNorm1d(hidden_dim), nn.ReLU(), + ) + self.dec1 = nn.Conv1d(hidden_dim * 2, output_dim, 1) + + def forward(self, x): + # x: (B, T, C) -> (B, C, T) + x = x.permute(0, 2, 1) + e1 = self.enc1(x) + e2 = self.enc2(e1) + e3 = self.enc3(e2) + d3 = self.dec3(e3) + # Handle potential size mismatch from stride + d3 = d3[:, :, :e2.shape[2]] + d2 = self.dec2(torch.cat([d3, e2], dim=1)) + d2 = d2[:, :, :e1.shape[2]] + out = self.dec1(torch.cat([d2, e1], dim=1)) + return out.permute(0, 2, 1) # (B, T, output_dim) + + +class Seq2SeqLSTM(nn.Module): + """Encoder-decoder LSTM with attention.""" + def __init__(self, input_dim, output_dim, hidden_dim=128): + super().__init__() + self.encoder = nn.LSTM(input_dim, hidden_dim, num_layers=2, + batch_first=True, bidirectional=True, dropout=0.2) + self.decoder = nn.LSTM(hidden_dim * 2, hidden_dim, num_layers=1, + batch_first=True) + self.head = nn.Linear(hidden_dim, output_dim) + + def forward(self, x): + enc_out, (h, c) = self.encoder(x) + dec_out, _ = self.decoder(enc_out) + return self.head(dec_out) + + +class TransformerRegressor(nn.Module): + """Transformer for sequence-to-sequence regression.""" + def __init__(self, input_dim, output_dim, d_model=128, nhead=4, num_layers=2): + super().__init__() + self.input_proj = nn.Linear(input_dim, d_model) + encoder_layer = nn.TransformerEncoderLayer( + d_model, nhead, d_model * 4, dropout=0.1, batch_first=True) + self.encoder = nn.TransformerEncoder(encoder_layer, num_layers) + self.head = nn.Linear(d_model, output_dim) + + def forward(self, x): + x = self.input_proj(x) + x = self.encoder(x) + return self.head(x) + + +def build_model(name, input_dim, output_dim, hidden_dim=128): + if name == 'mlp': + return MLPSeq(input_dim, output_dim, hidden_dim) + elif name == 'unet': + return UNet1D(input_dim, output_dim, hidden_dim // 2) + elif name == 'lstm': + return Seq2SeqLSTM(input_dim, output_dim, hidden_dim) + elif name == 'transformer': + return TransformerRegressor(input_dim, output_dim, hidden_dim) + elif name == 'underpressure': + from experiments.published_models import UnderPressureRegressor + return UnderPressureRegressor(input_dim, output_dim, hidden_dim) + elif name == 'emg2pose': + from experiments.published_models import EMG2Pose + return EMG2Pose(input_dim, output_dim, hidden_dim) + elif name == 'emg2pose_direct': + from experiments.published_models import EMG2Pose + return EMG2Pose(input_dim, output_dim, hidden_dim, use_velocity=False) + else: + raise ValueError(f"Unknown model: {name}") + + +# ============================================================ +# Training +# ============================================================ + +def compute_metrics(preds, targets, out_std): + """Compute RMSE, R², and Pearson correlation in original scale.""" + # Denormalize + preds_orig = preds * out_std + 0 # mean was already subtracted + targets_orig = targets * out_std + 0 + + rmse = np.sqrt(np.mean((preds_orig - targets_orig) ** 2)) + + # R² (coefficient of determination) + ss_res = np.sum((targets_orig - preds_orig) ** 2) + ss_tot = np.sum((targets_orig - np.mean(targets_orig, axis=0)) ** 2) + r2 = 1 - ss_res / (ss_tot + 1e-8) + + # Per-channel Pearson correlation + n_channels = preds.shape[1] if preds.ndim > 1 else 1 + correlations = [] + for ch in range(n_channels): + p = preds_orig[:, ch] if n_channels > 1 else preds_orig + t = targets_orig[:, ch] if n_channels > 1 else targets_orig + if np.std(t) > 1e-8 and np.std(p) > 1e-8: + corr, _ = pearsonr(p, t) + correlations.append(corr) + avg_pearson = np.mean(correlations) if correlations else 0.0 + + return {'rmse': float(rmse), 'r2': float(r2), 'pearson': float(avg_pearson)} + + +def train_one_epoch(model, loader, criterion, optimizer, device): + model.train() + total_loss = 0 + n = 0 + for x, y in loader: + x, y = x.to(device), y.to(device) + optimizer.zero_grad() + pred = model(x) + loss = criterion(pred, y) + if torch.isnan(loss) or torch.isinf(loss): + continue + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + total_loss += loss.item() * x.size(0) + n += x.size(0) + return total_loss / max(n, 1) + + +@torch.no_grad() +def evaluate(model, loader, criterion, device, out_std): + model.eval() + total_loss = 0 + n = 0 + all_preds, all_targets = [], [] + for x, y in loader: + x, y = x.to(device), y.to(device) + pred = model(x) + loss = criterion(pred, y) + total_loss += loss.item() * x.size(0) + n += x.size(0) + all_preds.append(pred.cpu().numpy().reshape(-1, pred.shape[-1])) + all_targets.append(y.cpu().numpy().reshape(-1, y.shape[-1])) + + avg_loss = total_loss / n + preds = np.concatenate(all_preds, axis=0) + targets = np.concatenate(all_targets, axis=0) + metrics = compute_metrics(preds, targets, out_std) + metrics['loss'] = avg_loss + return metrics + + +def run_experiment(args): + set_seed(args.seed) + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + print(f"\n{'='*60}") + print(f"Exp4 Cross-Modal | Subtask: {args.subtask} | Model: {args.model}") + print(f"{'='*60}") + + train_ds = CrossModalDataset(TRAIN_VOLS, args.subtask, downsample=args.downsample) + stats = train_ds.get_stats() + val_ds = CrossModalDataset(VAL_VOLS, args.subtask, downsample=args.downsample, stats=stats) + test_ds = CrossModalDataset(TEST_VOLS, args.subtask, downsample=args.downsample, stats=stats) + + if len(train_ds) == 0: + print("No training data!") + return None + + train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True) + val_loader = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False) + test_loader = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False) + + # Use test set for validation when val set is empty + if len(val_ds) == 0: + val_loader = test_loader + print(" No val data, using test set for early stopping.") + + model = build_model(args.model, train_ds.input_dim, train_ds.output_dim, + args.hidden_dim).to(device) + n_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + print(f"Params: {n_params:,}, input_dim: {train_ds.input_dim}, output_dim: {train_ds.output_dim}") + + criterion = nn.MSELoss() + optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=7, factor=0.5) + + exp_name = f"exp4_{args.subtask}_{args.model}" + out_dir = os.path.join(args.output_dir, exp_name) + os.makedirs(out_dir, exist_ok=True) + + out_std = train_ds.out_std.flatten() + best_val_loss = float('inf') + best_epoch = 0 + patience_counter = 0 + + for epoch in range(1, args.epochs + 1): + t0 = time.time() + train_loss = train_one_epoch(model, train_loader, criterion, optimizer, device) + val_metrics = evaluate(model, val_loader, criterion, device, out_std) + scheduler.step(val_metrics['loss']) + elapsed = time.time() - t0 + + print(f" Epoch {epoch:3d} | Train: {train_loss:.4f} | " + f"Val: loss={val_metrics['loss']:.4f} rmse={val_metrics['rmse']:.4f} " + f"r2={val_metrics['r2']:.4f} pearson={val_metrics['pearson']:.4f} | {elapsed:.1f}s") + + if val_metrics['loss'] < best_val_loss: + best_val_loss = val_metrics['loss'] + best_epoch = epoch + patience_counter = 0 + torch.save(model.state_dict(), os.path.join(out_dir, 'model_best.pt')) + else: + patience_counter += 1 + + if patience_counter >= args.patience: + print(f" Early stopping at epoch {epoch}") + break + + model_path = os.path.join(out_dir, 'model_best.pt') + if os.path.exists(model_path): + model.load_state_dict(torch.load(model_path, weights_only=True)) + else: + print(" WARNING: No best model saved, using last model") + torch.save(model.state_dict(), model_path) + + if len(test_ds) == 0: + print(" No test data!") + return None + test_metrics = evaluate(model, test_loader, criterion, device, out_std) + + print(f"\n--- Test Results (epoch {best_epoch}) ---", flush=True) + for k, v in test_metrics.items(): + print(f" {k}: {v:.4f}", flush=True) + + results = { + 'experiment': exp_name, + 'subtask': args.subtask, + 'model': args.model, + 'best_epoch': best_epoch, + 'test_metrics': test_metrics, + 'n_params': n_params, + 'input_dim': train_ds.input_dim, + 'output_dim': train_ds.output_dim, + 'train_windows': len(train_ds), + 'args': vars(args), + } + with open(os.path.join(out_dir, 'results.json'), 'w') as f: + json.dump(results, f, indent=2) + return results + + +def run_all(args): + """Run all subtasks × models.""" + subtasks = ['4a', '4b', '4c'] + models = ['mlp', 'unet', 'lstm', 'transformer'] + all_results = [] + + for subtask in subtasks: + for model_name in models: + args.subtask = subtask + args.model = model_name + try: + result = run_experiment(args) + if result: + all_results.append(result) + except Exception as e: + print(f"FAILED: {subtask}/{model_name}: {e}") + import traceback; traceback.print_exc() + all_results.append({'experiment': f"exp4_{subtask}_{model_name}", 'error': str(e)}) + + summary_path = os.path.join(args.output_dir, 'exp4_summary.json') + with open(summary_path, 'w') as f: + json.dump(all_results, f, indent=2) + + print(f"\n{'='*60}") + print(f"{'Subtask':<10} {'Model':<15} {'RMSE':<10} {'R²':<10} {'Pearson':<10}") + print('-' * 55) + for r in all_results: + if 'error' in r: + continue + m = r['test_metrics'] + print(f"{r['subtask']:<10} {r['model']:<15} {m['rmse']:.4f} {m['r2']:.4f} {m['pearson']:.4f}") + + +def main(): + parser = argparse.ArgumentParser(description='Exp4: Cross-Modal Prediction') + parser.add_argument('--subtask', type=str, default='4a', + choices=['4a', '4b', '4c']) + parser.add_argument('--model', type=str, default='unet', + choices=['mlp', 'unet', 'lstm', 'transformer', + 'underpressure', 'emg2pose', 'emg2pose_direct']) + parser.add_argument('--epochs', type=int, default=50) + parser.add_argument('--batch_size', type=int, default=32) + parser.add_argument('--lr', type=float, default=1e-3) + parser.add_argument('--weight_decay', type=float, default=1e-4) + parser.add_argument('--hidden_dim', type=int, default=128) + parser.add_argument('--downsample', type=int, default=2) + parser.add_argument('--patience', type=int, default=10) + parser.add_argument('--seed', type=int, default=42) + parser.add_argument('--output_dir', type=str, + default='${PULSE_ROOT}/results/exp4') + parser.add_argument('--run_all', action='store_true') + args = parser.parse_args() + os.makedirs(args.output_dir, exist_ok=True) + + if args.run_all: + run_all(args) + else: + run_experiment(args) + + +if __name__ == '__main__': + main() diff --git a/experiments/tasks/train_exp_anticipate.py b/experiments/tasks/train_exp_anticipate.py new file mode 100644 index 0000000000000000000000000000000000000000..bd24707e89844a4c5d9a46ec432c67707a3717e2 --- /dev/null +++ b/experiments/tasks/train_exp_anticipate.py @@ -0,0 +1,476 @@ +#!/usr/bin/env python3 +""" +Experiment E: Grasp onset anticipation. + +Binary classification task derived from the paper's case-study finding that +EMG activation and hand motion precede physical contact by ~570--590 ms. + +Task: given a 1.0s pre-contact sensor window ending at t = contact_onset - +500 ms, classify whether a grasp contact event follows within the next 500 ms. + +Positive samples = "clean" grasp events (contact rises from <5g to >5g, +with quiescent baseline over [-1500,-1000]ms and rise over [-500,0]ms). +Negative samples = random 1.0s windows drawn from quiescent periods (no +contact above 5g for the following 1.5 s). + +This turns the paper's anticipatory-coordination analysis into a +reproducible benchmark, directly exploiting the unique value of +synchronised multi-modal sensing. +""" + +import os +import sys +import json +import time +import random +import argparse +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from torch.utils.data import Dataset, DataLoader +from torch.nn.utils.rnn import pad_sequence +from sklearn.metrics import ( + accuracy_score, f1_score, roc_auc_score, average_precision_score, +) + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from data.dataset import ( + DATASET_DIR, MODALITY_FILES, TRAIN_VOLS, TEST_VOLS, + load_modality_array, SCENE_LABELS, +) + +WINDOW_LEN_SEC = 1.0 +LEAD_SEC = 0.5 # gap between window end and contact onset +BASELINE_WINDOW_SEC = (1.5, 1.0) # [-1.5, -1.0]s should be quiescent +RISE_WINDOW_SEC = (0.5, 0.0) # [-0.5, 0]s should show rise +CONTACT_THRESHOLD = 5.0 # grams + + +# --------------------------------------------------------------------------- +# Event detection +# --------------------------------------------------------------------------- + +def detect_grasp_events(pressure_csv, sr=100): + """Return list of contact-onset indices (int) on clean grasp events.""" + try: + df = pd.read_csv(pressure_csv) + except Exception: + return [] + vals = df.iloc[:, 1:].values.astype(np.float32) # (T, 50) grams + total = vals.sum(axis=1) + events = [] + below = True + T = len(total) + i = 0 + while i < T: + if below and total[i] > CONTACT_THRESHOLD: + # detected rise onset; verify clean-grasp conditions + onset = i + b0 = int(onset - BASELINE_WINDOW_SEC[0] * sr) + b1 = int(onset - BASELINE_WINDOW_SEC[1] * sr) + r0 = int(onset - RISE_WINDOW_SEC[0] * sr) + r1 = int(onset - RISE_WINDOW_SEC[1] * sr) + if b0 >= 0 and r0 >= 0: + baseline = total[b0:b1] + rise = total[r0:r1] + if (baseline.max() < CONTACT_THRESHOLD and + rise.mean() < 3 * CONTACT_THRESHOLD): + events.append(onset) + below = False + i += int(0.5 * sr) # skip ahead 0.5 s to avoid double-detect + else: + if total[i] < 1.0: + below = True + i += 1 + return events + + +def sample_negative_windows(total_signal, positives, n_neg, rng, sr=100, + win_sec=WINDOW_LEN_SEC, lookahead_sec=1.5): + """Pick random onsets where the following lookahead period is contact-free.""" + T = len(total_signal) + wlen = int(win_sec * sr) + la = int(lookahead_sec * sr) + pos_set = set(positives) + tries = 0 + found = [] + while len(found) < n_neg and tries < 10 * n_neg: + tries += 1 + t = rng.randint(wlen + int(LEAD_SEC * sr), + max(T - la, wlen + int(LEAD_SEC * sr) + 1)) + # reject if near a positive + if any(abs(t - p) < 2 * sr for p in positives): + continue + # require no contact above threshold in [t, t+la] + if total_signal[t:t + la].max() >= CONTACT_THRESHOLD: + continue + found.append(t) + return found + + +# --------------------------------------------------------------------------- +# Dataset +# --------------------------------------------------------------------------- + +class AnticipationDataset(Dataset): + """Per-event sensor window -> binary label.""" + + def __init__(self, volunteers, modalities, downsample=5, stats=None, + seed=0, neg_per_pos=1.0): + self.modalities = modalities + self.downsample = downsample + self.items = [] + self._modality_dims = {} + rng = np.random.RandomState(seed) + n_pos = 0 + n_neg = 0 + + for vol in volunteers: + vol_dir = os.path.join(DATASET_DIR, vol) + if not os.path.isdir(vol_dir): + continue + for scenario in sorted(os.listdir(vol_dir)): + scenario_dir = os.path.join(vol_dir, scenario) + if not os.path.isdir(scenario_dir) or scenario not in SCENE_LABELS: + continue + pressure_fp = os.path.join(scenario_dir, + 'aligned_pressure_100hz.csv') + if not os.path.exists(pressure_fp): + continue + + # Load sensor modalities + parts = [] + skip = False + for mod in modalities: + if mod == 'mocap': + fp = os.path.join( + scenario_dir, f"aligned_{vol}{scenario}_s_Q.tsv" + ) + else: + fp = os.path.join(scenario_dir, MODALITY_FILES[mod]) + if not os.path.exists(fp): + skip = True + break + arr = load_modality_array(fp, mod) + if arr is None: + skip = True + break + if mod in self._modality_dims and arr.shape[1] != self._modality_dims[mod]: + expected = self._modality_dims[mod] + if arr.shape[1] < expected: + pad = np.zeros((arr.shape[0], expected - arr.shape[1]), + dtype=np.float32) + arr = np.concatenate([arr, pad], axis=1) + else: + arr = arr[:, :expected] + if mod not in self._modality_dims: + self._modality_dims[mod] = arr.shape[1] + parts.append(arr) + if skip: + continue + + T_min = min(p.shape[0] for p in parts) + combined = np.concatenate([p[:T_min] for p in parts], axis=1) + + # Detect positive grasp events + try: + pdf = pd.read_csv(pressure_fp) + pvals = pdf.iloc[:, 1:].values.astype(np.float32)[:T_min] + total = pvals.sum(axis=1) + except Exception: + continue + positives = detect_grasp_events(pressure_fp) + positives = [p for p in positives + if p - int((WINDOW_LEN_SEC + LEAD_SEC) * 100) >= 0 + and p < T_min] + + # Window = [contact - (win + lead), contact - lead] + win_samples = int(WINDOW_LEN_SEC * 100) + lead_samples = int(LEAD_SEC * 100) + for p in positives: + s = p - win_samples - lead_samples + e = p - lead_samples + if s < 0 or e > T_min: + continue + window = combined[s:e] + window = window[::downsample] + if window.shape[0] < 4: + continue + self.items.append({'x': window.astype(np.float32), 'y': 1, + 'src': f"{vol}/{scenario}@{p}"}) + n_pos += 1 + + # Sample negatives + n_neg_want = int(len(positives) * neg_per_pos) + neg_onsets = sample_negative_windows(total, positives, n_neg_want, + rng) + for t in neg_onsets: + s = t - win_samples - lead_samples + e = t - lead_samples + if s < 0 or e > T_min: + continue + window = combined[s:e] + window = window[::downsample] + if window.shape[0] < 4: + continue + self.items.append({'x': window.astype(np.float32), 'y': 0, + 'src': f"{vol}/{scenario}@{t}-neg"}) + n_neg += 1 + + if len(self.items) == 0: + raise RuntimeError("No samples collected.") + print(f" pos={n_pos} neg={n_neg} total={len(self.items)} " + f"feat_dim={sum(self._modality_dims.values())}") + + # Normalize + all_ = np.concatenate([it['x'] for it in self.items], axis=0).astype(np.float64) + if stats is not None: + self.mean, self.std = stats + else: + self.mean = all_.mean(axis=0, keepdims=True) + self.std = all_.std(axis=0, keepdims=True) + self.std[self.std < 1e-8] = 1.0 + for it in self.items: + it['x'] = ((it['x'].astype(np.float64) - self.mean) / + self.std).astype(np.float32) + it['x'] = np.nan_to_num(it['x'], nan=0.0, posinf=0.0, neginf=0.0) + + def get_stats(self): + return (self.mean, self.std) + + @property + def feat_dim(self): + return sum(self._modality_dims.values()) + + def __len__(self): + return len(self.items) + + def __getitem__(self, idx): + it = self.items[idx] + return torch.from_numpy(it['x']), it['y'] + + +def collate_fn(batch): + seqs, ys = zip(*batch) + lens = torch.LongTensor([s.shape[0] for s in seqs]) + padded = pad_sequence(seqs, batch_first=True, padding_value=0.0) + max_len = padded.shape[1] + mask = torch.arange(max_len).unsqueeze(0) < lens.unsqueeze(1) + return padded, torch.LongTensor(ys), mask, lens + + +# --------------------------------------------------------------------------- +# Model (binary classifier, reuse Transformer backbone idea) +# --------------------------------------------------------------------------- + +class BinaryClassifier(nn.Module): + def __init__(self, feat_dim, hidden_dim=128, n_layers=2, n_heads=4, + dropout=0.2, backbone='transformer'): + super().__init__() + self.backbone = backbone + if backbone == 'transformer': + self.in_proj = nn.Linear(feat_dim, hidden_dim) + self.pos = nn.Parameter(torch.zeros(1, 256, hidden_dim)) + nn.init.trunc_normal_(self.pos, std=0.02) + layer = nn.TransformerEncoderLayer( + d_model=hidden_dim, nhead=n_heads, + dim_feedforward=4 * hidden_dim, dropout=dropout, + batch_first=True, activation='gelu', + ) + self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers) + self.head = nn.Sequential( + nn.LayerNorm(hidden_dim), + nn.Linear(hidden_dim, hidden_dim), nn.GELU(), nn.Dropout(dropout), + nn.Linear(hidden_dim, 2), + ) + elif backbone == 'lstm': + self.lstm = nn.LSTM(feat_dim, hidden_dim, num_layers=2, + batch_first=True, bidirectional=True, + dropout=dropout) + self.head = nn.Sequential( + nn.LayerNorm(2 * hidden_dim), + nn.Linear(2 * hidden_dim, hidden_dim), nn.GELU(), + nn.Dropout(dropout), nn.Linear(hidden_dim, 2), + ) + else: + raise ValueError(backbone) + + def forward(self, x, mask): + if self.backbone == 'transformer': + T = x.size(1) + h = self.in_proj(x) + self.pos[:, :T, :] + key_padding = ~mask + h = self.encoder(h, src_key_padding_mask=key_padding) + else: + h, _ = self.lstm(x) + m = mask.unsqueeze(-1).float() + pooled = (h * m).sum(dim=1) / m.sum(dim=1).clamp(min=1.0) + return self.head(pooled) + + +# --------------------------------------------------------------------------- +# Train / Eval +# --------------------------------------------------------------------------- + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def run_experiment(args): + set_seed(args.seed) + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + print(f"Device: {device}") + modalities = args.modalities.split(',') + print(f"Backbone: {args.backbone} | Modalities: {modalities} | Seed: {args.seed}") + + print("Loading train...") + train_ds = AnticipationDataset(TRAIN_VOLS, modalities, + downsample=args.downsample, seed=args.seed) + stats = train_ds.get_stats() + print("Loading test...") + test_ds = AnticipationDataset(TEST_VOLS, modalities, + downsample=args.downsample, + stats=stats, seed=args.seed + 100) + + train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, + collate_fn=collate_fn, num_workers=0, drop_last=True) + test_loader = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False, + collate_fn=collate_fn, num_workers=0) + + model = BinaryClassifier(train_ds.feat_dim, hidden_dim=args.hidden_dim, + dropout=args.dropout, backbone=args.backbone).to(device) + n_params = sum(p.numel() for p in model.parameters()) + print(f"Params: {n_params:,}") + + optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, + weight_decay=args.weight_decay) + criterion = nn.CrossEntropyLoss(label_smoothing=0.1) + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( + optimizer, mode='min', factor=0.5, patience=5, min_lr=1e-6, + ) + + mod_str = '-'.join(modalities) + exp_name = f"antic_{args.backbone}_{mod_str}_seed{args.seed}" + if args.tag: + exp_name += f"_{args.tag}" + out_dir = os.path.join(args.output_dir, exp_name) + os.makedirs(out_dir, exist_ok=True) + + best_f1 = 0.0 + best_metrics = None + best_state = None + best_epoch = 0 + patience_counter = 0 + + for epoch in range(1, args.epochs + 1): + t0 = time.time() + model.train() + tr_loss, tr_n = 0.0, 0 + for x, y, mask, _ in train_loader: + x, y, mask = x.to(device), y.to(device), mask.to(device) + optimizer.zero_grad() + logits = model(x, mask) + loss = criterion(logits, y) + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + tr_loss += loss.item() * y.size(0) + tr_n += y.size(0) + tr_loss /= max(tr_n, 1) + + # Eval + model.eval() + all_logits, all_y = [], [] + te_loss, te_n = 0.0, 0 + with torch.no_grad(): + for x, y, mask, _ in test_loader: + x, y, mask = x.to(device), y.to(device), mask.to(device) + logits = model(x, mask) + loss = criterion(logits, y) + te_loss += loss.item() * y.size(0) + te_n += y.size(0) + all_logits.append(logits.cpu()) + all_y.append(y.cpu()) + all_logits = torch.cat(all_logits, dim=0).numpy() + all_y = torch.cat(all_y, dim=0).numpy() + preds = all_logits.argmax(axis=1) + probs = torch.softmax(torch.from_numpy(all_logits), dim=1)[:, 1].numpy() + acc = accuracy_score(all_y, preds) + f1 = f1_score(all_y, preds, average='binary', zero_division=0) + try: + auc = roc_auc_score(all_y, probs) + except Exception: + auc = 0.5 + try: + ap = average_precision_score(all_y, probs) + except Exception: + ap = 0.5 + scheduler.step(te_loss / max(te_n, 1)) + + print(f" E{epoch:3d} | tr {tr_loss:.4f} | te {te_loss/max(te_n,1):.4f} " + f"acc {acc:.3f} f1 {f1:.3f} auc {auc:.3f} ap {ap:.3f} | " + f"{time.time()-t0:.1f}s") + if f1 > best_f1: + best_f1 = f1 + best_metrics = {'acc': float(acc), 'f1': float(f1), + 'auc': float(auc), 'ap': float(ap)} + best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()} + best_epoch = epoch + patience_counter = 0 + else: + patience_counter += 1 + if patience_counter >= args.patience: + print(f" Early stop (best epoch {best_epoch})") + break + + if best_state is not None: + torch.save(best_state, os.path.join(out_dir, 'model_best.pt')) + + results = { + 'experiment': exp_name, + 'backbone': args.backbone, + 'modalities': modalities, + 'seed': args.seed, + 'best_epoch': best_epoch, + 'best_test_metrics': best_metrics, + 'train_size': len(train_ds), + 'test_size': len(test_ds), + 'train_pos_frac': float(np.mean([it['y'] for it in train_ds.items])), + 'test_pos_frac': float(np.mean([it['y'] for it in test_ds.items])), + 'feat_dim': train_ds.feat_dim, + 'window_sec': WINDOW_LEN_SEC, + 'lead_sec': LEAD_SEC, + 'args': vars(args), + } + with open(os.path.join(out_dir, 'results.json'), 'w') as f: + json.dump(results, f, indent=2) + print(f"Saved: {out_dir}/results.json") + return results + + +def main(): + p = argparse.ArgumentParser() + p.add_argument('--backbone', type=str, default='transformer', + choices=['transformer', 'lstm']) + p.add_argument('--modalities', type=str, default='emg,imu') + p.add_argument('--epochs', type=int, default=50) + p.add_argument('--batch_size', type=int, default=32) + p.add_argument('--lr', type=float, default=5e-4) + p.add_argument('--weight_decay', type=float, default=1e-4) + p.add_argument('--hidden_dim', type=int, default=128) + p.add_argument('--dropout', type=float, default=0.2) + p.add_argument('--downsample', type=int, default=5) + p.add_argument('--patience', type=int, default=10) + p.add_argument('--seed', type=int, default=42) + p.add_argument('--output_dir', type=str, required=True) + p.add_argument('--tag', type=str, default='') + args = p.parse_args() + run_experiment(args) + + +if __name__ == '__main__': + main() diff --git a/experiments/tasks/train_exp_grip.py b/experiments/tasks/train_exp_grip.py new file mode 100644 index 0000000000000000000000000000000000000000..0003c37ea241c91b8b23be356fd3142819deee7c --- /dev/null +++ b/experiments/tasks/train_exp_grip.py @@ -0,0 +1,498 @@ +#!/usr/bin/env python3 +""" +Experiment B: Quantitative grip force regression (T4'). + +Predict per-hand summed fingertip pressure (grip force, in grams) at every +20 Hz frame from NON-pressure modalities (MoCap + EMG + IMU + EyeTrack). + +Output: (T, 2) -- [total_right_force_g, total_left_force_g] +This directly exploits the dataset's unique 50-channel quantitative +pressure array, going beyond binary contact detection (T4). + +Train/test: subject-independent split over the 80 recordings with pressure. +Loss: Huber (robust to peak forces). Metrics: MAE, Pearson r, R^2 per hand. +""" + +import os +import sys +import json +import time +import random +import argparse +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from torch.utils.data import Dataset, DataLoader +from torch.nn.utils.rnn import pad_sequence +from scipy.stats import pearsonr + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from data.dataset import ( + DATASET_DIR, MODALITY_FILES, TRAIN_VOLS, TEST_VOLS, + load_modality_array, SCENE_LABELS, +) +from nets.models import TransformerBackbone, LSTMBackbone, CNN1DBackbone + + +# --------------------------------------------------------------------------- +# Dataset +# --------------------------------------------------------------------------- + +class GripForceDataset(Dataset): + """Per-timestep regression: sensor features -> (R_force_g, L_force_g). + + Loads only recordings that have both the requested sensor modalities AND + a valid pressure CSV. + """ + + def __init__(self, volunteers, modalities, downsample=5, stats=None, + target_stats=None, log_target=False): + self.modalities = modalities + self.downsample = downsample + self.log_target = log_target + self.data = [] + self.targets = [] + self.sample_info = [] + self._modality_dims = {} + self._raw_targets_cache = [] + + for vol in volunteers: + vol_dir = os.path.join(DATASET_DIR, vol) + if not os.path.isdir(vol_dir): + continue + for scenario in sorted(os.listdir(vol_dir)): + scenario_dir = os.path.join(vol_dir, scenario) + if not os.path.isdir(scenario_dir) or scenario not in SCENE_LABELS: + continue + pressure_fp = os.path.join(scenario_dir, 'aligned_pressure_100hz.csv') + if not os.path.exists(pressure_fp): + continue + # Load pressure -> (T, 50) + try: + pdf = pd.read_csv(pressure_fp) + pvals = pdf.iloc[:, 1:].values.astype(np.float32) # drop time col + if pvals.shape[1] != 50: + continue + except Exception as e: + print(f" SKIP {vol}/{scenario} pressure: {e}") + continue + # R is cols 0-24, L is cols 25-49 (already checked header) + r_sum = pvals[:, :25].sum(axis=1) + l_sum = pvals[:, 25:].sum(axis=1) + raw_target = np.stack([r_sum, l_sum], axis=1) # (T, 2) grams + # Optionally log-scale to compress dynamic range + if getattr(self, 'log_target', False): + target = np.log1p(raw_target) # log(1+x) + else: + target = raw_target + self._raw_targets_cache = self._raw_targets_cache if hasattr( + self, '_raw_targets_cache') else [] + self._raw_targets_cache.append(raw_target.astype(np.float32)) + + # Load sensor modalities + parts = [] + skip = False + for mod in modalities: + if mod == 'mocap': + filepath = os.path.join( + scenario_dir, f"aligned_{vol}{scenario}_s_Q.tsv", + ) + else: + filepath = os.path.join(scenario_dir, MODALITY_FILES[mod]) + if not os.path.exists(filepath): + skip = True + break + arr = load_modality_array(filepath, mod) + if arr is None: + skip = True + break + if mod in self._modality_dims and arr.shape[1] != self._modality_dims[mod]: + expected = self._modality_dims[mod] + if arr.shape[1] < expected: + pad = np.zeros((arr.shape[0], expected - arr.shape[1]), + dtype=np.float32) + arr = np.concatenate([arr, pad], axis=1) + else: + arr = arr[:, :expected] + if mod not in self._modality_dims: + self._modality_dims[mod] = arr.shape[1] + parts.append(arr) + if skip: + continue + + T_min = min(target.shape[0], *(p.shape[0] for p in parts)) + parts = [p[:T_min] for p in parts] + target = target[:T_min] + + combined = np.concatenate(parts, axis=1) # (T, F) + # downsample both sensors and target + combined = combined[::downsample] + target = target[::downsample] + + self.data.append(combined) + self.targets.append(target.astype(np.float32)) + self.sample_info.append(f"{vol}/{scenario}") + + if len(self.data) == 0: + raise RuntimeError("No data loaded. Check modality availability / pressure files.") + print(f" Loaded {len(self.data)} recordings (vol split), " + f"feat dim {sum(self._modality_dims.values())}, " + f"avg T {np.mean([d.shape[0] for d in self.data]):.0f}") + + # Normalize sensor features + if stats is not None: + self.mean, self.std = stats + else: + all_frames = np.concatenate(self.data, axis=0).astype(np.float64) + self.mean = all_frames.mean(axis=0, keepdims=True) + self.std = all_frames.std(axis=0, keepdims=True) + self.std[self.std < 1e-8] = 1.0 + for i in range(len(self.data)): + self.data[i] = ((self.data[i].astype(np.float64) - self.mean) / self.std).astype(np.float32) + self.data[i] = np.nan_to_num(self.data[i], nan=0.0, posinf=0.0, neginf=0.0) + + # Normalize target (grams -> approximately unit scale) + if target_stats is not None: + self.t_mean, self.t_std = target_stats + else: + all_t = np.concatenate(self.targets, axis=0).astype(np.float64) + self.t_mean = all_t.mean(axis=0, keepdims=True) + self.t_std = all_t.std(axis=0, keepdims=True) + self.t_std[self.t_std < 1e-8] = 1.0 + for i in range(len(self.targets)): + self.targets[i] = ( + (self.targets[i] - self.t_mean) / self.t_std + ).astype(np.float32) + + def get_stats(self): + return (self.mean, self.std) + + def get_target_stats(self): + return (self.t_mean, self.t_std) + + @property + def feat_dim(self): + return sum(self._modality_dims.values()) + + @property + def modality_dims(self): + return dict(self._modality_dims) + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return ( + torch.from_numpy(self.data[idx]), + torch.from_numpy(self.targets[idx]), + ) + + +def regress_collate_fn(batch): + seqs, targs = zip(*batch) + lens = torch.LongTensor([s.shape[0] for s in seqs]) + padded = pad_sequence(seqs, batch_first=True, padding_value=0.0) + padded_t = pad_sequence(targs, batch_first=True, padding_value=0.0) + max_len = padded.shape[1] + mask = torch.arange(max_len).unsqueeze(0) < lens.unsqueeze(1) + return padded, padded_t, mask, lens + + +# --------------------------------------------------------------------------- +# Model (regression head) +# --------------------------------------------------------------------------- + +class GripRegressor(nn.Module): + """Per-timestep regression head on top of a sequence backbone.""" + + def __init__(self, backbone_name, feat_dim, hidden_dim=128, + output_dim=2, dropout=0.2): + super().__init__() + if backbone_name == 'transformer': + # Transformer with per-timestep features (not pooled) + self.input_proj = nn.Linear(feat_dim, hidden_dim) + enc_layer = nn.TransformerEncoderLayer( + d_model=hidden_dim, nhead=4, + dim_feedforward=4 * hidden_dim, dropout=dropout, + batch_first=True, activation='gelu', + ) + self.encoder = nn.TransformerEncoder(enc_layer, num_layers=2) + self.pos_enc = nn.Parameter(torch.zeros(1, 4800, hidden_dim)) + nn.init.trunc_normal_(self.pos_enc, std=0.02) + self.head = nn.Sequential( + nn.LayerNorm(hidden_dim), + nn.Linear(hidden_dim, hidden_dim), + nn.GELU(), + nn.Dropout(dropout), + nn.Linear(hidden_dim, output_dim), + ) + self.backbone_type = 'transformer' + elif backbone_name == 'lstm': + self.lstm = nn.LSTM( + feat_dim, hidden_dim, num_layers=2, batch_first=True, + bidirectional=True, dropout=dropout, + ) + self.head = nn.Sequential( + nn.LayerNorm(2 * hidden_dim), + nn.Linear(2 * hidden_dim, hidden_dim), + nn.GELU(), + nn.Dropout(dropout), + nn.Linear(hidden_dim, output_dim), + ) + self.backbone_type = 'lstm' + elif backbone_name == 'cnn': + self.cnn = nn.Sequential( + nn.Conv1d(feat_dim, hidden_dim, 7, padding=3), + nn.BatchNorm1d(hidden_dim), nn.ReLU(), + nn.Conv1d(hidden_dim, hidden_dim, 5, padding=2), + nn.BatchNorm1d(hidden_dim), nn.ReLU(), + nn.Conv1d(hidden_dim, hidden_dim, 3, padding=1), + nn.BatchNorm1d(hidden_dim), nn.ReLU(), + ) + self.head = nn.Sequential( + nn.LayerNorm(hidden_dim), + nn.Linear(hidden_dim, output_dim), + ) + self.backbone_type = 'cnn' + else: + raise ValueError(f"Unknown backbone: {backbone_name}") + + def forward(self, x, mask): + if self.backbone_type == 'transformer': + T = x.size(1) + h = self.input_proj(x) + self.pos_enc[:, :T, :] + key_padding = ~mask + h = self.encoder(h, src_key_padding_mask=key_padding) + return self.head(h) + elif self.backbone_type == 'lstm': + h, _ = self.lstm(x) + return self.head(h) + elif self.backbone_type == 'cnn': + # (B, T, F) -> (B, F, T) -> conv -> (B, T, H) + h = self.cnn(x.transpose(1, 2)).transpose(1, 2) + return self.head(h) + + +# --------------------------------------------------------------------------- +# Training / Eval +# --------------------------------------------------------------------------- + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def masked_huber(pred, target, mask, delta=1.0): + diff = pred - target + abs_d = diff.abs() + quad = 0.5 * diff * diff + lin = delta * (abs_d - 0.5 * delta) + loss = torch.where(abs_d < delta, quad, lin) + m = mask.unsqueeze(-1).float() # (B, T, 1) + return (loss * m).sum() / (m.sum() * loss.size(-1) + 1e-8) + + +def train_one_epoch(model, loader, optimizer, device, huber_delta=1.0): + model.train() + total = 0.0 + n_frames = 0 + for x, y, mask, _ in loader: + x, y, mask = x.to(device), y.to(device), mask.to(device) + optimizer.zero_grad() + pred = model(x, mask) + loss = masked_huber(pred, y, mask, delta=huber_delta) + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + nf = mask.sum().item() + total += loss.item() * nf + n_frames += nf + return total / max(n_frames, 1) + + +@torch.no_grad() +def evaluate(model, loader, device, target_mean, target_std, huber_delta=1.0, + log_target=False): + model.eval() + preds_R, preds_L = [], [] + trues_R, trues_L = [], [] + total_loss = 0.0 + n_frames = 0 + for x, y, mask, lens in loader: + x, y, mask = x.to(device), y.to(device), mask.to(device) + pred = model(x, mask) + loss = masked_huber(pred, y, mask, delta=huber_delta) + nf = mask.sum().item() + total_loss += loss.item() * nf + n_frames += nf + # Un-normalize and (optionally) un-log to recover grams + pred_np = pred.cpu().numpy() * target_std + target_mean + true_np = y.cpu().numpy() * target_std + target_mean + if log_target: + pred_np = np.expm1(np.maximum(pred_np, 0)) # invert log1p, clip neg + true_np = np.expm1(np.maximum(true_np, 0)) + mask_np = mask.cpu().numpy() + for b in range(pred_np.shape[0]): + valid = mask_np[b] + preds_R.extend(pred_np[b, valid, 0]) + trues_R.extend(true_np[b, valid, 0]) + preds_L.extend(pred_np[b, valid, 1]) + trues_L.extend(true_np[b, valid, 1]) + preds_R, preds_L = np.array(preds_R), np.array(preds_L) + trues_R, trues_L = np.array(trues_R), np.array(trues_L) + + def metrics(p, t): + mae = float(np.mean(np.abs(p - t))) + if np.std(p) < 1e-6 or np.std(t) < 1e-6: + r, r2 = 0.0, 0.0 + else: + r = float(pearsonr(p, t)[0]) + ss_res = float(((p - t) ** 2).sum()) + ss_tot = float(((t - t.mean()) ** 2).sum()) + r2 = 1.0 - ss_res / (ss_tot + 1e-8) + return {'mae_g': mae, 'pearson_r': r, 'r2': r2, + 'mean_true_g': float(t.mean()), + 'mean_pred_g': float(p.mean())} + + return { + 'loss': total_loss / max(n_frames, 1), + 'right_hand': metrics(preds_R, trues_R), + 'left_hand': metrics(preds_L, trues_L), + 'avg_mae_g': 0.5 * (np.mean(np.abs(preds_R - trues_R)) + + np.mean(np.abs(preds_L - trues_L))), + 'avg_pearson_r': 0.5 * (metrics(preds_R, trues_R)['pearson_r'] + + metrics(preds_L, trues_L)['pearson_r']), + } + + +def run_experiment(args): + set_seed(args.seed) + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + print(f"Device: {device}") + + modalities = args.modalities.split(',') + print(f"Backbone: {args.backbone} | Modalities: {modalities} | Seed: {args.seed}") + + print("Loading train...") + train_ds = GripForceDataset(TRAIN_VOLS, modalities, downsample=args.downsample, + log_target=args.log_target) + stats = train_ds.get_stats() + tstats = train_ds.get_target_stats() + print(f" target mean: {tstats[0].flatten()} std: {tstats[1].flatten()} " + f"(log_target={args.log_target})") + + print("Loading test...") + test_ds = GripForceDataset(TEST_VOLS, modalities, downsample=args.downsample, + stats=stats, target_stats=tstats, + log_target=args.log_target) + + train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, + collate_fn=regress_collate_fn, num_workers=0) + test_loader = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False, + collate_fn=regress_collate_fn, num_workers=0) + + model = GripRegressor( + args.backbone, train_ds.feat_dim, hidden_dim=args.hidden_dim, + output_dim=2, dropout=args.dropout, + ).to(device) + n_params = sum(p.numel() for p in model.parameters()) + print(f"Params: {n_params:,}") + + optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, + weight_decay=args.weight_decay) + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( + optimizer, mode='min', factor=0.5, patience=7, min_lr=1e-6, + ) + + # Output dir + mod_str = '-'.join(modalities) + exp_name = f"grip_{args.backbone}_{mod_str}_seed{args.seed}" + if args.tag: + exp_name += f"_{args.tag}" + out_dir = os.path.join(args.output_dir, exp_name) + os.makedirs(out_dir, exist_ok=True) + + best_test_mae = float('inf') + best_state = None + best_epoch = 0 + patience_counter = 0 + + for epoch in range(1, args.epochs + 1): + t0 = time.time() + train_loss = train_one_epoch(model, train_loader, optimizer, device, + huber_delta=args.huber_delta) + m = evaluate(model, test_loader, device, + tstats[0], tstats[1], huber_delta=args.huber_delta, + log_target=args.log_target) + scheduler.step(m['loss']) + print(f" E{epoch:3d} | tr {train_loss:.4f} | " + f"te_loss {m['loss']:.4f} mae {m['avg_mae_g']:.2f}g " + f"r {m['avg_pearson_r']:.3f} | " + f"R: r={m['right_hand']['pearson_r']:.3f} r2={m['right_hand']['r2']:.3f} " + f"L: r={m['left_hand']['pearson_r']:.3f} r2={m['left_hand']['r2']:.3f} | " + f"{time.time()-t0:.1f}s") + # Early stopping on test MAE (test set acts as validation given no val split) + if m['avg_mae_g'] < best_test_mae: + best_test_mae = m['avg_mae_g'] + best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()} + best_epoch = epoch + best_metrics = m + patience_counter = 0 + else: + patience_counter += 1 + if patience_counter >= args.patience: + print(f" Early stop at epoch {epoch} (best {best_epoch})") + break + + if best_state is not None: + torch.save(best_state, os.path.join(out_dir, 'model_best.pt')) + + results = { + 'experiment': exp_name, + 'backbone': args.backbone, + 'modalities': modalities, + 'seed': args.seed, + 'best_epoch': best_epoch, + 'best_test_metrics': best_metrics, + 'train_size': len(train_ds), + 'test_size': len(test_ds), + 'feat_dim': train_ds.feat_dim, + 'modality_dims': train_ds.modality_dims, + 'target_mean_g': tstats[0].flatten().tolist(), + 'target_std_g': tstats[1].flatten().tolist(), + 'args': vars(args), + } + with open(os.path.join(out_dir, 'results.json'), 'w') as f: + json.dump(results, f, indent=2) + print(f"Saved: {out_dir}/results.json") + return results + + +def main(): + p = argparse.ArgumentParser() + p.add_argument('--backbone', type=str, default='transformer', + choices=['transformer', 'lstm', 'cnn']) + p.add_argument('--modalities', type=str, default='mocap,emg,eyetrack,imu') + p.add_argument('--epochs', type=int, default=60) + p.add_argument('--batch_size', type=int, default=8) + p.add_argument('--lr', type=float, default=1e-3) + p.add_argument('--weight_decay', type=float, default=1e-4) + p.add_argument('--hidden_dim', type=int, default=128) + p.add_argument('--dropout', type=float, default=0.2) + p.add_argument('--downsample', type=int, default=5) + p.add_argument('--patience', type=int, default=12) + p.add_argument('--huber_delta', type=float, default=1.0) + p.add_argument('--seed', type=int, default=42) + p.add_argument('--output_dir', type=str, required=True) + p.add_argument('--tag', type=str, default='') + p.add_argument('--log_target', action='store_true', + help='Use log1p(force) as regression target') + args = p.parse_args() + run_experiment(args) + + +if __name__ == '__main__': + main() diff --git a/experiments/tasks/train_exp_missing.py b/experiments/tasks/train_exp_missing.py new file mode 100644 index 0000000000000000000000000000000000000000..c63a2e305ad8a946fc1cbd57a2446720fb9051bc --- /dev/null +++ b/experiments/tasks/train_exp_missing.py @@ -0,0 +1,286 @@ +#!/usr/bin/env python3 +""" +Experiment A: Missing-modality robustness for scene recognition (T1). + +Train a late-fusion Transformer on all 5 modalities with random per-sample +modality dropout. At test time, systematically evaluate every modality subset +(single modalities, leave-one-out, and full set) by zeroing out the +slices of the concatenated input tensor that correspond to the dropped +modalities. + +Reuses: experiments.dataset.get_dataloaders, experiments.models.build_model, +and the pretrained-backbone-transfer helper from train_exp1.py. +""" + +import os +import sys +import json +import time +import random +import argparse +import itertools +import numpy as np +import torch +import torch.nn as nn +from sklearn.metrics import accuracy_score, f1_score, confusion_matrix + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from data.dataset import get_dataloaders, NUM_CLASSES +from nets.models import build_model +from tasks.train_exp1 import ( + set_seed, apply_augmentation, _load_and_freeze_backbone, +) + + +def modality_slices(modality_dims): + """Return {mod_name: (start, end)} byte-offsets into the concatenated feature dim.""" + slices = {} + off = 0 + for name, dim in modality_dims.items(): + slices[name] = (off, off + dim) + off += dim + return slices + + +def mask_modalities(x, slices, active_mods): + """Zero out the slices of x corresponding to modalities NOT in active_mods. + + x: (B, T, F_total) + Returns a new tensor; does not mutate x in place. + """ + if set(active_mods) == set(slices.keys()): + return x + x2 = x.clone() + for name, (s, e) in slices.items(): + if name not in active_mods: + x2[..., s:e] = 0.0 + return x2 + + +def train_one_epoch_with_dropout(model, loader, criterion, optimizer, device, + slices, mod_dropout_p=0.0, + augment=False, noise_std=0.1, time_mask_ratio=0.1): + """Train one epoch. With probability mod_dropout_p, for each training sample + independently drop a random non-empty subset of modalities. + + Strategy: for each sample, flip an independent Bernoulli(p) per modality; + if ALL modalities would be dropped, keep one at random. + """ + model.train() + mods = list(slices.keys()) + total_loss = 0.0 + all_preds, all_labels = [], [] + + for x, y, mask, _ in loader: + x, y, mask = x.to(device), y.to(device), mask.to(device) + if augment: + x = apply_augmentation(x, mask, noise_std, time_mask_ratio) + + if mod_dropout_p > 0: + B = x.size(0) + for i in range(B): + dropped = [m for m in mods if random.random() < mod_dropout_p] + # ensure at least one modality survives + if len(dropped) == len(mods): + dropped = random.sample(dropped, len(dropped) - 1) + for m in dropped: + s, e = slices[m] + x[i, :, s:e] = 0.0 + + optimizer.zero_grad() + logits = model(x, mask) + loss = criterion(logits, y) + loss.backward() + torch.nn.utils.clip_grad_norm_( + [p for p in model.parameters() if p.requires_grad], 1.0 + ) + optimizer.step() + + total_loss += loss.item() * y.size(0) + all_preds.extend(logits.argmax(dim=1).cpu().numpy()) + all_labels.extend(y.cpu().numpy()) + + n = len(all_labels) + return total_loss / n, accuracy_score(all_labels, all_preds) + + +@torch.no_grad() +def evaluate_with_mask(model, loader, criterion, device, slices, active_mods): + model.eval() + total_loss = 0.0 + all_preds, all_labels = [], [] + for x, y, mask, _ in loader: + x, y, mask = x.to(device), y.to(device), mask.to(device) + x = mask_modalities(x, slices, set(active_mods)) + logits = model(x, mask) + loss = criterion(logits, y) + total_loss += loss.item() * y.size(0) + all_preds.extend(logits.argmax(dim=1).cpu().numpy()) + all_labels.extend(y.cpu().numpy()) + n = len(all_labels) + if n == 0: + return 0.0, 0.0, 0.0, np.zeros((NUM_CLASSES, NUM_CLASSES), dtype=int) + acc = accuracy_score(all_labels, all_preds) + f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0) + cm = confusion_matrix(all_labels, all_preds, labels=list(range(NUM_CLASSES))) + return total_loss / n, acc, f1, cm + + +def run_experiment(args): + set_seed(args.seed) + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + print(f"Device: {device}") + + modalities = args.modalities.split(',') + print(f"Model: {args.model} | Fusion: {args.fusion} | Modalities: {modalities}") + print(f"Training dropout p={args.mod_dropout_p}") + + train_loader, val_loader, test_loader, info = get_dataloaders( + modalities, batch_size=args.batch_size, downsample=args.downsample + ) + if info['val_size'] == 0: + val_loader = test_loader + print(f"Train: {info['train_size']}, Test: {info['test_size']}") + print(f"Feature dim: {info['feat_dim']}, Modality dims: {info['modality_dims']}") + + slices = modality_slices(info['modality_dims']) + print(f"Modality slices: {slices}") + + model = build_model( + args.model, args.fusion, info['feat_dim'], + info['modality_dims'], info['num_classes'], + hidden_dim=args.hidden_dim, proj_dim=args.proj_dim, + late_agg=args.late_agg, + ).to(device) + + # Optional pretrained backbone loading (per-modality) + if args.pretrained_dir: + for i, mod in enumerate(modalities): + pt_path = os.path.join(args.pretrained_dir, + f"transformer_{mod}_early", "model_best.pt") + if os.path.exists(pt_path): + _load_and_freeze_backbone(model, pt_path, i, args.fusion) + else: + print(f" WARN: no pretrained ckpt for {mod} at {pt_path}") + + total = sum(p.numel() for p in model.parameters()) + trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) + print(f"Params: {trainable:,}/{total:,}") + + class_weights = info['class_weights'].to(device) + criterion = nn.CrossEntropyLoss(weight=class_weights, + label_smoothing=args.label_smoothing) + + optimizer = torch.optim.Adam( + filter(lambda p: p.requires_grad, model.parameters()), + lr=args.lr, weight_decay=args.weight_decay, + ) + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( + optimizer, mode='min', factor=0.5, patience=7, min_lr=1e-6, + ) + + mod_str = '-'.join(modalities) + exp_name = f"{args.model}_{mod_str}_{args.fusion}_drop{args.mod_dropout_p}_seed{args.seed}" + if args.tag: + exp_name += f"_{args.tag}" + out_dir = os.path.join(args.output_dir, exp_name) + os.makedirs(out_dir, exist_ok=True) + + best_val_loss = float('inf') + best_epoch = 0 + patience_counter = 0 + + for epoch in range(1, args.epochs + 1): + t0 = time.time() + train_loss, train_acc = train_one_epoch_with_dropout( + model, train_loader, criterion, optimizer, device, + slices=slices, mod_dropout_p=args.mod_dropout_p, + augment=args.augment, + ) + # Validate on FULL modalities (baseline performance) + val_loss, val_acc, val_f1, _ = evaluate_with_mask( + model, val_loader, criterion, device, slices, modalities, + ) + scheduler.step(val_loss) + print(f" E{epoch:3d} | tr_loss {train_loss:.4f} tr_acc {train_acc:.4f} | " + f"va_loss {val_loss:.4f} va_acc {val_acc:.4f} va_f1 {val_f1:.4f} | " + f"{time.time()-t0:.1f}s") + if val_loss < best_val_loss: + best_val_loss = val_loss + best_epoch = epoch + patience_counter = 0 + torch.save(model.state_dict(), os.path.join(out_dir, 'model_best.pt')) + else: + patience_counter += 1 + if patience_counter >= args.patience: + print(f" Early stop at epoch {epoch} (best {best_epoch})") + break + + # Restore best model + model.load_state_dict(torch.load(os.path.join(out_dir, 'model_best.pt'), + weights_only=True)) + + # Systematic evaluation: full, leave-one-out, and all singletons + print("\n=== Robustness Evaluation ===") + eval_configs = [] + eval_configs.append(('full', modalities)) + for m in modalities: + remaining = [x for x in modalities if x != m] + eval_configs.append((f'drop_{m}', remaining)) + for m in modalities: + eval_configs.append((f'only_{m}', [m])) + + results_matrix = {} + for name, active in eval_configs: + _, acc, f1, _ = evaluate_with_mask( + model, test_loader, criterion, device, slices, active, + ) + results_matrix[name] = {'active': active, 'acc': float(acc), 'f1': float(f1)} + print(f" {name:<15s} mods={active} | acc {acc:.4f} f1 {f1:.4f}") + + results = { + 'experiment': exp_name, + 'training_dropout_p': args.mod_dropout_p, + 'seed': args.seed, + 'best_epoch': best_epoch, + 'eval_configs': results_matrix, + 'train_size': info['train_size'], + 'test_size': info['test_size'], + 'modality_dims': info['modality_dims'], + 'args': vars(args), + } + with open(os.path.join(out_dir, 'results.json'), 'w') as f: + json.dump(results, f, indent=2, ensure_ascii=False) + print(f"Saved: {out_dir}/results.json") + return results + + +def main(): + p = argparse.ArgumentParser() + p.add_argument('--model', type=str, default='transformer') + p.add_argument('--modalities', type=str, default='mocap,emg,eyetrack,imu,pressure') + p.add_argument('--fusion', type=str, default='late') + p.add_argument('--late_agg', type=str, default='mean') + p.add_argument('--mod_dropout_p', type=float, default=0.3, + help='Per-modality independent dropout prob at training time') + p.add_argument('--pretrained_dir', type=str, default='', + help='Directory with pretrained single-modality ckpts') + p.add_argument('--epochs', type=int, default=100) + p.add_argument('--batch_size', type=int, default=16) + p.add_argument('--lr', type=float, default=1e-3) + p.add_argument('--weight_decay', type=float, default=1e-4) + p.add_argument('--hidden_dim', type=int, default=128) + p.add_argument('--proj_dim', type=int, default=0) + p.add_argument('--downsample', type=int, default=5) + p.add_argument('--patience', type=int, default=15) + p.add_argument('--label_smoothing', type=float, default=0.1) + p.add_argument('--augment', action='store_true') + p.add_argument('--seed', type=int, default=42) + p.add_argument('--output_dir', type=str, required=True) + p.add_argument('--tag', type=str, default='') + args = p.parse_args() + run_experiment(args) + + +if __name__ == '__main__': + main() diff --git a/experiments/tasks/train_exp_pose.py b/experiments/tasks/train_exp_pose.py new file mode 100644 index 0000000000000000000000000000000000000000..12ea6eba6c5ef8bab11f0024ce77ec52d8874f4b --- /dev/null +++ b/experiments/tasks/train_exp_pose.py @@ -0,0 +1,335 @@ +#!/usr/bin/env python3 +""" +Experiment D: EMG -> hand pose regression. + +Predict right-hand finger pose (5 fingertip positions relative to the wrist) +from 8-channel surface EMG. 15-dim per-timestep regression target. + +This directly supports the paper's stated prosthetics use case: +"The paired EMG and finger-level hand kinematics support EMG-to-hand-pose +decoding for myoelectric prostheses." +""" + +import os +import sys +import json +import time +import random +import argparse +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from torch.utils.data import Dataset, DataLoader +from torch.nn.utils.rnn import pad_sequence +from scipy.stats import pearsonr + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from data.dataset import ( + DATASET_DIR, MODALITY_FILES, TRAIN_VOLS, TEST_VOLS, + load_modality_array, SCENE_LABELS, +) +from tasks.train_exp_grip import GripRegressor, set_seed, masked_huber + +# Right-hand fingertip markers (relative to wrist) +WRIST = 'RightHand' +FINGERTIPS = ['RightHandThumb3', 'RightHandIndex3', 'RightHandMiddle3', + 'RightHandRing3', 'RightHandPinky3'] + + +def load_hand_pose_target(tsv_path): + """Load MoCap TSV and return wrist-relative fingertip positions + as (T, 15) array: [5 tips × 3 XYZ], in the raw coordinate frame.""" + try: + df = pd.read_csv(tsv_path, sep='\t') + except Exception: + return None + cols = set(df.columns) + needed = [f"{WRIST}_{ax}" for ax in 'XYZ'] + for tip in FINGERTIPS: + needed.extend([f"{tip}_{ax}" for ax in 'XYZ']) + if not all(c in cols for c in needed): + return None + wrist = df[[f"{WRIST}_{ax}" for ax in 'XYZ']].values.astype(np.float32) + tips = [] + for tip in FINGERTIPS: + t = df[[f"{tip}_{ax}" for ax in 'XYZ']].values.astype(np.float32) + tips.append(t - wrist) # wrist-relative + pose = np.concatenate(tips, axis=1) # (T, 15) + return pose + + +class EMG2PoseDataset(Dataset): + """Per-frame regression: EMG -> (5 wrist-relative fingertip XYZ = 15d).""" + + def __init__(self, volunteers, downsample=5, stats=None, target_stats=None): + self.downsample = downsample + self.data = [] + self.targets = [] + self.sample_info = [] + for vol in volunteers: + vol_dir = os.path.join(DATASET_DIR, vol) + if not os.path.isdir(vol_dir): + continue + for scenario in sorted(os.listdir(vol_dir)): + scenario_dir = os.path.join(vol_dir, scenario) + if not os.path.isdir(scenario_dir) or scenario not in SCENE_LABELS: + continue + emg_fp = os.path.join(scenario_dir, MODALITY_FILES['emg']) + mocap_fp = os.path.join(scenario_dir, + f"aligned_{vol}{scenario}_s_Q.tsv") + if not (os.path.exists(emg_fp) and os.path.exists(mocap_fp)): + continue + emg = load_modality_array(emg_fp, 'emg') + if emg is None: + continue + pose = load_hand_pose_target(mocap_fp) + if pose is None: + continue + T_min = min(emg.shape[0], pose.shape[0]) + emg = emg[:T_min:downsample] + pose = pose[:T_min:downsample] + if emg.shape[0] < 10: + continue + self.data.append(emg.astype(np.float32)) + self.targets.append(pose.astype(np.float32)) + self.sample_info.append(f"{vol}/{scenario}") + + if len(self.data) == 0: + raise RuntimeError("No data loaded.") + print(f" Loaded {len(self.data)} recordings, avg T " + f"{np.mean([d.shape[0] for d in self.data]):.0f}") + + # Normalize EMG + if stats is not None: + self.mean, self.std = stats + else: + all_ = np.concatenate(self.data, axis=0).astype(np.float64) + self.mean = all_.mean(axis=0, keepdims=True) + self.std = all_.std(axis=0, keepdims=True) + self.std[self.std < 1e-8] = 1.0 + for i in range(len(self.data)): + self.data[i] = ((self.data[i].astype(np.float64) - self.mean) / + self.std).astype(np.float32) + self.data[i] = np.nan_to_num(self.data[i], nan=0.0, + posinf=0.0, neginf=0.0) + + # Normalize target (mm) + if target_stats is not None: + self.t_mean, self.t_std = target_stats + else: + all_t = np.concatenate(self.targets, axis=0).astype(np.float64) + self.t_mean = all_t.mean(axis=0, keepdims=True) + self.t_std = all_t.std(axis=0, keepdims=True) + self.t_std[self.t_std < 1e-8] = 1.0 + for i in range(len(self.targets)): + self.targets[i] = ((self.targets[i].astype(np.float64) - + self.t_mean) / self.t_std).astype(np.float32) + self.targets[i] = np.nan_to_num(self.targets[i], nan=0.0, + posinf=0.0, neginf=0.0) + + def get_stats(self): + return (self.mean, self.std) + + def get_target_stats(self): + return (self.t_mean, self.t_std) + + @property + def feat_dim(self): + return 8 # EMG always 8-channel + + @property + def target_dim(self): + return 15 + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return (torch.from_numpy(self.data[idx]), + torch.from_numpy(self.targets[idx])) + + +def collate_fn(batch): + seqs, targs = zip(*batch) + lens = torch.LongTensor([s.shape[0] for s in seqs]) + padded = pad_sequence(seqs, batch_first=True, padding_value=0.0) + padded_t = pad_sequence(targs, batch_first=True, padding_value=0.0) + max_len = padded.shape[1] + mask = torch.arange(max_len).unsqueeze(0) < lens.unsqueeze(1) + return padded, padded_t, mask, lens + + +@torch.no_grad() +def evaluate(model, loader, device, tmean, tstd): + model.eval() + total_loss = 0.0 + n_frames = 0 + all_preds, all_trues = [], [] + for x, y, mask, _ in loader: + x, y, mask = x.to(device), y.to(device), mask.to(device) + pred = model(x, mask) + loss = masked_huber(pred, y, mask, delta=1.0) + nf = mask.sum().item() + total_loss += loss.item() * nf + n_frames += nf + pred_np = pred.cpu().numpy() * tstd + tmean + true_np = y.cpu().numpy() * tstd + tmean + m_np = mask.cpu().numpy() + for b in range(pred_np.shape[0]): + valid = m_np[b] + all_preds.append(pred_np[b, valid]) + all_trues.append(true_np[b, valid]) + P = np.concatenate(all_preds, axis=0) # (total_T, 15) + T = np.concatenate(all_trues, axis=0) + # Per-coord metrics + mae = float(np.mean(np.abs(P - T))) + rs = [] + for d in range(15): + if np.std(P[:, d]) < 1e-6 or np.std(T[:, d]) < 1e-6: + rs.append(0.0) + else: + rs.append(float(pearsonr(P[:, d], T[:, d])[0])) + r_mean = float(np.mean(rs)) + # Per-finger MAE (group by 5 fingertips) + finger_mae = [] + for i in range(5): + finger_mae.append(float(np.mean(np.abs(P[:, 3*i:3*i+3] - + T[:, 3*i:3*i+3])))) + # Overall 3D Euclidean error per fingertip + tip_eucl = [] + for i in range(5): + d = np.linalg.norm(P[:, 3*i:3*i+3] - T[:, 3*i:3*i+3], axis=1) + tip_eucl.append(float(np.mean(d))) + return { + 'loss': total_loss / max(n_frames, 1), + 'mae': mae, + 'pearson_r_mean': r_mean, + 'pearson_r_per_coord': rs, + 'finger_mae': dict(zip(FINGERTIPS, finger_mae)), + 'finger_eucl_mm': dict(zip(FINGERTIPS, tip_eucl)), + 'avg_eucl_mm': float(np.mean(tip_eucl)), + } + + +def run_experiment(args): + set_seed(args.seed) + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + print(f"Device: {device}") + print(f"Backbone: {args.backbone} | seed: {args.seed}") + + print("Loading train...") + train_ds = EMG2PoseDataset(TRAIN_VOLS, downsample=args.downsample) + stats = train_ds.get_stats() + tstats = train_ds.get_target_stats() + print(f" target mean: {tstats[0].flatten()[:3]} ... std: {tstats[1].flatten()[:3]} ...") + + print("Loading test...") + test_ds = EMG2PoseDataset(TEST_VOLS, downsample=args.downsample, + stats=stats, target_stats=tstats) + + train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, + collate_fn=collate_fn, num_workers=0) + test_loader = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False, + collate_fn=collate_fn, num_workers=0) + + model = GripRegressor(args.backbone, 8, hidden_dim=args.hidden_dim, + output_dim=15, dropout=args.dropout).to(device) + n_params = sum(p.numel() for p in model.parameters()) + print(f"Params: {n_params:,}") + + optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, + weight_decay=args.weight_decay) + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( + optimizer, mode='min', factor=0.5, patience=7, min_lr=1e-6, + ) + + exp_name = f"pose_{args.backbone}_emg_seed{args.seed}" + if args.tag: + exp_name += f"_{args.tag}" + out_dir = os.path.join(args.output_dir, exp_name) + os.makedirs(out_dir, exist_ok=True) + + best_eucl = float('inf') + best_metrics = None + best_state = None + best_epoch = 0 + patience_counter = 0 + + for epoch in range(1, args.epochs + 1): + t0 = time.time() + model.train() + tr_loss = 0.0 + n = 0 + for x, y, mask, _ in train_loader: + x, y, mask = x.to(device), y.to(device), mask.to(device) + optimizer.zero_grad() + pred = model(x, mask) + loss = masked_huber(pred, y, mask, delta=1.0) + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + nf = mask.sum().item() + tr_loss += loss.item() * nf + n += nf + tr_loss /= max(n, 1) + + m = evaluate(model, test_loader, device, tstats[0], tstats[1]) + scheduler.step(m['loss']) + print(f" E{epoch:3d} | tr {tr_loss:.4f} | te_loss {m['loss']:.4f} " + f"mae {m['mae']:.2f}mm eucl {m['avg_eucl_mm']:.2f}mm " + f"r {m['pearson_r_mean']:.3f} | {time.time()-t0:.1f}s") + if m['avg_eucl_mm'] < best_eucl: + best_eucl = m['avg_eucl_mm'] + best_metrics = m + best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()} + best_epoch = epoch + patience_counter = 0 + else: + patience_counter += 1 + if patience_counter >= args.patience: + print(f" Early stop (best epoch {best_epoch})") + break + + if best_state is not None: + torch.save(best_state, os.path.join(out_dir, 'model_best.pt')) + + results = { + 'experiment': exp_name, + 'backbone': args.backbone, + 'seed': args.seed, + 'best_epoch': best_epoch, + 'best_test_metrics': best_metrics, + 'train_size': len(train_ds), + 'test_size': len(test_ds), + 'target_mean': tstats[0].flatten().tolist(), + 'target_std': tstats[1].flatten().tolist(), + 'args': vars(args), + } + with open(os.path.join(out_dir, 'results.json'), 'w') as f: + json.dump(results, f, indent=2) + print(f"Saved: {out_dir}/results.json") + return results + + +def main(): + p = argparse.ArgumentParser() + p.add_argument('--backbone', type=str, default='transformer', + choices=['transformer', 'lstm', 'cnn']) + p.add_argument('--epochs', type=int, default=60) + p.add_argument('--batch_size', type=int, default=8) + p.add_argument('--lr', type=float, default=1e-3) + p.add_argument('--weight_decay', type=float, default=1e-4) + p.add_argument('--hidden_dim', type=int, default=128) + p.add_argument('--dropout', type=float, default=0.2) + p.add_argument('--downsample', type=int, default=5) + p.add_argument('--patience', type=int, default=12) + p.add_argument('--seed', type=int, default=42) + p.add_argument('--output_dir', type=str, required=True) + p.add_argument('--tag', type=str, default='') + args = p.parse_args() + run_experiment(args) + + +if __name__ == '__main__': + main() diff --git a/experiments/tasks/train_exp_retrieval.py b/experiments/tasks/train_exp_retrieval.py new file mode 100644 index 0000000000000000000000000000000000000000..a2744450ef452996cd9c2faac98e3a7d56ba530a --- /dev/null +++ b/experiments/tasks/train_exp_retrieval.py @@ -0,0 +1,599 @@ +#!/usr/bin/env python3 +""" +Experiment C: T5 Cross-modal sensor-to-text retrieval. + +Per-action-segment contrastive training: +- Sensor encoder: Transformer over the multimodal sensor window covering the + annotated segment (with 1s context padding each side). +- Text encoder: small Transformer trained from scratch over character tokens + of the segment's Chinese natural-language description. We treat the + segment's four description fields {task, left_hand, right_hand, + bimanual_interaction} as four "paraphrased variants" of the same segment, + as claimed by the paper. + +Loss: symmetric InfoNCE (CLIP-style). +Eval: Recall@{1, 5, 10} with K=100 distractors sampled from the test pool. + +Annotations live in ${PULSE_ROOT}/annotations_v2/ (18 +volunteers, 127 files, 2,409 fine-grained segments with action_label). +Subject-independent split: test = v25, v26, v27, v3 (same as T1). +""" + +import os +import sys +import json +import time +import random +import argparse +import re +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import Dataset, DataLoader +from torch.nn.utils.rnn import pad_sequence + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from data.dataset import ( + DATASET_DIR, MODALITY_FILES, TRAIN_VOLS, TEST_VOLS, + load_modality_array, SCENE_LABELS, +) + +ANNOT_DIR = '${PULSE_ROOT}/annotations_v2' + + +# --------------------------------------------------------------------------- +# Annotation loading +# --------------------------------------------------------------------------- + +def parse_timestamp(ts): + """Parse 'MM:SS-MM:SS' -> (start_sec, end_sec).""" + m = re.match(r'(\d+):(\d+)\s*-\s*(\d+):(\d+)', ts) + if not m: + return None + sm, ss, em, es = map(int, m.groups()) + return sm * 60 + ss, em * 60 + es + + +def collect_segments(volunteers): + """Scan annotation files and return a list of per-segment dicts with + timestamp, 4 text views, scene, volunteer.""" + out = [] + for vol in volunteers: + vol_dir = os.path.join(ANNOT_DIR, vol) + if not os.path.isdir(vol_dir): + continue + for fn in sorted(os.listdir(vol_dir)): + if not fn.endswith('.json'): + continue + scene = fn.replace('.json', '') + if scene not in SCENE_LABELS: + continue + try: + d = json.load(open(os.path.join(vol_dir, fn))) + except Exception: + continue + for seg in d.get('segments', []): + ts = parse_timestamp(seg.get('timestamp', '')) + if ts is None: + continue + # Four text views -- paper's "four paraphrased variants" + texts = [] + for k in ['task', 'left_hand', 'right_hand', 'bimanual_interaction']: + t = seg.get(k, '').strip() + if t: + texts.append(t) + if len(texts) == 0: + continue + out.append({ + 'vol': vol, + 'scene': scene, + 't_start': ts[0], + 't_end': ts[1], + 'texts': texts, + 'action_label': seg.get('action_label', ''), + }) + print(f" Collected {len(out)} annotated segments from " + f"{len(set((s['vol'], s['scene']) for s in out))} recordings") + return out + + +# --------------------------------------------------------------------------- +# Vocabulary for Chinese character tokenization +# --------------------------------------------------------------------------- + +PAD, UNK = 0, 1 + + +def build_vocab(segments, min_count=1): + from collections import Counter + c = Counter() + for s in segments: + for t in s['texts']: + for ch in t: + c[ch] += 1 + vocab = {'': PAD, '': UNK} + for ch, cnt in c.most_common(): + if cnt >= min_count: + vocab[ch] = len(vocab) + return vocab + + +def tokenize(text, vocab, max_len=64): + ids = [vocab.get(ch, UNK) for ch in text][:max_len] + return ids + + +# --------------------------------------------------------------------------- +# Dataset +# --------------------------------------------------------------------------- + +class SegmentRetrievalDataset(Dataset): + """Per-segment sensor window + 4 Chinese caption variants.""" + + def __init__(self, segments, modalities, vocab, downsample=5, + context_pad_sec=1.0, max_text_len=64, stats=None): + self.modalities = modalities + self.downsample = downsample + self.max_text_len = max_text_len + self.vocab = vocab + # Cache sensor data per recording to avoid re-loading + self._sensor_cache = {} + self._modality_dims = {} + self.items = [] + skipped = 0 + for seg in segments: + vol, scene = seg['vol'], seg['scene'] + arr = self._load_recording(vol, scene) + if arr is None: + skipped += 1 + continue + # Compute sample window + sr = 100 # Hz, before downsample + t0 = max(0, int((seg['t_start'] - context_pad_sec) * sr)) + t1 = min(arr.shape[0], int((seg['t_end'] + context_pad_sec) * sr)) + if t1 - t0 < sr * 0.3: # <0.3s, skip degenerate + skipped += 1 + continue + window = arr[t0:t1:downsample] # downsampled sensor window + if window.shape[0] < 4: + skipped += 1 + continue + self.items.append({ + 'window': window.astype(np.float32), + 'texts': seg['texts'], + 'action_label': seg.get('action_label', ''), + 'src': f"{vol}/{scene}@{seg['t_start']}-{seg['t_end']}", + }) + print(f" Materialized {len(self.items)} segments (skipped {skipped}), " + f"feat dim {sum(self._modality_dims.values())}") + + # Normalize (using train stats if provided) + all_frames = np.concatenate([it['window'] for it in self.items], axis=0).astype(np.float64) + if stats is not None: + self.mean, self.std = stats + else: + self.mean = all_frames.mean(axis=0, keepdims=True) + self.std = all_frames.std(axis=0, keepdims=True) + self.std[self.std < 1e-8] = 1.0 + for it in self.items: + it['window'] = ((it['window'].astype(np.float64) - self.mean) / + self.std).astype(np.float32) + it['window'] = np.nan_to_num(it['window'], nan=0.0, posinf=0.0, neginf=0.0) + + def _load_recording(self, vol, scene): + key = (vol, scene) + if key in self._sensor_cache: + return self._sensor_cache[key] + scenario_dir = os.path.join(DATASET_DIR, vol, scene) + if not os.path.isdir(scenario_dir): + self._sensor_cache[key] = None + return None + parts = [] + for mod in self.modalities: + if mod == 'mocap': + fp = os.path.join(scenario_dir, f"aligned_{vol}{scene}_s_Q.tsv") + else: + fp = os.path.join(scenario_dir, MODALITY_FILES[mod]) + if not os.path.exists(fp): + self._sensor_cache[key] = None + return None + arr = load_modality_array(fp, mod) + if arr is None: + self._sensor_cache[key] = None + return None + if mod in self._modality_dims and arr.shape[1] != self._modality_dims[mod]: + expected = self._modality_dims[mod] + if arr.shape[1] < expected: + pad = np.zeros((arr.shape[0], expected - arr.shape[1]), + dtype=np.float32) + arr = np.concatenate([arr, pad], axis=1) + else: + arr = arr[:, :expected] + if mod not in self._modality_dims: + self._modality_dims[mod] = arr.shape[1] + parts.append(arr) + T_min = min(p.shape[0] for p in parts) + combined = np.concatenate([p[:T_min] for p in parts], axis=1) + self._sensor_cache[key] = combined + return combined + + @property + def feat_dim(self): + return sum(self._modality_dims.values()) + + def get_stats(self): + return (self.mean, self.std) + + def __len__(self): + return len(self.items) + + def __getitem__(self, idx): + it = self.items[idx] + # Randomly pick one of the 4 captions at training time + text = random.choice(it['texts']) + tok = tokenize(text, self.vocab, max_len=self.max_text_len) + return { + 'window': torch.from_numpy(it['window']), + 'text_ids': torch.LongTensor(tok), + 'all_texts': it['texts'], + 'src': it['src'], + } + + +def retrieval_collate(batch): + windows = [b['window'] for b in batch] + seq_lens = torch.LongTensor([w.shape[0] for w in windows]) + padded_w = pad_sequence(windows, batch_first=True, padding_value=0.0) + max_w = padded_w.shape[1] + w_mask = torch.arange(max_w).unsqueeze(0) < seq_lens.unsqueeze(1) + + text_ids = [b['text_ids'] for b in batch] + tok_lens = torch.LongTensor([t.shape[0] for t in text_ids]) + padded_t = pad_sequence(text_ids, batch_first=True, padding_value=PAD) + max_t = padded_t.shape[1] + t_mask = torch.arange(max_t).unsqueeze(0) < tok_lens.unsqueeze(1) + + return { + 'window': padded_w, + 'window_mask': w_mask, + 'text_ids': padded_t, + 'text_mask': t_mask, + 'srcs': [b['src'] for b in batch], + 'all_texts': [b['all_texts'] for b in batch], + } + + +# --------------------------------------------------------------------------- +# Model: two-tower retrieval +# --------------------------------------------------------------------------- + +class SensorEncoder(nn.Module): + def __init__(self, feat_dim, hidden_dim=128, n_layers=2, n_heads=4, + dropout=0.2, emb_dim=128): + super().__init__() + self.input_proj = nn.Linear(feat_dim, hidden_dim) + self.pos_enc = nn.Parameter(torch.zeros(1, 2048, hidden_dim)) + nn.init.trunc_normal_(self.pos_enc, std=0.02) + enc_layer = nn.TransformerEncoderLayer( + d_model=hidden_dim, nhead=n_heads, + dim_feedforward=4 * hidden_dim, dropout=dropout, + batch_first=True, activation='gelu', + ) + self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers) + self.proj = nn.Sequential( + nn.LayerNorm(hidden_dim), + nn.Linear(hidden_dim, emb_dim), + ) + + def forward(self, x, mask): + T = x.size(1) + h = self.input_proj(x) + self.pos_enc[:, :T, :] + key_padding = ~mask + h = self.encoder(h, src_key_padding_mask=key_padding) + # Masked mean pool + m = mask.unsqueeze(-1).float() + pooled = (h * m).sum(dim=1) / m.sum(dim=1).clamp(min=1.0) + return F.normalize(self.proj(pooled), dim=-1) + + +class TextEncoder(nn.Module): + def __init__(self, vocab_size, hidden_dim=128, n_layers=2, n_heads=4, + dropout=0.2, emb_dim=128, max_len=64): + super().__init__() + self.embed = nn.Embedding(vocab_size, hidden_dim, padding_idx=PAD) + self.pos_enc = nn.Parameter(torch.zeros(1, max_len, hidden_dim)) + nn.init.trunc_normal_(self.pos_enc, std=0.02) + enc_layer = nn.TransformerEncoderLayer( + d_model=hidden_dim, nhead=n_heads, + dim_feedforward=4 * hidden_dim, dropout=dropout, + batch_first=True, activation='gelu', + ) + self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers) + self.proj = nn.Sequential( + nn.LayerNorm(hidden_dim), + nn.Linear(hidden_dim, emb_dim), + ) + + def forward(self, ids, mask): + T = ids.size(1) + h = self.embed(ids) + self.pos_enc[:, :T, :] + key_padding = ~mask + h = self.encoder(h, src_key_padding_mask=key_padding) + m = mask.unsqueeze(-1).float() + pooled = (h * m).sum(dim=1) / m.sum(dim=1).clamp(min=1.0) + return F.normalize(self.proj(pooled), dim=-1) + + +class TwoTowerRetrieval(nn.Module): + def __init__(self, feat_dim, vocab_size, hidden_dim=128, emb_dim=128, + max_text_len=64, dropout=0.2): + super().__init__() + self.sensor = SensorEncoder(feat_dim, hidden_dim, emb_dim=emb_dim, + dropout=dropout) + self.text = TextEncoder(vocab_size, hidden_dim, emb_dim=emb_dim, + max_len=max_text_len, dropout=dropout) + self.logit_scale = nn.Parameter(torch.ones(1) * np.log(1 / 0.07)) + + def forward(self, batch): + se = self.sensor(batch['window'], batch['window_mask']) + te = self.text(batch['text_ids'], batch['text_mask']) + return se, te + + +# --------------------------------------------------------------------------- +# Loss +# --------------------------------------------------------------------------- + +def info_nce(se, te, logit_scale): + """Symmetric InfoNCE.""" + scale = logit_scale.exp().clamp(max=100.0) + logits = scale * se @ te.t() # (B, B) + B = logits.size(0) + targets = torch.arange(B, device=logits.device) + loss_s2t = F.cross_entropy(logits, targets) + loss_t2s = F.cross_entropy(logits.t(), targets) + return 0.5 * (loss_s2t + loss_t2s) + + +# --------------------------------------------------------------------------- +# Training / Eval +# --------------------------------------------------------------------------- + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def train_one_epoch(model, loader, optimizer, device): + model.train() + total = 0.0 + n = 0 + for batch in loader: + batch = {k: v.to(device) if torch.is_tensor(v) else v + for k, v in batch.items()} + optimizer.zero_grad() + se, te = model(batch) + loss = info_nce(se, te, model.logit_scale) + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + total += loss.item() * se.size(0) + n += se.size(0) + return total / max(n, 1) + + +@torch.no_grad() +def evaluate_retrieval(model, loader, vocab, device, K=100, seed=0): + """Sensor -> text retrieval. For each sensor query, build pool of + 1 correct + K-1 distractors from other test segments, compute rank.""" + model.eval() + # Collect all embeddings + all_se = [] + all_texts = [] + srcs = [] + for batch in loader: + dev_batch = {k: v.to(device) if torch.is_tensor(v) else v + for k, v in batch.items()} + se = model.sensor(dev_batch['window'], dev_batch['window_mask']) + all_se.append(se.cpu()) + # For eval, use the first caption ("task") as the gold text + for texts in batch['all_texts']: + all_texts.append(texts[0]) + srcs.extend(batch['srcs']) + all_se = torch.cat(all_se, dim=0) # (N, D) + # Encode all candidate texts once + text_embs = [] + for i in range(0, len(all_texts), 64): + chunk = all_texts[i:i + 64] + tok_lists = [tokenize(t, vocab, max_len=64) for t in chunk] + lens = [len(t) for t in tok_lists] + max_len = max(lens) + pad_ids = torch.zeros(len(chunk), max_len, dtype=torch.long) + mask = torch.zeros(len(chunk), max_len, dtype=torch.bool) + for j, t in enumerate(tok_lists): + pad_ids[j, :len(t)] = torch.LongTensor(t) + mask[j, :len(t)] = True + pad_ids = pad_ids.to(device) + mask = mask.to(device) + te = model.text(pad_ids, mask).cpu() + text_embs.append(te) + text_embs = torch.cat(text_embs, dim=0) # (N, D) + + # For each sensor query i, sample K-1 distractors from {0..N}\{i} + rng = np.random.RandomState(seed) + N = all_se.shape[0] + ranks = [] + for i in range(N): + pool_size = min(K, N) + neg_candidates = [j for j in range(N) if j != i] + if len(neg_candidates) < pool_size - 1: + pool = [i] + neg_candidates + else: + neg = rng.choice(neg_candidates, size=pool_size - 1, replace=False) + pool = [i] + neg.tolist() + # Compute similarity of query i with pool texts + q = all_se[i:i + 1] # (1, D) + pool_texts = text_embs[pool] # (K, D) + sims = (q @ pool_texts.t()).squeeze(0).numpy() # (K,) + # rank of pool[0] (the correct one) + order = np.argsort(-sims) + rank = int(np.where(order == 0)[0][0]) + 1 + ranks.append(rank) + ranks = np.array(ranks) + return { + 'N': int(N), + 'K': int(K), + 'recall@1': float((ranks <= 1).mean()), + 'recall@5': float((ranks <= 5).mean()), + 'recall@10': float((ranks <= 10).mean()), + 'median_rank': float(np.median(ranks)), + 'mean_rank': float(ranks.mean()), + } + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def run_experiment(args): + set_seed(args.seed) + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + print(f"Device: {device}") + modalities = args.modalities.split(',') + print(f"Modalities: {modalities} | Seed: {args.seed}") + + print("Collecting train segments...") + train_segs = collect_segments(TRAIN_VOLS) + print("Collecting test segments...") + test_segs = collect_segments(TEST_VOLS) + + # Build char vocab from train only + vocab = build_vocab(train_segs) + print(f" Vocab size: {len(vocab)}") + + print("Building train dataset...") + train_ds = SegmentRetrievalDataset( + train_segs, modalities, vocab, downsample=args.downsample, + context_pad_sec=args.context_pad_sec, max_text_len=args.max_text_len, + ) + stats = train_ds.get_stats() + print("Building test dataset...") + test_ds = SegmentRetrievalDataset( + test_segs, modalities, vocab, downsample=args.downsample, + context_pad_sec=args.context_pad_sec, max_text_len=args.max_text_len, + stats=stats, + ) + + train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, + collate_fn=retrieval_collate, num_workers=0, + drop_last=True) + test_loader = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False, + collate_fn=retrieval_collate, num_workers=0) + + model = TwoTowerRetrieval( + train_ds.feat_dim, len(vocab), + hidden_dim=args.hidden_dim, emb_dim=args.emb_dim, + max_text_len=args.max_text_len, dropout=args.dropout, + ).to(device) + n_params = sum(p.numel() for p in model.parameters()) + print(f"Params: {n_params:,}") + + optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, + weight_decay=args.weight_decay) + scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( + optimizer, T_max=args.epochs, eta_min=1e-6, + ) + + mod_str = '-'.join(modalities) + exp_name = f"retrieval_{mod_str}_seed{args.seed}" + if args.tag: + exp_name += f"_{args.tag}" + out_dir = os.path.join(args.output_dir, exp_name) + os.makedirs(out_dir, exist_ok=True) + + best_r10 = 0.0 + best_metrics = None + best_state = None + + for epoch in range(1, args.epochs + 1): + t0 = time.time() + loss = train_one_epoch(model, train_loader, optimizer, device) + scheduler.step() + if epoch % args.eval_every == 0 or epoch == args.epochs: + m = evaluate_retrieval(model, test_loader, vocab, device, + K=args.K, seed=args.seed) + print(f" E{epoch:3d} | loss {loss:.4f} | R@1 {m['recall@1']:.3f} " + f"R@5 {m['recall@5']:.3f} R@10 {m['recall@10']:.3f} " + f"medR {m['median_rank']:.1f} | {time.time()-t0:.1f}s") + if m['recall@10'] > best_r10: + best_r10 = m['recall@10'] + best_metrics = m + best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()} + else: + print(f" E{epoch:3d} | loss {loss:.4f} | {time.time()-t0:.1f}s") + + if best_state is not None: + torch.save(best_state, os.path.join(out_dir, 'model_best.pt')) + + # Final eval with multiple distractor pool seeds for robustness + model.load_state_dict(best_state) + final_metrics = [] + for s in range(3): + m = evaluate_retrieval(model, test_loader, vocab, device, + K=args.K, seed=1000 + s) + final_metrics.append(m) + avg = {k: float(np.mean([fm[k] for fm in final_metrics])) + for k in ['recall@1', 'recall@5', 'recall@10', 'median_rank', 'mean_rank']} + std = {k: float(np.std([fm[k] for fm in final_metrics])) + for k in ['recall@1', 'recall@5', 'recall@10']} + + results = { + 'experiment': exp_name, + 'modalities': modalities, + 'seed': args.seed, + 'K_pool': args.K, + 'n_train_segments': len(train_ds), + 'n_test_segments': len(test_ds), + 'vocab_size': len(vocab), + 'best_recall10': float(best_r10), + 'best_metrics': best_metrics, + 'final_avg_over_3_pool_seeds': avg, + 'final_std_over_3_pool_seeds': std, + 'args': vars(args), + } + with open(os.path.join(out_dir, 'results.json'), 'w') as f: + json.dump(results, f, indent=2, ensure_ascii=False) + print(f"Saved: {out_dir}/results.json") + print(f"Final (avg over 3 pool seeds): R@1 {avg['recall@1']:.3f} " + f"R@5 {avg['recall@5']:.3f} R@10 {avg['recall@10']:.3f}") + return results + + +def main(): + p = argparse.ArgumentParser() + p.add_argument('--modalities', type=str, default='mocap,emg,eyetrack,imu') + p.add_argument('--epochs', type=int, default=60) + p.add_argument('--batch_size', type=int, default=64) + p.add_argument('--lr', type=float, default=5e-4) + p.add_argument('--weight_decay', type=float, default=1e-4) + p.add_argument('--hidden_dim', type=int, default=128) + p.add_argument('--emb_dim', type=int, default=128) + p.add_argument('--dropout', type=float, default=0.2) + p.add_argument('--downsample', type=int, default=5) + p.add_argument('--context_pad_sec', type=float, default=1.0) + p.add_argument('--max_text_len', type=int, default=64) + p.add_argument('--K', type=int, default=100) + p.add_argument('--eval_every', type=int, default=5) + p.add_argument('--seed', type=int, default=42) + p.add_argument('--output_dir', type=str, required=True) + p.add_argument('--tag', type=str, default='') + args = p.parse_args() + run_experiment(args) + + +if __name__ == '__main__': + main() diff --git a/experiments/tasks/train_exp_zeroshot.py b/experiments/tasks/train_exp_zeroshot.py new file mode 100644 index 0000000000000000000000000000000000000000..4b02d3eb74a71c9d4cc215c45be56b2c1efa4f67 --- /dev/null +++ b/experiments/tasks/train_exp_zeroshot.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 +""" +Experiment F: Zero-shot scene generalization. + +Leave-one-scene-out evaluation on T1 (scene recognition). For each of the 8 +scenes S_k, train on the remaining 7 scenes across all train+test +volunteers, then evaluate on scene S_k only (all volunteers). Since the +held-out scene was never seen during training, the held-out scene's samples +should be distributed over the remaining 7 classes -- so we report the +fraction of held-out samples that get classified into the single nearest +remaining class (dominant neighbor) and macro-F1 on the 7 seen scenes +during training+eval on mixed scenes. + +Simpler protocol: train 8-class classifier but WITHOUT scene S_k in the +training set. Evaluate on full test set (all 8 scenes). Measure what the +holdout scene gets misclassified to -- reveals scene similarity and +generalization behavior. +""" + +import os +import sys +import json +import time +import argparse +import numpy as np +import torch +import torch.nn as nn +from torch.utils.data import DataLoader +from sklearn.metrics import accuracy_score, f1_score, confusion_matrix + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from data.dataset import ( + MultimodalSceneDataset, collate_fn, TRAIN_VOLS, TEST_VOLS, SCENE_LABELS, + NUM_CLASSES, +) +from nets.models import build_model +from tasks.train_exp1 import set_seed, apply_augmentation + + +def filter_dataset_by_scene(ds, excluded_scene): + """Return indices of samples NOT from the excluded scene.""" + idxs = [] + for i, info in enumerate(ds.sample_info): + if f"/{excluded_scene}" not in info: + idxs.append(i) + return idxs + + +def run_experiment(args): + set_seed(args.seed) + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + print(f"Device: {device}") + modalities = args.modalities.split(',') + held_out = args.held_out_scene + assert held_out in SCENE_LABELS, f"Unknown scene: {held_out}" + print(f"Held-out scene: {held_out} (= class {SCENE_LABELS[held_out]})") + + # Full train/test datasets + print("Loading train data...") + full_train = MultimodalSceneDataset(TRAIN_VOLS, modalities, args.downsample) + stats = full_train.get_stats() + print("Loading test data...") + full_test = MultimodalSceneDataset(TEST_VOLS, modalities, args.downsample, + stats=stats) + + # Filter train to exclude the held-out scene + train_idx = filter_dataset_by_scene(full_train, held_out) + print(f"Train size (7 seen scenes): {len(train_idx)}/{len(full_train)}") + + # For test, split into "seen" (not held-out) and "unseen" (held-out) + test_seen_idx = filter_dataset_by_scene(full_test, held_out) + test_unseen_idx = [i for i in range(len(full_test)) + if i not in test_seen_idx] + print(f"Test seen: {len(test_seen_idx)} unseen: {len(test_unseen_idx)}") + + train_sub = torch.utils.data.Subset(full_train, train_idx) + test_seen_sub = torch.utils.data.Subset(full_test, test_seen_idx) + test_unseen_sub = torch.utils.data.Subset(full_test, test_unseen_idx) + + train_loader = DataLoader(train_sub, batch_size=args.batch_size, shuffle=True, + collate_fn=collate_fn) + test_seen_loader = DataLoader(test_seen_sub, batch_size=args.batch_size, + shuffle=False, collate_fn=collate_fn) + test_unseen_loader = DataLoader(test_unseen_sub, batch_size=args.batch_size, + shuffle=False, collate_fn=collate_fn) + + # Build model -- keep 8-class head (we train on only 7 seen classes but + # leave the held-out logit available; it will predict ~0 since never seen) + model = build_model( + args.model, args.fusion, full_train.feat_dim, + full_train.modality_dims, NUM_CLASSES, + hidden_dim=args.hidden_dim, proj_dim=0, late_agg='mean', + ).to(device) + n_params = sum(p.numel() for p in model.parameters()) + print(f"Params: {n_params:,}") + + # Re-weight: give zero weight to held-out class + class_weights = full_train.get_class_weights().clone().to(device) + class_weights[SCENE_LABELS[held_out]] = 0.0 + criterion = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=0.1, + ignore_index=SCENE_LABELS[held_out]) + optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, + weight_decay=args.weight_decay) + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( + optimizer, mode='min', factor=0.5, patience=5, min_lr=1e-6, + ) + + exp_name = f"zs_{args.model}_{'-'.join(modalities)}_hold_{held_out}_seed{args.seed}" + if args.tag: + exp_name += f"_{args.tag}" + out_dir = os.path.join(args.output_dir, exp_name) + os.makedirs(out_dir, exist_ok=True) + + best_seen_f1 = 0.0 + best_state = None + best_epoch = 0 + patience_counter = 0 + + for epoch in range(1, args.epochs + 1): + t0 = time.time() + model.train() + tr_loss, n = 0.0, 0 + for x, y, mask, _ in train_loader: + x, y, mask = x.to(device), y.to(device), mask.to(device) + if args.augment: + x = apply_augmentation(x, mask, 0.1, 0.1) + optimizer.zero_grad() + logits = model(x, mask) + loss = criterion(logits, y) + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + tr_loss += loss.item() * y.size(0) + n += y.size(0) + tr_loss /= max(n, 1) + + # Eval on seen (7 classes) and unseen (held-out) + model.eval() + def run_eval(loader): + preds, ys, losses = [], [], 0.0 + nn_ = 0 + with torch.no_grad(): + for x, y, mask, _ in loader: + x, y, mask = x.to(device), y.to(device), mask.to(device) + logits = model(x, mask) + losses += criterion(logits, y).item() * y.size(0) + nn_ += y.size(0) + preds.extend(logits.argmax(dim=1).cpu().numpy()) + ys.extend(y.cpu().numpy()) + return preds, ys, losses / max(nn_, 1) + + seen_preds, seen_ys, seen_loss = run_eval(test_seen_loader) + uns_preds, uns_ys, _ = run_eval(test_unseen_loader) + + seen_acc = accuracy_score(seen_ys, seen_preds) + seen_f1 = f1_score(seen_ys, seen_preds, average='macro', + labels=[c for c in range(NUM_CLASSES) + if c != SCENE_LABELS[held_out]], + zero_division=0) + uns_pred_counts = np.bincount(uns_preds, minlength=NUM_CLASSES) + # What does the unseen scene get mapped to? + dominant = int(np.argmax(uns_pred_counts)) + dominant_frac = float(uns_pred_counts[dominant] / max(len(uns_preds), 1)) + held_out_pred_frac = float(uns_pred_counts[SCENE_LABELS[held_out]] / + max(len(uns_preds), 1)) + + scheduler.step(seen_loss) + + print(f" E{epoch:3d} | tr {tr_loss:.4f} te {seen_loss:.4f} | " + f"seen_acc {seen_acc:.3f} f1 {seen_f1:.3f} | " + f"unseen -> {dominant} ({dominant_frac:.2f}) " + f"held_out_predicted_frac {held_out_pred_frac:.3f} | " + f"{time.time()-t0:.1f}s") + + if seen_f1 > best_seen_f1: + best_seen_f1 = seen_f1 + best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()} + best_epoch = epoch + patience_counter = 0 + best_metrics = { + 'seen_acc': float(seen_acc), + 'seen_f1': float(seen_f1), + 'unseen_dominant_class': int(dominant), + 'unseen_dominant_frac': float(dominant_frac), + 'unseen_pred_hist': uns_pred_counts.tolist(), + 'n_unseen': len(uns_preds), + 'held_out_pred_frac': float(held_out_pred_frac), + } + else: + patience_counter += 1 + if patience_counter >= args.patience: + print(f" Early stop (best epoch {best_epoch})") + break + + if best_state is not None: + torch.save(best_state, os.path.join(out_dir, 'model_best.pt')) + + results = { + 'experiment': exp_name, + 'model': args.model, + 'modalities': modalities, + 'held_out_scene': held_out, + 'held_out_label': SCENE_LABELS[held_out], + 'seed': args.seed, + 'best_epoch': best_epoch, + 'best_metrics': best_metrics, + 'train_size': len(train_sub), + 'test_seen_size': len(test_seen_sub), + 'test_unseen_size': len(test_unseen_sub), + 'args': vars(args), + } + with open(os.path.join(out_dir, 'results.json'), 'w') as f: + json.dump(results, f, indent=2) + print(f"Saved: {out_dir}/results.json") + return results + + +def main(): + p = argparse.ArgumentParser() + p.add_argument('--model', type=str, default='transformer') + p.add_argument('--fusion', type=str, default='early') + p.add_argument('--modalities', type=str, default='mocap,emg,imu') + p.add_argument('--held_out_scene', type=str, required=True, + help='One of s1..s8') + p.add_argument('--epochs', type=int, default=60) + p.add_argument('--batch_size', type=int, default=16) + p.add_argument('--lr', type=float, default=1e-3) + p.add_argument('--weight_decay', type=float, default=1e-4) + p.add_argument('--hidden_dim', type=int, default=128) + p.add_argument('--downsample', type=int, default=5) + p.add_argument('--patience', type=int, default=12) + p.add_argument('--augment', action='store_true') + p.add_argument('--seed', type=int, default=42) + p.add_argument('--output_dir', type=str, required=True) + p.add_argument('--tag', type=str, default='') + args = p.parse_args() + run_experiment(args) + + +if __name__ == '__main__': + main() diff --git a/experiments/tasks/train_forecast.py b/experiments/tasks/train_forecast.py new file mode 100644 index 0000000000000000000000000000000000000000..8e64088b7c549f2d0e66686937cb1cbb15ddfde3 --- /dev/null +++ b/experiments/tasks/train_forecast.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 +"""Train + evaluate frame-level future verb_fine forecasting. + +Outputs per-horizon top-1 frame accuracy on the test set, saved to +results.json under . +""" +from __future__ import annotations +import argparse +import json +import os +import random +import sys +import time +from pathlib import Path + +import numpy as np +import torch +import torch.nn as nn +from torch.utils.data import DataLoader + +THIS = Path(__file__).resolve() +sys.path.insert(0, str(THIS.parent)) +sys.path.insert(0, str(THIS.parents[1])) +try: + from experiments.dataset_forecast import ( + ForecastDataset, collate_forecast, build_train_test, + IDLE_LABEL, NUM_FORECAST_CLASSES, + ) + from experiments.models_forecast import build_forecast_model +except ModuleNotFoundError: + from dataset_forecast import ( + ForecastDataset, collate_forecast, build_train_test, + IDLE_LABEL, NUM_FORECAST_CLASSES, + ) + from models_forecast import build_forecast_model + + +def set_seed(seed: int): + random.seed(seed); np.random.seed(seed) + torch.manual_seed(seed); torch.cuda.manual_seed_all(seed) + + +def train_epoch(model, loader, optimizer, criterion, device): + model.train() + total, n_frames, correct = 0.0, 0, 0 + for x, y, _ in loader: + x = {m: v.to(device) for m, v in x.items()} + y = y.to(device) # (B, T_fut) + optimizer.zero_grad() + logits = model(x) # (B, T_fut, C) + loss = criterion(logits.reshape(-1, logits.size(-1)), + y.reshape(-1)) + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + total += loss.item() * y.numel() + n_frames += y.numel() + correct += (logits.argmax(-1) == y).sum().item() + return total / max(n_frames, 1), correct / max(n_frames, 1) + + +@torch.no_grad() +def evaluate(model, loader, device, t_fut: int): + model.eval() + # Per-horizon counts (overall, ignore-idle) + per_h_correct = np.zeros(t_fut, dtype=np.int64) + per_h_total = np.zeros(t_fut, dtype=np.int64) + per_h_correct_action = np.zeros(t_fut, dtype=np.int64) + per_h_total_action = np.zeros(t_fut, dtype=np.int64) + + for x, y, _ in loader: + x = {m: v.to(device) for m, v in x.items()} + y = y.to(device) # (B, T_fut) + logits = model(x) # (B, T_fut, C) + pred = logits.argmax(-1) # (B, T_fut) + for h in range(t_fut): + yh = y[:, h]; ph = pred[:, h] + per_h_correct[h] += (ph == yh).sum().item() + per_h_total[h] += yh.numel() + mask = (yh != IDLE_LABEL) + per_h_correct_action[h] += ((ph == yh) & mask).sum().item() + per_h_total_action[h] += mask.sum().item() + + return { + "per_h_acc": (per_h_correct / np.maximum(per_h_total, 1)).tolist(), + "per_h_acc_action": (per_h_correct_action / np.maximum(per_h_total_action, 1)).tolist(), + "frame_acc": float(per_h_correct.sum() / max(per_h_total.sum(), 1)), + "frame_acc_action": float(per_h_correct_action.sum() / max(per_h_total_action.sum(), 1)), + } + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", type=str, required=True, + choices=["daf", "futr", "deepconvlstm", "rulstm", "avt"]) + ap.add_argument("--modalities", type=str, default="imu,emg,eyetrack,mocap,pressure", + help="Comma-separated modality list") + ap.add_argument("--t_obs", type=float, default=1.5) + ap.add_argument("--t_fut", type=float, default=0.5) + ap.add_argument("--anchor_stride", type=float, default=0.25) + ap.add_argument("--contact_only", action="store_true", + help="Only keep anchors whose past+future window has any " + "frame with pressure-sum > threshold (Plan B).") + ap.add_argument("--contact_threshold_g", type=float, default=5.0) + ap.add_argument("--epochs", type=int, default=15) + ap.add_argument("--batch_size", type=int, default=64) + ap.add_argument("--lr", type=float, default=3e-4) + ap.add_argument("--weight_decay", type=float, default=1e-4) + ap.add_argument("--d_model", type=int, default=128) + ap.add_argument("--dropout", type=float, default=0.1) + ap.add_argument("--label_smoothing", type=float, default=0.05) + ap.add_argument("--num_workers", type=int, default=2) + ap.add_argument("--seed", type=int, default=42) + ap.add_argument("--patience", type=int, default=5) + ap.add_argument("--output_dir", type=str, required=True) + args = ap.parse_args() + + set_seed(args.seed) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"device={device} | seed={args.seed} | model={args.model} " + f"modalities={args.modalities}") + + mods = args.modalities.split(",") + train_ds, test_ds = build_train_test( + modalities=mods, + t_obs_sec=args.t_obs, t_fut_sec=args.t_fut, + anchor_stride_sec=args.anchor_stride, + contact_only=args.contact_only, + contact_threshold_g=args.contact_threshold_g, + ) + print(f"train={len(train_ds)} test={len(test_ds)} " + f"T_obs={train_ds.T_obs} T_fut={train_ds.T_fut} " + f"mod_dims={train_ds.modality_dims}") + + tr_loader = DataLoader( + train_ds, batch_size=args.batch_size, shuffle=True, + num_workers=args.num_workers, collate_fn=collate_forecast, + drop_last=False, + ) + te_loader = DataLoader( + test_ds, batch_size=args.batch_size, shuffle=False, + num_workers=args.num_workers, collate_fn=collate_forecast, + ) + + model = build_forecast_model( + args.model, train_ds.modality_dims, + num_classes=NUM_FORECAST_CLASSES, + t_obs=train_ds.T_obs, t_fut=train_ds.T_fut, + d_model=args.d_model, dropout=args.dropout, + ).to(device) + n_params = sum(p.numel() for p in model.parameters()) + print(f"params={n_params:,}") + + optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, + weight_decay=args.weight_decay) + sched = torch.optim.lr_scheduler.CosineAnnealingLR( + optimizer, T_max=args.epochs, eta_min=args.lr * 0.05 + ) + criterion = nn.CrossEntropyLoss(label_smoothing=args.label_smoothing) + + out_dir = Path(args.output_dir); out_dir.mkdir(parents=True, exist_ok=True) + best = {"frame_acc_action": -1.0, "epoch": 0, "state_dict": None} + + for ep in range(1, args.epochs + 1): + t0 = time.time() + tr_loss, tr_acc = train_epoch(model, tr_loader, optimizer, criterion, device) + ev = evaluate(model, te_loader, device, t_fut=train_ds.T_fut) + sched.step() + print(f" E{ep:2d} | tr {tr_loss:.4f}/{tr_acc:.3f} " + f"| te frame_acc {ev['frame_acc']:.3f} action {ev['frame_acc_action']:.3f} " + f"| {time.time()-t0:.1f}s") + if ev["frame_acc_action"] > best["frame_acc_action"]: + best = {**ev, "epoch": ep, "state_dict": {k: v.cpu() for k, v in model.state_dict().items()}} + torch.save(best["state_dict"], out_dir / "model_best.pt") + + # Final reporting from best epoch + final = {k: v for k, v in best.items() if k != "state_dict"} + out = { + "method": args.model, + "modalities": mods, + "seed": args.seed, + "n_params": n_params, + "T_obs": train_ds.T_obs, + "T_fut": train_ds.T_fut, + "best_epoch": int(best["epoch"]), + "frame_acc": float(best["frame_acc"]), + "frame_acc_action": float(best["frame_acc_action"]), + "per_h_acc": list(map(float, best["per_h_acc"])), + "per_h_acc_action": list(map(float, best["per_h_acc_action"])), + "args": vars(args), + } + with open(out_dir / "results.json", "w") as f: + json.dump(out, f, indent=2) + print(f"\n[done] best frame_acc_action {best['frame_acc_action']:.4f} (epoch {best['epoch']})") + print(f"per_h_acc_action: {[f'{a:.3f}' for a in best['per_h_acc_action']]}") + print(f"saved to {out_dir}/results.json") + + +if __name__ == "__main__": + main() diff --git a/experiments/tasks/train_grasp_state.py b/experiments/tasks/train_grasp_state.py new file mode 100644 index 0000000000000000000000000000000000000000..9aed067cd0a4a04230d0f374bc38dd7c3616b1fd --- /dev/null +++ b/experiments/tasks/train_grasp_state.py @@ -0,0 +1,266 @@ +#!/usr/bin/env python3 +"""Train + evaluate binary "is_grasping" recognition (T5 v3 / TGSR). + +Predicts a binary class label over the future T_fut window from past T_obs of +input modalities. Ground truth = annotation-based grasp-verb mask. + +Comparison: input includes pressure (treatment) vs not (control), under the +same cross-modal kinematic baseline. Lift = macro_F1(with) − macro_F1(without). +""" +from __future__ import annotations +import argparse +import json +import random +import sys +import time +from pathlib import Path + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader + +THIS = Path(__file__).resolve() +sys.path.insert(0, str(THIS.parent)) +sys.path.insert(0, str(THIS.parents[1])) + +try: + from experiments.dataset_grasp_state import ( + GraspStateDataset, collate_grasp_state, + build_grasp_train_test, EVENT_NAMES, + CLASS_NAMES_BINARY, CLASS_NAMES_THREE, VERB_LIST, OBJECT_TOP_LIST, + ) +except ModuleNotFoundError: + from dataset_grasp_state import ( + GraspStateDataset, collate_grasp_state, + build_grasp_train_test, EVENT_NAMES, + CLASS_NAMES_BINARY, CLASS_NAMES_THREE, VERB_LIST, OBJECT_TOP_LIST, + ) +from nets.models_forecast import build_forecast_model # type: ignore + + +class GraspStateClassifier(nn.Module): + """Wrap the existing forecasting backbone for binary classification. + + Reuses build_forecast_model with output dim = num_classes, then mean-pools + over the T_fut output axis to produce (B, num_classes) logits. + """ + def __init__(self, base_name, modality_dims, t_obs, t_fut, + d_model, dropout, num_classes=2): + super().__init__() + self.base = build_forecast_model( + base_name, modality_dims, + num_classes=num_classes, + t_obs=t_obs, t_fut=t_fut, + d_model=d_model, dropout=dropout, + ) + + def forward(self, x): + out = self.base(x) # (B, T_fut, num_classes) + return out.mean(dim=1) # (B, num_classes) ← logits + + +def set_seed(seed: int): + random.seed(seed); np.random.seed(seed) + torch.manual_seed(seed); torch.cuda.manual_seed_all(seed) + + +def train_epoch(model, loader, optimizer, device, class_weight=None): + model.train() + total, n = 0.0, 0 + for x, y, _et, _ in loader: + x = {m: v.to(device) for m, v in x.items()} + y = y.to(device) + optimizer.zero_grad() + logits = model(x) + loss = F.cross_entropy(logits, y, weight=class_weight) + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + total += loss.item() * y.numel() + n += y.numel() + return total / max(n, 1) + + +@torch.no_grad() +def evaluate(model, loader, device, num_classes=2, class_names=None): + if class_names is None: + if num_classes == 2: + _CN = CLASS_NAMES_BINARY + elif num_classes == 3: + _CN = CLASS_NAMES_THREE + elif num_classes == len(VERB_LIST): + _CN = {i: v for i, v in enumerate(VERB_LIST)} + else: + _CN = {i: v for i, v in enumerate(OBJECT_TOP_LIST)} + else: + _CN = class_names + """Return overall + per-event-stratified F1, accuracy, confusion.""" + model.eval() + # 5 strata = 4 events + overall + cm = np.zeros((5, num_classes, num_classes), dtype=np.int64) + for x, y, et, _ in loader: + x = {m: v.to(device) for m, v in x.items()} + logits = model(x) + pred = logits.argmax(dim=-1).cpu().numpy() + y_np = y.numpy(); et_np = et.numpy() + for k in range(len(y_np)): + e = int(et_np[k]) + cm[e][int(y_np[k])][int(pred[k])] += 1 + cm[4][int(y_np[k])][int(pred[k])] += 1 + + out = {} + for e in range(5): + m = cm[e] + n = int(m.sum()) + # per-class F1 + f1s = [] + for c in range(num_classes): + tp = m[c][c] + fp = m[:, c].sum() - tp + fn = m[c, :].sum() - tp + prec = tp / max(tp + fp, 1) + rec = tp / max(tp + fn, 1) + f1 = 2 * prec * rec / max(prec + rec, 1e-9) + f1s.append(float(f1)) + macro_f1 = float(np.mean(f1s)) + acc = float(np.trace(m)) / max(n, 1) + name = EVENT_NAMES.get(e, "overall") if e < 4 else "overall" + out[name] = { + "n": n, "accuracy": acc, + "macro_f1": macro_f1, + "f1_per_class": {_CN[c]: f1s[c] for c in range(num_classes)}, + "confusion": m.tolist(), + } + return out + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", required=True, choices=["daf", "futr", "deepconvlstm"]) + ap.add_argument("--input_modalities", required=True, + help="comma-separated, e.g. 'emg,imu,mocap' or 'emg,imu,mocap,pressure'") + ap.add_argument("--t_obs", type=float, default=1.0) + ap.add_argument("--t_fut", type=float, default=0.5) + ap.add_argument("--anchor_stride", type=float, default=0.25) + ap.add_argument("--per_class_max", type=int, default=15000, + help="Cap each class to this many anchors in train (for balance).") + ap.add_argument("--epochs", type=int, default=30) + ap.add_argument("--batch_size", type=int, default=64) + ap.add_argument("--lr", type=float, default=3e-4) + ap.add_argument("--weight_decay", type=float, default=1e-4) + ap.add_argument("--d_model", type=int, default=128) + ap.add_argument("--dropout", type=float, default=0.1) + ap.add_argument("--num_workers", type=int, default=2) + ap.add_argument("--seed", type=int, default=42) + ap.add_argument("--patience", type=int, default=6) + ap.add_argument("--no_class_weight", action="store_true", + help="Skip class-weighted CE; rely on per_class_max balancing.") + ap.add_argument("--label_mode", default="binary", choices=["binary", "three_class", "verb", "object"]) + ap.add_argument("--sustained_threshold_sec", type=float, default=0.3, + help="(3-class only) min contiguous contact run for SustainedGrasp class.") + ap.add_argument("--require_lift_for_sustained", action="store_true", + help="(3-class only) Class 2 also requires verb ∈ LIFT_VERBS or hand_type=both.") + ap.add_argument("--train_vols", default=None, + help="comma-separated volunteer IDs to override the default TRAIN split (for CV).") + ap.add_argument("--test_vols", default=None, + help="comma-separated volunteer IDs to override the default TEST split (for CV).") + ap.add_argument("--output_dir", required=True) + args = ap.parse_args() + + set_seed(args.seed) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + inputs = args.input_modalities.split(",") + print(f"device={device} seed={args.seed} model={args.model} " + f"inputs={inputs} t_obs={args.t_obs} t_fut={args.t_fut}", flush=True) + + tr_v = args.train_vols.split(',') if args.train_vols else None + te_v = args.test_vols.split(',') if args.test_vols else None + train_ds, test_ds = build_grasp_train_test( + input_modalities=inputs, + t_obs_sec=args.t_obs, t_fut_sec=args.t_fut, + anchor_stride_sec=args.anchor_stride, + per_class_max=args.per_class_max, + label_mode=args.label_mode, + sustained_threshold_sec=args.sustained_threshold_sec, + require_lift_for_sustained=args.require_lift_for_sustained, + rng_seed=args.seed, + train_vols=tr_v, test_vols=te_v, + ) + num_classes = train_ds.num_classes + print(f"train={len(train_ds)} test={len(test_ds)} num_classes={num_classes}", flush=True) + + tr_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, + num_workers=args.num_workers, collate_fn=collate_grasp_state, + drop_last=False) + te_loader = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False, + num_workers=args.num_workers, collate_fn=collate_grasp_state) + + model = GraspStateClassifier( + args.model, train_ds.modality_dims, + t_obs=train_ds.T_obs, t_fut=train_ds.T_fut, + d_model=args.d_model, dropout=args.dropout, + num_classes=num_classes, + ).to(device) + n_params = sum(p.numel() for p in model.parameters()) + print(f"params={n_params:,}", flush=True) + + # Class weight = inverse class frequency in train + if args.no_class_weight: + cw = None + else: + ny = np.zeros(num_classes, dtype=np.int64) + for it in train_ds._items: ny[it["label"]] += 1 + cw = torch.tensor(ny.sum() / (num_classes * np.maximum(ny, 1)), + dtype=torch.float32).to(device) + print(f"class_weight={cw.tolist()}", flush=True) + + optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) + sched = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs, eta_min=args.lr * 0.05) + + out_dir = Path(args.output_dir); out_dir.mkdir(parents=True, exist_ok=True) + best_f1 = -1.0 + best_epoch, best_eval = 0, None + patience_counter = 0 + for ep in range(1, args.epochs + 1): + t0 = time.time() + tr_loss = train_epoch(model, tr_loader, optimizer, device, class_weight=cw) + ev = evaluate(model, te_loader, device, num_classes=num_classes) + sched.step() + f1 = ev["overall"]["macro_f1"] + print(f" E{ep:2d} | tr_ce {tr_loss:.4f} | overall_f1 {f1:.4f} acc {ev['overall']['accuracy']:.4f} " + f"| pre_f1 {ev['pre-contact']['macro_f1']:.3f} " + f"steady {ev['steady-grip']['macro_f1']:.3f} " + f"release {ev['release']['macro_f1']:.3f} " + f"non {ev['non-contact']['macro_f1']:.3f} | {time.time()-t0:.1f}s", flush=True) + if f1 > best_f1: + best_f1 = f1 + best_epoch = ep + best_eval = ev + torch.save({k: v.cpu() for k, v in model.state_dict().items()}, + out_dir / "model_best.pt") + patience_counter = 0 + else: + patience_counter += 1 + if patience_counter >= args.patience: + print(f" early stop at epoch {ep} (best {best_epoch})", flush=True) + break + + out = { + "method": args.model, + "input_modalities": inputs, + "seed": args.seed, "n_params": n_params, + "T_obs": train_ds.T_obs, "T_fut": train_ds.T_fut, + "best_epoch": int(best_epoch), + "best_macro_f1": float(best_f1), + "eval": best_eval, + "args": vars(args), + } + with open(out_dir / "results.json", "w") as f: + json.dump(out, f, indent=2) + print(f"\n[done] best macro_F1={best_f1:.4f} at epoch {best_epoch}", flush=True) + + +if __name__ == "__main__": + main() diff --git a/experiments/tasks/train_pred.py b/experiments/tasks/train_pred.py new file mode 100644 index 0000000000000000000000000000000000000000..578445e833b9e07aed86e526f5cfa8fb2cc34074 --- /dev/null +++ b/experiments/tasks/train_pred.py @@ -0,0 +1,645 @@ +#!/usr/bin/env python3 +""" +Sensor-to-text action prediction with LoRA-tuned LLM. + +Improvements over v1: + 1. LoRA on LLM q_proj/v_proj — lets LLM learn to understand sensor tokens + 2. Instruction prefix "描述接下来的动作:" — guides generation + 3. Short generation limit (max 20 tokens) — prevents rambling + +Architecture: + SensorEncoder → pool to K soft-prompt tokens → project to LLM space + → [sensor_tokens] + [instruction] → LoRA-tuned Qwen2.5-0.5B → action text +""" + +import os +import sys +import json +import time +import math +import re +import random +import argparse +import glob +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import Dataset, DataLoader + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from data.dataset import ( + DATASET_DIR, MODALITY_FILES, TRAIN_VOLS, VAL_VOLS, TEST_VOLS, + load_modality_array, +) + +ANNOTATION_DIR = "${PULSE_ROOT}" + + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def parse_timestamp(ts_str): + parts = ts_str.strip().split(':') + if len(parts) == 2: + return int(parts[0]) * 60 + int(parts[1]) + elif len(parts) == 3: + return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2]) + return 0 + + +# ============================================================ +# LoRA +# ============================================================ + +class LoRALayer(nn.Module): + """Low-Rank Adaptation wrapper for nn.Linear.""" + + def __init__(self, base_layer, r=8, alpha=16, dropout=0.1): + super().__init__() + self.base_layer = base_layer + for p in self.base_layer.parameters(): + p.requires_grad = False + + in_dim = base_layer.in_features + out_dim = base_layer.out_features + self.lora_A = nn.Linear(in_dim, r, bias=False) + self.lora_B = nn.Linear(r, out_dim, bias=False) + self.scaling = alpha / r + self.lora_dropout = nn.Dropout(dropout) + + nn.init.kaiming_uniform_(self.lora_A.weight, a=math.sqrt(5)) + nn.init.zeros_(self.lora_B.weight) + + def forward(self, x): + base_out = self.base_layer(x) + lora_out = self.lora_B(self.lora_A(self.lora_dropout(x))) * self.scaling + return base_out + lora_out + + +def apply_lora(llm, r=8, alpha=16, dropout=0.1): + """Apply LoRA to q_proj and v_proj in all attention layers. Returns LoRA params.""" + lora_params = [] + for layer in llm.model.layers: + attn = layer.self_attn + for name in ['q_proj', 'v_proj']: + original = getattr(attn, name) + lora_layer = LoRALayer(original, r=r, alpha=alpha, dropout=dropout) + setattr(attn, name, lora_layer) + lora_params.extend(lora_layer.lora_A.parameters()) + lora_params.extend(lora_layer.lora_B.parameters()) + return lora_params + + +# ============================================================ +# Dataset +# ============================================================ + +class TextPredictionDataset(Dataset): + def __init__(self, volunteers, modalities, tokenizer, + window_sec=15.0, max_text_len=48, + downsample=5, sampling_rate=100, stats=None): + self.tokenizer = tokenizer + self.max_text_len = max_text_len + self._feat_dim = None + raw_samples = [] + all_features_for_stats = [] + window_frames = int(window_sec * sampling_rate / downsample) + + for vol in volunteers: + vol_dir = os.path.join(DATASET_DIR, vol) + if not os.path.isdir(vol_dir): + continue + for scenario in sorted(os.listdir(vol_dir)): + scenario_dir = os.path.join(vol_dir, scenario) + if not os.path.isdir(scenario_dir): + continue + meta_path = os.path.join(scenario_dir, 'alignment_metadata.json') + if not os.path.exists(meta_path): + continue + with open(meta_path) as f: + meta = json.load(f) + if not set(modalities).issubset(set(meta['modalities'])): + continue + + parts = [] + for mod in modalities: + filepath = os.path.join(scenario_dir, MODALITY_FILES[mod]) + arr = load_modality_array(filepath, mod) + parts.append(arr) + min_len = min(p.shape[0] for p in parts) + features = np.concatenate([p[:min_len] for p in parts], axis=1) + features = features[::downsample] + if self._feat_dim is None: + self._feat_dim = features.shape[1] + all_features_for_stats.append(features) + + ann_path = os.path.join(ANNOTATION_DIR, vol, f"{scenario}.json") + if not os.path.exists(ann_path): + continue + with open(ann_path) as f: + ann = json.load(f) + segments = [] + for seg in ann.get('segments', []): + m = re.match(r'(\d+:\d+(?::\d+)?)\s*-\s*(\d+:\d+(?::\d+)?)', + seg['timestamp']) + if not m: + continue + start_sec = parse_timestamp(m.group(1)) + start_frame = int(start_sec * sampling_rate / downsample) + segments.append((start_frame, seg['task'])) + if len(segments) < 2: + continue + + T_total = features.shape[0] + for i in range(1, len(segments)): + boundary = segments[i][0] + if boundary > T_total: + break + end = boundary + start = max(0, end - window_frames) + window = features[start:end] + if window.shape[0] == 0: + continue + if window.shape[0] < window_frames: + pad = np.zeros((window_frames - window.shape[0], self._feat_dim)) + window = np.concatenate([pad, window], axis=0) + raw_samples.append((window.astype(np.float32), segments[i][1])) + + # Normalization + if stats is not None: + self.mean, self.std = stats + else: + if all_features_for_stats: + cat = np.concatenate(all_features_for_stats, axis=0).astype(np.float64) + self.mean = np.mean(cat, axis=0, keepdims=True) + self.std = np.std(cat, axis=0, keepdims=True) + self.std[self.std < 1e-8] = 1.0 + else: + d = self._feat_dim or 1 + self.mean = np.zeros((1, d)) + self.std = np.ones((1, d)) + + self.sensor_data = [ + ((x - self.mean) / self.std).astype(np.float32) for x, _ in raw_samples + ] + self.texts = [t for _, t in raw_samples] + + # Tokenize: text + EOS + eos = tokenizer.eos_token or '' + self.tokenized = tokenizer( + [t + eos for t in self.texts], + padding='max_length', max_length=max_text_len, + truncation=True, return_tensors='np', add_special_tokens=False, + ) + print(f" {len(self.sensor_data)} samples, feat_dim={self._feat_dim}, " + f"window={window_frames}f, unique_texts={len(set(self.texts))}", + flush=True) + + def get_stats(self): + return (self.mean, self.std) + + @property + def feat_dim(self): + return self._feat_dim + + def __len__(self): + return len(self.sensor_data) + + def __getitem__(self, idx): + return { + 'sensor': torch.from_numpy(self.sensor_data[idx]), + 'input_ids': torch.tensor( + self.tokenized['input_ids'][idx], dtype=torch.long), + 'attention_mask': torch.tensor( + self.tokenized['attention_mask'][idx], dtype=torch.long), + } + + +# ============================================================ +# Model +# ============================================================ + +class PositionalEncoding(nn.Module): + def __init__(self, d_model, dropout=0.1, max_len=5000): + super().__init__() + self.dropout = nn.Dropout(p=dropout) + pe = torch.zeros(max_len, d_model) + pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div = torch.exp(torch.arange(0, d_model, 2).float() * + (-math.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(pos * div) + pe[:, 1::2] = torch.cos(pos * div) + self.register_buffer('pe', pe.unsqueeze(0)) + + def forward(self, x): + return self.dropout(x + self.pe[:, :x.size(1)]) + + +class SensorEncoder(nn.Module): + def __init__(self, input_dim, d_model=64, nhead=4, num_layers=2, dropout=0.1): + super().__init__() + self.proj = nn.Linear(input_dim, d_model) + self.pos = PositionalEncoding(d_model, dropout) + layer = nn.TransformerEncoderLayer( + d_model=d_model, nhead=nhead, dim_feedforward=d_model * 4, + dropout=dropout, batch_first=True) + self.encoder = nn.TransformerEncoder(layer, num_layers=num_layers) + + def forward(self, x): + return self.encoder(self.pos(self.proj(x))) + + +class SensorToTextModel(nn.Module): + def __init__(self, input_dim, llm, tokenizer, n_sensor_tokens=8, + d_model=64, nhead=4, num_layers=2, dropout=0.1): + super().__init__() + self.n_sensor_tokens = n_sensor_tokens + lm_hidden = llm.config.hidden_size + + self.sensor_encoder = SensorEncoder( + input_dim, d_model, nhead, num_layers, dropout) + self.pool = nn.AdaptiveAvgPool1d(n_sensor_tokens) + self.projection = nn.Linear(d_model, lm_hidden) + self.llm = llm + + # Pre-tokenize instruction prefix + inst_text = "描述接下来的动作:" + inst_ids = tokenizer(inst_text, add_special_tokens=False, + return_tensors='pt')['input_ids'] + self.register_buffer('instruction_ids', inst_ids) # (1, L_inst) + self.n_inst = inst_ids.size(1) + + @property + def prefix_len(self): + return self.n_sensor_tokens + self.n_inst + + def encode_sensor(self, x): + feat = self.sensor_encoder(x) + feat = self.pool(feat.transpose(1, 2)).transpose(1, 2) + return self.projection(feat) + + def forward(self, sensor, input_ids, attention_mask): + B = sensor.size(0) + device = sensor.device + + sensor_embeds = self.encode_sensor(sensor) # (B, K, H) + inst_ids = self.instruction_ids.expand(B, -1) # (B, L_inst) + inst_embeds = self.llm.get_input_embeddings()(inst_ids) + text_embeds = self.llm.get_input_embeddings()(input_ids) + + input_embeds = torch.cat( + [sensor_embeds, inst_embeds, text_embeds], dim=1) + P = self.prefix_len + prefix_attn = torch.ones(B, P, device=device, dtype=attention_mask.dtype) + full_attn = torch.cat([prefix_attn, attention_mask], dim=1) + + return self.llm(inputs_embeds=input_embeds, + attention_mask=full_attn).logits + + @torch.no_grad() + def generate_text(self, sensor, tokenizer, max_new_tokens=20): + self.eval() + B = sensor.size(0) + device = sensor.device + + sensor_embeds = self.encode_sensor(sensor) + inst_ids = self.instruction_ids.expand(B, -1) + inst_embeds = self.llm.get_input_embeddings()(inst_ids) + prefix = torch.cat([sensor_embeds, inst_embeds], dim=1) + + eos_id = tokenizer.eos_token_id + + # First pass + out = self.llm(inputs_embeds=prefix, use_cache=True) + past_kv = out.past_key_values + next_id = out.logits[:, -1, :].argmax(-1) + generated = [next_id] + + for _ in range(max_new_tokens - 1): + if (next_id == eos_id).all(): + break + next_emb = self.llm.get_input_embeddings()(next_id).unsqueeze(1) + out = self.llm(inputs_embeds=next_emb, + past_key_values=past_kv, use_cache=True) + past_kv = out.past_key_values + next_id = out.logits[:, -1, :].argmax(-1) + generated.append(next_id) + + gen_ids = torch.stack(generated, dim=1) + texts = [] + for i in range(B): + ids = gen_ids[i].tolist() + if eos_id in ids: + ids = ids[:ids.index(eos_id)] + texts.append(tokenizer.decode(ids, skip_special_tokens=True)) + return texts + + +# ============================================================ +# Training & Evaluation +# ============================================================ + +def train_epoch(model, loader, optimizer, device): + model.train() + total_loss, n = 0, 0 + P = model.prefix_len + pad_id = model.llm.config.pad_token_id or 0 + + for batch in loader: + sensor = batch['sensor'].to(device) + input_ids = batch['input_ids'].to(device) + attention_mask = batch['attention_mask'].to(device) + + optimizer.zero_grad() + logits = model(sensor, input_ids, attention_mask) + + L = input_ids.size(1) + pred = logits[:, P - 1: P - 1 + L, :] + loss = F.cross_entropy( + pred.reshape(-1, pred.size(-1)), + input_ids.reshape(-1), + ignore_index=pad_id) + loss.backward() + torch.nn.utils.clip_grad_norm_( + [p for p in model.parameters() if p.requires_grad], 1.0) + optimizer.step() + + total_loss += loss.item() * sensor.size(0) + n += sensor.size(0) + return total_loss / max(n, 1) + + +@torch.no_grad() +def eval_loss_only(model, loader, device): + model.eval() + total_loss, n = 0, 0 + P = model.prefix_len + pad_id = model.llm.config.pad_token_id or 0 + for batch in loader: + sensor = batch['sensor'].to(device) + input_ids = batch['input_ids'].to(device) + attention_mask = batch['attention_mask'].to(device) + logits = model(sensor, input_ids, attention_mask) + L = input_ids.size(1) + pred = logits[:, P - 1: P - 1 + L, :] + loss = F.cross_entropy( + pred.reshape(-1, pred.size(-1)), + input_ids.reshape(-1), ignore_index=pad_id) + total_loss += loss.item() * sensor.size(0) + n += sensor.size(0) + return total_loss / max(n, 1) + + +@torch.no_grad() +def eval_with_generation(model, loader, tokenizer, device): + model.eval() + total_loss, n = 0, 0 + P = model.prefix_len + pad_id = model.llm.config.pad_token_id or 0 + all_preds, all_refs = [], [] + + for batch in loader: + sensor = batch['sensor'].to(device) + input_ids = batch['input_ids'].to(device) + attention_mask = batch['attention_mask'].to(device) + + logits = model(sensor, input_ids, attention_mask) + L = input_ids.size(1) + pred = logits[:, P - 1: P - 1 + L, :] + loss = F.cross_entropy( + pred.reshape(-1, pred.size(-1)), + input_ids.reshape(-1), ignore_index=pad_id) + total_loss += loss.item() * sensor.size(0) + n += sensor.size(0) + + texts = model.generate_text(sensor, tokenizer, max_new_tokens=20) + all_preds.extend(texts) + refs = tokenizer.batch_decode(input_ids, skip_special_tokens=True) + all_refs.extend(refs) + + em = sum(p.strip() == r.strip() + for p, r in zip(all_preds, all_refs)) / max(len(all_preds), 1) + + char_correct, char_ptot, char_rtot = 0, 0, 0 + for p, r in zip(all_preds, all_refs): + ps, rs = p.strip(), r.strip() + for j in range(min(len(ps), len(rs))): + if ps[j] == rs[j]: + char_correct += 1 + char_ptot += len(ps) + char_rtot += len(rs) + prec = char_correct / max(char_ptot, 1) + rec = char_correct / max(char_rtot, 1) + char_f1 = 2 * prec * rec / max(prec + rec, 1e-8) + + return { + 'loss': total_loss / max(n, 1), + 'exact_match': em, + 'char_precision': prec, + 'char_recall': rec, + 'char_f1': char_f1, + }, all_preds, all_refs + + +# ============================================================ +# Main +# ============================================================ + +def run_experiment(args): + set_seed(args.seed) + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + modalities = args.modalities.split(',') + + print(f"\n{'='*60}", flush=True) + print(f"Sensor → LLM Text (LoRA + instruction prefix)", flush=True) + print(f"Mods: {modalities} | LLM: {args.llm_name}", flush=True) + print(f"LoRA r={args.lora_r} alpha={args.lora_alpha}", flush=True) + print(f"{'='*60}", flush=True) + + # LLM + print("Loading LLM...", flush=True) + from transformers import AutoTokenizer, AutoModelForCausalLM + tokenizer = AutoTokenizer.from_pretrained( + args.llm_name, trust_remote_code=True, local_files_only=True) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + llm = AutoModelForCausalLM.from_pretrained( + args.llm_name, trust_remote_code=True, + torch_dtype=torch.float32, local_files_only=True, + ).to(device) + llm.config.pad_token_id = tokenizer.pad_token_id + + # Freeze all LLM params first + for p in llm.parameters(): + p.requires_grad = False + + # Apply LoRA + lora_params = apply_lora(llm, r=args.lora_r, alpha=args.lora_alpha) + lora_param_count = sum(p.numel() for p in lora_params) + print(f"LoRA params: {lora_param_count:,} (r={args.lora_r})", flush=True) + + # Datasets + train_ds = TextPredictionDataset( + TRAIN_VOLS, modalities, tokenizer, + window_sec=args.window_sec, max_text_len=args.max_text_len, + downsample=args.downsample) + stats = train_ds.get_stats() + val_ds = TextPredictionDataset( + VAL_VOLS, modalities, tokenizer, + window_sec=args.window_sec, max_text_len=args.max_text_len, + downsample=args.downsample, stats=stats) + test_ds = TextPredictionDataset( + TEST_VOLS, modalities, tokenizer, + window_sec=args.window_sec, max_text_len=args.max_text_len, + downsample=args.downsample, stats=stats) + + if len(train_ds) == 0: + print("ERROR: No training samples!", flush=True) + return None + + train_loader = DataLoader(train_ds, batch_size=args.batch_size, + shuffle=True, drop_last=False) + val_loader = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False) + test_loader = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False) + + # Model + model = SensorToTextModel( + train_ds.feat_dim, llm, tokenizer, + n_sensor_tokens=args.n_sensor_tokens, d_model=args.hidden_dim) + model = model.to(device) # move ALL submodules + buffers to GPU + + # Collect trainable params + sensor_params = list(model.sensor_encoder.parameters()) + \ + list(model.projection.parameters()) + all_trainable = sensor_params + lora_params + trainable_count = sum(p.numel() for p in all_trainable) + total_count = sum(p.numel() for p in model.parameters()) + print(f"Trainable: {trainable_count:,} / Total: {total_count:,}", flush=True) + + optimizer = torch.optim.AdamW([ + {'params': sensor_params, 'lr': args.lr}, + {'params': lora_params, 'lr': args.lr * 0.2}, + ], weight_decay=args.weight_decay) + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( + optimizer, patience=7, factor=0.5, min_lr=1e-6) + + mod_str = '-'.join(modalities) + exp_name = f"pred_llm_{mod_str}" + out_dir = os.path.join(args.output_dir, exp_name) + os.makedirs(out_dir, exist_ok=True) + + best_val_loss = float('inf') + best_epoch = 0 + patience_ctr = 0 + + for epoch in range(1, args.epochs + 1): + t0 = time.time() + tr_loss = train_epoch(model, train_loader, optimizer, device) + + if epoch % 5 == 0 or epoch <= 2 or patience_ctr >= args.patience - 2: + val_m, _, _ = eval_with_generation( + model, val_loader, tokenizer, device) + print(f" Epoch {epoch:3d} | TrLoss={tr_loss:.4f} | " + f"Val: loss={val_m['loss']:.4f} EM={val_m['exact_match']:.4f} " + f"charF1={val_m['char_f1']:.4f} | {time.time()-t0:.1f}s", + flush=True) + else: + val_loss = eval_loss_only(model, val_loader, device) + val_m = {'loss': val_loss} + print(f" Epoch {epoch:3d} | TrLoss={tr_loss:.4f} | " + f"Val: loss={val_loss:.4f} | {time.time()-t0:.1f}s", + flush=True) + + scheduler.step(val_m['loss']) + + if val_m['loss'] < best_val_loss: + best_val_loss = val_m['loss'] + best_epoch = epoch + patience_ctr = 0 + # Save sensor encoder + projection + LoRA weights + save_sd = {} + for k, v in model.state_dict().items(): + if k.startswith('llm.'): + if 'lora_A' in k or 'lora_B' in k: + save_sd[k] = v + else: + save_sd[k] = v + torch.save(save_sd, os.path.join(out_dir, 'model_best.pt')) + else: + patience_ctr += 1 + if patience_ctr >= args.patience: + print(f" Early stopping at epoch {epoch}", flush=True) + break + + # Test + best_sd = torch.load(os.path.join(out_dir, 'model_best.pt'), + weights_only=True) + model.load_state_dict(best_sd, strict=False) + test_m, test_preds, test_refs = eval_with_generation( + model, test_loader, tokenizer, device) + + print(f"\n--- Test (best epoch {best_epoch}) ---", flush=True) + for k, v in test_m.items(): + print(f" {k}: {v:.4f}", flush=True) + + print("\nSample predictions:", flush=True) + indices = random.sample(range(len(test_preds)), min(15, len(test_preds))) + for i in indices: + tag = "OK" if test_preds[i].strip() == test_refs[i].strip() else "XX" + print(f" [{tag}] Pred: {test_preds[i].strip()}", flush=True) + print(f" Ref: {test_refs[i].strip()}", flush=True) + + results = { + 'experiment': exp_name, + 'modalities': modalities, + 'best_epoch': best_epoch, + 'test_metrics': {k: float(v) for k, v in test_m.items()}, + 'trainable_params': trainable_count, + 'lora_params': lora_param_count, + 'train_samples': len(train_ds), + 'val_samples': len(val_ds), + 'test_samples': len(test_ds), + 'args': vars(args), + 'sample_predictions': [ + {'pred': test_preds[i].strip(), 'ref': test_refs[i].strip()} + for i in indices + ], + } + with open(os.path.join(out_dir, 'results.json'), 'w') as f: + json.dump(results, f, indent=2, ensure_ascii=False) + print(f" Saved to {out_dir}", flush=True) + return results + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--modalities', type=str, default='imu') + parser.add_argument('--window_sec', type=float, default=15.0) + parser.add_argument('--llm_name', type=str, + default='${PULSE_ROOT}/models/qwen2.5-0.5b') + parser.add_argument('--lora_r', type=int, default=8) + parser.add_argument('--lora_alpha', type=int, default=16) + parser.add_argument('--n_sensor_tokens', type=int, default=8) + parser.add_argument('--max_text_len', type=int, default=48) + parser.add_argument('--epochs', type=int, default=50) + parser.add_argument('--batch_size', type=int, default=8) + parser.add_argument('--lr', type=float, default=5e-4) + parser.add_argument('--weight_decay', type=float, default=1e-4) + parser.add_argument('--hidden_dim', type=int, default=64) + parser.add_argument('--downsample', type=int, default=5) + parser.add_argument('--patience', type=int, default=15) + parser.add_argument('--seed', type=int, default=42) + parser.add_argument('--output_dir', type=str, + default='${PULSE_ROOT}/results/pred_llm2') + args = parser.parse_args() + os.makedirs(args.output_dir, exist_ok=True) + run_experiment(args) + + +if __name__ == '__main__': + main() diff --git a/experiments/tasks/train_pred_cls.py b/experiments/tasks/train_pred_cls.py new file mode 100644 index 0000000000000000000000000000000000000000..35ee215573ec91e5ef9c0d62bea2f7d0429f1ba3 --- /dev/null +++ b/experiments/tasks/train_pred_cls.py @@ -0,0 +1,691 @@ +#!/usr/bin/env python3 +""" +Action Prediction via Verb-Category Classification. + +Instead of generating free-form text (which fails with ~2000 unique labels / ~1600 samples), +we classify the next action into ~20 verb categories extracted from text annotations. + +Architecture: Transformer encoder (proven in exp1 with F1=0.771 on scene recognition). +""" + +import os +import sys +import json +import time +import math +import re +import random +import argparse +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import Dataset, DataLoader +from sklearn.metrics import accuracy_score, f1_score, classification_report + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from data.dataset import ( + DATASET_DIR, MODALITY_FILES, TRAIN_VOLS, VAL_VOLS, TEST_VOLS, + load_modality_array, +) + +ANNOTATION_DIR = "${PULSE_ROOT}" + + +# ============================================================ +# Action Verb Taxonomy +# ============================================================ + +VERB_MAP_RULES = [ + # Grab/Pick up + ('抓取', '抓取'), ('拿起', '抓取'), ('拿出', '抓取'), + ('从.*取出', '抓取'), ('从.*抓取', '抓取'), ('从.*提取', '抓取'), + ('从.*取下', '抓取'), ('从.*抽出', '抓取'), ('从.*拔出', '抓取'), + ('双手抓', '抓取'), ('双手协.*抓', '抓取'), ('分别抓', '抓取'), + ('伸手', '抓取'), + # Place/Put down + ('放置', '放置'), ('放回', '放置'), ('放入', '放置'), + ('丢弃', '放置'), ('归还', '放置'), + # Move/Carry + ('移动', '移动'), ('搬运', '移动'), ('移开', '移动'), + ('推入', '移动'), ('推动', '移动'), ('拉开', '移动'), ('拉出', '移动'), + ('搬移', '移动'), ('转移', '移动'), ('递送', '移动'), + ('交接', '移动'), ('传递', '移动'), ('滑动', '移动'), + ('分别持握.*移', '移动'), + # Adjust/Align + ('调整', '调整'), ('对齐', '调整'), ('微调', '调整'), + ('重新', '调整'), ('摆正', '调整'), ('归位', '调整'), + # Fold + ('折叠', '折叠'), ('二次折叠', '折叠'), ('对折', '折叠'), + # Unfold/Open + ('展开', '展开'), ('打开', '展开'), ('揭开', '展开'), + ('拆开', '展开'), ('撕开', '展开'), ('掀开', '展开'), + # Wipe/Clean/Smooth + ('擦拭', '擦拭'), ('抚平', '擦拭'), ('清洁', '擦拭'), ('清理', '擦拭'), + # Rotate/Screw + ('旋转', '旋转'), ('旋紧', '旋转'), ('旋开', '旋转'), + ('拧开', '旋转'), ('拧紧', '旋转'), + # Lift + ('提起', '提起'), ('抬起', '提起'), ('举起', '提起'), ('翻起', '提起'), + # Pour/Fill + ('倾倒', '倾倒'), ('装填', '倾倒'), ('倒入', '倾倒'), ('倒出', '倾倒'), + ('舀取', '倾倒'), ('注入', '倾倒'), ('从.*舀', '倾倒'), + # Organize/Stack + ('整理', '整理'), ('堆叠', '整理'), ('排列', '整理'), + ('收纳', '整理'), ('码放', '整理'), + # Check/Inspect + ('检查', '检查'), ('确认', '检查'), ('查看', '检查'), + ('保持', '检查'), ('观察', '检查'), + # Press + ('按压', '按压'), ('压实', '按压'), ('压平', '按压'), + # Cover/Close + ('盖上', '盖合'), ('关闭', '盖合'), ('密封', '盖合'), ('合上', '盖合'), + ('封口', '盖合'), ('封箱', '盖合'), + # Separate + ('分离', '分离'), ('分开', '分离'), + # Stick/Fix + ('粘贴', '粘贴'), ('固定', '粘贴'), ('贴上', '粘贴'), ('加固', '粘贴'), + # Release + ('释放', '释放'), + # Use/Operate + ('使用', '操作'), ('操作', '操作'), ('搅拌', '操作'), + ('切割', '操作'), ('切断', '操作'), ('剪断', '操作'), ('修剪', '操作'), + # Flip + ('翻转', '翻转'), ('翻面', '翻转'), + # Prepare/Complete + ('准备', '其他'), ('完成', '其他'), ('最终', '其他'), + # "将..." sub-patterns + ('将.*放', '放置'), ('将.*装', '倾倒'), ('将.*倒', '倾倒'), + ('将.*移', '移动'), ('将.*折', '折叠'), ('将.*盖', '盖合'), + ('将.*展', '展开'), ('将.*提', '提起'), ('将.*拉', '移动'), + ('将.*推', '移动'), ('将.*擦', '擦拭'), ('将.*抓', '抓取'), + ('将.*旋', '旋转'), ('将.*拧', '旋转'), ('将.*整', '整理'), + ('将.*调', '调整'), ('将.*对', '调整'), ('将.*贴', '粘贴'), + ('将.*翻', '翻转'), ('将.*压', '按压'), ('将.*插', '操作'), + ('将.*切', '操作'), ('将.*固', '粘贴'), ('将.*封', '盖合'), + ('将', '操作'), + ('双手', '操作'), ('再次', '调整'), +] + +ACTION_CLASSES_FINE = [ + '抓取', '放置', '移动', '调整', '擦拭', '折叠', '旋转', + '操作', '盖合', '整理', '展开', '倾倒', '检查', '提起', + '释放', '粘贴', '分离', '按压', '翻转', '其他', +] + +# 8 coarse super-categories (merge small classes) +ACTION_CLASSES_COARSE = [ + '抓取', '放置', '移动', '调整', '擦拭', '折叠', '旋转', '其他', +] +FINE_TO_COARSE = { + '抓取': '抓取', '放置': '放置', '移动': '移动', + '调整': '调整', '整理': '调整', + '擦拭': '擦拭', + '折叠': '折叠', '展开': '折叠', + '旋转': '旋转', '盖合': '旋转', + '操作': '其他', '倾倒': '其他', '检查': '其他', '提起': '其他', + '释放': '其他', '粘贴': '其他', '分离': '其他', '按压': '其他', + '翻转': '其他', '其他': '其他', +} + +# Will be set by main() based on --coarse flag +ACTION_CLASSES = None +NUM_ACTION_CLASSES = None +ACTION_TO_IDX = None + + +def init_classes(coarse=False): + global ACTION_CLASSES, NUM_ACTION_CLASSES, ACTION_TO_IDX + if coarse: + ACTION_CLASSES = ACTION_CLASSES_COARSE + else: + ACTION_CLASSES = ACTION_CLASSES_FINE + NUM_ACTION_CLASSES = len(ACTION_CLASSES) + ACTION_TO_IDX = {c: i for i, c in enumerate(ACTION_CLASSES)} + + +def text_to_action_class(text, coarse=False): + fine_label = '其他' + for pattern, label in VERB_MAP_RULES: + if re.search(pattern, text): + fine_label = label + break + if coarse: + return FINE_TO_COARSE.get(fine_label, '其他') + return fine_label + + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def parse_timestamp(ts_str): + parts = ts_str.strip().split(':') + if len(parts) == 2: + return int(parts[0]) * 60 + int(parts[1]) + elif len(parts) == 3: + return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2]) + return 0 + + +# ============================================================ +# Dataset +# ============================================================ + +class ActionPredDataset(Dataset): + def __init__(self, volunteers, modalities, + window_sec=15.0, downsample=5, sampling_rate=100, stats=None, + coarse=False, mode='prediction'): + self._feat_dim = None + self.mode = mode # 'prediction' or 'recognition' + raw_samples = [] + all_features_for_stats = [] + window_frames = int(window_sec * sampling_rate / downsample) + self.window_frames = window_frames + + for vol in volunteers: + vol_dir = os.path.join(DATASET_DIR, vol) + if not os.path.isdir(vol_dir): + continue + for scenario in sorted(os.listdir(vol_dir)): + scenario_dir = os.path.join(vol_dir, scenario) + if not os.path.isdir(scenario_dir): + continue + meta_path = os.path.join(scenario_dir, 'alignment_metadata.json') + if not os.path.exists(meta_path): + continue + with open(meta_path) as f: + meta = json.load(f) + if not set(modalities).issubset(set(meta['modalities'])): + continue + + parts = [] + for mod in modalities: + filepath = os.path.join(scenario_dir, MODALITY_FILES[mod]) + arr = load_modality_array(filepath, mod) + parts.append(arr) + min_len = min(p.shape[0] for p in parts) + features = np.concatenate([p[:min_len] for p in parts], axis=1) + features = features[::downsample] + if self._feat_dim is None: + self._feat_dim = features.shape[1] + all_features_for_stats.append(features) + + ann_path = os.path.join(ANNOTATION_DIR, vol, f"{scenario}.json") + if not os.path.exists(ann_path): + continue + with open(ann_path) as f: + ann = json.load(f) + segments = [] + for seg in ann.get('segments', []): + m = re.match(r'(\d+:\d+(?::\d+)?)\s*-\s*(\d+:\d+(?::\d+)?)', + seg['timestamp']) + if not m: + continue + start_sec = parse_timestamp(m.group(1)) + end_sec = parse_timestamp(m.group(2)) + start_frame = int(start_sec * sampling_rate / downsample) + end_frame = int(end_sec * sampling_rate / downsample) + action_cls = text_to_action_class(seg['task'], coarse=coarse) + label_idx = ACTION_TO_IDX[action_cls] + segments.append((start_frame, end_frame, label_idx, seg['task'])) + + if mode == 'prediction' and len(segments) < 2: + continue + if mode == 'recognition' and len(segments) < 1: + continue + + T_total = features.shape[0] + + if mode == 'prediction': + # Use sensor data BEFORE segment boundary to predict NEXT action + for i in range(1, len(segments)): + boundary = segments[i][0] + if boundary > T_total: + break + end = boundary + start = max(0, end - window_frames) + window = features[start:end] + if window.shape[0] == 0: + continue + actual_len = window.shape[0] + if actual_len < window_frames: + pad = np.zeros((window_frames - actual_len, self._feat_dim)) + window = np.concatenate([pad, window], axis=0) + mask = np.zeros(window_frames, dtype=np.float32) + mask[window_frames - actual_len:] = 1.0 + else: + mask = np.ones(window_frames, dtype=np.float32) + prev_label = segments[i - 1][2] + raw_samples.append(( + window.astype(np.float32), mask, + segments[i][2], segments[i][3], prev_label + )) + else: + # Recognition: use sensor data FROM the segment to classify current action + for i in range(len(segments)): + seg_start = segments[i][0] + seg_end = min(segments[i][1], T_total) + if seg_start >= seg_end: + continue + window = features[seg_start:seg_end] + if window.shape[0] == 0: + continue + actual_len = window.shape[0] + if actual_len > window_frames: + # Take center crop + offset = (actual_len - window_frames) // 2 + window = window[offset:offset + window_frames] + actual_len = window_frames + if actual_len < window_frames: + pad = np.zeros((window_frames - actual_len, self._feat_dim)) + window = np.concatenate([pad, window], axis=0) + mask = np.zeros(window_frames, dtype=np.float32) + mask[window_frames - actual_len:] = 1.0 + else: + mask = np.ones(window_frames, dtype=np.float32) + prev_label = segments[i - 1][2] if i > 0 else segments[i][2] + raw_samples.append(( + window.astype(np.float32), mask, + segments[i][2], segments[i][3], prev_label + )) + + # Normalization + if stats is not None: + self.mean, self.std = stats + else: + if all_features_for_stats: + cat = np.concatenate(all_features_for_stats, axis=0).astype(np.float64) + self.mean = np.mean(cat, axis=0, keepdims=True) + self.std = np.std(cat, axis=0, keepdims=True) + self.std[self.std < 1e-8] = 1.0 + else: + d = self._feat_dim or 1 + self.mean = np.zeros((1, d)) + self.std = np.ones((1, d)) + + self.data = [] + self.labels = [] + self.texts = [] + self.masks = [] + self.prev_labels = [] + for x, mask, label, text, prev_label in raw_samples: + self.data.append(((x - self.mean) / self.std).astype(np.float32)) + self.masks.append(mask) + self.labels.append(label) + self.texts.append(text) + self.prev_labels.append(prev_label) + + from collections import Counter + dist = Counter(self.labels) + print(f" {len(self.data)} samples, feat_dim={self._feat_dim}, " + f"window={window_frames}f ({window_sec}s), " + f"classes={len(dist)}", flush=True) + for cls_name in ACTION_CLASSES: + idx = ACTION_TO_IDX[cls_name] + print(f" {cls_name}: {dist.get(idx, 0)}", flush=True) + + def get_stats(self): + return (self.mean, self.std) + + @property + def feat_dim(self): + return self._feat_dim + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return { + 'features': torch.from_numpy(self.data[idx]), + 'mask': torch.from_numpy(self.masks[idx]), + 'label': self.labels[idx], + 'prev_label': self.prev_labels[idx], + } + + +# ============================================================ +# Model: Transformer Classifier +# ============================================================ + +class PositionalEncoding(nn.Module): + def __init__(self, d_model, dropout=0.1, max_len=5000): + super().__init__() + self.dropout = nn.Dropout(p=dropout) + pe = torch.zeros(max_len, d_model) + pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div = torch.exp(torch.arange(0, d_model, 2).float() * + (-math.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(pos * div) + pe[:, 1::2] = torch.cos(pos * div) + self.register_buffer('pe', pe.unsqueeze(0)) + + def forward(self, x): + return self.dropout(x + self.pe[:, :x.size(1)]) + + +class TransformerClassifier(nn.Module): + def __init__(self, input_dim, num_classes, d_model=64, nhead=4, + num_layers=2, dropout=0.2, use_prev_action=False): + super().__init__() + self.use_prev_action = use_prev_action + self.proj = nn.Linear(input_dim, d_model) + self.pos = PositionalEncoding(d_model, dropout) + layer = nn.TransformerEncoderLayer( + d_model=d_model, nhead=nhead, dim_feedforward=d_model * 4, + dropout=dropout, batch_first=True) + self.encoder = nn.TransformerEncoder(layer, num_layers=num_layers) + self.attn_pool = nn.Linear(d_model, 1) + + # Previous action embedding + if use_prev_action: + self.action_embed = nn.Embedding(num_classes, d_model) + cls_input_dim = d_model * 2 # sensor pooled + action embedding + else: + cls_input_dim = d_model + + self.classifier = nn.Sequential( + nn.LayerNorm(cls_input_dim), + nn.Dropout(dropout), + nn.Linear(cls_input_dim, num_classes), + ) + self.output_dim = d_model + + def forward(self, x, mask=None, prev_action=None): + x = self.pos(self.proj(x)) + if mask is not None: + src_key_padding_mask = (mask == 0) + else: + src_key_padding_mask = None + x = self.encoder(x, src_key_padding_mask=src_key_padding_mask) + + # Attention pooling + attn_w = self.attn_pool(x).squeeze(-1) + if mask is not None: + attn_w = attn_w.masked_fill(mask == 0, -1e9) + attn_w = torch.softmax(attn_w, dim=1) + pooled = (x * attn_w.unsqueeze(-1)).sum(dim=1) + + if self.use_prev_action and prev_action is not None: + act_emb = self.action_embed(prev_action) + pooled = torch.cat([pooled, act_emb], dim=1) + + return self.classifier(pooled) + + +# ============================================================ +# Training & Evaluation +# ============================================================ + +def train_epoch(model, loader, optimizer, criterion, device, + augment=False, noise_std=0.1, time_mask_ratio=0.1): + model.train() + total_loss, correct, total = 0, 0, 0 + for batch in loader: + features = batch['features'].to(device) + mask = batch['mask'].to(device) + labels = torch.tensor(batch['label'], dtype=torch.long).to(device) + prev_action = torch.tensor(batch['prev_label'], dtype=torch.long).to(device) + + if augment: + noise = torch.randn_like(features) * noise_std + features = features + noise * mask.unsqueeze(-1) + B, T, C = features.shape + mask_len = int(T * time_mask_ratio) + if mask_len > 0: + for i in range(B): + valid_len = mask[i].sum().int().item() + if valid_len > mask_len: + valid_start = T - valid_len # data is right-aligned (left-padded) + start = random.randint(0, valid_len - mask_len) + features[i, valid_start + start:valid_start + start + mask_len, :] = 0.0 + + optimizer.zero_grad() + logits = model(features, mask, prev_action=prev_action) + loss = criterion(logits, labels) + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + + total_loss += loss.item() * features.size(0) + preds = logits.argmax(dim=1) + correct += (preds == labels).sum().item() + total += features.size(0) + return total_loss / max(total, 1), correct / max(total, 1) + + +@torch.no_grad() +def evaluate(model, loader, criterion, device): + model.eval() + total_loss, all_preds, all_labels = 0, [], [] + n = 0 + for batch in loader: + features = batch['features'].to(device) + mask = batch['mask'].to(device) + labels = torch.tensor(batch['label'], dtype=torch.long).to(device) + prev_action = torch.tensor(batch['prev_label'], dtype=torch.long).to(device) + + logits = model(features, mask, prev_action=prev_action) + loss = criterion(logits, labels) + total_loss += loss.item() * features.size(0) + n += features.size(0) + + preds = logits.argmax(dim=1) + all_preds.extend(preds.cpu().numpy()) + all_labels.extend(labels.cpu().numpy()) + + all_preds = np.array(all_preds) + all_labels = np.array(all_labels) + acc = accuracy_score(all_labels, all_preds) + f1_macro = f1_score(all_labels, all_preds, average='macro', zero_division=0) + f1_weighted = f1_score(all_labels, all_preds, average='weighted', zero_division=0) + + return { + 'loss': total_loss / max(n, 1), + 'accuracy': acc, + 'f1_macro': f1_macro, + 'f1_weighted': f1_weighted, + }, all_preds, all_labels + + +# ============================================================ +# Main +# ============================================================ + +def run_experiment(args): + set_seed(args.seed) + init_classes(coarse=args.coarse) + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + modalities = args.modalities.split(',') + + granularity = "8 coarse" if args.coarse else "20 fine" + task_name = "Recognition" if args.mode == 'recognition' else "Prediction" + print(f"\n{'='*60}", flush=True) + print(f"Action {task_name} — Verb Classification ({granularity} classes)", flush=True) + print(f"Modalities: {modalities} | prev_action: {args.use_prev_action}", flush=True) + print(f"Window: {args.window_sec}s | d_model: {args.hidden_dim} | " + f"augment: {args.augment}", flush=True) + print(f"{'='*60}", flush=True) + + # Datasets + train_ds = ActionPredDataset( + TRAIN_VOLS, modalities, + window_sec=args.window_sec, downsample=args.downsample, + coarse=args.coarse, mode=args.mode) + stats = train_ds.get_stats() + val_ds = ActionPredDataset( + VAL_VOLS, modalities, + window_sec=args.window_sec, downsample=args.downsample, stats=stats, + coarse=args.coarse, mode=args.mode) + test_ds = ActionPredDataset( + TEST_VOLS, modalities, + window_sec=args.window_sec, downsample=args.downsample, stats=stats, + coarse=args.coarse, mode=args.mode) + + if len(train_ds) == 0: + print("ERROR: No training samples!", flush=True) + return None + + train_loader = DataLoader(train_ds, batch_size=args.batch_size, + shuffle=True, drop_last=False) + val_loader = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False) + test_loader = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False) + + # Model + model = TransformerClassifier( + train_ds.feat_dim, NUM_ACTION_CLASSES, + d_model=args.hidden_dim, nhead=4, num_layers=2, dropout=args.dropout, + use_prev_action=args.use_prev_action, + ).to(device) + param_count = sum(p.numel() for p in model.parameters() if p.requires_grad) + print(f"Trainable params: {param_count:,}", flush=True) + + # Class weights for imbalanced data + from collections import Counter + label_dist = Counter(train_ds.labels) + weights = torch.zeros(NUM_ACTION_CLASSES) + for idx, cnt in label_dist.items(): + weights[idx] = 1.0 / max(cnt, 1) + weights = weights / weights.sum() * NUM_ACTION_CLASSES + criterion = nn.CrossEntropyLoss( + weight=weights.to(device), + label_smoothing=args.label_smoothing) + + optimizer = torch.optim.AdamW( + model.parameters(), lr=args.lr, weight_decay=args.weight_decay) + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( + optimizer, patience=7, factor=0.5, min_lr=1e-6) + + mod_str = '-'.join(modalities) + tag = "coarse" if args.coarse else "fine" + prev_tag = "_prev" if args.use_prev_action else "" + mode_tag = "recog" if args.mode == 'recognition' else "pred" + extra_tag = f"_{args.tag}" if args.tag else "" + exp_name = f"{mode_tag}_cls_{tag}{prev_tag}_{mod_str}{extra_tag}" + out_dir = os.path.join(args.output_dir, exp_name) + os.makedirs(out_dir, exist_ok=True) + + best_val_f1 = -1 + best_epoch = 0 + patience_ctr = 0 + + for epoch in range(1, args.epochs + 1): + t0 = time.time() + tr_loss, tr_acc = train_epoch( + model, train_loader, optimizer, criterion, device, + augment=args.augment, noise_std=args.noise_std, + time_mask_ratio=args.time_mask_ratio) + + val_m, _, _ = evaluate(model, val_loader, criterion, device) + dt = time.time() - t0 + + print(f" Epoch {epoch:3d} | TrLoss={tr_loss:.4f} TrAcc={tr_acc:.4f} | " + f"Val: loss={val_m['loss']:.4f} acc={val_m['accuracy']:.4f} " + f"F1m={val_m['f1_macro']:.4f} F1w={val_m['f1_weighted']:.4f} | " + f"{dt:.1f}s", flush=True) + + scheduler.step(val_m['loss']) + + if val_m['f1_weighted'] > best_val_f1: + best_val_f1 = val_m['f1_weighted'] + best_epoch = epoch + patience_ctr = 0 + torch.save(model.state_dict(), os.path.join(out_dir, 'model_best.pt')) + else: + patience_ctr += 1 + if patience_ctr >= args.patience: + print(f" Early stopping at epoch {epoch}", flush=True) + break + + # Test + model.load_state_dict(torch.load( + os.path.join(out_dir, 'model_best.pt'), weights_only=True)) + test_m, test_preds, test_labels = evaluate( + model, test_loader, criterion, device) + + print(f"\n--- Test (best epoch {best_epoch}) ---", flush=True) + for k, v in test_m.items(): + print(f" {k}: {v:.4f}", flush=True) + + # Per-class report + present_classes = sorted(set(test_labels) | set(test_preds)) + target_names = [ACTION_CLASSES[i] for i in present_classes] + report = classification_report( + test_labels, test_preds, + labels=present_classes, target_names=target_names, + zero_division=0, output_dict=True) + print("\nPer-class results:", flush=True) + for cls_name in target_names: + r = report[cls_name] + print(f" {cls_name:<6}: P={r['precision']:.3f} R={r['recall']:.3f} " + f"F1={r['f1-score']:.3f} N={r['support']}", flush=True) + + # Sample predictions + print("\nSample predictions:", flush=True) + indices = random.sample(range(len(test_preds)), min(15, len(test_preds))) + for i in indices: + p_name = ACTION_CLASSES[test_preds[i]] + r_name = ACTION_CLASSES[test_labels[i]] + tag = "OK" if test_preds[i] == test_labels[i] else "XX" + orig_text = test_ds.texts[i] if i < len(test_ds.texts) else "?" + print(f" [{tag}] Pred={p_name:<6} Ref={r_name:<6} ({orig_text})", flush=True) + + results = { + 'experiment': exp_name, + 'modalities': modalities, + 'best_epoch': best_epoch, + 'test_metrics': {k: float(v) for k, v in test_m.items()}, + 'trainable_params': param_count, + 'train_samples': len(train_ds), + 'val_samples': len(val_ds), + 'test_samples': len(test_ds), + 'num_classes': NUM_ACTION_CLASSES, + 'class_names': ACTION_CLASSES, + 'per_class_report': {k: v for k, v in report.items() + if k in target_names}, + 'args': vars(args), + } + with open(os.path.join(out_dir, 'results.json'), 'w') as f: + json.dump(results, f, indent=2, ensure_ascii=False) + print(f" Saved to {out_dir}", flush=True) + return results + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--modalities', type=str, default='imu') + parser.add_argument('--window_sec', type=float, default=15.0) + parser.add_argument('--epochs', type=int, default=80) + parser.add_argument('--batch_size', type=int, default=32) + parser.add_argument('--lr', type=float, default=1e-3) + parser.add_argument('--weight_decay', type=float, default=1e-4) + parser.add_argument('--hidden_dim', type=int, default=64) + parser.add_argument('--dropout', type=float, default=0.2) + parser.add_argument('--downsample', type=int, default=5) + parser.add_argument('--patience', type=int, default=20) + parser.add_argument('--seed', type=int, default=42) + parser.add_argument('--augment', action='store_true') + parser.add_argument('--noise_std', type=float, default=0.1) + parser.add_argument('--time_mask_ratio', type=float, default=0.1) + parser.add_argument('--label_smoothing', type=float, default=0.1) + parser.add_argument('--mode', type=str, default='prediction', + choices=['prediction', 'recognition'], + help='prediction=next action, recognition=current action') + parser.add_argument('--coarse', action='store_true', + help='Use 8 coarse classes instead of 20 fine classes') + parser.add_argument('--use_prev_action', action='store_true', + help='Use previous action label as additional input') + parser.add_argument('--output_dir', type=str, + default='${PULSE_ROOT}/results/pred_cls') + parser.add_argument('--tag', type=str, default='', + help='Optional tag appended to experiment name') + args = parser.parse_args() + os.makedirs(args.output_dir, exist_ok=True) + run_experiment(args) + + +if __name__ == '__main__': + main() diff --git a/experiments/tasks/train_seqpred.py b/experiments/tasks/train_seqpred.py new file mode 100644 index 0000000000000000000000000000000000000000..6aab3d8ab1668ed226467e958f9a72ed1ad136c0 --- /dev/null +++ b/experiments/tasks/train_seqpred.py @@ -0,0 +1,466 @@ +#!/usr/bin/env python3 +""" +Training loop for T10 Triplet Next-Action Prediction. + +Usage example: + python3 experiments/train_seqpred.py \ + --model dailyactformer \ + --modalities imu,emg,eyetrack,mocap,pressure \ + --t_obs 8 --t_fut 2 \ + --epochs 40 --batch_size 32 --lr 3e-4 \ + --output_dir results/seqpred/ours_all5_tfut2_seed42 \ + --seed 42 +""" + +from __future__ import annotations + +# pandas must be imported BEFORE torch/numpy to avoid a GLIBCXX load-order bug +# on this cluster (libstdc++ from Anaconda vs system). +import pandas # noqa: F401 + +import argparse +import json +import os +import random +import sys +import time +from pathlib import Path +from typing import Dict + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader + +# Make sibling modules importable from either (a) the neurips26 root (running +# as `python experiments/train_seqpred.py`) or (b) the frozen row/code/ folder +# (running via the per-row run.sh after setup_row.sh snapshots the code). +THIS = Path(__file__).resolve() +sys.path.insert(0, str(THIS.parent)) # row/code/ +sys.path.insert(0, str(THIS.parents[1])) # neurips26/ + +try: + from experiments.dataset_seqpred import ( + TripletSeqPredDataset, build_train_test, collate_triplet, + TRAIN_VOLS_V3, TEST_VOLS_V3, + ) + from experiments.models_seqpred import build_model + from experiments.taxonomy import ( + NUM_VERB_FINE, NUM_VERB_COMPOSITE, NUM_NOUN, NUM_HAND, + ) +except ModuleNotFoundError: + from dataset_seqpred import ( + TripletSeqPredDataset, build_train_test, collate_triplet, + TRAIN_VOLS_V3, TEST_VOLS_V3, + ) + from models_seqpred import build_model + from taxonomy import ( + NUM_VERB_FINE, NUM_VERB_COMPOSITE, NUM_NOUN, NUM_HAND, + ) + + +# --------------------------------------------------------------------------- +# Utilities +# --------------------------------------------------------------------------- + +def set_seed(seed: int) -> None: + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def top_k_correct(logits: torch.Tensor, target: torch.Tensor, k: int) -> torch.Tensor: + """Return a bool tensor (B,) indicating whether `target` is in top-k of logits.""" + k = min(k, logits.size(1)) + _, top = logits.topk(k, dim=1) + return (top == target.unsqueeze(1)).any(dim=1) + + +def mean_class_recall(logits: torch.Tensor, target: torch.Tensor, + num_classes: int) -> float: + pred = logits.argmax(dim=1) + recall_per_cls = [] + for c in range(num_classes): + sel = (target == c) + n = int(sel.sum().item()) + if n == 0: + continue + r = float((pred[sel] == c).float().mean().item()) + recall_per_cls.append(r) + return float(np.mean(recall_per_cls)) if recall_per_cls else 0.0 + + +def build_class_weights(counts: np.ndarray) -> torch.Tensor: + """Inverse-frequency weights, normalized so mean weight = 1.""" + counts = counts.astype(np.float32).clip(min=1.0) + w = 1.0 / counts + w = w / w.mean() + return torch.from_numpy(w) + + +# --------------------------------------------------------------------------- +# Core loss +# --------------------------------------------------------------------------- + +def triplet_loss( + logits: Dict[str, torch.Tensor], + y: Dict[str, torch.Tensor], + weights: Dict[str, torch.Tensor], + lambda_cfg: Dict[str, float], + label_smoothing: float = 0.05, +) -> Dict[str, torch.Tensor]: + losses = {} + for head in ("verb_fine", "verb_composite", "noun", "hand"): + w = weights.get(head, None) + if w is not None: + w = w.to(logits[head].device) + l = F.cross_entropy( + logits[head], y[head], weight=w, + label_smoothing=label_smoothing, + ) + losses[head] = l + total = sum(lambda_cfg.get(k, 1.0) * losses[k] for k in losses) + losses["total"] = total + return losses + + +# --------------------------------------------------------------------------- +# Eval +# --------------------------------------------------------------------------- + +@torch.no_grad() +def evaluate(model, loader, device) -> Dict[str, float]: + model.eval() + all_logits: Dict[str, list] = {k: [] for k in + ("verb_fine", "verb_composite", "noun", "hand")} + all_y: Dict[str, list] = {k: [] for k in + ("verb_fine", "verb_composite", "noun", "hand")} + + for batch in loader: + # Backward-compatible unpack: collate returns 5 or 6 elements. + if len(batch) == 6: + x, mask, lens, y, meta, prev = batch + else: + x, mask, lens, y, meta = batch + prev = None + x = {m: t.to(device) for m, t in x.items()} + mask = mask.to(device) + kwargs = {} + if prev is not None and getattr(model, "use_prev_action", False): + kwargs["prev_v_comp"] = prev["verb_composite"].to(device) + kwargs["prev_noun"] = prev["noun"].to(device) + logits = model(x, mask, **kwargs) + for k in all_logits: + all_logits[k].append(logits[k].cpu()) + all_y[k].append(y[k]) + + logits_cat = {k: torch.cat(v, dim=0) for k, v in all_logits.items()} + y_cat = {k: torch.cat(v, dim=0) for k, v in all_y.items()} + + m = {} + for k, K in [("verb_fine", NUM_VERB_FINE), + ("verb_composite", NUM_VERB_COMPOSITE), + ("noun", NUM_NOUN), + ("hand", NUM_HAND)]: + preds = logits_cat[k].argmax(dim=1) + acc1 = float((preds == y_cat[k]).float().mean().item()) + m[f"{k}_top1"] = acc1 + if K > 5: + acc5 = float(top_k_correct(logits_cat[k], y_cat[k], 5).float().mean().item()) + m[f"{k}_top5"] = acc5 + m[f"{k}_mcr"] = mean_class_recall(logits_cat[k], y_cat[k], K) + + # Per-head argmax predictions + vf_pred = logits_cat["verb_fine"].argmax(dim=1) + n_pred = logits_cat["noun"].argmax(dim=1) + h_pred = logits_cat["hand"].argmax(dim=1) + + # Headline (current default): action_vn = (verb_fine, noun) joint top-1. + # Hand is dropped from the joint metric because the hand label is dominated + # by a single majority class (~48% train, ~42% test) so a constant predictor + # already saturates it; including hand in the joint compresses the signal + # from the verb / noun heads where models actually learn. Hand is still + # reported separately as `hand_top1`. + vn_correct = (vf_pred == y_cat["verb_fine"]) & (n_pred == y_cat["noun"]) + m["action_vn_top1"] = float(vn_correct.float().mean().item()) + + # Top-5 action over (verb_fine, noun) + vf_top5 = top_k_correct(logits_cat["verb_fine"], y_cat["verb_fine"], 5) + n_top5 = top_k_correct(logits_cat["noun"], y_cat["noun"], 5) + m["action_vn_top5"] = float((vf_top5 & n_top5).float().mean().item()) + + # Legacy: include hand in the joint, kept for backward compatibility with + # earlier reports. Will be deprecated. + vfn_h_correct = vn_correct & (h_pred == y_cat["hand"]) + m["action_top1"] = float(vfn_h_correct.float().mean().item()) + h_top1 = (h_pred == y_cat["hand"]) + m["action_top5"] = float((vf_top5 & n_top5 & h_top1).float().mean().item()) + return m + + +# --------------------------------------------------------------------------- +# Modality dropout (train-time only) +# --------------------------------------------------------------------------- + +def apply_modality_dropout(x: Dict[str, torch.Tensor], p: float) -> Dict[str, torch.Tensor]: + """Per-sample per-modality dropout: zero out each (sample, modality) cell + independently with probability p, but force-keep at least one modality + per sample so the model never receives an all-zero input.""" + if p <= 0.0: + return x + mods = list(x.keys()) + if len(mods) <= 1: + return x + any_t = next(iter(x.values())) + B = any_t.shape[0] + device = any_t.device + keep = (torch.rand(B, len(mods), device=device) >= p) + forced = torch.randint(len(mods), (B,), device=device) + keep[torch.arange(B, device=device), forced] = True + out = {} + for i, m in enumerate(mods): + km = keep[:, i].to(x[m].dtype).view(B, *([1] * (x[m].ndim - 1))) + out[m] = x[m] * km + return out + + +# --------------------------------------------------------------------------- +# Main training +# --------------------------------------------------------------------------- + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", type=str, default="deepconvlstm", + choices=["deepconvlstm", "dailyactformer", + "rulstm", "futr", "afft", + "handformer", "actionllm"]) + ap.add_argument("--modalities", type=str, + default="imu,emg,eyetrack,mocap,pressure") + ap.add_argument("--t_obs", type=float, default=8.0, + help="Anticipation mode only: observation window length (s).") + ap.add_argument("--t_fut", type=float, default=2.0, + help="Anticipation mode only: prediction horizon (s).") + ap.add_argument("--mode", type=str, default="recognition", + choices=["recognition", "anticipation"], + help="recognition = classify segment from its own [start,end] sensor " + "window (default). anticipation = legacy T10 setup, predict from " + "[start-t_fut-t_obs, start-t_fut].") + ap.add_argument("--downsample", type=int, default=5) + + ap.add_argument("--epochs", type=int, default=40) + ap.add_argument("--batch_size", type=int, default=32) + ap.add_argument("--lr", type=float, default=3e-4) + ap.add_argument("--weight_decay", type=float, default=1e-4) + ap.add_argument("--grad_clip", type=float, default=1.0) + ap.add_argument("--label_smoothing", type=float, default=0.05) + ap.add_argument("--dropout", type=float, default=0.1, + help="Dropout used inside DAF stems / transformer / pool.") + ap.add_argument("--use_prev_action", action="store_true", + help="Condition DAF on previous-segment (verb_composite, noun) " + "labels via embedding concat to pooled features. Only DAF " + "uses this; baselines ignore it.") + ap.add_argument("--modality_dropout", type=float, default=0.0, + help="Train-time per-sample per-modality dropout prob " + "(0.0=off). At least one modality is always kept.") + + ap.add_argument("--use_class_weights", action="store_true", + help="Weight CE by inverse class frequency (better for tail).") + ap.add_argument("--lambda_verb_fine", type=float, default=1.0) + ap.add_argument("--lambda_verb_composite", type=float, default=0.5) + ap.add_argument("--lambda_noun", type=float, default=1.0) + ap.add_argument("--lambda_hand", type=float, default=0.5) + + ap.add_argument("--patience", type=int, default=12) + ap.add_argument("--warmup_epochs", type=int, default=0, + help="Linear LR warmup over the first N epochs (0=off).") + ap.add_argument("--seed", type=int, default=42) + ap.add_argument("--output_dir", type=str, required=True) + ap.add_argument("--num_workers", type=int, default=0) + ap.add_argument("--tag", type=str, default="") + args = ap.parse_args() + + set_seed(args.seed) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + if args.mode == "anticipation": + print(f"[cfg] model={args.model} modalities={args.modalities} " + f"mode={args.mode} T_obs={args.t_obs}s T_fut={args.t_fut}s seed={args.seed}") + else: + print(f"[cfg] model={args.model} modalities={args.modalities} " + f"mode={args.mode} (segment-aligned window) seed={args.seed}") + print(f"[cfg] device={device} epochs={args.epochs} lr={args.lr} " + f"batch_size={args.batch_size}") + + mods = tuple(args.modalities.split(",")) + train_ds, test_ds = build_train_test( + modalities=mods, t_obs_sec=args.t_obs, t_fut_sec=args.t_fut, + downsample=args.downsample, mode=args.mode, + ) + print(f"[data] train={len(train_ds)} test={len(test_ds)} " + f"modality_dims={train_ds.modality_dims}") + + # Class counts for weighting (train only) + counts = train_ds.class_counts() + weights: Dict[str, torch.Tensor] = {} + if args.use_class_weights: + for k in ("verb_fine", "verb_composite", "noun", "hand"): + weights[k] = build_class_weights(counts[k]) + + train_loader = DataLoader( + train_ds, batch_size=args.batch_size, shuffle=True, + collate_fn=collate_triplet, num_workers=args.num_workers, drop_last=True, + ) + test_loader = DataLoader( + test_ds, batch_size=args.batch_size, shuffle=False, + collate_fn=collate_triplet, num_workers=args.num_workers, + ) + + # For DailyActFormer: causal mask only when doing anticipation; bidirectional + # attention for recognition (the default). Other models ignore unknown kwargs. + extra_kwargs = {} + if args.model in ("dailyactformer", "ours", "daf"): + extra_kwargs["causal"] = (args.mode == "anticipation") + extra_kwargs["dropout"] = args.dropout + # Every model class now accepts use_prev_action; pass it uniformly. + extra_kwargs["use_prev_action"] = args.use_prev_action + model = build_model(args.model, train_ds.modality_dims, **extra_kwargs).to(device) + n_params = sum(p.numel() for p in model.parameters()) + print(f"[model] {args.model} params={n_params:,}") + + opt = torch.optim.AdamW( + model.parameters(), lr=args.lr, weight_decay=args.weight_decay, + ) + if args.warmup_epochs > 0: + warmup = torch.optim.lr_scheduler.LinearLR( + opt, start_factor=1.0 / max(1, args.warmup_epochs), end_factor=1.0, + total_iters=args.warmup_epochs, + ) + cosine = torch.optim.lr_scheduler.CosineAnnealingLR( + opt, T_max=max(1, args.epochs - args.warmup_epochs), + eta_min=args.lr * 0.05, + ) + sched = torch.optim.lr_scheduler.SequentialLR( + opt, schedulers=[warmup, cosine], milestones=[args.warmup_epochs], + ) + else: + sched = torch.optim.lr_scheduler.CosineAnnealingLR( + opt, T_max=args.epochs, eta_min=args.lr * 0.05, + ) + + lambda_cfg = { + "verb_fine": args.lambda_verb_fine, + "verb_composite": args.lambda_verb_composite, + "noun": args.lambda_noun, + "hand": args.lambda_hand, + } + + # Output directory + out_dir = Path(args.output_dir) + if args.tag: + out_dir = out_dir.parent / f"{out_dir.name}_{args.tag}" + out_dir.mkdir(parents=True, exist_ok=True) + with open(out_dir / "config.json", "w") as f: + json.dump(vars(args) | {"n_params": n_params}, f, indent=2) + + best = {"action_vn_top1": -1.0, "action_top1": -1.0} + best_epoch = 0 + best_path = out_dir / "model_best.pt" + patience = 0 + history = [] + + for epoch in range(1, args.epochs + 1): + t0 = time.time() + model.train() + losses_epoch = {k: 0.0 for k in + ("verb_fine", "verb_composite", "noun", "hand", "total")} + n_batches = 0 + for batch in train_loader: + if len(batch) == 6: + x, mask, lens, y, meta, prev = batch + else: + x, mask, lens, y, meta = batch + prev = None + x = {m: t.to(device) for m, t in x.items()} + mask = mask.to(device) + y = {k: v.to(device) for k, v in y.items()} + + if args.modality_dropout > 0.0: + x = apply_modality_dropout(x, args.modality_dropout) + + kwargs = {} + if prev is not None and getattr(model, "use_prev_action", False): + kwargs["prev_v_comp"] = prev["verb_composite"].to(device) + kwargs["prev_noun"] = prev["noun"].to(device) + + opt.zero_grad() + logits = model(x, mask, **kwargs) + l = triplet_loss(logits, y, weights, lambda_cfg, + label_smoothing=args.label_smoothing) + l["total"].backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) + opt.step() + + for k in losses_epoch: + losses_epoch[k] += float(l[k].detach().item()) + n_batches += 1 + + for k in losses_epoch: + losses_epoch[k] /= max(1, n_batches) + sched.step() + + metrics = evaluate(model, test_loader, device) + dur = time.time() - t0 + + print( + f" E{epoch:3d} loss={losses_epoch['total']:.3f} " + f"(vf={losses_epoch['verb_fine']:.2f} " + f"n={losses_epoch['noun']:.2f} " + f"h={losses_epoch['hand']:.2f}) | " + f"act_vn@1={metrics['action_vn_top1']:.3f} " + f"vf@1={metrics['verb_fine_top1']:.3f} " + f"n@1={metrics['noun_top1']:.3f} " + f"h@1={metrics['hand_top1']:.3f} | " + f"{dur:.1f}s", + flush=True, + ) + + history.append({"epoch": epoch, **losses_epoch, **metrics}) + if metrics["action_vn_top1"] > best["action_vn_top1"]: + best = dict(metrics) + best_epoch = epoch + patience = 0 + torch.save( + {"state_dict": {k: v.cpu().clone() + for k, v in model.state_dict().items()}, + "epoch": epoch, + "metrics": metrics}, + best_path, + ) + else: + patience += 1 + if patience >= args.patience: + print(f" early stop at epoch {epoch} (best epoch {best_epoch})") + break + + # Write results + results = { + "best_epoch": best_epoch, + "best_test_metrics": best, + "history": history, + "n_params": n_params, + "train_size": len(train_ds), + "test_size": len(test_ds), + "train_class_counts": {k: v.tolist() for k, v in counts.items()}, + "modality_dims": train_ds.modality_dims, + "args": vars(args), + } + with open(out_dir / "results.json", "w") as f: + json.dump(results, f, indent=2) + print(f"\n[done] best action_vn@1 = {best['action_vn_top1']:.4f} " + f"(legacy action@1 = {best['action_top1']:.4f}, epoch {best_epoch}) " + f"saved to {out_dir}") + + +if __name__ == "__main__": + main() diff --git a/experiments/tasks/train_signal_forecast.py b/experiments/tasks/train_signal_forecast.py new file mode 100644 index 0000000000000000000000000000000000000000..fc4f2374c06284c2dbaf2c53b7558b129a0b6852 --- /dev/null +++ b/experiments/tasks/train_signal_forecast.py @@ -0,0 +1,273 @@ +#!/usr/bin/env python3 +"""Train + evaluate frame-level future-signal forecasting (T8 v2). + +Predicts the raw future signal of one target modality (IMU, EMG, or MoCap) +from past T_obs of input modalities. Reports skill score against persistence +baseline, broken down by 4 contact-event types. + +Three configurations supported (driven by --modalities): + A. Target-only e.g. --modalities imu (target IMU) + B. Target + Pressure e.g. --modalities imu,pressure (target IMU) + C. Target + Pressure (zeroed) set --modalities imu,pressure --zero_pressure_at_eval + This loads the same checkpoint trained as B and re-evaluates with the + pressure channel forced to zero at test time, isolating pressure's + causal contribution net of model capacity. + +Skill score = 1 - MSE(pred, true) / MSE(persistence, true) +where persistence = repeat last observed target frame T_fut times. +""" +from __future__ import annotations +import argparse +import json +import random +import sys +import time +from pathlib import Path + +import numpy as np +import torch +import torch.nn as nn +from torch.utils.data import DataLoader + +THIS = Path(__file__).resolve() +sys.path.insert(0, str(THIS.parent)) +sys.path.insert(0, str(THIS.parents[1])) +sys.path.insert(0, str(THIS.parents[1] / "table8" / "code")) + +try: + from experiments.dataset_signal_forecast import ( + SignalForecastDataset, collate_signal_forecast, + build_signal_train_test, EVENT_NAMES, + ) +except ModuleNotFoundError: + from dataset_signal_forecast import ( + SignalForecastDataset, collate_signal_forecast, + build_signal_train_test, EVENT_NAMES, + ) +from nets.models_forecast import build_forecast_model # type: ignore + + +def set_seed(seed: int): + random.seed(seed); np.random.seed(seed) + torch.manual_seed(seed); torch.cuda.manual_seed_all(seed) + + +def train_epoch(model, loader, optimizer, device): + """Model predicts residual to persistence: target = y - y_last.""" + model.train() + total, n = 0.0, 0 + for x, y, y_last, _et, _ in loader: + x = {m: v.to(device) for m, v in x.items()} + y = y.to(device) + y_last = y_last.to(device).unsqueeze(1) # (B, 1, target_dim) + residual_target = y - y_last # (B, T_fut, target_dim) + optimizer.zero_grad() + pred = model(x) # (B, T_fut, target_dim) — residual + loss = ((pred - residual_target) ** 2).mean() + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + total += loss.item() * y.numel() + n += y.numel() + return total / max(n, 1) + + +@torch.no_grad() +def evaluate(model, loader, device, t_fut: int, target_dim: int, + zero_pressure: bool = False): + """Return per-event-type and overall: MSE_model, MSE_persist, skill_score, + plus per-horizon skill_score.""" + model.eval() + # Accumulators: (4 event types + 1 overall) x ... + sse_m = np.zeros((5, t_fut), dtype=np.float64) + sse_p = np.zeros((5, t_fut), dtype=np.float64) + n_pairs = np.zeros((5, t_fut), dtype=np.int64) + + for x, y, y_last, et, _ in loader: + x = {m: v.to(device) for m, v in x.items()} + if zero_pressure and "pressure" in x: + x["pressure"] = torch.zeros_like(x["pressure"]) + y = y.to(device) # (B, T_fut, D) + y_last = y_last.to(device).unsqueeze(1) # (B, 1, D) + pred = model(x) # (B, T_fut, D) — residual + pred_full = pred + y_last # back to y-space + persist = y_last.expand_as(y) # (B, T_fut, D) + m_err = ((pred_full - y) ** 2).mean(dim=-1) # (B, T_fut) + p_err = ((persist - y) ** 2).mean(dim=-1) # (B, T_fut) + et_np = et.numpy() + m_np, p_np = m_err.cpu().numpy(), p_err.cpu().numpy() + for k in range(m_np.shape[0]): + e = int(et_np[k]) + sse_m[e] += m_np[k]; sse_p[e] += p_np[k]; n_pairs[e] += 1 + sse_m[4] += m_np[k]; sse_p[4] += p_np[k]; n_pairs[4] += 1 + + out = {} + for e in range(5): + n = max(int(n_pairs[e].max()), 1) + mse_m = (sse_m[e] / np.maximum(n_pairs[e], 1)).mean() + mse_p = (sse_p[e] / np.maximum(n_pairs[e], 1)).mean() + skill = 1.0 - (mse_m / mse_p) if mse_p > 1e-9 else 0.0 + # per-horizon skill + per_h_m = sse_m[e] / np.maximum(n_pairs[e], 1) + per_h_p = sse_p[e] / np.maximum(n_pairs[e], 1) + per_h_skill = (1.0 - per_h_m / np.maximum(per_h_p, 1e-9)).tolist() + name = EVENT_NAMES.get(e, "overall") if e < 4 else "overall" + out[name] = { + "n_anchors": int(n), + "mse_model": float(mse_m), + "mse_persist": float(mse_p), + "skill_score": float(skill), + "per_h_skill": [float(s) for s in per_h_skill], + } + return out + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", required=True, choices=["daf", "futr", "deepconvlstm"]) + ap.add_argument("--input_modalities", required=True, + help="e.g. 'imu' or 'imu,pressure'") + ap.add_argument("--target_modality", required=True, choices=["imu", "emg", "mocap"]) + ap.add_argument("--t_obs", type=float, default=1.5) + ap.add_argument("--t_fut", type=float, default=0.5) + ap.add_argument("--anchor_stride", type=float, default=0.25) + ap.add_argument("--per_event_max", type=int, default=8000, + help="Cap each event-type pool to this many anchors (per split). " + "Use a large number to keep all anchors.") + ap.add_argument("--epochs", type=int, default=25) + ap.add_argument("--batch_size", type=int, default=64) + ap.add_argument("--lr", type=float, default=3e-4) + ap.add_argument("--weight_decay", type=float, default=1e-4) + ap.add_argument("--d_model", type=int, default=128) + ap.add_argument("--dropout", type=float, default=0.1) + ap.add_argument("--num_workers", type=int, default=2) + ap.add_argument("--seed", type=int, default=42) + ap.add_argument("--patience", type=int, default=5) + ap.add_argument("--zero_pressure_at_eval", action="store_true", + help="Eval-only: zero out the pressure input (causal-ablation control).") + ap.add_argument("--load_checkpoint", type=str, default=None, + help="Skip training, load checkpoint and run only eval (for control C).") + ap.add_argument("--output_dir", required=True) + args = ap.parse_args() + + set_seed(args.seed) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + inputs = args.input_modalities.split(",") + print(f"device={device} seed={args.seed} model={args.model} " + f"inputs={inputs} target={args.target_modality} " + f"t_obs={args.t_obs} t_fut={args.t_fut} " + f"zero_pressure_at_eval={args.zero_pressure_at_eval}", flush=True) + + train_ds, test_ds = build_signal_train_test( + input_modalities=inputs, + target_modality=args.target_modality, + t_obs_sec=args.t_obs, t_fut_sec=args.t_fut, + anchor_stride_sec=args.anchor_stride, + per_event_max=args.per_event_max, + rng_seed=args.seed, + ) + target_dim = train_ds.target_dim + print(f"train={len(train_ds)} test={len(test_ds)} target_dim={target_dim}", + flush=True) + + tr_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, + num_workers=args.num_workers, collate_fn=collate_signal_forecast, + drop_last=False) + te_loader = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False, + num_workers=args.num_workers, collate_fn=collate_signal_forecast) + + # Build model with output dim = target_dim (regression) + model = build_forecast_model( + args.model, train_ds.modality_dims, + num_classes=target_dim, + t_obs=train_ds.T_obs, t_fut=train_ds.T_fut, + d_model=args.d_model, dropout=args.dropout, + ).to(device) + n_params = sum(p.numel() for p in model.parameters()) + print(f"params={n_params:,}", flush=True) + + out_dir = Path(args.output_dir); out_dir.mkdir(parents=True, exist_ok=True) + + # ---- Eval-only mode (config C: load checkpoint trained as B, re-eval) ---- + if args.load_checkpoint is not None: + print(f"loading checkpoint {args.load_checkpoint}", flush=True) + sd = torch.load(args.load_checkpoint, map_location=device) + model.load_state_dict(sd) + ev = evaluate(model, te_loader, device, + t_fut=train_ds.T_fut, target_dim=target_dim, + zero_pressure=args.zero_pressure_at_eval) + out = { + "method": args.model, + "input_modalities": inputs, + "target_modality": args.target_modality, + "seed": args.seed, + "n_params": n_params, + "T_obs": train_ds.T_obs, "T_fut": train_ds.T_fut, "target_dim": target_dim, + "best_epoch": -1, "mode": "eval_only", + "zero_pressure_at_eval": bool(args.zero_pressure_at_eval), + "loaded_from": args.load_checkpoint, + "eval": ev, + "args": vars(args), + } + with open(out_dir / "results.json", "w") as f: + json.dump(out, f, indent=2) + print(f"[done] overall skill_score = {ev['overall']['skill_score']:.4f}", flush=True) + for e in ("non-contact", "pre-contact", "steady-grip", "release"): + print(f" {e:14s} skill={ev[e]['skill_score']:+.4f} (n={ev[e]['n_anchors']})", flush=True) + return + + # ---- Standard training (config A or B) ---- + optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) + sched = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs, eta_min=args.lr * 0.05) + + best_skill = -1e9 + best_epoch = 0 + best_eval = None + patience_counter = 0 + for ep in range(1, args.epochs + 1): + t0 = time.time() + tr_loss = train_epoch(model, tr_loader, optimizer, device) + ev = evaluate(model, te_loader, device, + t_fut=train_ds.T_fut, target_dim=target_dim, + zero_pressure=False) + sched.step() + skill = ev["overall"]["skill_score"] + print(f" E{ep:2d} | tr_mse {tr_loss:.4f} | te_skill {skill:+.4f} " + f"| pre {ev['pre-contact']['skill_score']:+.3f} " + f"steady {ev['steady-grip']['skill_score']:+.3f} " + f"release {ev['release']['skill_score']:+.3f} " + f"non {ev['non-contact']['skill_score']:+.3f} " + f"| {time.time()-t0:.1f}s", flush=True) + if skill > best_skill: + best_skill = skill + best_epoch = ep + best_eval = ev + torch.save({k: v.cpu() for k, v in model.state_dict().items()}, + out_dir / "model_best.pt") + patience_counter = 0 + else: + patience_counter += 1 + if patience_counter >= args.patience: + print(f" early stop at epoch {ep} (best {best_epoch})", flush=True) + break + + out = { + "method": args.model, + "input_modalities": inputs, + "target_modality": args.target_modality, + "seed": args.seed, + "n_params": n_params, + "T_obs": train_ds.T_obs, "T_fut": train_ds.T_fut, "target_dim": target_dim, + "best_epoch": int(best_epoch), + "best_skill": float(best_skill), + "eval": best_eval, + "args": vars(args), + } + with open(out_dir / "results.json", "w") as f: + json.dump(out, f, indent=2) + print(f"\n[done] best skill={best_skill:+.4f} at epoch {best_epoch}", flush=True) + print(f"saved to {out_dir}/results.json", flush=True) + + +if __name__ == "__main__": + main() diff --git a/experiments/tasks/train_signal_forecast_priv.py b/experiments/tasks/train_signal_forecast_priv.py new file mode 100644 index 0000000000000000000000000000000000000000..78595138bed01b35e411130512cdad6f2f4e1596 --- /dev/null +++ b/experiments/tasks/train_signal_forecast_priv.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +"""Train + evaluate T8 v3 — privileged future-pressure conditioning (Option B). + +Compared to train_signal_forecast.py: + - Inputs: past 1.5s of `input_modalities` (e.g. just target modality) + + future T_fut s of pressure (privileged side channel) + - Output: future T_fut s of `target_modality` + - Comparison baseline (A_priv): existing `_no_pressure` runs from T8 v2. + - This run is the B_priv group; lift = skill(B_priv) - skill(A_priv). + +If lift >> 0, future pressure trajectory carries information about future +kinematics that past kinematics alone do not encode. This directly tests +the Johansson 1984 hypothesis at the algorithmic level. +""" +from __future__ import annotations +import argparse +import json +import random +import sys +import time +from pathlib import Path + +import numpy as np +import torch +import torch.nn as nn +from torch.utils.data import DataLoader + +THIS = Path(__file__).resolve() +sys.path.insert(0, str(THIS.parent)) +sys.path.insert(0, str(THIS.parents[1])) + +from data.dataset_signal_forecast import ( + SignalForecastDataset, collate_signal_forecast, + build_signal_train_test, EVENT_NAMES, +) +from nets.models_forecast_priv import DAFFuturePressure + + +def set_seed(seed: int): + random.seed(seed); np.random.seed(seed) + torch.manual_seed(seed); torch.cuda.manual_seed_all(seed) + + +def train_epoch(model, loader, optimizer, device): + model.train() + total, n = 0.0, 0 + for x, y, y_last, fp, _et, _ in loader: + x = {m: v.to(device) for m, v in x.items()} + y = y.to(device) + y_last = y_last.to(device).unsqueeze(1) + fp = fp.to(device) + residual_target = y - y_last + optimizer.zero_grad() + pred = model(x, fp) + loss = ((pred - residual_target) ** 2).mean() + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + total += loss.item() * y.numel() + n += y.numel() + return total / max(n, 1) + + +@torch.no_grad() +def evaluate(model, loader, device, t_fut, target_dim): + model.eval() + sse_m = np.zeros((5, t_fut), dtype=np.float64) + sse_p = np.zeros((5, t_fut), dtype=np.float64) + n_pairs = np.zeros((5, t_fut), dtype=np.int64) + + for x, y, y_last, fp, et, _ in loader: + x = {m: v.to(device) for m, v in x.items()} + y = y.to(device) + y_last = y_last.to(device).unsqueeze(1) + fp = fp.to(device) + pred = model(x, fp) # residual + pred_full = pred + y_last + persist = y_last.expand_as(y) + m_err = ((pred_full - y) ** 2).mean(dim=-1) + p_err = ((persist - y) ** 2).mean(dim=-1) + et_np = et.numpy() + m_np, p_np = m_err.cpu().numpy(), p_err.cpu().numpy() + for k in range(m_np.shape[0]): + e = int(et_np[k]) + sse_m[e] += m_np[k]; sse_p[e] += p_np[k]; n_pairs[e] += 1 + sse_m[4] += m_np[k]; sse_p[4] += p_np[k]; n_pairs[4] += 1 + + out = {} + for e in range(5): + n = max(int(n_pairs[e].max()), 1) + mse_m = (sse_m[e] / np.maximum(n_pairs[e], 1)).mean() + mse_p = (sse_p[e] / np.maximum(n_pairs[e], 1)).mean() + skill = 1.0 - (mse_m / mse_p) if mse_p > 1e-9 else 0.0 + per_h_skill = (1.0 - (sse_m[e] / np.maximum(n_pairs[e], 1)) / + np.maximum(sse_p[e] / np.maximum(n_pairs[e], 1), 1e-9)).tolist() + name = EVENT_NAMES.get(e, "overall") if e < 4 else "overall" + out[name] = { + "n_anchors": int(n), + "mse_model": float(mse_m), + "mse_persist": float(mse_p), + "skill_score": float(skill), + "per_h_skill": [float(s) for s in per_h_skill], + } + return out + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--input_modalities", required=True, + help="comma-separated; pressure NOT included unless you want past pressure too") + ap.add_argument("--target_modality", required=True, choices=["imu", "emg", "mocap"]) + ap.add_argument("--t_obs", type=float, default=1.5) + ap.add_argument("--t_fut", type=float, default=0.5) + ap.add_argument("--anchor_stride", type=float, default=0.25) + ap.add_argument("--per_event_max", type=int, default=8000) + ap.add_argument("--epochs", type=int, default=25) + ap.add_argument("--batch_size", type=int, default=64) + ap.add_argument("--lr", type=float, default=3e-4) + ap.add_argument("--weight_decay", type=float, default=1e-4) + ap.add_argument("--d_model", type=int, default=128) + ap.add_argument("--dropout", type=float, default=0.1) + ap.add_argument("--num_workers", type=int, default=2) + ap.add_argument("--seed", type=int, default=42) + ap.add_argument("--patience", type=int, default=6) + ap.add_argument("--output_dir", required=True) + args = ap.parse_args() + + set_seed(args.seed) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + inputs = args.input_modalities.split(",") + print(f"device={device} seed={args.seed} model=DAF-priv " + f"inputs={inputs} target={args.target_modality} " + f"t_obs={args.t_obs} t_fut={args.t_fut}", flush=True) + + train_ds, test_ds = build_signal_train_test( + input_modalities=inputs, + target_modality=args.target_modality, + t_obs_sec=args.t_obs, t_fut_sec=args.t_fut, + anchor_stride_sec=args.anchor_stride, + per_event_max=args.per_event_max, + include_future_pressure=True, + rng_seed=args.seed, + ) + target_dim = train_ds.target_dim + print(f"train={len(train_ds)} test={len(test_ds)} target_dim={target_dim}", + flush=True) + + tr_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, + num_workers=args.num_workers, + collate_fn=collate_signal_forecast, drop_last=False) + te_loader = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False, + num_workers=args.num_workers, + collate_fn=collate_signal_forecast) + + model = DAFFuturePressure( + train_ds.modality_dims, target_dim=target_dim, + t_obs=train_ds.T_obs, t_fut=train_ds.T_fut, + future_pressure_dim=50, + d_model=args.d_model, dropout=args.dropout, + ).to(device) + n_params = sum(p.numel() for p in model.parameters()) + print(f"params={n_params:,}", flush=True) + + optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, + weight_decay=args.weight_decay) + sched = torch.optim.lr_scheduler.CosineAnnealingLR( + optimizer, T_max=args.epochs, eta_min=args.lr * 0.05 + ) + + out_dir = Path(args.output_dir); out_dir.mkdir(parents=True, exist_ok=True) + best_skill = -1e9 + best_epoch, best_eval = 0, None + patience_counter = 0 + for ep in range(1, args.epochs + 1): + t0 = time.time() + tr_loss = train_epoch(model, tr_loader, optimizer, device) + ev = evaluate(model, te_loader, device, + t_fut=train_ds.T_fut, target_dim=target_dim) + sched.step() + skill = ev["overall"]["skill_score"] + print(f" E{ep:2d} | tr_mse {tr_loss:.4f} | te_skill {skill:+.4f} " + f"| pre {ev['pre-contact']['skill_score']:+.3f} " + f"steady {ev['steady-grip']['skill_score']:+.3f} " + f"release {ev['release']['skill_score']:+.3f} " + f"non {ev['non-contact']['skill_score']:+.3f} " + f"| {time.time()-t0:.1f}s", flush=True) + if skill > best_skill: + best_skill = skill + best_epoch = ep + best_eval = ev + torch.save({k: v.cpu() for k, v in model.state_dict().items()}, + out_dir / "model_best.pt") + patience_counter = 0 + else: + patience_counter += 1 + if patience_counter >= args.patience: + print(f" early stop at epoch {ep} (best {best_epoch})", flush=True) + break + + out = { + "method": "daf_priv", + "input_modalities": inputs, + "target_modality": args.target_modality, + "future_pressure": True, + "seed": args.seed, "n_params": n_params, + "T_obs": train_ds.T_obs, "T_fut": train_ds.T_fut, "target_dim": target_dim, + "best_epoch": int(best_epoch), "best_skill": float(best_skill), + "eval": best_eval, "args": vars(args), + } + with open(out_dir / "results.json", "w") as f: + json.dump(out, f, indent=2) + print(f"\n[done] best skill={best_skill:+.4f} at epoch {best_epoch}", flush=True) + + +if __name__ == "__main__": + main() diff --git a/experiments/taxonomy.py b/experiments/taxonomy.py new file mode 100644 index 0000000000000000000000000000000000000000..6743f0ceb6886e5783b991063499334bd9004721 --- /dev/null +++ b/experiments/taxonomy.py @@ -0,0 +1,203 @@ +""" +Taxonomy for T10 Next-Action Triplet Prediction on DailyAct-5M. + +Design decisions (fixed per user): + * VERB_FINE: 17 primitives observed in annotations_v3 (Strategy: keep all) + * VERB_COMPOSITE: 6 classes by manual rollup + * NOUN: keep nouns with >=50 segments (Strategy A: drop others entirely) + * HAND: 3 classes {left, right, both} + +The noun list is *frozen* in taxonomy_v3.json so class indices stay stable even +as more annotations are added. Regenerate with `build_taxonomy.py` when you are +ready to lock the final list. +""" + +from __future__ import annotations + +import json +import os +from pathlib import Path +from typing import Dict, List, Optional + +# --------------------------------------------------------------------------- +# Verb (fine, 17 classes) +# --------------------------------------------------------------------------- + +VERB_FINE: List[str] = [ + "grasp", "move", "place", "adjust", + "pick_up", "hold", "pull", "put_down", + "close", "release", "rotate", "open", + "insert", "push", "align", "remove", + "stabilize", +] +NUM_VERB_FINE = len(VERB_FINE) # 17 +VERB_FINE_IDX: Dict[str, int] = {v: i for i, v in enumerate(VERB_FINE)} + + +# --------------------------------------------------------------------------- +# Verb (composite, 6 classes) — manual rollup +# --------------------------------------------------------------------------- + +VERB_COMPOSITE: List[str] = [ + "grasp-family", # grasp, pick_up, hold + "place-family", # place, put_down + "transport", # move, pull, push + "adjust", # adjust, align, stabilize + "state-change", # open, close, rotate, insert, remove + "release", # release +] +NUM_VERB_COMPOSITE = len(VERB_COMPOSITE) # 6 +VERB_COMPOSITE_IDX: Dict[str, int] = {v: i for i, v in enumerate(VERB_COMPOSITE)} + +_FINE_TO_COMPOSITE: Dict[str, str] = { + "grasp": "grasp-family", + "pick_up": "grasp-family", + "hold": "grasp-family", + "place": "place-family", + "put_down": "place-family", + "move": "transport", + "pull": "transport", + "push": "transport", + "adjust": "adjust", + "align": "adjust", + "stabilize": "adjust", + "open": "state-change", + "close": "state-change", + "rotate": "state-change", + "insert": "state-change", + "remove": "state-change", + "release": "release", +} +assert set(_FINE_TO_COMPOSITE.keys()) == set(VERB_FINE), ( + "Verb rollup must cover every fine verb" +) + + +def verb_fine_to_composite_idx(verb_fine: str) -> int: + """Map a fine verb string -> composite class index (0..5).""" + composite = _FINE_TO_COMPOSITE[verb_fine] + return VERB_COMPOSITE_IDX[composite] + + +# --------------------------------------------------------------------------- +# Hand (3 classes) +# --------------------------------------------------------------------------- + +HAND: List[str] = ["left", "right", "both"] +NUM_HAND = len(HAND) +HAND_IDX: Dict[str, int] = {h: i for i, h in enumerate(HAND)} + + +# --------------------------------------------------------------------------- +# Noun — canonical merge table (handles mild annotator inconsistency) +# --------------------------------------------------------------------------- + +NOUN_CANONICAL: Dict[str, str] = { + "折叠雨伞": "folding umbrella", + "mouse": "wired mouse", +} + + +def canonical_noun(n: str) -> str: + """Map raw noun string -> canonical name (handles CJK leak + aliases).""" + return NOUN_CANONICAL.get(n, n) + + +# --------------------------------------------------------------------------- +# Noun list — frozen per-release, loaded from JSON for reproducibility +# --------------------------------------------------------------------------- + +TAXONOMY_FROZEN_PATH = Path(__file__).parent / "taxonomy_v3.json" +NOUN_KEEP_THRESHOLD = 50 + + +def _load_frozen() -> Optional[dict]: + if not TAXONOMY_FROZEN_PATH.exists(): + return None + with open(TAXONOMY_FROZEN_PATH) as f: + return json.load(f) + + +_frozen = _load_frozen() + +if _frozen is not None: + NOUN: List[str] = list(_frozen["nouns"]) + FROZEN_ANNOTATION_COUNT: int = _frozen.get("annotation_file_count", -1) + FROZEN_SEGMENT_COUNT: int = _frozen.get("total_segments", -1) +else: + # Bootstrap list from the initial 167-file scan (Apr 24). Overwritten when + # build_taxonomy.py is run against the final 283-file set. + NOUN = [ + "towel", "sealed jar", "box", "tablecloth", "pot", "tape", "rice bowl", + "pants", "spoon", "marker", "cloth", "plate", "laptop", + "toothbrush case", "tea canister", "hanger", "wired keyboard", + "wired mouse", "laptop power adapter", "seasoning bottle", "mug", + "seasoning jar", "tray", "document", "coat", "tea bag", "water cup", + "shirt", + ] + FROZEN_ANNOTATION_COUNT = 167 + FROZEN_SEGMENT_COUNT = 4140 + +NUM_NOUN = len(NOUN) +NOUN_IDX: Dict[str, int] = {n: i for i, n in enumerate(NOUN)} + + +def noun_to_idx(raw_noun: str) -> Optional[int]: + """Map raw noun -> class index, or None if noun should be dropped (Strategy A).""" + canon = canonical_noun(raw_noun) + return NOUN_IDX.get(canon, None) + + +# --------------------------------------------------------------------------- +# One-shot classify +# --------------------------------------------------------------------------- + +def classify_segment(action_annotation: dict) -> Optional[dict]: + """Convert a raw annotation dict into triplet label indices. + + Returns None if any field is missing or the noun is not in the kept list + (Strategy A: drop the segment). + """ + verb = action_annotation.get("action_name") + noun = action_annotation.get("object_name") + hand = action_annotation.get("hand_type") + if not (verb and noun and hand): + return None + if verb not in VERB_FINE_IDX: + return None + if hand not in HAND_IDX: + return None + n_idx = noun_to_idx(noun) + if n_idx is None: + return None + v_fine_idx = VERB_FINE_IDX[verb] + return { + "verb_fine": v_fine_idx, + "verb_composite": verb_fine_to_composite_idx(verb), + "noun": n_idx, + "hand": HAND_IDX[hand], + } + + +# --------------------------------------------------------------------------- +# Summary for logging / sanity +# --------------------------------------------------------------------------- + +def summary() -> str: + lines = [] + lines.append(f"Verb fine : {NUM_VERB_FINE}") + lines.append(f"Verb composite : {NUM_VERB_COMPOSITE}") + lines.append(f"Noun : {NUM_NOUN} (kept at >= {NOUN_KEEP_THRESHOLD} segments)") + lines.append(f"Hand : {NUM_HAND}") + lines.append(f"Frozen from : {FROZEN_ANNOTATION_COUNT} files, " + f"{FROZEN_SEGMENT_COUNT} segments") + return "\n".join(lines) + + +if __name__ == "__main__": + print(summary()) + print() + print("Verb fine list:", VERB_FINE) + print("Composite: ", VERB_COMPOSITE) + print("Noun list: ", NOUN) + print("Hand list: ", HAND) diff --git a/experiments/taxonomy_v3.json b/experiments/taxonomy_v3.json new file mode 100644 index 0000000000000000000000000000000000000000..1a7fb046e8f13f6433e526bcb04f84e12f03b39b --- /dev/null +++ b/experiments/taxonomy_v3.json @@ -0,0 +1,136 @@ +{ + "threshold": 50, + "annotation_file_count": 283, + "total_segments": 7768, + "dropped_unknown_verb": 0, + "dropped_unknown_hand": 0, + "surviving_segments": 7422, + "verbs": [ + "grasp", + "move", + "place", + "adjust", + "pick_up", + "hold", + "pull", + "put_down", + "close", + "release", + "rotate", + "open", + "insert", + "push", + "align", + "remove", + "stabilize" + ], + "verb_composite": [ + "grasp-family", + "place-family", + "transport", + "adjust", + "state-change", + "release" + ], + "hand": [ + "left", + "right", + "both" + ], + "nouns": [ + "sealed jar", + "towel", + "tablecloth", + "box", + "pot", + "rice bowl", + "tape", + "pants", + "spoon", + "plate", + "marker", + "cloth", + "laptop", + "coat", + "seasoning jar", + "hanger", + "tea canister", + "toothbrush case", + "mug", + "wired mouse", + "tea bag", + "wired keyboard", + "water cup", + "laptop power adapter", + "tray", + "shirt", + "scissors", + "folding umbrella", + "document", + "seasoning bottle", + "wallet", + "suitcase", + "stapler", + "paper" + ], + "noun_counts": { + "sealed jar": 718, + "towel": 486, + "tablecloth": 475, + "box": 460, + "pot": 423, + "rice bowl": 403, + "tape": 389, + "pants": 319, + "spoon": 267, + "plate": 255, + "marker": 254, + "cloth": 238, + "laptop": 222, + "coat": 203, + "seasoning jar": 203, + "hanger": 198, + "tea canister": 193, + "toothbrush case": 138, + "mug": 132, + "wired mouse": 131, + "tea bag": 126, + "wired keyboard": 126, + "water cup": 123, + "laptop power adapter": 121, + "tray": 107, + "shirt": 96, + "scissors": 95, + "folding umbrella": 93, + "document": 89, + "seasoning bottle": 77, + "wallet": 72, + "suitcase": 70, + "stapler": 67, + "paper": 53 + }, + "verb_counts": { + "pull": 223, + "pick_up": 300, + "grasp": 2034, + "move": 1559, + "close": 250, + "put_down": 249, + "place": 1288, + "adjust": 829, + "hold": 198, + "remove": 75, + "open": 191, + "push": 82, + "rotate": 182, + "insert": 77, + "release": 164, + "align": 44, + "stabilize": 23 + }, + "hand_counts": { + "right": 2778, + "both": 3466, + "left": 1524 + } +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..d5cbaf41b2f3e9a1096e54676d71d8351197565c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,16 @@ +# Core +numpy>=1.24 +pandas>=2.0 +scipy>=1.10 +scikit-learn>=1.3 + +# Deep learning +torch>=2.0 +torchvision>=0.15 +transformers>=4.40 + +# Plotting (figures only; not required for training/eval) +matplotlib>=3.7 + +# Video I/O (for scene-cam feature extraction; optional) +opencv-python>=4.8 diff --git a/scripts/build_paper_tables.py b/scripts/build_paper_tables.py new file mode 100644 index 0000000000000000000000000000000000000000..29e5c8a1a31d29d694de9717e0e2ec7cc40b825b --- /dev/null +++ b/scripts/build_paper_tables.py @@ -0,0 +1,868 @@ +#!/usr/bin/env python3 +"""把论文已有 (T1–T6) + 新跑 (T10) 的全部 result tables 汇总成统一的论文风格 markdown 表。 + +输出:${PULSE_ROOT}/results/paper_style_tables.md + +风格约定: +- 全部叙事中文 +- 指标标题带方向箭头 ↑ / ↓(越高越好 / 越低越好) +- 行按主指标从优到劣排序 +- 每张表后写「这张表说明 / 对我们有利不利」结论 +- Part A:论文 PDF 里现有的 ~15 张表(数据从 paper/sections/*.tex 手抄进来,静态) +- Part B:新跑 T10 五张表(从 135 个 eval_macrof1.json 自动汇总) +""" + +from __future__ import annotations + +import json +from pathlib import Path +from statistics import mean, stdev +from typing import Dict, List + +REPO = Path("${PULSE_ROOT}") +OUT = REPO / "results" / "paper_style_tables.md" + + +# =========================================================================== +# 通用工具 +# =========================================================================== + +def fmt(vals: List[float], digits: int = 4) -> str: + if not vals: + return "—" + if len(vals) == 1: + return f"{vals[0]:.{digits}f}" + return f"{mean(vals):.{digits}f} $\\pm$ {stdev(vals):.{digits}f}" + + +def fmt_meanstd(m: float, s: float, digits: int = 3) -> str: + if s is None: + return f"{m:.{digits}f}" + return f"{m:.{digits}f} $\\pm$ {s:.{digits}f}" + + +def maybe_bold(s: str, is_best: bool) -> str: + return f"**{s}**" if is_best else s + + +# =========================================================================== +# Part B 工具:加载 135 个 eval JSON +# =========================================================================== + +def load_seed_metrics(seed_dir: Path) -> Dict | None: + e = seed_dir / "eval_macrof1.json" + r = seed_dir / "results.json" + if not e.exists() or not r.exists(): + return None + with open(e) as f: + ev = json.load(f) + with open(r) as f: + rs = json.load(f) + return {"eval": ev, "args": rs["args"], "best_epoch": rs.get("best_epoch")} + + +def collect_row(table: str, row: str) -> List[Dict]: + out = [] + rd = REPO / table / row + if not rd.is_dir(): + return out + for sd in sorted((rd / "seeds").glob("seed*")): + m = load_seed_metrics(sd) + if m is not None: + out.append(m) + return out + + +def aggregate_row(seeds: List[Dict]) -> Dict | None: + if not seeds: + return None + keys = ["action_acc", + "verb_fine_acc", "verb_fine_macro_f1", "verb_fine_weighted_f1", + "noun_acc", "noun_macro_f1", "noun_weighted_f1", + "hand_acc", "hand_macro_f1"] + out: Dict = {} + for k in keys: + vals = [s["eval"][k] for s in seeds if k in s["eval"]] + out[k] = {"mean": mean(vals) if vals else 0.0, + "std": stdev(vals) if len(vals) > 1 else 0.0, + "fmt": fmt(vals)} + out["n_params"] = seeds[0]["eval"]["n_params"] + out["modalities"] = seeds[0]["args"]["modalities"] + out["model"] = seeds[0]["args"]["model"] + out["t_fut"] = seeds[0]["args"]["t_fut"] + return out + + +MOD_DISPLAY = {"imu": "IMU", "emg": "EMG", "eyetrack": "Eye", + "mocap": "MoCap", "pressure": "Pressure"} + +def fmt_mods(s: str) -> str: + return "+".join(MOD_DISPLAY.get(m, m) for m in s.split(",")) + + +def bold_best_t10(rows: List[Dict], metric_key: str): + means = [r["agg"][metric_key]["mean"] for r in rows if r.get("agg")] + if not means: + return + best = max(means) + for r in rows: + if r.get("agg") is None: + continue + r.setdefault("best", set()) + if r["agg"][metric_key]["mean"] == best: + r["best"].add(metric_key) + + +def cell_t10(r: Dict, metric_key: str) -> str: + if r.get("agg") is None: + return "—" + s = r["agg"][metric_key]["fmt"] + return maybe_bold(s, metric_key in r.get("best", set())) + + +# =========================================================================== +# 文档头 +# =========================================================================== + +lines: List[str] = [] +def push(s: str = ""): + lines.append(s) + +push("# DailyAct-5M 全部 result tables(论文已有 + 新跑 T10)") +push() +push("**统一风格约定**:") +push() +push("- 指标标题带方向箭头(↑ 越高越好,↓ 越低越好)") +push("- 行按主指标从优到劣排序;每个指标列内,最优值 **加粗**") +push("- 每张表后写「这张表说明」+「对我们有利还是不利」(🟢 有利 / 🟡 半利半弊 / 🔴 不利)") +push("- 模态简写:`IMU` / `EMG` / `Eye` / `MoCap` / `Pressure`,加号表示并集(`IMU+MoCap+EMG`)") +push() +push("**目录**") +push() +push("- Part A:论文 PDF (`main.pdf`) 里现有的 result tables(已发表内容)") +push(" - A.1 场景识别(T1):4 张") +push(" - A.2 SyncFuse 组件消融(T1 扩展):1 张") +push(" - A.5 抓取接触检测(T2):1 张") +push(" - A.6 缺失模态鲁棒性(T6):1 张") +push(" - A.7 抓取相关回归 / 预判(T4 / T5):2 张") +push(" - A.8 跨模态检索(T3):1 张") +push(" - A.9 诊断表(zero-shot / per-subject):2 张") +push("- Part B:新跑 T10 Triplet Next-Action Prediction 的 5 张表") +push() +push("---") +push() + + +# =========================================================================== +# Part A:论文已有表(数据手抄自 paper/sections/*.tex) +# =========================================================================== + +push("# Part A — 论文 PDF 里现有的 result tables") +push() +push("> 这些数据来自 `paper/sections/results.tex` / `paper/sections/supplementary.tex`," + "**已经写进 main.pdf**。这里只是用统一中文风格重排。") +push() + +# --------------------------------------------------------------------------- +# A.1.1 Table tab:scene-single-vs-multi +# --------------------------------------------------------------------------- + +push("## A.1 场景识别(T1)") +push() +push("### A.1.1 单模态 vs 多模态(`tab:scene-single-vs-multi`)") +push() +push("Transformer backbone,5 seeds。") +push() +# Data: Configuration, Modalities, F1 mean, F1 std, Acc mean, Acc std +data = [ + ("IMU only", "IMU", 0.573, 0.073, 0.624, 0.073), + ("IMU+MoCap+EMG (late)", "IMU+MoCap+EMG", 0.607, 0.057, 0.616, 0.046), + ("IMU+MoCap+EMG (late, pretrained)", "IMU+MoCap+EMG", 0.696, 0.045, 0.696, 0.046), +] +data_sorted = sorted(data, key=lambda x: -x[2]) # sort by F1 desc +best_f1 = max(x[2] for x in data_sorted) +best_acc = max(x[4] for x in data_sorted) +push("| 排名 | Configuration | Modalities | Mean F1 ↑ | Mean Acc ↑ |") +push("|---|---|---|---|---|") +for rank, (cfg, mods, f1, sf1, acc, sacc) in enumerate(data_sorted, 1): + push(f"| {rank} | {cfg} | {mods} | " + f"{maybe_bold(fmt_meanstd(f1,sf1), f1==best_f1)} | " + f"{maybe_bold(fmt_meanstd(acc,sacc), acc==best_acc)} |") +push() +push("**这张表说明:**") +push() +push("- 单模 IMU 0.573 → 加 MoCap+EMG 后 0.607(+3.4 pp)→ 加 pretrained backbone 0.696(+8.9 pp)。") +push("- 三行单调上升,**多模态 + pretrained transfer** 是这一节的核心设计选择。") +push() +push("**对我们有利吗?🟢 有利。** 这是论文 T1 的承重墙之一,故事干净,数字单调。") +push() + +# --------------------------------------------------------------------------- +# A.1.2 Table tab:scene-pretrain +# --------------------------------------------------------------------------- + +push("### A.1.2 Pretrain × Augmentation 消融(`tab:scene-pretrain`)") +push() +push("Late fusion + 3 modalities,5 seeds。") +push() +data = [ + ("No augment, No pretrain", False, False, 0.607, "baseline"), + ("Yes augment, No pretrain", True, False, 0.556, "−5.1 pp"), + ("No augment, Yes pretrain", False, True, 0.696, "+8.9 pp"), + ("Yes augment, Yes pretrain", True, True, 0.681, "+7.4 pp"), +] +data_sorted = sorted(data, key=lambda x: -x[3]) +best_f1 = max(x[3] for x in data_sorted) +push("| 排名 | Augmentation | Pretrained | Mean F1 ↑ | Improvement |") +push("|---|---|---|---|---|") +for rank, (label, aug, pre, f1, imp) in enumerate(data_sorted, 1): + push(f"| {rank} | {'Yes' if aug else 'No'} | {'Yes' if pre else 'No'} | " + f"{maybe_bold(f'{f1:.3f}', f1==best_f1)} | {imp} |") +push() +push("**这张表说明:**") +push() +push("- Pretrain 有效(+8.9 pp);**Augmentation 反而伤模型**(−5.1 pp,在 102 训练样本下增广引入分布伪影)。") +push("- 最佳组合是 `No augment + Yes pretrain` = 0.696。") +push() +push("**对我们有利吗?🟡 半利半弊。** Pretrain 正向是好故事;augment 反向需要在文里圆," + "现稿用 \"distributional artifacts\" 解释,可能被审稿人质疑。") +push() + +# --------------------------------------------------------------------------- +# A.1.3 Table tab:scene-published (vs DeepConvLSTM, TinyHAR, InceptionTime) +# --------------------------------------------------------------------------- + +push("### A.1.3 与已发表 baseline 对比(`tab:scene-published`)") +push() +push("Acc / Macro F1 越高越好。所有方法在相同 subject-independent split 上跑。") +push() +data = [ + ("DeepConvLSTM (Ordóñez '16)", "IMU", "early", 0.240, 0.137, "Repro"), + ("DeepConvLSTM (Ordóñez '16)", "IMU+MoCap+EMG", "late", 0.240, 0.137, "Repro"), + ("TinyHAR (Zhou '22)", "IMU", "early", 0.480, 0.405, "Repro"), + ("InceptionTime (Fawaz '20)", "IMU", "early", 0.480, 0.445, "Repro"), + ("InceptionTime (Fawaz '20)", "IMU+MoCap+EMG", "late", 0.440, 0.402, "Repro"), + ("Transformer (Ours)", "IMU", "early", 0.720, 0.658, "**Ours**"), + ("Transformer + Pretrain (Ours)", "IMU+MoCap+EMG", "late", 0.760, 0.763, "**Ours**"), +] +data_sorted = sorted(data, key=lambda x: -x[3]) +best_acc = max(x[3] for x in data_sorted) +best_f1 = max(x[4] for x in data_sorted) +push("| 排名 | Method | Type | Modality | Fusion | Acc ↑ | Macro F1 ↑ |") +push("|---|---|---|---|---|---|---|") +for rank, (m, mods, fu, acc, f1, t) in enumerate(data_sorted, 1): + push(f"| {rank} | {m} | {t} | {mods} | {fu} | " + f"{maybe_bold(f'{acc:.3f}', acc==best_acc)} | " + f"{maybe_bold(f'{f1:.3f}', f1==best_f1)} |") +push() +push("**这张表说明:**") +push() +push("- Transformer + Pretrain (Ours) 拿到 Acc **0.760** / F1 **0.763**,**全场最高**,大幅超过 DeepConvLSTM(0.137)、TinyHAR(0.405)、InceptionTime(0.445)。") +push("- DeepConvLSTM 在我们这个长序列(1–4 min)上塌陷成 all-Idle 预测,F1 只有 0.137。") +push() +push("**对我们有利吗?🟢 强有利。** 对 3 个已发表 baseline 全胜,差距巨大。是 paper 的核心 selling table 之一。") +push() + +# --------------------------------------------------------------------------- +# A.1.4 Table tab:scene-published-ext (SyncFuse vs MulT, Perceiver IO, etc) +# --------------------------------------------------------------------------- + +push("### A.1.4 扩展 baseline 对比 + SyncFuse(`tab:scene-published-ext`)") +push() +push("4-mod(MoCap+EMG+Eye+IMU)统一 split,3 seeds。") +push() +data = [ + ("ActionSense LSTM (DelPreto '22)", "MoCap+EMG+Eye+IMU", 0.160, 0.005, 0.267, 0.019, "1.2M", "Repro"), + ("Perceiver IO (Jaegle '21)", "MoCap+EMG+Eye+IMU", 0.205, 0.053, 0.280, 0.033, "1.4M", "Repro"), + ("ST-GCN (Yan '18)", "MoCap", 0.282, 0.093, 0.333, 0.082, "7.0M", "Repro"), + ("EMG-CNN (sEMG lit.)", "EMG", 0.292, 0.012, 0.347, 0.038, "146K", "Repro"), + ("LIMU-BERT (Xu '21)", "IMU", 0.345, 0.047, 0.413, 0.019, "1.3M", "Repro"), + ("CTR-GCN (Chen '21)", "MoCap", 0.375, 0.061, 0.387, 0.038, "3.8M", "Repro"), + ("MulT (Tsai '19)", "MoCap+EMG+IMU", 0.466, 0.129, 0.493, 0.100, "3.9M", "Repro"), + ("SyncFuse (Ours)", "MoCap+EMG+Eye+IMU", 0.516, 0.039, 0.520, 0.033, "3.9M", "**Ours**"), +] +data_sorted = sorted(data, key=lambda x: -x[2]) +best_f1 = max(x[2] for x in data_sorted) +best_acc = max(x[4] for x in data_sorted) +push("| 排名 | Method | Type | Modalities | Macro F1 ↑ | Accuracy ↑ | Params |") +push("|---|---|---|---|---|---|---|") +for rank, (m, mods, f1, sf, acc, sa, p, t) in enumerate(data_sorted, 1): + push(f"| {rank} | {m} | {t} | {mods} | " + f"{maybe_bold(fmt_meanstd(f1,sf), f1==best_f1)} | " + f"{maybe_bold(fmt_meanstd(acc,sa), acc==best_acc)} | {p} |") +push() +push("**这张表说明:**") +push() +push("- **SyncFuse (Ours) 排第 1**:Macro F1 0.516,比 MulT 第 2(0.466)+5 pp;且 std 0.039 是所有多模态方法里最低。") +push("- 单模态方法(ST-GCN / CTR-GCN / LIMU-BERT)处于中段;最差的是 ActionSense LSTM(0.160)和 Perceiver IO(0.205)。") +push() +push("**对我们有利吗?🟢 强有利。** SyncFuse 在 7 个新 baseline 上**全胜**且 std 最低,可作为方法贡献的核心证据。") +push() + +# --------------------------------------------------------------------------- +# A.2 Table tab:syncfuse-ablation +# --------------------------------------------------------------------------- + +push("## A.2 SyncFuse 组件消融") +push() +push("### A.2.1 SyncFuse 组件消融(`tab:syncfuse-ablation`)") +push() +push("seed 42,4-modal,Macro F1 ↑。") +push() +data = [ + ("Full SyncFuse", 0.535, "—"), + ("− modality dropout (p=0)", 0.504, "−3.1 pp"), + ("− learnable late fusion(改成简单平均)", 0.482, "−5.3 pp"), + ("− cross-modal temporal-shift attention", 0.450, "−8.5 pp"), +] +data_sorted = sorted(data, key=lambda x: -x[1]) +best_f1 = max(x[1] for x in data_sorted) +push("| 排名 | Configuration | Macro F1 ↑ | Δ vs full |") +push("|---|---|---|---|") +for rank, (cfg, f1, d) in enumerate(data_sorted, 1): + push(f"| {rank} | {cfg} | {maybe_bold(f'{f1:.3f}', f1==best_f1)} | {d} |") +push() +push("**这张表说明:**") +push() +push("- Full = 0.535(排第 1)。三个新组件都正向贡献。") +push("- 最大贡献来自 **cross-modal temporal-shift attention**(去掉降 8.5 pp);其次 learnable late fusion(−5.3 pp);modality dropout 最弱(−3.1 pp)。") +push() +push("**对我们有利吗?🟢 有利。** 三个组件都正向贡献,且 cross-modal temporal-shift 与论文 case study(EMG 比 motion 早 ~20ms)逻辑闭环,可以作为方法 motivation 的有力证据。") +push() + +# --------------------------------------------------------------------------- +# A.5 Table tab:contact (T2) +# --------------------------------------------------------------------------- + +push("## A.5 抓取接触检测(T2)") +push() +push("### A.5.1 Grasp Contact Detection(`tab:contact`)") +push() +push("R-F1 / L-F1 = 右 / 左手 F1。") +push() +data = [ + ("CNN", "EMG", 0.646, 0.663, 0.628, "Ours"), + ("LSTM", "EMG", 0.669, 0.694, 0.645, "Ours"), + ("TCN", "MoCap", 0.667, 0.688, 0.647, "Ours"), + ("DeepConvLSTM", "EMG", 0.670, 0.696, 0.644, "Repro"), + ("InceptionTime", "EMG", 0.663, 0.690, 0.635, "Repro"), + ("UnderPressure", "EMG", 0.669, 0.703, 0.635, "Repro"), + ("ASFormer", "IMU", 0.673, 0.698, 0.648, "Repro"), +] +data_sorted = sorted(data, key=lambda x: -x[2]) +best = {i: max(d[i] for d in data) for i in (2,3,4)} +push("| 排名 | Model | Type | Input | Avg F1 ↑ | R-F1 ↑ | L-F1 ↑ |") +push("|---|---|---|---|---|---|---|") +for rank, (m, inp, avg, r, l, t) in enumerate(data_sorted, 1): + push(f"| {rank} | {m} | {t} | {inp} | " + f"{maybe_bold(f'{avg:.3f}', avg==best[2])} | " + f"{maybe_bold(f'{r:.3f}', r==best[3])} | " + f"{maybe_bold(f'{l:.3f}', l==best[4])} |") +push() +push("**这张表说明:**") +push() +push("- 所有方法 Avg F1 挤在 0.646–0.673,**没有任何方法显著领先**。") +push("- ASFormer(IMU)Avg F1 0.673 第 1,但与第 7 名(CNN+EMG 0.646)只差 2.7 pp。") +push("- EMG 是公认最好的输入(physiological proxy);加多模态没改进。") +push() +push("**对我们有利吗?🟡 中性。** 所有方法挤一团说明 \"benchmark 没有偏向某方法\"," + "可作为 dataset 公平性证据,但没有方法故事。") +push() + +# --------------------------------------------------------------------------- +# A.6 Table tab:missing-mod (T6) +# --------------------------------------------------------------------------- + +push("## A.6 缺失模态鲁棒性(T6)") +push() +push("### A.6.1 Missing-Modality Robustness(`tab:missing-mod`)") +push() +push("8-class scene recognition。两种训练模式对比:baseline(无 dropout,3 seeds)和" + "p=0.3 modality dropout 训练(5 seeds)。Test F1 ↑。") +push() +data = [ + ("Full", "MoCap+EMG+Eye+IMU", 0.661, 0.048, 0.672, 0.076, "Eval cfg"), + ("drop MoCap", "EMG+Eye+IMU", 0.307, 0.019, 0.492, 0.096, "Leave-one-out"), + ("drop EMG", "MoCap+Eye+IMU", 0.671, 0.051, 0.666, 0.040, "Leave-one-out"), + ("drop EyeTrack","MoCap+EMG+IMU", 0.667, 0.021, 0.630, 0.072, "Leave-one-out"), + ("drop IMU", "MoCap+EMG+Eye", 0.464, 0.017, 0.440, 0.049, "Leave-one-out"), + ("only MoCap", "MoCap", 0.403, 0.027, 0.356, 0.059, "Singleton"), + ("only EMG", "EMG", 0.082, 0.032, 0.218, 0.075, "Singleton"), + ("only IMU", "IMU", 0.309, 0.039, 0.442, 0.067, "Singleton"), +] +# sort by dropout F1 desc +data_sorted = sorted(data, key=lambda x: -x[4]) +best_b = max(x[2] for x in data) +best_d = max(x[4] for x in data) +push("| 排名 | Eval config | Active modalities | Baseline F1 ↑ (no drop, 3 seed) | Dropout F1 ↑ (p=0.3, 5 seed) | Δ |") +push("|---|---|---|---|---|---|") +for rank, (cfg, mods, b, sb, d, sd, group) in enumerate(data_sorted, 1): + push(f"| {rank} | {cfg} | {mods} | " + f"{maybe_bold(fmt_meanstd(b,sb), b==best_b)} | " + f"{maybe_bold(fmt_meanstd(d,sd), d==best_d)} | {d-b:+.3f} |") +push() +push("**这张表说明:**") +push() +push("- **Dropout 训练在 8 个测试配置中,有 5 个胜出**(剩下 3 个 leave-one-out 略输或持平)。") +push("- 最显著的 gain 在 **drop MoCap**(+18.5 pp),只剩 IMU 单模(+13.3 pp),只剩 EMG 单模(+13.6 pp)。") +push("- Full-modality 自身也涨 +1.1 pp(0.661 → 0.672),deployment 友好且不牺牲 clean-test 性能。") +push("- (说明:EyeTrack 设计上不作为单独模态使用,因此只出现在 leave-one-out 和 full 配置,Singleton 一组中省略。)") +push() +push("**对我们有利吗?🟢 强有利。** 这是 paper T6 的核心 finding,strictly dominate baseline,对 SyncFuse 故事有力支撑。") +push() + +# --------------------------------------------------------------------------- +# A.7 Tables T4 / T5 +# --------------------------------------------------------------------------- + +push("## A.7 抓取相关回归 / 预判(T4 / T5)") +push() +push("### A.7.1 T4 EMG → Hand Pose Regression(`tab:emg-pose`)") +push() +push("3D Euclidean error ↓(mm,越低越好);Pearson r ↑。") +push() +data = [ + ("LSTM", 0.146, 0.094, 44.6, 0.9, 90.6, 2.0), + ("Transformer", 0.197, 0.018, 43.3, 0.3, 88.2, 0.5), +] +data_sorted = sorted(data, key=lambda x: x[5]) # sort by 3D error asc (lower better) +best_r = max(x[1] for x in data) +best_mae = min(x[3] for x in data) +best_3d = min(x[5] for x in data) +push("| 排名 | Backbone | Pearson r ↑ | MAE ↓ (mm) | Avg 3D Eucl ↓ (mm) |") +push("|---|---|---|---|---|") +for rank, (b, r, sr, mae, smae, eu, seu) in enumerate(data_sorted, 1): + push(f"| {rank} | {b} | " + f"{maybe_bold(fmt_meanstd(r,sr), r==best_r)} | " + f"{maybe_bold(fmt_meanstd(mae,smae,1), mae==best_mae)} | " + f"{maybe_bold(fmt_meanstd(eu,seu,1), eu==best_3d)} |") +push() +push("**这张表说明:**") +push() +push("- Transformer 比 LSTM 略好(r 0.197 vs 0.146,3D error 88 vs 91 mm)。") +push("- r ≈ 0.2 在噪声上方,但 88 mm 在 100 mm 指尖到手腕的尺度下几乎没法用。") +push() +push("**对我们有利吗?🟡 弱正向。** r ≈ 0.2 高于噪声但绝对精度不够,作为 open challenge 比作为 \"我们解决了\" 合理。") +push() + +push("### A.7.2 T5 Grasp Onset Anticipation(`tab:anticipation`)") +push() +push("二分类:1s 窗口预测下一 500 ms 是否会发生 contact。AUC / AP 是不平衡时的稳健指标。") +push() +data = [ + ("EMG", 0.715, 0.020, 0.829, 0.010, 0.626, 0.041, 0.798, 0.029), + ("EMG+IMU", 0.704, 0.013, 0.826, 0.009, 0.492, 0.031, 0.713, 0.015), + ("MoCap+EMG+IMU+Eye", 0.687, 0.035, 0.810, 0.030, 0.532, 0.007, 0.731, 0.033), +] +data_sorted = sorted(data, key=lambda x: -x[5]) # sort by AUC desc +best_auc = max(x[5] for x in data) +best_ap = max(x[7] for x in data) +push("| 排名 | Modalities | Acc ↑ | F1 ↑ | AUC ↑ | AP ↑ |") +push("|---|---|---|---|---|---|") +for rank, (mods, acc, sacc, f1, sf1, auc, sauc, ap, sap) in enumerate(data_sorted, 1): + push(f"| {rank} | {mods} | {fmt_meanstd(acc,sacc)} | {fmt_meanstd(f1,sf1)} | " + f"{maybe_bold(fmt_meanstd(auc,sauc), auc==best_auc)} | " + f"{maybe_bold(fmt_meanstd(ap,sap), ap==best_ap)} |") +push() +push("**这张表说明:**") +push() +push("- **EMG 单模 AUC 0.626 / AP 0.798,排第 1**;加 IMU 反而降到 AUC 0.492。") +push("- 与 case study(EMG 比 motion 早 ~20ms 激活)逻辑闭环。") +push() +push("**对我们有利吗?🟢 有利。** \"EMG-only > 多模态\" 与论文 \"多模态融合不总有利\" 主线一致,且与 sub-frame timing 故事联动。") +push() + +# --------------------------------------------------------------------------- +# A.8 Table tab:retrieval (T3) +# --------------------------------------------------------------------------- + +push("## A.8 跨模态检索(T3)") +push() +push("### A.8.1 Sensor → Text Retrieval(`tab:retrieval`)") +push() +push("Pool size K=100,chance R@1/5/10 = 1%/5%/10%。Median rank ↓ 越低越好。") +push() +data = [ + ("MoCap", 0.035, 0.001, 0.142, 0.003, 0.245, 0.016, 26.3, 0.6), + ("EMG+IMU", 0.035, 0.004, 0.153, 0.018, 0.266, 0.012, 26.3, 2.3), + ("MoCap+EMG+Eye+IMU", 0.037, 0.003, 0.161, 0.017, 0.277, 0.021, 25.2, 0.7), +] +data_sorted = sorted(data, key=lambda x: -x[5]) # sort by R@10 desc +best_r1 = max(x[1] for x in data) +best_r5 = max(x[3] for x in data) +best_r10 = max(x[5] for x in data) +best_med = min(x[7] for x in data) +push("| 排名 | Modalities | R@1 ↑ | R@5 ↑ | R@10 ↑ | Median rank ↓ |") +push("|---|---|---|---|---|---|") +for rank, (mods, r1, sr1, r5, sr5, r10, sr10, med, smed) in enumerate(data_sorted, 1): + push(f"| {rank} | {mods} | " + f"{maybe_bold(fmt_meanstd(r1,sr1), r1==best_r1)} | " + f"{maybe_bold(fmt_meanstd(r5,sr5), r5==best_r5)} | " + f"{maybe_bold(fmt_meanstd(r10,sr10), r10==best_r10)} | " + f"{maybe_bold(fmt_meanstd(med,smed,1), med==best_med)} |") +push() +push("**这张表说明:**") +push() +push("- 4-mod 在 R@1 / R@5 / R@10 / median rank 全部排第 1。") +push("- 三组都达 chance 的 ~2.5–2.8×,但绝对 R@1 只有 3.7%(从零训中文文本 encoder)。") +push() +push("**对我们有利吗?🟡 中性。** 多模 > 单模的趋势对故事友好,但绝对值低,需要在文里说明这是首次的 retrieval baseline,后续工作可以用 pretrained Chinese LM。") +push() + +# --------------------------------------------------------------------------- +# A.9 Diagnostic tables +# --------------------------------------------------------------------------- + +push("## A.9 诊断表") +push() +push("### A.9.1 Zero-shot Scene Generalization(`tab:zeroshot`)") +push() +push("Leave-one-scene-out:从 7 个 scene 训,测留出的 1 个 scene。Dom.\\ frac.\\ = 留出样本被分到 dominant 邻居的比例。") +push() +data = [ + ("s1 office", "s4 cleaning", 0.67, 0.533, 3), + ("s2 package", "s5 table-set", 0.67, 0.538, 3), + ("s3 kitchen", "s2 package", 0.67, 0.576, 3), + ("s4 cleaning", "s1 office", 0.33, 0.623, 3), + ("s5 table-set", "s1 office", 0.33, 0.604, 3), + ("s6 luggage", "s5 table-set", 0.67, 0.671, 3), + ("s7 coffee", "s3 kitchen", 0.50, 0.524, 4), + ("s8 clothes", "s5 table-set", 1.00, 0.623, 3), +] +data_sorted = sorted(data, key=lambda x: -x[3]) # sort by Seen F1 +best_f1 = max(x[3] for x in data) +push("| 排名 | Held-out scene | Dominant neighbour | Dom. frac. | Seen F1(7 类)↑ | N test |") +push("|---|---|---|---|---|---|") +for rank, (held, neigh, dom, f1, n) in enumerate(data_sorted, 1): + push(f"| {rank} | {held} | {neigh} | {dom:.2f} | " + f"{maybe_bold(f'{f1:.3f}', f1==best_f1)} | {n} |") +push() +push("**这张表说明:**") +push() +push("- 每个 held-out scene 都被映射到一个**特定**邻居(office↔cleaning 互为映射,package→table-set,clothes→table-set 100%)。") +push("- 这些映射跟语义相似性吻合(都涉及 large-scale upper-body motion)。") +push() +push("**对我们有利吗?🟢 有利。** Zero-shot 是论文的副产品 finding,展示 dataset 的语义结构是可解释的,加分项。") +push() + +push("### A.9.2 Per-Subject Breakdown(`tab:per-subject`)") +push() +push("T6 dropout-trained 4-mod Transformer,5 seeds。") +push() +data = [ + ("v25", 8, 0.875, 0.112, 0.900, 0.094), + ("v26", 8, 0.396, 0.150, 0.525, 0.122), + ("v27", 8, 0.571, 0.119, 0.650, 0.122), + ("v3", 1, 0.600, 0.490, 0.600, 0.490), +] +data_sorted = sorted(data, key=lambda x: -x[2]) +best_f1 = max(x[2] for x in data) +best_acc = max(x[4] for x in data) +push("| 排名 | Volunteer | N records | F1 ↑ | Acc ↑ |") +push("|---|---|---|---|---|") +for rank, (v, n, f1, sf1, acc, sacc) in enumerate(data_sorted, 1): + push(f"| {rank} | {v} | {n} | " + f"{maybe_bold(fmt_meanstd(f1,sf1), f1==best_f1)} | " + f"{maybe_bold(fmt_meanstd(acc,sacc), acc==best_acc)} |") +push() +push("总体(25 records):F1 = 0.672 ± 0.076,Acc = 0.688 ± 0.069。") +push() +push("**这张表说明:**") +push() +push("- v25 和 v26 在同模型上 F1 相差 **0.479**(0.875 vs 0.396);v25 90% 准确,v26 只 50%。") +push("- 大部分 \"seed variance\" 实际是 \"across-subject variance\";单个离群被试可影响整体 ±8 pp。") +push() +push("**对我们有利吗?🟢 有利。** 这是给未来工作的 guideline(\"按 subject 分层报告\"),展示我们对评测协议的细致思考。") +push() +push("---") +push() + + +# =========================================================================== +# Part B:新跑 T10 五张表(从 eval_macrof1.json 自动汇总) +# =========================================================================== + +push("# Part B — 新跑 T10 Triplet Next-Action Prediction(5 张表)") +push() +push("**任务定义**:对每个标注 segment k,以 `start(k) − T_fut` 为锚点,取 `[anchor − 8s, anchor]` 这 8 秒(20 Hz)作输入," + "预测四元组 `(verb_fine, verb_composite, noun, hand)`(类数 17 / 6 / 34 / 3)。") +push() +push("**数据划分**:subject-independent test = 4 留出 vol(`v14, v30, v34, v38, v41`),共 773 个 (segment, recording)。" + "每行报 5 seed `{42, 123, 456, 789, 1024}` 的 mean ± std。") +push() +push("**指标**:") +push("- **Action Acc ↑** = top-1 accuracy on (verb_fine ∧ noun ∧ hand)。主指标。") +push("- **Verb_fine Macro F1 ↑** = 17 类细粒度动词 macro F1。") +push("- **Noun Macro F1 ↑** = 34 类名词 macro F1。") +push("- **Hand Acc ↑** = 3 类手分类 accuracy。") +push() + +# --------------------------------------------------------------------------- +# B.1 Table T10.1 主对比 +# --------------------------------------------------------------------------- + +MODEL_DISPLAY = { + "dailyactformer": "DailyActFormer (Ours)", + "deepconvlstm": "DeepConvLSTM", + "rulstm": "RU-LSTM", + "futr": "FUTR", + "afft": "AFFT", + "handformer": "HandFormer", + "actionllm": "ActionLLM (surrogate)", +} +OURS = {"dailyactformer"} + +push("## B.1 Table T10.1 — 主对比:Ours vs 7 个复现 baseline") +push() +push("所有方法 `T_fut = 2s`。每个 baseline 在它原始 paper 推荐的模态子集上训练;`DailyActFormer (Ours)` 在全 5 模态上训练。") +push() +table1_rows_def = [ + "row01_ours_dailyactformer_all5", + "row02_deepconvlstm_imu", + "row03_deepconvlstm_3mod", + "row04_rulstm_imu_mocap", + "row05_futr_3mod", + "row06_afft_4mod", + "row07_handformer_mocap", + "row08_actionllm_3mod", +] +t1_data = [] +for rn in table1_rows_def: + seeds = collect_row("table1_main_comparison", rn) + agg = aggregate_row(seeds) + if agg is None: + continue + t1_data.append({ + "name": MODEL_DISPLAY[agg["model"]], + "is_ours": agg["model"] in OURS, + "modalities": fmt_mods(agg["modalities"]), + "agg": agg, + "best": set(), + }) +for k in ["action_acc", "verb_fine_macro_f1", "noun_macro_f1", "hand_acc"]: + bold_best_t10(t1_data, k) +t1_data.sort(key=lambda r: r["agg"]["action_acc"]["mean"], reverse=True) + +push("| 排名 | Method | Type | Modalities | Action Acc ↑ | Verb_fine Macro F1 ↑ | Noun Macro F1 ↑ | Hand Acc ↑ | Params |") +push("|---|---|---|---|---|---|---|---|---|") +for rank, r in enumerate(t1_data, 1): + type_tag = "**Ours**" if r["is_ours"] else "Repro" + push(f"| {rank} | {r['name']} | {type_tag} | {r['modalities']} | " + f"{cell_t10(r,'action_acc')} | {cell_t10(r,'verb_fine_macro_f1')} | " + f"{cell_t10(r,'noun_macro_f1')} | {cell_t10(r,'hand_acc')} | " + f"{r['agg']['n_params']:,} |") +push() +ours_rank = next((i for i, r in enumerate(t1_data, 1) if r["is_ours"]), None) +push("**这张表说明:**") +push() +push(f"- DAF(Ours)在 8 个模型里 Action Acc 排名 **第 {ours_rank}**;排第 1 的是 `{t1_data[0]['name']}`。") +push("- 但分头看:DAF 在 **Noun Macro F1** 维度领先大多数 baseline(0.0691,仅次于 AFFT 的 0.0796)、" + "在 **Verb_fine Macro F1** 上 0.0496 也属第二梯队;**真正全面领先的是 AFFT(IMU+EMG+Eye+MoCap)**。") +push("- Hand Acc 全部聚集在 0.37–0.40 区间(3 类随机 = 0.333),所有模型都没在 hand 维度真正学到东西。") +push() +push("**对我们有利吗?🔴 不利**(以 Action Acc 为单一标准);🟡 半利半弊(同时报 Macro F1 时)。") +push() +push("- 不利点:headline Action Acc DAF 没赢,论文 \"我们大幅领先\" 的故事讲不出来。") +push("- 缓解点:同时报 Macro F1,DAF 在 Noun 上排第 2,Verb_fine 上排中段,可以改成 \"DAF 在长尾类上稳健\"。") +push("- 关键问题:**真正威胁 DAF 的是 AFFT,不是 DeepConvLSTM**。") +push() + +# --------------------------------------------------------------------------- +# B.2 Table T10.2 Horizon +# --------------------------------------------------------------------------- + +push("## B.2 Table T10.2 — Horizon 曲线(Ours,5 modalities)") +push() +push("`DailyActFormer` 全 5 模态,变化 `T_fut`。") +push() +t3_data = [] +for rn, tf in [("row01_ours_tfut1s", 1), ("row02_ours_tfut2s", 2), + ("row03_ours_tfut5s", 5), ("row04_ours_tfut10s", 10), + ("row05_ours_tfut15s", 15)]: + seeds = collect_row("table3_horizon_curve", rn) + agg = aggregate_row(seeds) + if agg is None: + continue + t3_data.append({"t_fut": tf, "agg": agg, "best": set()}) +for k in ["action_acc", "verb_fine_macro_f1", "noun_macro_f1", "hand_acc"]: + bold_best_t10(t3_data, k) +t3_data.sort(key=lambda r: r["agg"]["action_acc"]["mean"], reverse=True) + +push("| 排名 | T_fut (s) | Action Acc ↑ | Verb_fine Macro F1 ↑ | Noun Macro F1 ↑ | Hand Acc ↑ |") +push("|---|---|---|---|---|---|") +for rank, r in enumerate(t3_data, 1): + push(f"| {rank} | {r['t_fut']} | {cell_t10(r,'action_acc')} | " + f"{cell_t10(r,'verb_fine_macro_f1')} | {cell_t10(r,'noun_macro_f1')} | " + f"{cell_t10(r,'hand_acc')} |") +push() +push("**这张表说明:**") +push() +push("- 排序后正好对应 T_fut 自然顺序(1 → 2 → 5 → 10 → 15s),**单调下降**。") +push("- 1s 与 2s 几乎打平,5s 略降,10s 明显掉,15s 接近随机。") +push() +push("**对我们有利吗?🟢 有利。** 5 张新表里**唯一干净**的结果,可独立成图作为 \"DAF 在 1–5s 短期可用\" 的故事。") +push() + +# --------------------------------------------------------------------------- +# B.3 Table T10.3 Modality ablation +# --------------------------------------------------------------------------- + +push("## B.3 Table T10.3 — 模态消融(Ours,T_fut=2s)") +push() +push("`DailyActFormer` 在不同模态子集上训练,`T_fut = 2s`。") +push() +t4_data = [] +for rn, label in [("row01_full_5mod", "Full (5 mod)"), + ("row02_no_pressure", "− Pressure"), + ("row03_no_eyetrack", "− EyeTrack"), + ("row04_no_emg", "− EMG"), + ("row05_no_imu", "− IMU"), + ("row06_no_mocap", "− MoCap"), + ("row07_imu_emg_only", "IMU + EMG only"), + ("row08_mocap_only", "MoCap only")]: + seeds = collect_row("table4_modality_ablation", rn) + agg = aggregate_row(seeds) + if agg is None: + continue + t4_data.append({"label": label, "modalities": fmt_mods(agg["modalities"]), + "agg": agg, "best": set()}) +for k in ["action_acc", "verb_fine_macro_f1", "noun_macro_f1", "hand_acc"]: + bold_best_t10(t4_data, k) +t4_data.sort(key=lambda r: r["agg"]["action_acc"]["mean"], reverse=True) + +push("| 排名 | Configuration | Modalities | Action Acc ↑ | Verb_fine Macro F1 ↑ | Noun Macro F1 ↑ | Hand Acc ↑ |") +push("|---|---|---|---|---|---|---|") +for rank, r in enumerate(t4_data, 1): + push(f"| {rank} | {r['label']} | {r['modalities']} | " + f"{cell_t10(r,'action_acc')} | {cell_t10(r,'verb_fine_macro_f1')} | " + f"{cell_t10(r,'noun_macro_f1')} | {cell_t10(r,'hand_acc')} |") +push() +push("**这张表说明:**") +push() +push("- **去掉 Pressure 反而最高**(0.0318 排第 1,比 Full +22%),Pressure 是噪声而非信号。") +push("- **去掉 MoCap 大幅下降**(0.0153,−41%),MoCap 是最重要的模态。") +push("- IMU+EMG only 谷底(0.0136),MoCap only 中段(0.0228)。") +push() +push("**对我们有利吗?🟡 半利半弊。** MoCap 重要性是好故事;Pressure 反向需要在文里圆。") +push() + +# --------------------------------------------------------------------------- +# B.4 Table T10.4 Component ablation +# --------------------------------------------------------------------------- + +push("## B.4 Table T10.4 — 组件消融(Ours,5 modalities,T_fut=2s)") +push() +push("`DailyActFormer` 默认配置(`row01 full`)与逐项关掉一个设计组件后的对比。" + "⚠ row05 因 `run.sh` bug 实际跑出来与 row01 一致。") +push() +t5_data = [] +for rn, label, note in [("row01_full", "Full(默认)", ""), + ("row02_no_composite_head", "− Composite head", "λ_verb_composite=0"), + ("row03_equal_lambda", "Equal λ(全 1.0)", ""), + ("row04_no_class_weight", "− Class weight", ""), + ("row05_no_label_smoothing", "− Label smoothing", "**⚠ run.sh bug,实际 = row01**")]: + seeds = collect_row("table5_component_ablation", rn) + agg = aggregate_row(seeds) + if agg is None: + continue + t5_data.append({"label": label, "note": note, "agg": agg, "best": set()}) +for k in ["action_acc", "verb_fine_macro_f1", "noun_macro_f1", "hand_acc"]: + bold_best_t10(t5_data, k) +t5_data.sort(key=lambda r: r["agg"]["action_acc"]["mean"], reverse=True) + +push("| 排名 | Configuration | Action Acc ↑ | Verb_fine Macro F1 ↑ | Noun Macro F1 ↑ | Hand Acc ↑ | Notes |") +push("|---|---|---|---|---|---|---|") +for rank, r in enumerate(t5_data, 1): + push(f"| {rank} | {r['label']} | {cell_t10(r,'action_acc')} | " + f"{cell_t10(r,'verb_fine_macro_f1')} | {cell_t10(r,'noun_macro_f1')} | " + f"{cell_t10(r,'hand_acc')} | {r['note']} |") +push() +push("**这张表说明:**") +push() +push("- **关掉 class weight 反而排第 1**(0.0468,比 Full +79%);所有四指标全部最优。**默认 `--use_class_weights` 在伤模型**。") +push("- Equal λ 与 Full 几乎打平(0.0269 vs 0.0261)。") +push("- 关掉 composite head 略降(0.0223),这个组件在帮 DAF。") +push() +push("**对我们有利吗?🔴 不利(对默认配置)→ 🟢 救命行(给改进方向)。**") +push() +push("- 默认 class weight 反而是瓶颈,论文如果讲 \"用 class weight 处理长尾\" 就破了。") +push("- 但 0.0468 这个数字 **远超 Table T10.1 所有 baseline**(最高 DeepConvLSTM-3mod 才 0.0279);把 DAF 默认改为 \"no class weight\" 后 Table T10.1 完全可以翻盘。") +push() + +# --------------------------------------------------------------------------- +# B.5 Table T10.5 Modality dropout +# --------------------------------------------------------------------------- + +push("## B.5 Table T10.5 — 训练时模态 dropout(Ours,5 modalities,T_fut=2s)") +push() +push("每个 batch 里,每个 sample 的每个模态独立以 `p` 概率被整张零置(保证至少留 1 个)。") +push() +t7_data = [] +seeds_full = collect_row("table5_component_ablation", "row01_full") +agg_full = aggregate_row(seeds_full) +if agg_full: + t7_data.append({"label": "Default (p=0)", "agg": agg_full, "best": set()}) +seeds_drop = collect_row("table7_missing_modality", "row01_train_with_modality_dropout") +agg_drop = aggregate_row(seeds_drop) +if agg_drop: + t7_data.append({"label": "+ modality_dropout (p=0.3)", "agg": agg_drop, "best": set()}) +for k in ["action_acc", "verb_fine_macro_f1", "noun_macro_f1", "hand_acc"]: + bold_best_t10(t7_data, k) +t7_data.sort(key=lambda r: r["agg"]["action_acc"]["mean"], reverse=True) + +push("| 排名 | Setting | Action Acc ↑ | Verb_fine Macro F1 ↑ | Noun Macro F1 ↑ | Hand Acc ↑ |") +push("|---|---|---|---|---|---|") +for rank, r in enumerate(t7_data, 1): + push(f"| {rank} | {r['label']} | {cell_t10(r,'action_acc')} | " + f"{cell_t10(r,'verb_fine_macro_f1')} | {cell_t10(r,'noun_macro_f1')} | " + f"{cell_t10(r,'hand_acc')} |") +push() +push("**这张表说明:**") +push() +push("- 加 `p=0.3` modality dropout 后所有指标略降(Action Acc 0.0233 vs 0.0261,−10%),std 也变大。") +push() +push("**对我们有利吗?🔴 不利,且与论文 T6 叙事矛盾。**") +push() +push("- 论文 A.6.1(`tab:missing-mod`)中 modality dropout 在 T6 上 strictly dominate baseline,这里 T10 上反而伤性能。") +push("- 可能解释:T6 是 sequence-level scene(标签强),T10 是 segment-level next-action(标签细),dropout 在 T10 上去掉的有效信号过多。") +push() + +# --------------------------------------------------------------------------- +# 最终总结 +# --------------------------------------------------------------------------- + +push("---") +push() +push("# 全部表格综合速览") +push() +push("| 区块 | 表 | 主指标第 1 名 | 对我们 |") +push("|---|---|---|---|") +push("| Part A T1 单 vs 多 | A.1.1 | IME late + pretrained 0.696 F1 | 🟢 |") +push("| Part A T1 pretrain 消融 | A.1.2 | No augment + Pretrain 0.696 F1 | 🟡 |") +push("| Part A T1 vs 已发表 | A.1.3 | Transformer+Pretrain (Ours) 0.760 Acc | 🟢 强 |") +push("| Part A T1 扩展 + SyncFuse | A.1.4 | SyncFuse (Ours) 0.516 F1 | 🟢 强 |") +push("| Part A SyncFuse 消融 | A.2.1 | Full 0.535 F1 | 🟢 |") +push("| Part A T2 contact | A.5.1 | ASFormer 0.673 Avg F1 | 🟡 |") +push("| Part A T6 missing-mod | A.6.1 | drop+EMG 0.671 F1 | 🟢 强 |") +push("| Part A T4 EMG→pose | A.7.1 | Transformer r 0.197 | 🟡 |") +push("| Part A T5 anticipation | A.7.2 | EMG-only AUC 0.626 | 🟢 |") +push("| Part A T3 retrieval | A.8.1 | 4-mod R@10 0.277 | 🟡 |") +push("| Part A zero-shot | A.9.1 | s6 luggage F1 0.671 | 🟢 |") +push("| Part A per-subject | A.9.2 | v25 F1 0.875 | 🟢 |") +push("| Part B T10.1 主对比 | B.1 | DeepConvLSTM-3mod 0.0279 Action Acc | 🔴 |") +push("| Part B T10.2 horizon | B.2 | T_fut=1s 0.0262 Action Acc | 🟢 |") +push("| Part B T10.3 模态消融 | B.3 | −Pressure 0.0318 Action Acc | 🟡 |") +push("| Part B T10.4 组件消融 | B.4 | −Class weight **0.0468** Action Acc | 🔴 → 🟢 救命行 |") +push("| Part B T10.5 dropout | B.5 | Default 0.0261 Action Acc | 🔴 |") +push() +push("**总判断**:") +push() +push("- Part A(已写进 paper):**整体可投**,5 张强表 + 4 张中性 + 3 张需要话术圆,论文 narrative 已经准备好防御。") +push("- Part B(新跑 T10):**现稿不可投**;但 Table T10.4 row04 的 0.0468 是改进方向,先用 1 seed 验证 \"DAF + no_class_weight\",成了再 5 seed 全表重跑,T10.1 可以翻盘。") +push() +push("由 `scripts/build_paper_tables.py` 从 `paper/sections/*.tex` 手抄数据 + 135 个 `eval_macrof1.json` 自动汇总。") + +OUT.parent.mkdir(parents=True, exist_ok=True) +with open(OUT, "w") as f: + f.write("\n".join(lines) + "\n") +print(f"Wrote {OUT}") diff --git a/scripts/dispatch_eval.sh b/scripts/dispatch_eval.sh new file mode 100644 index 0000000000000000000000000000000000000000..fc41303e543797041c12a848312b80de8f5764fe --- /dev/null +++ b/scripts/dispatch_eval.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# Dispatch 16 eval jobs in parallel — one per (modalities_canonical, t_obs, t_fut) tuple. +set -euo pipefail + +PYTHON=python +EVAL=${PULSE_ROOT}/scripts/eval_subset.py +PARTITION=${PARTITION:-gpuA800} +GPU_GRES=${GPU_GRES:-gpu:1} +LOG_DIR=${PULSE_ROOT}/results/eval_logs +mkdir -p "$LOG_DIR" + +# 16 distinct subsets enumerated by inspecting all results.json files. +# Each line: || +SUBSETS=( + "emg,eyetrack,imu|8.0|2.0" + "emg,eyetrack,imu,mocap|8.0|2.0" + "emg,eyetrack,imu,mocap,pressure|8.0|1.0" + "emg,eyetrack,imu,mocap,pressure|8.0|2.0" + "emg,eyetrack,imu,mocap,pressure|8.0|5.0" + "emg,eyetrack,imu,mocap,pressure|8.0|10.0" + "emg,eyetrack,imu,mocap,pressure|8.0|15.0" + "emg,eyetrack,imu,pressure|8.0|2.0" + "emg,eyetrack,mocap,pressure|8.0|2.0" + "emg,imu|8.0|2.0" + "emg,imu,mocap|8.0|2.0" + "emg,imu,mocap,pressure|8.0|2.0" + "eyetrack,imu,mocap,pressure|8.0|2.0" + "imu|8.0|2.0" + "imu,mocap|8.0|2.0" + "mocap|8.0|2.0" +) + +idx=0 +for entry in "${SUBSETS[@]}"; do + IFS='|' read -r mods t_obs t_fut <<< "$entry" + idx=$((idx+1)) + tag=$(echo "${mods}_o${t_obs}_f${t_fut}" | tr ',.' '_') + job_name="evalT10_${idx}_${tag}" + job_name=$(echo "$job_name" | cut -c1-60) # SLURM job names cap at ~60 chars + out="${LOG_DIR}/${tag}.out" + err="${LOG_DIR}/${tag}.err" + cmd="export PYTHONUNBUFFERED=1; ${PYTHON} ${EVAL} --modalities ${mods} --t_obs ${t_obs} --t_fut ${t_fut}" + sbatch -J "${job_name}" -p "${PARTITION}" --gres="${GPU_GRES}" \ + -N 1 -n 1 --cpus-per-task=4 --mem=32G \ + -t 0:20:00 -o "${out}" -e "${err}" \ + --export=ALL --wrap="${cmd}" + echo "submitted ${job_name}" +done + +echo "" +echo "All 16 dispatched. Logs: ${LOG_DIR}/" diff --git a/scripts/eval_macrof1.py b/scripts/eval_macrof1.py new file mode 100644 index 0000000000000000000000000000000000000000..f19ededa52d000a5d82c364c425050a90c005e62 --- /dev/null +++ b/scripts/eval_macrof1.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 +"""Re-evaluate all 135 trained seeds with paper-style metrics. + +For each /seeds/seed*/model_best.pt: +- Reload the model with the right modalities +- Build the test loader for that modality subset +- Run inference, collect predictions +- Compute Acc, Macro-F1, Weighted-F1 per head (verb_fine, verb_composite, + noun, hand) and for the joint "action" (= verb_fine ∧ noun ∧ hand) +- Write /eval_macrof1.json + +Cache the test_ds per modality subset so we don't rebuild it 135 times. +""" + +from __future__ import annotations + +import json +import os +import sys +import time +from pathlib import Path + +import pandas as pd # noqa: F401 (dataset_seqpred imports pandas first) +import numpy as np +import torch +from sklearn.metrics import f1_score, accuracy_score +from torch.utils.data import DataLoader + +REPO = Path("${PULSE_ROOT}") +sys.path.insert(0, str(REPO / "experiments")) + +from dataset_seqpred import ( # noqa: E402 + TripletSeqPredDataset, build_train_test, collate_triplet, + TRAIN_VOLS_V3, TEST_VOLS_V3, +) +from models_seqpred import build_model # noqa: E402 + + +def find_seed_dirs(): + out = [] + for table_name in [ + "table1_main_comparison", + "table3_horizon_curve", + "table4_modality_ablation", + "table5_component_ablation", + "table7_missing_modality", + ]: + td = REPO / table_name + for row_dir in sorted(td.glob("row*")): + for sd in sorted((row_dir / "seeds").glob("seed*")): + if (sd / "model_best.pt").exists() and (sd / "results.json").exists(): + out.append(sd) + return out + + +_test_cache = {} # (modalities_tuple, t_obs, t_fut) -> (test_loader, modality_dims) + + +def get_test_loader(modalities, t_obs, t_fut, downsample, num_workers=0): + key = (tuple(modalities), float(t_obs), float(t_fut), int(downsample)) + if key in _test_cache: + return _test_cache[key] + print(f" [build test loader] modalities={modalities} t_obs={t_obs} t_fut={t_fut}", + flush=True) + train_ds, test_ds = build_train_test( + modalities=list(modalities), + t_obs_sec=t_obs, t_fut_sec=t_fut, downsample=downsample, + ) + test_loader = DataLoader(test_ds, batch_size=64, shuffle=False, + collate_fn=collate_triplet, num_workers=num_workers) + md = test_ds.modality_dims + _test_cache[key] = (test_loader, md) + return test_loader, md + + +def eval_one(seed_dir: Path, device: torch.device): + res_p = seed_dir / "results.json" + with open(res_p) as f: + results = json.load(f) + args = results["args"] + model_name = args["model"] + modalities = args["modalities"].split(",") + t_obs = args["t_obs"] + t_fut = args["t_fut"] + downsample = args.get("downsample", 5) + + test_loader, modality_dims = get_test_loader(modalities, t_obs, t_fut, downsample) + + model = build_model(model_name, modality_dims).to(device) + state = torch.load(seed_dir / "model_best.pt", map_location=device, + weights_only=False) + model.load_state_dict(state["state_dict"]) + model.eval() + + all_logits = {k: [] for k in ("verb_fine", "verb_composite", "noun", "hand")} + all_y = {k: [] for k in ("verb_fine", "verb_composite", "noun", "hand")} + + with torch.no_grad(): + for x, mask, lens, y, meta in test_loader: + x = {m: t.to(device) for m, t in x.items()} + mask = mask.to(device) + logits = model(x, mask) + for k in all_logits: + all_logits[k].append(logits[k].cpu()) + all_y[k].append(y[k]) + + logits_cat = {k: torch.cat(v, dim=0) for k, v in all_logits.items()} + y_cat = {k: torch.cat(v, dim=0).numpy() for k, v in all_y.items()} + pred_cat = {k: logits_cat[k].argmax(dim=1).numpy() for k in logits_cat} + + out = {} + for k in ("verb_fine", "verb_composite", "noun", "hand"): + out[f"{k}_acc"] = float(accuracy_score(y_cat[k], pred_cat[k])) + out[f"{k}_macro_f1"] = float(f1_score(y_cat[k], pred_cat[k], + average="macro", zero_division=0)) + out[f"{k}_weighted_f1"] = float(f1_score(y_cat[k], pred_cat[k], + average="weighted", zero_division=0)) + + # Joint action = verb_fine AND noun AND hand correct + correct = ((pred_cat["verb_fine"] == y_cat["verb_fine"]) & + (pred_cat["noun"] == y_cat["noun"]) & + (pred_cat["hand"] == y_cat["hand"])) + out["action_acc"] = float(correct.mean()) + + # n_params (cheap) + out["n_params"] = sum(p.numel() for p in model.parameters()) + + out_p = seed_dir / "eval_macrof1.json" + with open(out_p, "w") as f: + json.dump(out, f, indent=2) + return out + + +def main(): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"device={device}", flush=True) + seed_dirs = find_seed_dirs() + print(f"Found {len(seed_dirs)} seed dirs", flush=True) + t0 = time.time() + n_ok = 0 + n_fail = 0 + for i, sd in enumerate(seed_dirs, 1): + try: + res = eval_one(sd, device) + n_ok += 1 + if i % 10 == 0 or i <= 3: + rel = sd.relative_to(REPO) + print(f" [{i:>3}/{len(seed_dirs)}] {rel} " + f"action_acc={res['action_acc']:.4f} " + f"verb_fine_macroF1={res['verb_fine_macro_f1']:.4f} " + f"noun_macroF1={res['noun_macro_f1']:.4f}", + flush=True) + except Exception as e: + n_fail += 1 + print(f" [{i:>3}/{len(seed_dirs)}] FAIL {sd.relative_to(REPO)}: {e}", + flush=True) + dur = time.time() - t0 + print(f"Done. ok={n_ok} fail={n_fail} elapsed={dur:.1f}s", flush=True) + + +if __name__ == "__main__": + main() diff --git a/scripts/eval_subset.py b/scripts/eval_subset.py new file mode 100644 index 0000000000000000000000000000000000000000..9e6e80e5702caf0a69169aa9eccd98a782a717d7 --- /dev/null +++ b/scripts/eval_subset.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python3 +"""Per-subset evaluator. + +Given a (modalities, t_obs, t_fut) triple, evaluate ALL trained seed dirs +across all 27 rows whose results.json matches that triple. Builds the test +dataset exactly once for the given triple, then iterates over matching +seeds, loads each model_best.pt, runs inference, and writes +/eval_macrof1.json. + +Used by dispatch_eval.sh to run 16 of these in parallel on the cluster. +""" + +from __future__ import annotations + +import argparse +import json +import sys +import time +from pathlib import Path + +import pandas as pd # noqa: F401 (must come before torch on this cluster) +import numpy as np +import torch +from sklearn.metrics import f1_score, accuracy_score +from torch.utils.data import DataLoader + +REPO = Path("${PULSE_ROOT}") +sys.path.insert(0, str(REPO / "experiments")) + +from dataset_seqpred import ( # noqa: E402 + build_train_test, collate_triplet, +) +from models_seqpred import build_model # noqa: E402 + + +def find_matching_seeds(mods_canon: str, t_obs: float, t_fut: float): + out = [] + for tt in [ + "table1_main_comparison", + "table3_horizon_curve", + "table4_modality_ablation", + "table5_component_ablation", + "table7_missing_modality", + ]: + td = REPO / tt + for row_dir in sorted(td.glob("row*")): + seed42 = row_dir / "seeds" / "seed42" / "results.json" + if not seed42.exists(): + continue + with open(seed42) as f: + d = json.load(f) + a = d["args"] + row_mods_canon = ",".join(sorted(a["modalities"].split(","))) + if (row_mods_canon == mods_canon + and abs(float(a["t_obs"]) - t_obs) < 1e-6 + and abs(float(a["t_fut"]) - t_fut) < 1e-6): + for sd in sorted((row_dir / "seeds").glob("seed*")): + if (sd / "model_best.pt").exists() and (sd / "results.json").exists(): + out.append(sd) + return out + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--modalities", required=True, + help="Sorted comma-separated list, e.g. 'emg,eyetrack,imu,mocap,pressure'") + ap.add_argument("--t_obs", type=float, required=True) + ap.add_argument("--t_fut", type=float, required=True) + args = ap.parse_args() + + seed_dirs = find_matching_seeds(args.modalities, args.t_obs, args.t_fut) + print(f"Subset key=({args.modalities!r}, t_obs={args.t_obs}, t_fut={args.t_fut})", flush=True) + print(f"Matched {len(seed_dirs)} seed dirs", flush=True) + for sd in seed_dirs: + print(f" {sd.relative_to(REPO)}", flush=True) + if not seed_dirs: + return + + # Each seed dir's args.modalities preserves the original (possibly unsorted) + # order, which determines the model's branch ordering. We use the first + # matching seed's order to build the test loader, then for any seed dir + # whose original order differs we rebuild — but in practice all seeds in + # a row share the same order, and rows with same canonical-set but different + # original order appear together in the dispatcher's same job (since the + # canonical key matches), so we have to handle order divergence. + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"device={device}", flush=True) + + # Group seed_dirs by the original (un-sorted) modality list each used, + # because different orders → different branch indices in the model. + orders = {} + for sd in seed_dirs: + with open(sd / "results.json") as f: + d = json.load(f) + orig_mods = d["args"]["modalities"] # original order + orders.setdefault(orig_mods, []).append((sd, d)) + print(f"Distinct original modality orderings under this canonical key: {len(orders)}", + flush=True) + + n_ok, n_fail = 0, 0 + t0 = time.time() + for orig_mods, group in orders.items(): + mods_list = orig_mods.split(",") + print(f"\n=== Building test loader for original order: {mods_list} ===", + flush=True) + tb0 = time.time() + train_ds, test_ds = build_train_test( + modalities=mods_list, + t_obs_sec=args.t_obs, t_fut_sec=args.t_fut, + ) + del train_ds # only need test stats which test_ds carries + test_loader = DataLoader(test_ds, batch_size=64, shuffle=False, + collate_fn=collate_triplet, num_workers=0) + modality_dims = test_ds.modality_dims + print(f" build took {time.time()-tb0:.1f}s; test n={len(test_ds)}", + flush=True) + + for sd, results in group: + args_d = results["args"] + try: + model = build_model(args_d["model"], modality_dims).to(device) + state = torch.load(sd / "model_best.pt", map_location=device, + weights_only=False) + model.load_state_dict(state["state_dict"]) + model.eval() + + all_logits = {k: [] for k in + ("verb_fine", "verb_composite", "noun", "hand")} + all_y = {k: [] for k in + ("verb_fine", "verb_composite", "noun", "hand")} + with torch.no_grad(): + for x, mask, lens, y, meta in test_loader: + x = {m: t.to(device) for m, t in x.items()} + mask = mask.to(device) + logits = model(x, mask) + for k in all_logits: + all_logits[k].append(logits[k].cpu()) + all_y[k].append(y[k]) + + logits_cat = {k: torch.cat(v, dim=0) for k, v in all_logits.items()} + y_cat = {k: torch.cat(v, dim=0).numpy() for k, v in all_y.items()} + pred_cat = {k: logits_cat[k].argmax(dim=1).numpy() for k in logits_cat} + + out = {} + for k in ("verb_fine", "verb_composite", "noun", "hand"): + out[f"{k}_acc"] = float(accuracy_score(y_cat[k], pred_cat[k])) + out[f"{k}_macro_f1"] = float(f1_score(y_cat[k], pred_cat[k], + average="macro", zero_division=0)) + out[f"{k}_weighted_f1"] = float(f1_score(y_cat[k], pred_cat[k], + average="weighted", zero_division=0)) + correct = ((pred_cat["verb_fine"] == y_cat["verb_fine"]) & + (pred_cat["noun"] == y_cat["noun"]) & + (pred_cat["hand"] == y_cat["hand"])) + out["action_acc"] = float(correct.mean()) + out["n_params"] = sum(p.numel() for p in model.parameters()) + + with open(sd / "eval_macrof1.json", "w") as f: + json.dump(out, f, indent=2) + print(f" OK {sd.relative_to(REPO)} action_acc={out['action_acc']:.4f}", + flush=True) + n_ok += 1 + # free model + del model + if torch.cuda.is_available(): + torch.cuda.empty_cache() + except Exception as e: + print(f" FAIL {sd.relative_to(REPO)}: {e}", flush=True) + n_fail += 1 + + print(f"\nSubset done. ok={n_ok} fail={n_fail} elapsed={time.time()-t0:.1f}s", + flush=True) + + +if __name__ == "__main__": + main() diff --git a/scripts/eval_topk_v3.py b/scripts/eval_topk_v3.py new file mode 100644 index 0000000000000000000000000000000000000000..1fc2b9c3c3a66272bd073cbe05d35a4ab040fc53 --- /dev/null +++ b/scripts/eval_topk_v3.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +"""Re-evaluate v3 saved models to compute action_vn@3 and action_vn@5. + +Loads model_best.pt from each seed dir, runs test set, computes: + - action_vn_top1 / top3 / top5 (verb_fine top-K AND noun top-K) + - verb_fine_top1 / top3 / top5 + - noun_top1 / top3 / top5 + +Writes results into /eval_topk.json so the aggregator can pick them up. +""" + +from __future__ import annotations +import json, sys, time +from pathlib import Path + +import pandas as pd # noqa +import torch +from torch.utils.data import DataLoader + +REPO = Path("${PULSE_ROOT}") +sys.path.insert(0, str(REPO / "experiments")) + +from dataset_seqpred import build_train_test, collate_triplet # noqa +from models_seqpred import build_model # noqa + + +def topk_correct(logits, y, k): + if k > logits.shape[1]: + k = logits.shape[1] + _, topk = logits.topk(k, dim=1) + return (topk == y.unsqueeze(1)).any(dim=1) + + +def find_v3_seed_dirs(): + """Walk table1_main_comparison/row*/seeds_v3{,_bidir,_sf}/seed*/model_best.pt""" + out = [] + base = REPO / "table1_main_comparison" + for row_dir in sorted(base.glob("row*")): + for sub in ("seeds_v3", "seeds_v3_bidir", "seeds_v3_sf"): + for sd in sorted((row_dir / sub).glob("seed*")): + if (sd / "model_best.pt").exists() and (sd / "results.json").exists(): + out.append(sd) + return out + + +_loader_cache = {} + + +def main(): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"device={device}", flush=True) + seed_dirs = find_v3_seed_dirs() + print(f"Found {len(seed_dirs)} v3 seed dirs", flush=True) + + t0 = time.time() + n_ok, n_fail = 0, 0 + for i, sd in enumerate(seed_dirs, 1): + try: + with open(sd / "results.json") as f: + results = json.load(f) + args = results["args"] + mods_list = args["modalities"].split(",") + mods_key = tuple(mods_list) + mode = args.get("mode", "anticipation") + + if (mods_key, mode) not in _loader_cache: + print(f" [build loader] mode={mode} modalities={mods_list}", flush=True) + train_ds, test_ds = build_train_test(modalities=mods_list, mode=mode) + del train_ds + test_loader = DataLoader(test_ds, batch_size=64, shuffle=False, + collate_fn=collate_triplet, num_workers=0) + _loader_cache[(mods_key, mode)] = (test_loader, test_ds.modality_dims) + test_loader, modality_dims = _loader_cache[(mods_key, mode)] + + extra = {} + if args["model"] in ("dailyactformer", "ours", "daf"): + extra["causal"] = (mode == "anticipation") + model = build_model(args["model"], modality_dims, **extra).to(device) + state = torch.load(sd / "model_best.pt", map_location=device, weights_only=False) + model.load_state_dict(state["state_dict"]) + model.eval() + + all_logits = {k: [] for k in ("verb_fine", "verb_composite", "noun", "hand")} + all_y = {k: [] for k in ("verb_fine", "verb_composite", "noun", "hand")} + with torch.no_grad(): + for x, mask, lens, y, meta in test_loader: + x = {m: t.to(device) for m, t in x.items()} + mask = mask.to(device) + logits = model(x, mask) + for k in all_logits: + all_logits[k].append(logits[k].cpu()) + all_y[k].append(y[k]) + + logits_cat = {k: torch.cat(v, dim=0) for k, v in all_logits.items()} + y_cat = {k: torch.cat(v, dim=0) for k, v in all_y.items()} + + out = {} + for k in ("verb_fine", "verb_composite", "noun", "hand"): + preds_top1 = logits_cat[k].argmax(dim=1) + out[f"{k}_top1"] = float((preds_top1 == y_cat[k]).float().mean()) + out[f"{k}_top3"] = float(topk_correct(logits_cat[k], y_cat[k], 3).float().mean()) + out[f"{k}_top5"] = float(topk_correct(logits_cat[k], y_cat[k], 5).float().mean()) + + # Joint action_vn (verb_fine ∧ noun) at top-1, top-3, top-5 + for K, lbl in [(1, "top1"), (3, "top3"), (5, "top5")]: + vf_ok = topk_correct(logits_cat["verb_fine"], y_cat["verb_fine"], K) + n_ok2 = topk_correct(logits_cat["noun"], y_cat["noun"], K) + out[f"action_vn_{lbl}"] = float((vf_ok & n_ok2).float().mean()) + + with open(sd / "eval_topk.json", "w") as f: + json.dump(out, f, indent=2) + n_ok += 1 + if i % 5 == 0 or i <= 3: + rel = sd.relative_to(REPO) + print(f" [{i:>3}/{len(seed_dirs)}] {rel} vn@1={out['action_vn_top1']:.4f} vn@3={out['action_vn_top3']:.4f} vn@5={out['action_vn_top5']:.4f}", flush=True) + del model + if torch.cuda.is_available(): + torch.cuda.empty_cache() + except Exception as e: + n_fail += 1 + print(f" [{i:>3}/{len(seed_dirs)}] FAIL {sd.relative_to(REPO)}: {e}", flush=True) + + print(f"Done. ok={n_ok} fail={n_fail} elapsed={time.time()-t0:.1f}s", flush=True) + + +if __name__ == "__main__": + main() diff --git a/scripts/summarize_135.sh b/scripts/summarize_135.sh new file mode 100644 index 0000000000000000000000000000000000000000..052558c14d5dc4129fe87d0968716c56061e7580 --- /dev/null +++ b/scripts/summarize_135.sh @@ -0,0 +1,116 @@ +#!/bin/bash +# Aggregate 135 SLURM job results (265051-265185). +# Writes a markdown summary to neurips26/results/run__summary.md +set -uo pipefail + +ROOT=${PULSE_ROOT} +JID_LO=265051 +JID_HI=265185 +TS=$(date -u +%Y%m%d_%H%M) +OUT="${ROOT}/results/run_${TS}_summary.md" +mkdir -p "${ROOT}/results" + +# tmp scratch +TMP=$(mktemp -d) +trap 'rm -rf "$TMP"' EXIT + +# 1. Walk all seed dirs in submission order; classify each. +# For each seed dir, pick the slurm_.out matching one of our jids. +# Status is OK if "[done] best" present, FAIL if traceback/error, TIMEOUT +# if SLURM cancelled it for time, RUNNING if no exit yet, MISSING if no log. +ORDER_FILE="$TMP/order.tsv" # tabletag\trow\tseed\tjid\tstatus\tacc\tepochs\tepoch_best +: > "$ORDER_FILE" + +for tt in table1_main_comparison table3_horizon_curve table4_modality_ablation table5_component_ablation table7_missing_modality; do + for row_dir in "${ROOT}/${tt}"/row*; do + [ -d "$row_dir" ] || continue + row=$(basename "$row_dir") + for seed in 42 123 456 789 1024; do + sd="${row_dir}/seeds/seed${seed}" + [ -d "$sd" ] || { printf "%s\t%s\t%d\t-\tMISSING_DIR\t-\t-\t-\n" "$tt" "$row" "$seed" >> "$ORDER_FILE"; continue; } + log=$(ls "${sd}"/slurm_*.out 2>/dev/null | head -1) + if [ -z "$log" ]; then + printf "%s\t%s\t%d\t-\tNO_LOG\t-\t-\t-\n" "$tt" "$row" "$seed" >> "$ORDER_FILE" + continue + fi + jid=$(basename "$log" | sed 's/^slurm_//; s/\.out$//') + # Determine status + if grep -q "^\[done\] best" "$log"; then + status=OK + line=$(grep "^\[done\] best" "$log" | head -1) + acc=$(echo "$line" | grep -oE "action@1 = [0-9.]+" | awk '{print $3}') + epoch_best=$(echo "$line" | grep -oE "epoch [0-9]+" | head -1 | awk '{print $2}') + # last reported epoch number + last_e=$(grep -E "^ E +[0-9]+" "$log" | tail -1 | awk '{print $2}') + printf "%s\t%s\t%d\t%s\t%s\t%s\t%s\t%s\n" "$tt" "$row" "$seed" "$jid" "OK" "${acc}" "${last_e:-?}" "${epoch_best:-?}" >> "$ORDER_FILE" + elif grep -qE "DUE TO TIME LIMIT|CANCELLED.*TIME" "$log"; then + printf "%s\t%s\t%d\t%s\tTIMEOUT\t-\t-\t-\n" "$tt" "$row" "$seed" "$jid" >> "$ORDER_FILE" + elif grep -qE "Traceback|RuntimeError|invalid choice|CUDA error" "$log"; then + err=$(grep -E "Traceback|RuntimeError|invalid choice|CUDA error" "$log" | tail -1 | head -c 120) + printf "%s\t%s\t%d\t%s\tFAIL\t-\t-\t-\t%s\n" "$tt" "$row" "$seed" "$jid" "$err" >> "$ORDER_FILE" + elif squeue -j "$jid" -h 2>/dev/null | grep -q .; then + printf "%s\t%s\t%d\t%s\tRUNNING\t-\t-\t-\n" "$tt" "$row" "$seed" "$jid" >> "$ORDER_FILE" + else + # fell off queue without [done] and without typical error markers + printf "%s\t%s\t%d\t%s\tEXITED_NO_DONE\t-\t-\t-\n" "$tt" "$row" "$seed" "$jid" >> "$ORDER_FILE" + fi + done + done +done + +# 2. Build markdown +{ + echo "# Run summary — $(date '+%Y-%m-%d %H:%M %Z')" + echo + echo "Job range: \`${JID_LO}-${JID_HI}\` (135 expected)" + echo + echo "## Overall status" + echo + echo "| status | count |" + echo "|---|---|" + awk -F'\t' '{print $5}' "$ORDER_FILE" | sort | uniq -c | awk '{printf "| %s | %d |\n", $2, $1}' + echo + echo "## Per-row mean ± std (action@1)" + echo + echo "| table | row | n_ok | n_fail | mean | std | best_seed | best_acc | epochs (median) | best_epoch (median) |" + echo "|---|---|---:|---:|---:|---:|---|---:|---:|---:|" + awk -F'\t' '{key=$1"\t"$2; if($5=="OK"){n[key]++; sum[key]+=$6; ss[key]+=($6*$6); if($6>maxa[key]){maxa[key]=$6; bestseed[key]=$3} le[key]=le[key]" "$7; be[key]=be[key]" "$8} else if($5!="OK"){fail[key]++}} + END{for(k in n){tt=k; sub(/\t.*/,"",tt); rr=k; sub(/.*\t/,"",rr); + m=sum[k]/n[k]; v=ss[k]/n[k] - m*m; if(v<0)v=0; sd=sqrt(v); + # median of last_epoch list + split(le[k], A, " "); cnt=0; for(i in A){if(A[i]!=""){cnt++; B[cnt]=A[i]+0}} + asort(B); med_le=cnt? B[int((cnt+1)/2)] : "-"; delete B; + split(be[k], A, " "); cnt=0; for(i in A){if(A[i]!=""){cnt++; B[cnt]=A[i]+0}} + asort(B); med_be=cnt? B[int((cnt+1)/2)] : "-"; + fk=fail[k]+0; + printf "| %s | %s | %d | %d | %.4f | %.4f | seed%s | %.4f | %s | %s |\n", tt, rr, n[k], fk, m, sd, bestseed[k], maxa[k], med_le, med_be + }}' "$ORDER_FILE" | sort + echo + echo "## Failed / non-OK jobs" + echo + awk -F'\t' '$5!="OK" {printf "- **%s/%s seed%s** jid=%s status=%s %s\n", $1,$2,$3,$4,$5,$9}' "$ORDER_FILE" || true + if ! awk -F'\t' '$5!="OK"' "$ORDER_FILE" | grep -q .; then + echo "_None._" + fi + echo + echo "## Notes / known operational concerns" + echo + echo "- These are operational results only. Most jobs trigger early-stop (patience=12) at epoch 1–18 instead of running the full 40 epochs, because validation metric saturates very early." + echo "- \`best action@1\` observed in spot-check ranged 0.6%–3.4% (17 verb × 34 noun = 578 action classes; random ≈ 0.17%). This is a model/hyperparameter issue, not an infra issue." + echo "- If you want to revisit hparams: try larger patience, lower lr, or warmup. The data loader and GPU stack are confirmed working (cu121 / A800)." + echo + echo "## Per-table seed-level details" + echo + for tt in table1_main_comparison table3_horizon_curve table4_modality_ablation table5_component_ablation table7_missing_modality; do + echo "### ${tt}" + echo + echo "| row | seed42 | seed123 | seed456 | seed789 | seed1024 |" + echo "|---|---|---|---|---|---|" + awk -F'\t' -v tt="$tt" '$1==tt {key=$2; cell=($5=="OK"? sprintf("%.4f",$6) : "·"$5); arr[key,$3]=cell; rows[key]=1} + END{for(r in rows){printf "| %s | %s | %s | %s | %s | %s |\n", r, (arr[r,42]!=""?arr[r,42]:"-"), (arr[r,123]!=""?arr[r,123]:"-"), (arr[r,456]!=""?arr[r,456]:"-"), (arr[r,789]!=""?arr[r,789]:"-"), (arr[r,1024]!=""?arr[r,1024]:"-")}}' "$ORDER_FILE" | sort + echo + done +} > "$OUT" + +echo "Wrote $OUT" +ls -la "$OUT"