velvet-pine-22 commited on 17 days ago

Commit

b4b2877

verified ·

1 Parent(s): 6f63aa1

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

LICENSE +21 -0
README.md +152 -0
experiments/__init__.py +0 -0
experiments/analysis/__init__.py +0 -0
experiments/analysis/aggregate_new_exps.py +166 -0
experiments/analysis/aggregate_t1_extended.py +60 -0
experiments/analysis/analysis_figures.py +444 -0
experiments/analysis/build_taxonomy.py +136 -0
experiments/analysis/check_seg_lengths.py +229 -0
experiments/analysis/data_statistics_figure.py +126 -0
experiments/analysis/exp_per_subject.py +150 -0
experiments/analysis/extract_video_features.py +208 -0
experiments/analysis/extract_videomae_features.py +276 -0
experiments/analysis/gen_val_comparison.py +74 -0
experiments/analysis/generate_action_labels.py +133 -0
experiments/analysis/generate_coarse_annotations.py +296 -0
experiments/analysis/grasp_phase_analysis.py +442 -0
experiments/analysis/modality_viz.py +145 -0
experiments/analysis/reannotate_actions.py +363 -0
experiments/data/__init__.py +0 -0
experiments/data/__pycache__/dataset.cpython-312.pyc +0 -0
experiments/data/dataset.py +332 -0
experiments/data/dataset_forecast.py +319 -0
experiments/data/dataset_grasp_state.py +571 -0
experiments/data/dataset_seqpred.py +533 -0
experiments/data/dataset_signal_forecast.py +391 -0
experiments/nets/__init__.py +0 -0
experiments/nets/__pycache__/models_seqpred.cpython-312.pyc +0 -0
experiments/nets/baselines_published/__init__.py +0 -0
experiments/nets/baselines_published/baselines.py +488 -0
experiments/nets/baselines_published/syncfuse.py +270 -0
experiments/nets/models.py +648 -0
experiments/nets/models_forecast.py +269 -0
experiments/nets/models_forecast_priv.py +76 -0
experiments/nets/models_seqpred.py +806 -0
experiments/nets/published_models.py +699 -0
experiments/s9_primitives.json +76 -0
experiments/slurm/freeze_all_rows.sh +179 -0
experiments/slurm/run_ablation_fix.sh +33 -0
experiments/slurm/run_ablation_fusion.sh +174 -0
experiments/slurm/run_asformer_exp3.sh +44 -0
experiments/slurm/run_exp1.sh +40 -0
experiments/slurm/run_exp1_fusion.sh +36 -0
experiments/slurm/run_exp1_parallel.sh +67 -0
experiments/slurm/run_exp1_small.sh +84 -0
experiments/slurm/run_exp1_small2.sh +85 -0
experiments/slurm/run_exp1_small3.sh +137 -0
experiments/slurm/run_exp1_v3.sh +68 -0
experiments/slurm/run_exp1_v4.sh +69 -0
experiments/slurm/run_exp1_v5.sh +62 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2026 Anonymous Authors (under double-blind review for NeurIPS 2026)
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,152 @@

+---
+license: mit
+language:
+  - en
+library_name: pytorch
+tags:
+  - multi-modal
+  - daily-activity
+  - wearable-sensors
+  - benchmark
+---
+# PULSE — Code Repository
+Reference implementation, training scripts, and benchmark baselines for the
+**PULSE** dataset paper (under double-blind review at NeurIPS 2026 Evaluations &
+Datasets Track).
+> **Dataset:** [`velvet-pine-22/PULSE`](https://huggingface.co/datasets/velvet-pine-22/PULSE)
+> · **Sample subset (≈285 MB):** [`velvet-pine-22/PULSE-sample`](https://huggingface.co/datasets/velvet-pine-22/PULSE-sample)
+## Repository layout
+```
+PULSE-code/
+├── experiments/
+│   ├── data/                     # PyTorch Dataset wrappers
+│   │   ├── dataset.py                  # core multi-modal dataset (T1, T2)
+│   │   ├── dataset_seqpred.py          # T2 fine-grained action recognition
+│   │   ├── dataset_grasp_state.py      # T3 grasp onset anticipation
+│   │   ├── dataset_forecast.py         # auxiliary forecasting heads
+│   │   └── dataset_signal_forecast.py  # T5 tactile-driven motion forecast
+│   │
+│   ├── nets/                     # Model architectures
+│   │   ├── models.py                   # backbone networks (Transformer / LSTM / 1D-CNN)
+│   │   ├── models_seqpred.py           # DailyActFormer (DAF) — multi-modal Transformer
+│   │   ├── models_forecast.py          # forecasting heads
+│   │   ├── models_forecast_priv.py     # privileged-tactile variants for T5
+│   │   ├── published_models.py         # third-party model implementations
+│   │   └── baselines_published/        # 7 published baselines (re-implementation)
+│   │       ├── baselines.py            #   DeepConvLSTM / InceptionTime / MS-TCN / etc.
+│   │       └── syncfuse.py             #   under-pressure-style multi-modal fusion
+│   │
+│   ├── tasks/                    # Training + evaluation entry points
+│   │   ├── train_exp1.py               # T1 — scene recognition
+│   │   ├── train_seqpred.py            # T2 — action recognition (DAF + ablations)
+│   │   ├── train_grasp_state.py        # T3 — grasp onset anticipation
+│   │   ├── train_pred_cls.py           # T3 alt classification head
+│   │   ├── train_exp_missing.py        # T4 — missing-modality robustness
+│   │   ├── train_signal_forecast.py    # T5 — tactile-driven motion forecasting
+│   │   ├── train_signal_forecast_priv.py  # T5 privileged variants
+│   │   ├── train_baselines_t1.py       # baselines for T1
+│   │   ├── train_exp{2,3,4}.py         # ablation experiments
+│   │   ├── train_exp_{anticipate,grip,pose,retrieval,zeroshot}.py  # auxiliary
+│   │   ├── train_pred.py / train_forecast.py
+│   │   ├── eval_baselines.py / eval_combined.py
+│   │   └── published_baselines.py      # baseline registry
+│   │
+│   ├── analysis/                 # Case study, figures, data prep utilities
+│   │   ├── grasp_phase_analysis.py     # case study (gaze→EMG→hand→contact cascade)
+│   │   ├── modality_viz.py / analysis_figures.py / data_statistics_figure.py
+│   │   ├── extract_video_features.py / extract_videomae_features.py
+│   │   ├── build_taxonomy.py / generate_action_labels.py / generate_coarse_annotations.py
+│   │   ├── reannotate_actions.py / gen_val_comparison.py
+│   │   ├── exp_per_subject.py / check_seg_lengths.py
+│   │   └── aggregate_*.py              # collate run results
+│   │
+│   ├── slurm/                    # 60+ SLURM launch scripts (one per main experiment)
+│   │   └── run_*.sh
+│   │
+│   ├── taxonomy.py               # shared 18-primitive taxonomy
+│   ├── s9_primitives.json
+│   └── taxonomy_v3.json
+│
+├── scripts/                      # Top-level utilities (not task-specific)
+│   ├── build_paper_tables.py     # collates results JSONs into LaTeX tables
+│   ├── eval_macrof1.py / eval_subset.py / eval_topk_v3.py
+│   └── dispatch_eval.sh          # batch dispatcher
+│
+├── LICENSE                       # MIT
+├── requirements.txt              # Python deps
+└── README.md
+```
+## Quick start
+```bash
+# 1. Set up Python environment
+python -m venv .venv && source .venv/bin/activate
+pip install -r requirements.txt
+# 2. Point at the PULSE dataset (download from HuggingFace first)
+export PULSE_ROOT=/path/to/PULSE   # the dataset root (not this code repo)
+# 3. Run a training entry point as a module (from the experiments/ directory)
+cd experiments
+python -m tasks.train_seqpred \
+    --root $PULSE_ROOT \
+    --modalities mocap emg eyetrack imu pressure \
+    --output_dir runs/t2_daf
+# 4. Reproduce paper tables (after training all benchmarks)
+cd ..
+python scripts/build_paper_tables.py \
+    --results_root experiments/runs/ \
+    --out tables/
+```
+> **Why `python -m tasks.train_seqpred` and not `python tasks/train_seqpred.py`?**
+> The training scripts import sibling modules (`from data.dataset import …`,
+> `from nets.models import …`). Running with `-m` from the `experiments/`
+> directory makes Python treat `data/`, `nets/`, `tasks/`, and `analysis/` as
+> top-level packages so the imports resolve cleanly.
+## Reproducing the benchmark tasks
+| Task | Entry point | Output |
+|---|---|---|
+| T1 — Scene recognition (8-way) | `tasks.train_exp1` | scene-classification metrics |
+| T2 — Fine-grained action recognition | `tasks.train_seqpred` | verb / noun / hand top-k accuracy |
+| T3 — Grasp onset anticipation | `tasks.train_grasp_state` / `tasks.train_pred_cls` | anticipation F1 / time-to-contact |
+| T4 — Missing-modality robustness | `tasks.train_exp_missing` + `tasks.eval_combined` | per-modality ablation table |
+| T5 — Tactile-driven grasp-state recognition | `tasks.train_signal_forecast` (+ `_priv` variants) | sub-second grasp-state metrics |
+| T6 — Cross-modal pressure prediction | `tasks.train_forecast` / `tasks.train_signal_forecast` | pressure reconstruction metrics |
+The exact command lines (with hyperparameters, seeds, GPU configs) used for
+every paper table are checked in under `experiments/slurm/run_*.sh`, one
+SLURM script per paper experiment. Output JSON files from these runs are
+collated into LaTeX tables by `scripts/build_paper_tables.py`.
+## Hardware
+Headline experiments were run on **NVIDIA A800 (80 GB)** GPUs. A single seed of
+DailyActFormer T2 trains in ~6 hours on one A800. Most baselines fit on a
+single 24 GB consumer GPU.
+## License & attribution
+Code is released under **MIT** (see `LICENSE`). The PULSE dataset itself is
+released under **CC BY-NC 4.0** (see the dataset repository).
+## Citation
+```bibtex
+@inproceedings{anonymous2026pulse,
+  title     = {PULSE: A Synchronized Five-Modality Dataset for Multi-Modal Daily Activity Understanding},
+  author    = {Anonymous Authors},
+  booktitle = {Submitted to NeurIPS 2026 Evaluations and Datasets Track},
+  year      = {2026},
+  note      = {Under double-blind review}
+}
+```

experiments/__init__.py ADDED Viewed

File without changes

experiments/analysis/__init__.py ADDED Viewed

File without changes

experiments/analysis/aggregate_new_exps.py ADDED Viewed

	@@ -0,0 +1,166 @@

+#!/usr/bin/env python3
+"""Aggregate results from the three new benchmark experiments."""
+import os
+import json
+import glob
+import numpy as np
+ROOT = '${PULSE_ROOT}/results/exp_new'
+def load_results(pattern):
+    files = sorted(glob.glob(pattern))
+    results = []
+    for f in files:
+        try:
+            results.append(json.load(open(f)))
+        except Exception as e:
+            print(f"  ERR: {f}: {e}")
+    return results
+def aggregate_expA():
+    """Missing modality: average across seeds per eval config."""
+    print("\n" + "=" * 70)
+    print("EXP A: Missing-modality robustness")
+    print("=" * 70)
+    for subdir in ['expA_missing', 'expA_baseline']:
+        files = load_results(f'{ROOT}/{subdir}/*/results.json')
+        if not files:
+            print(f"  No results yet for {subdir}")
+            continue
+        print(f"\n-- {subdir} (n seeds = {len(files)}) --")
+        # Group by eval config name; accumulate F1/Acc over seeds
+        config_stats = {}
+        for r in files:
+            if 'eval_configs' not in r:
+                continue
+            for name, info in r['eval_configs'].items():
+                config_stats.setdefault(name, {'f1': [], 'acc': [], 'active': info['active']})
+                config_stats[name]['f1'].append(info['f1'])
+                config_stats[name]['acc'].append(info['acc'])
+        # Order: full, leave-one-out, singletons
+        full_names = [n for n in config_stats if n == 'full']
+        drop_names = sorted([n for n in config_stats if n.startswith('drop_')])
+        only_names = sorted([n for n in config_stats if n.startswith('only_')])
+        print(f"  {'Config':<22s}  {'Active modalities':<42s}  "
+              f"{'F1 mean±std':<14s}  {'Acc mean±std':<14s}")
+        print('  ' + '-' * 96)
+        for grp in [full_names, drop_names, only_names]:
+            for name in grp:
+                d = config_stats[name]
+                f1_m, f1_s = np.mean(d['f1']), np.std(d['f1'])
+                ac_m, ac_s = np.mean(d['acc']), np.std(d['acc'])
+                active = ','.join(d['active'])
+                print(f"  {name:<22s}  {active:<42s}  "
+                      f"{f1_m:.3f}±{f1_s:.3f}    {ac_m:.3f}±{ac_s:.3f}")
+def aggregate_expB():
+    """Grip regression: group by (backbone, mod_config), average over seeds."""
+    print("\n" + "=" * 70)
+    print("EXP B: Grip force regression")
+    print("=" * 70)
+    files = load_results(f'{ROOT}/expB_grip/*/results.json')
+    if not files:
+        print("  No results yet")
+        return
+    # Group
+    groups = {}
+    for r in files:
+        if 'best_test_metrics' not in r:
+            continue
+        key = (r['backbone'], ','.join(r['modalities']))
+        groups.setdefault(key, []).append(r)
+    rows = []
+    for (bb, mods), rs in groups.items():
+        mae_R = [r['best_test_metrics']['right_hand']['mae_g'] for r in rs]
+        mae_L = [r['best_test_metrics']['left_hand']['mae_g'] for r in rs]
+        r_R = [r['best_test_metrics']['right_hand']['pearson_r'] for r in rs]
+        r_L = [r['best_test_metrics']['left_hand']['pearson_r'] for r in rs]
+        r2_R = [r['best_test_metrics']['right_hand']['r2'] for r in rs]
+        r2_L = [r['best_test_metrics']['left_hand']['r2'] for r in rs]
+        mae_avg = [r['best_test_metrics']['avg_mae_g'] for r in rs]
+        r_avg = [r['best_test_metrics']['avg_pearson_r'] for r in rs]
+        rows.append({
+            'backbone': bb,
+            'modalities': mods,
+            'n_seeds': len(rs),
+            'mae_R': (np.mean(mae_R), np.std(mae_R)),
+            'mae_L': (np.mean(mae_L), np.std(mae_L)),
+            'mae_avg': (np.mean(mae_avg), np.std(mae_avg)),
+            'r_R': (np.mean(r_R), np.std(r_R)),
+            'r_L': (np.mean(r_L), np.std(r_L)),
+            'r_avg': (np.mean(r_avg), np.std(r_avg)),
+            'r2_R': (np.mean(r2_R), np.std(r2_R)),
+            'r2_L': (np.mean(r2_L), np.std(r2_L)),
+        })
+    rows.sort(key=lambda r: r['r_avg'][0], reverse=True)
+    print(f"  {'Backbone':<12s}  {'Modalities':<30s}  N  "
+          f"{'MAE(g) avg':<14s}  {'Pearson r avg':<14s}  {'R²(R)':<12s}  {'R²(L)':<12s}")
+    print('  ' + '-' * 102)
+    for row in rows:
+        print(f"  {row['backbone']:<12s}  {row['modalities']:<30s}  {row['n_seeds']}  "
+              f"{row['mae_avg'][0]:.1f}±{row['mae_avg'][1]:.1f}    "
+              f"{row['r_avg'][0]:.3f}±{row['r_avg'][1]:.3f}    "
+              f"{row['r2_R'][0]:.3f}±{row['r2_R'][1]:.3f}    "
+              f"{row['r2_L'][0]:.3f}±{row['r2_L'][1]:.3f}")
+def aggregate_expC():
+    """T5 retrieval: group by mod config, average over seeds."""
+    print("\n" + "=" * 70)
+    print("EXP C: T5 Cross-modal text retrieval")
+    print("=" * 70)
+    files = load_results(f'{ROOT}/expC_retrieval/*/results.json')
+    if not files:
+        print("  No results yet")
+        return
+    groups = {}
+    for r in files:
+        if 'final_avg_over_3_pool_seeds' not in r:
+            continue
+        key = ','.join(r['modalities'])
+        groups.setdefault(key, []).append(r)
+    rows = []
+    for mods, rs in groups.items():
+        r1 = [r['final_avg_over_3_pool_seeds']['recall@1'] for r in rs]
+        r5 = [r['final_avg_over_3_pool_seeds']['recall@5'] for r in rs]
+        r10 = [r['final_avg_over_3_pool_seeds']['recall@10'] for r in rs]
+        medR = [r['final_avg_over_3_pool_seeds']['median_rank'] for r in rs]
+        rows.append({
+            'modalities': mods,
+            'n_seeds': len(rs),
+            'r1': (np.mean(r1), np.std(r1)),
+            'r5': (np.mean(r5), np.std(r5)),
+            'r10': (np.mean(r10), np.std(r10)),
+            'medR': (np.mean(medR), np.std(medR)),
+            'n_test': rs[0].get('n_test_segments', 0),
+            'K': rs[0].get('K_pool', 100),
+        })
+    rows.sort(key=lambda r: r['r10'][0], reverse=True)
+    print(f"  {'Modalities':<30s}  N  N_test  K  "
+          f"{'R@1':<12s}  {'R@5':<12s}  {'R@10':<12s}  {'medR':<12s}")
+    print('  ' + '-' * 100)
+    for row in rows:
+        print(f"  {row['modalities']:<30s}  {row['n_seeds']}  {row['n_test']:<6d}  {row['K']:<2d}  "
+              f"{row['r1'][0]:.3f}±{row['r1'][1]:.3f}  "
+              f"{row['r5'][0]:.3f}±{row['r5'][1]:.3f}  "
+              f"{row['r10'][0]:.3f}±{row['r10'][1]:.3f}  "
+              f"{row['medR'][0]:.1f}±{row['medR'][1]:.1f}")
+def main():
+    aggregate_expA()
+    aggregate_expB()
+    aggregate_expC()
+if __name__ == '__main__':
+    main()

experiments/analysis/aggregate_t1_extended.py ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/usr/bin/env python3
+"""Aggregate T1 extended benchmark results.
+Prints a Markdown-style table sorted by F1 desc."""
+import os
+import json
+import glob
+import numpy as np
+from collections import defaultdict
+ROOT = '${PULSE_ROOT}/results/t1_extended'
+def collect(pattern):
+    by_key = defaultdict(list)
+    for f in sorted(glob.glob(pattern)):
+        try:
+            r = json.load(open(f))
+        except Exception as e:
+            print(f"  ERR reading {f}: {e}")
+            continue
+        key = r.get('method', os.path.basename(os.path.dirname(f)))
+        # Distinguish ablations by tag
+        tag = r.get('args', {}).get('tag', '')
+        if tag:
+            key = f"{key}_{tag}"
+        by_key[key].append(r)
+    return by_key
+def main():
+    groups = collect(f'{ROOT}/*/results.json')
+    rows = []
+    for key, rs in groups.items():
+        f1s = [r['test_f1'] for r in rs]
+        accs = [r['test_acc'] for r in rs]
+        mods = ','.join(rs[0]['modalities'])
+        rows.append({
+            'method': key,
+            'modalities': mods,
+            'n_seeds': len(rs),
+            'f1_mean': np.mean(f1s),
+            'f1_std': np.std(f1s),
+            'acc_mean': np.mean(accs),
+            'acc_std': np.std(accs),
+            'n_params': rs[0].get('n_params', 0),
+        })
+    rows.sort(key=lambda r: r['f1_mean'], reverse=True)
+    print(f"\n{'Method':<28s} {'Modalities':<32s}  N  {'F1 mean±std':<14s}  "
+          f"{'Acc mean±std':<14s}  Params")
+    print('-' * 110)
+    for r in rows:
+        print(f"{r['method']:<28s} {r['modalities']:<32s}  {r['n_seeds']}  "
+              f"{r['f1_mean']:.3f}±{r['f1_std']:.3f}   "
+              f"{r['acc_mean']:.3f}±{r['acc_std']:.3f}   "
+              f"{r['n_params']:,}")
+if __name__ == '__main__':
+    main()

experiments/analysis/analysis_figures.py ADDED Viewed

	@@ -0,0 +1,444 @@

+#!/usr/bin/env python3
+"""Generate three showcase figures for the main paper:
+  1. Eye-Hand-Contact coordination (gaze fixation + hand velocity + pressure)
+  2. Pressure fingerprints per action category
+  3. 3D hand trajectory colored by pressure
+"""
+import os, glob, json, re
+import numpy as np
+import pandas as pd
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from scipy.signal import savgol_filter
+DATASET = "${PULSE_ROOT}/dataset"
+OUT_DIR = "${PULSE_ROOT}/paper/figures"
+os.makedirs(OUT_DIR, exist_ok=True)
+PRESSURE_THRESHOLD = 5.0
+FPS = 100
+# ============================================================
+# Shared data-loading helpers
+# ============================================================
+def load_pressure(scenario_dir):
+    """Return (T, 2) array of (right_total, left_total) pressure."""
+    f = os.path.join(scenario_dir, "aligned_pressure_100hz.csv")
+    if not os.path.exists(f):
+        return None
+    df = pd.read_csv(f, low_memory=False)
+    r_cols = [c for c in df.columns if c.startswith('R') and c.endswith('(g)')]
+    l_cols = [c for c in df.columns if c.startswith('L') and c.endswith('(g)')]
+    if len(r_cols) < 20 or len(l_cols) < 20:
+        return None
+    r = df[r_cols].apply(pd.to_numeric, errors='coerce').fillna(0).values
+    l = df[l_cols].apply(pd.to_numeric, errors='coerce').fillna(0).values
+    return r, l  # (T, 25) each
+def load_emg(scenario_dir):
+    f = os.path.join(scenario_dir, "aligned_emg_100hz.csv")
+    if not os.path.exists(f):
+        return None
+    df = pd.read_csv(f, low_memory=False)
+    numeric = [c for c in df.select_dtypes(include=[np.number]).columns
+               if c not in ('time', 'UTC', 'Frame')]
+    if len(numeric) < 4:
+        return None
+    return np.nan_to_num(df[numeric].values.astype(np.float32))
+def load_gaze(scenario_dir):
+    f = os.path.join(scenario_dir, "aligned_eyetrack_100hz.csv")
+    if not os.path.exists(f):
+        return None
+    df = pd.read_csv(f, low_memory=False)
+    gx_col = [c for c in df.columns if 'Gaze X' in c and 'Scene Cam' in c]
+    gy_col = [c for c in df.columns if 'Gaze Y' in c and 'Scene Cam' in c]
+    if gx_col and gy_col:
+        gx = pd.to_numeric(df[gx_col[0]], errors='coerce').fillna(0).values
+        gy = pd.to_numeric(df[gy_col[0]], errors='coerce').fillna(0).values
+        return np.stack([gx, gy], axis=1)
+    return None
+def load_mocap_hand(scenario_dir, vol, scenario):
+    """Return wrist 3D position (T,3) and tip position summary."""
+    f = os.path.join(scenario_dir, f"aligned_{vol}{scenario}_s_Q.tsv")
+    if not os.path.exists(f):
+        return None, None
+    df = pd.read_csv(f, sep='\t', low_memory=False)
+    # Right hand wrist (try several naming patterns)
+    candidates = [
+        ['RightHand_X','RightHand_Y','RightHand_Z'],
+        ['R_Hand_X','R_Hand_Y','R_Hand_Z'],
+        ['Q_RWristIn_X','Q_RWristIn_Y','Q_RWristIn_Z'],
+    ]
+    r_wrist = None
+    for cs in candidates:
+        if all(c in df.columns for c in cs):
+            r_wrist = df[cs].apply(pd.to_numeric, errors='coerce').fillna(0).values
+            break
+    l_wrist = None
+    for cs_l in [['LeftHand_X','LeftHand_Y','LeftHand_Z'],
+                 ['L_Hand_X','L_Hand_Y','L_Hand_Z'],
+                 ['Q_LWristIn_X','Q_LWristIn_Y','Q_LWristIn_Z']]:
+        if all(c in df.columns for c in cs_l):
+            l_wrist = df[cs_l].apply(pd.to_numeric, errors='coerce').fillna(0).values
+            break
+    return r_wrist, l_wrist
+def compute_velocity(position, window=5):
+    """Magnitude of velocity (after smoothing)."""
+    vel = np.zeros_like(position)
+    vel[1:] = position[1:] - position[:-1]
+    mag = np.linalg.norm(vel, axis=1)
+    try:
+        mag = savgol_filter(mag, window_length=min(window*2+1, len(mag)-1 if len(mag)%2==0 else len(mag)), polyorder=2)
+    except:
+        pass
+    return mag
+def detect_grasp_events(hand_pressure, threshold=PRESSURE_THRESHOLD, min_gap=50):
+    """Detect pressure onset events."""
+    total = hand_pressure.sum(axis=1) if hand_pressure.ndim == 2 else hand_pressure
+    above = total > threshold
+    onsets = []
+    last_state = False
+    for i, a in enumerate(above):
+        if a and not last_state:
+            if i + 10 < len(above) and np.mean(above[i:i+10]) > 0.7:
+                if not onsets or i - onsets[-1] > min_gap:
+                    onsets.append(i)
+                last_state = True
+        elif not a and last_state:
+            if i + 5 < len(above) and np.mean(above[i:i+5]) < 0.3:
+                last_state = False
+    return onsets
+def emg_envelope(emg, window=20):
+    rect = np.abs(emg - np.mean(emg, axis=0))
+    kernel = np.ones(window) / window
+    env = np.stack([np.convolve(rect[:, c], kernel, mode='same') for c in range(rect.shape[1])], axis=1)
+    return env.sum(axis=1)
+def gaze_velocity(gaze_xy, window=5):
+    """Magnitude of gaze velocity — high = saccade, low = fixation."""
+    v = np.zeros_like(gaze_xy)
+    v[1:] = gaze_xy[1:] - gaze_xy[:-1]
+    mag = np.linalg.norm(v, axis=1)
+    try:
+        mag = savgol_filter(mag, window_length=min(window*2+1, 15), polyorder=2)
+    except:
+        pass
+    return mag
+# ============================================================
+# FIGURE 1: Eye-Hand-Contact coordination
+# ============================================================
+def make_eye_hand_contact_figure():
+    print("=== Figure 1: Eye-Hand-Contact coordination ===")
+    context = 200  # 2s before + 0.5s after
+    after = 50
+    events = []  # list of dicts: gaze_vel, hand_vel, pressure, all shape (context+after,)
+    for vol_dir in sorted(glob.glob(f"{DATASET}/v*")):
+        vol = os.path.basename(vol_dir)
+        for sd in sorted(glob.glob(f"{vol_dir}/s*")):
+            scenario = os.path.basename(sd)
+            meta_path = os.path.join(sd, "alignment_metadata.json")
+            if not os.path.exists(meta_path):
+                continue
+            meta = json.load(open(meta_path))
+            if not {'pressure', 'eyetrack', 'mocap'}.issubset(set(meta['modalities'])):
+                continue
+            p = load_pressure(sd)
+            g = load_gaze(sd)
+            r_wrist, _ = load_mocap_hand(sd, vol, scenario)
+            if p is None or g is None or r_wrist is None:
+                continue
+            r_p, _ = p
+            min_len = min(len(r_p), len(g), len(r_wrist))
+            r_p, g, r_wrist = r_p[:min_len], g[:min_len], r_wrist[:min_len]
+            hand_vel = compute_velocity(r_wrist)
+            gvel = gaze_velocity(g)
+            total_p = r_p.sum(axis=1)
+            onsets = detect_grasp_events(r_p)
+            for o in onsets:
+                if o < context or o + after >= min_len:
+                    continue
+                # Require quiescent pre-grasp
+                rest_window = gvel[o-150:o-100]
+                vel_rest = hand_vel[o-150:o-100]
+                if np.mean(vel_rest) > hand_vel[o-50:o].mean() * 0.5:
+                    continue
+                gv_seg = gvel[o-context:o+after]
+                hv_seg = hand_vel[o-context:o+after]
+                pr_seg = total_p[o-context:o+after]
+                if len(gv_seg) != context+after or np.isnan(gv_seg).any():
+                    continue
+                events.append({'gv': gv_seg, 'hv': hv_seg, 'p': pr_seg})
+            if len(events) > 400:
+                break
+        if len(events) > 400:
+            break
+    print(f"  Collected {len(events)} events")
+    if len(events) < 50:
+        print("  Not enough events, skipping")
+        return
+    # Gaze: fixation = low gaze velocity, so use "1 - normalized gaze velocity"
+    # This represents "gaze fixation stability"
+    def norm01(arr):
+        arr = np.array(arr)
+        arr = arr - arr.min(axis=1, keepdims=True)
+        mx = arr.max(axis=1, keepdims=True)
+        return arr / (mx + 1e-8)
+    gv_stack = norm01([e['gv'] for e in events])
+    hv_stack = norm01([e['hv'] for e in events])
+    p_stack = norm01([e['p'] for e in events])
+    # Smooth gaze to show fixation trend
+    # Gaze fixation = low velocity. Plot (1 - gaze_velocity) -> rises as gaze fixates
+    gaze_fix = 1 - gv_stack  # high = fixating
+    # Normalize each event's fix to [0,1] for display
+    gaze_fix_plot = norm01(gaze_fix)
+    time_axis = np.arange(-context, after) * 10  # ms
+    fig, ax = plt.subplots(figsize=(9, 4.5))
+    for stack, color, label in [
+        (gaze_fix_plot, '#8E44AD', 'Gaze fixation'),
+        (hv_stack, '#3498DB', 'Hand velocity'),
+        (p_stack, '#27AE60', 'Pressure (contact)'),
+    ]:
+        mean = stack.mean(axis=0)
+        std = stack.std(axis=0)
+        ax.plot(time_axis, mean, color=color, linewidth=2.5, label=label)
+        ax.fill_between(time_axis, mean - std*0.4, mean + std*0.4, color=color, alpha=0.15)
+    ax.axvline(0, color='black', linestyle='--', linewidth=1.2, alpha=0.7)
+    ax.set_xlabel('Time relative to contact onset (ms)', fontsize=12)
+    ax.set_ylabel('Normalized amplitude', fontsize=12)
+    ax.set_title(f'Gaze → Hand → Contact coordination ({len(events)} events)',
+                 fontsize=13, fontweight='bold')
+    ax.set_xlim(-2000, 500)
+    ax.legend(loc='upper left', fontsize=10, frameon=True)
+    ax.grid(True, alpha=0.3)
+    ax.set_ylim(-0.05, 1.1)
+    plt.tight_layout()
+    out_path = os.path.join(OUT_DIR, 'eye_hand_contact.pdf')
+    plt.savefig(out_path, dpi=150, bbox_inches='tight')
+    plt.savefig(out_path.replace('.pdf', '.png'), dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"  Saved {out_path}")
+# ============================================================
+# FIGURE 2: Pressure fingerprints per action category
+# ============================================================
+def make_pressure_fingerprints():
+    print("\n=== Figure 2: Pressure fingerprints ===")
+    import sys
+    sys.path.insert(0, '${PULSE_ROOT}')
+    from experiments.train_exp2 import load_annotations
+    # For each action class, accumulate mean pressure profile (50 channels)
+    action_r_sum = {}  # action -> (sum 25 channels, count)
+    action_l_sum = {}
+    for vol_dir in sorted(glob.glob(f"{DATASET}/v*")):
+        vol = os.path.basename(vol_dir)
+        for sd in sorted(glob.glob(f"{vol_dir}/s*")):
+            scenario = os.path.basename(sd)
+            meta_path = os.path.join(sd, "alignment_metadata.json")
+            if not os.path.exists(meta_path):
+                continue
+            meta = json.load(open(meta_path))
+            if 'pressure' not in set(meta['modalities']):
+                continue
+            p = load_pressure(sd)
+            if p is None:
+                continue
+            r_p, l_p = p
+            labels = load_annotations(vol, scenario, len(r_p), sampling_rate=100, use_coarse=False)
+            if labels is None:
+                continue
+            labels = labels[:len(r_p)]
+            from experiments.train_exp2 import ACTION_NAMES
+            for a_id, a_name in ACTION_NAMES.items():
+                if a_name == 'Idle':
+                    continue
+                mask = labels == a_id
+                if mask.sum() < 10:
+                    continue
+                r_mean = r_p[mask].mean(axis=0)
+                l_mean = l_p[mask].mean(axis=0)
+                if a_name not in action_r_sum:
+                    action_r_sum[a_name] = [np.zeros(25), 0]
+                    action_l_sum[a_name] = [np.zeros(25), 0]
+                action_r_sum[a_name][0] += r_mean * mask.sum()
+                action_r_sum[a_name][1] += mask.sum()
+                action_l_sum[a_name][0] += l_mean * mask.sum()
+                action_l_sum[a_name][1] += mask.sum()
+    # Compute mean for each action
+    results = {}
+    for a_name in action_r_sum:
+        r_cnt = action_r_sum[a_name][1]
+        l_cnt = action_l_sum[a_name][1]
+        if r_cnt == 0 or l_cnt == 0:
+            continue
+        results[a_name] = {
+            'r': action_r_sum[a_name][0] / r_cnt,
+            'l': action_l_sum[a_name][0] / l_cnt,
+        }
+    print(f"  Action categories: {list(results.keys())}")
+    if not results:
+        print("  No data")
+        return
+    # Pick top 6 by frequency (they have most data)
+    # Sort by right-hand count
+    sorted_actions = sorted(results.keys(),
+                            key=lambda a: action_r_sum[a][1], reverse=True)[:6]
+    # Plot as 2-row grid: top row = right hand, bottom row = left hand (or combine as single image)
+    # Use 25 points arranged as a 5x5 grid (stylized hand layout)
+    # Actual finger layout is complex; for visualization use simple grid
+    # Layout (rough hand analogy): arrange as fingertips at top, palm base at bottom
+    # Index mapping — 25 points, organized heuristically:
+    # row 0 (fingertips): 1-5
+    # row 1-2: finger segments
+    # row 3-4: palm area
+    def point_to_xy(idx):
+        """Map channel index (0-24) to 2D hand position (stylized)."""
+        # Simple 5x5 grid
+        row = idx // 5
+        col = idx % 5
+        return col, 4 - row  # flip y so fingertips at top
+    n = len(sorted_actions)
+    fig, axes = plt.subplots(2, n, figsize=(2.0 * n, 4.8), squeeze=False)
+    vmax = max(max(results[a]['r'].max(), results[a]['l'].max()) for a in sorted_actions)
+    for i, a in enumerate(sorted_actions):
+        for row, (hand, title) in enumerate([('r', 'Right'), ('l', 'Left')]):
+            ax = axes[row][i]
+            data = results[a][hand]
+            grid = np.zeros((5, 5))
+            for idx, v in enumerate(data):
+                x, y = point_to_xy(idx)
+                grid[4-y, x] = v
+            im = ax.imshow(grid, cmap='hot', vmin=0, vmax=vmax, aspect='equal')
+            ax.set_xticks([]); ax.set_yticks([])
+            if row == 0:
+                ax.set_title(a, fontsize=11, fontweight='bold')
+            if i == 0:
+                ax.set_ylabel(title, fontsize=10)
+    fig.suptitle('Per-action fingertip pressure signatures (mean across events)',
+                 fontsize=12, fontweight='bold', y=0.98)
+    cbar = fig.colorbar(im, ax=axes.ravel().tolist(), shrink=0.7, pad=0.02)
+    cbar.set_label('Pressure (g)', fontsize=10)
+    plt.savefig(os.path.join(OUT_DIR, 'pressure_fingerprints.pdf'), bbox_inches='tight')
+    plt.savefig(os.path.join(OUT_DIR, 'pressure_fingerprints.png'), dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"  Saved pressure_fingerprints.pdf")
+# ============================================================
+# FIGURE 3: 3D hand trajectory colored by pressure
+# ============================================================
+def make_3d_trajectory():
+    print("\n=== Figure 3: 3D hand trajectory + pressure coloring ===")
+    from mpl_toolkits.mplot3d import Axes3D
+    # Pick a few illustrative recordings with rich grasping — use v1 s3 (kitchen) or similar
+    candidates = [('v1', 's3'), ('v2', 's4'), ('v1', 's5'), ('v1', 's7')]
+    picked = []
+    for vol, scn in candidates:
+        sd = f"{DATASET}/{vol}/{scn}"
+        if not os.path.isdir(sd):
+            continue
+        p = load_pressure(sd)
+        r_wrist, _ = load_mocap_hand(sd, vol, scn)
+        if p is None or r_wrist is None:
+            continue
+        r_p, _ = p
+        min_len = min(len(r_p), len(r_wrist))
+        total_p = r_p[:min_len].sum(axis=1)
+        r_wrist = r_wrist[:min_len]
+        # Take a window that contains a grasp
+        onsets = detect_grasp_events(r_p[:min_len])
+        if not onsets:
+            continue
+        # Take ~3s centred on first onset
+        o = onsets[0]
+        start = max(0, o - 150)
+        end = min(min_len, o + 150)
+        traj = r_wrist[start:end]
+        pressure = total_p[start:end]
+        picked.append((vol, scn, traj, pressure))
+        if len(picked) >= 3:
+            break
+    if not picked:
+        print("  No valid recordings found")
+        return
+    fig = plt.figure(figsize=(3.5 * len(picked), 4))
+    for i, (vol, scn, traj, pr) in enumerate(picked):
+        ax = fig.add_subplot(1, len(picked), i+1, projection='3d')
+        # Normalize pressure for coloring
+        pr_norm = pr / (pr.max() + 1e-6)
+        # Plot as colored line segments
+        for j in range(len(traj) - 1):
+            x = traj[j:j+2, 0]
+            y = traj[j:j+2, 1]
+            z = traj[j:j+2, 2]
+            c = plt.cm.coolwarm(pr_norm[j])
+            ax.plot(x, y, z, color=c, linewidth=2.5, alpha=0.85)
+        # Mark contact point
+        contact_idx = np.argmax(pr)
+        ax.scatter(traj[contact_idx, 0], traj[contact_idx, 1], traj[contact_idx, 2],
+                   color='red', s=50, marker='*', zorder=5, label='Peak contact')
+        ax.set_title(f'{vol}/{scn}', fontsize=10)
+        ax.set_xlabel('X', fontsize=8); ax.set_ylabel('Y', fontsize=8); ax.set_zlabel('Z', fontsize=8)
+        ax.tick_params(labelsize=7)
+    # Colorbar
+    sm = plt.cm.ScalarMappable(cmap='coolwarm', norm=matplotlib.colors.Normalize(vmin=0, vmax=1))
+    sm.set_array([])
+    cbar = fig.colorbar(sm, ax=fig.axes, shrink=0.6, pad=0.02)
+    cbar.set_label('Normalised pressure', fontsize=10)
+    fig.suptitle('Right-hand wrist 3D trajectory coloured by fingertip pressure',
+                 fontsize=12, fontweight='bold', y=1.02)
+    plt.savefig(os.path.join(OUT_DIR, 'hand_trajectory_3d.pdf'), bbox_inches='tight')
+    plt.savefig(os.path.join(OUT_DIR, 'hand_trajectory_3d.png'), dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"  Saved hand_trajectory_3d.pdf")
+if __name__ == '__main__':
+    make_eye_hand_contact_figure()
+    make_pressure_fingerprints()
+    make_3d_trajectory()
+    print("\nAll figures generated in", OUT_DIR)

experiments/analysis/build_taxonomy.py ADDED Viewed

	@@ -0,0 +1,136 @@

+#!/usr/bin/env python3
+"""
+Rebuild the frozen taxonomy JSON from the current annotations_v3/ state.
+Run this *once* after annotation is complete to lock the 28+ noun list. Later
+experiments load the frozen list via taxonomy.py, so class indices don't
+drift if more annotations are ever added.
+Usage:
+    python3 experiments/build_taxonomy.py
+    python3 experiments/build_taxonomy.py --threshold 50 --out experiments/taxonomy_v3.json
+"""
+import argparse
+import glob
+import json
+import os
+from collections import Counter
+from pathlib import Path
+REPO = Path(__file__).resolve().parents[1]
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument(
+        "--annotations_dir",
+        default=str(REPO / "annotations_v3"),
+        help="Directory containing v*/s*.json annotation files",
+    )
+    ap.add_argument("--threshold", type=int, default=50,
+                    help="Minimum noun frequency to keep (Strategy A drops the rest)")
+    ap.add_argument(
+        "--out",
+        default=str(REPO / "experiments" / "taxonomy_v3.json"),
+        help="Output frozen taxonomy JSON",
+    )
+    args = ap.parse_args()
+    # Late import so building the list doesn't depend on the frozen file
+    # being present yet.
+    import sys
+    sys.path.insert(0, str(REPO))
+    from experiments.taxonomy import (
+        VERB_FINE, VERB_COMPOSITE, HAND, NOUN_CANONICAL, canonical_noun,
+    )
+    paths = sorted(glob.glob(os.path.join(args.annotations_dir, "v*", "s*.json")))
+    if not paths:
+        raise SystemExit(f"No json files under {args.annotations_dir}")
+    verbs, nouns, hands = Counter(), Counter(), Counter()
+    total = 0
+    dropped_unknown_verb = 0
+    dropped_unknown_hand = 0
+    for p in paths:
+        try:
+            with open(p) as f:
+                d = json.load(f)
+        except Exception as e:
+            print(f"  WARN: could not parse {p}: {e}")
+            continue
+        for s in d.get("segments", []):
+            a = s.get("action_annotation", {})
+            v = a.get("action_name")
+            n = a.get("object_name")
+            h = a.get("hand_type")
+            if not (v and n and h):
+                continue
+            total += 1
+            if v not in VERB_FINE:
+                dropped_unknown_verb += 1
+                continue
+            if h not in HAND:
+                dropped_unknown_hand += 1
+                continue
+            verbs[v] += 1
+            nouns[canonical_noun(n)] += 1
+            hands[h] += 1
+    kept = [n for n, c in nouns.most_common() if c >= args.threshold]
+    # Stable alphabetical ordering within kept-set, so re-runs that swap two
+    # near-tie classes don't flip indices.
+    kept = sorted(kept, key=lambda n: (-nouns[n], n))
+    surviving_segs = 0
+    for p in paths:
+        with open(p) as f:
+            d = json.load(f)
+        for s in d.get("segments", []):
+            a = s.get("action_annotation", {})
+            v = a.get("action_name")
+            n = a.get("object_name")
+            h = a.get("hand_type")
+            if not (v and n and h):
+                continue
+            if v not in VERB_FINE or h not in HAND:
+                continue
+            if canonical_noun(n) not in kept:
+                continue
+            surviving_segs += 1
+    out = {
+        "threshold":             args.threshold,
+        "annotation_file_count": len(paths),
+        "total_segments":        total,
+        "dropped_unknown_verb":  dropped_unknown_verb,
+        "dropped_unknown_hand":  dropped_unknown_hand,
+        "surviving_segments":    surviving_segs,
+        "verbs":                 VERB_FINE,
+        "verb_composite":        VERB_COMPOSITE,
+        "hand":                  HAND,
+        "nouns":                 kept,
+        "noun_counts":           {n: nouns[n] for n in kept},
+        "verb_counts":           dict(verbs),
+        "hand_counts":           dict(hands),
+    }
+    Path(args.out).parent.mkdir(parents=True, exist_ok=True)
+    with open(args.out, "w") as f:
+        json.dump(out, f, ensure_ascii=False, indent=2)
+    print(f"Scanned {len(paths)} files, {total} segments")
+    print(f"Dropped (unknown verb / hand): {dropped_unknown_verb} / "
+          f"{dropped_unknown_hand}")
+    print(f"Kept {len(kept)} nouns (>= {args.threshold}):")
+    for n in kept:
+        print(f"  {n}: {nouns[n]}")
+    print(f"Surviving segments (Strategy A): "
+          f"{surviving_segs} / {total}  "
+          f"({100 * surviving_segs / max(1, total):.1f}%)")
+    print(f"Wrote {args.out}")
+if __name__ == "__main__":
+    main()

experiments/analysis/check_seg_lengths.py ADDED Viewed

	@@ -0,0 +1,229 @@

+#!/usr/bin/env python3
+"""
+Analyze segment lengths in the recognition dataset.
+For each annotation file, computes segment lengths in:
+- Raw frames (at 100Hz sampling rate)
+- Downsampled frames (downsample=5 -> 20Hz effective)
+Reports statistics and distribution relative to window_frames used in training.
+"""
+import os
+import sys
+import json
+import re
+import numpy as np
+from collections import defaultdict
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from data.dataset import DATASET_DIR, TRAIN_VOLS, VAL_VOLS, TEST_VOLS
+ANNOTATION_DIR = "${PULSE_ROOT}"
+SAMPLING_RATE = 100  # Hz
+DOWNSAMPLE = 5
+def parse_timestamp(ts_str):
+    parts = ts_str.strip().split(':')
+    if len(parts) == 2:
+        return int(parts[0]) * 60 + int(parts[1])
+    elif len(parts) == 3:
+        return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
+    return 0
+def main():
+    all_vols = TRAIN_VOLS + VAL_VOLS + TEST_VOLS
+    # Collect segment lengths
+    raw_lengths_sec = []       # in seconds
+    raw_lengths_frames = []    # in raw 100Hz frames
+    ds_lengths_frames = []     # in downsampled frames (100/5 = 20Hz)
+    split_stats = defaultdict(list)  # split -> list of ds_lengths
+    total_scenarios = 0
+    total_segments = 0
+    skipped_segments = 0
+    for vol in sorted(all_vols):
+        # Determine split
+        if vol in TRAIN_VOLS:
+            split = 'train'
+        elif vol in VAL_VOLS:
+            split = 'val'
+        else:
+            split = 'test'
+        ann_vol_dir = os.path.join(ANNOTATION_DIR, vol)
+        if not os.path.isdir(ann_vol_dir):
+            print(f"WARNING: No annotation dir for {vol}")
+            continue
+        for ann_file in sorted(os.listdir(ann_vol_dir)):
+            if not ann_file.endswith('.json'):
+                continue
+            scenario = ann_file.replace('.json', '')
+            ann_path = os.path.join(ann_vol_dir, ann_file)
+            # Also check that corresponding dataset dir exists
+            scenario_dir = os.path.join(DATASET_DIR, vol, scenario)
+            if not os.path.isdir(scenario_dir):
+                continue
+            with open(ann_path) as f:
+                ann = json.load(f)
+            total_scenarios += 1
+            for seg in ann.get('segments', []):
+                m = re.match(r'(\d+:\d+(?::\d+)?)\s*-\s*(\d+:\d+(?::\d+)?)',
+                             seg['timestamp'])
+                if not m:
+                    skipped_segments += 1
+                    continue
+                start_sec = parse_timestamp(m.group(1))
+                end_sec = parse_timestamp(m.group(2))
+                if end_sec <= start_sec:
+                    skipped_segments += 1
+                    continue
+                duration_sec = end_sec - start_sec
+                raw_frames = duration_sec * SAMPLING_RATE
+                ds_frames = int(end_sec * SAMPLING_RATE / DOWNSAMPLE) - int(start_sec * SAMPLING_RATE / DOWNSAMPLE)
+                raw_lengths_sec.append(duration_sec)
+                raw_lengths_frames.append(raw_frames)
+                ds_lengths_frames.append(ds_frames)
+                split_stats[split].append(ds_frames)
+                total_segments += 1
+    # Convert to numpy
+    raw_sec = np.array(raw_lengths_sec)
+    raw_fr = np.array(raw_lengths_frames)
+    ds_fr = np.array(ds_lengths_frames)
+    print("=" * 70)
+    print("SEGMENT LENGTH ANALYSIS FOR RECOGNITION DATASET")
+    print("=" * 70)
+    print(f"\nTotal scenarios: {total_scenarios}")
+    print(f"Total valid segments: {total_segments}")
+    print(f"Skipped segments (bad timestamp): {skipped_segments}")
+    print(f"Sampling rate: {SAMPLING_RATE} Hz")
+    print(f"Downsample factor: {DOWNSAMPLE}")
+    print(f"Effective rate after downsample: {SAMPLING_RATE / DOWNSAMPLE} Hz")
+    # --- Raw seconds ---
+    print("\n" + "-" * 70)
+    print("SEGMENT DURATION (seconds)")
+    print("-" * 70)
+    print(f"  Min:    {raw_sec.min():.1f}s")
+    print(f"  Max:    {raw_sec.max():.1f}s")
+    print(f"  Mean:   {raw_sec.mean():.2f}s")
+    print(f"  Median: {np.median(raw_sec):.1f}s")
+    print(f"  Std:    {raw_sec.std():.2f}s")
+    # Percentiles
+    for p in [5, 10, 25, 50, 75, 90, 95]:
+        print(f"  P{p:2d}:    {np.percentile(raw_sec, p):.1f}s")
+    # --- Raw frames (100Hz) ---
+    print("\n" + "-" * 70)
+    print("SEGMENT LENGTH (raw frames @ 100Hz)")
+    print("-" * 70)
+    print(f"  Min:    {raw_fr.min()}")
+    print(f"  Max:    {raw_fr.max()}")
+    print(f"  Mean:   {raw_fr.mean():.1f}")
+    print(f"  Median: {np.median(raw_fr):.0f}")
+    # --- Downsampled frames ---
+    print("\n" + "-" * 70)
+    print(f"SEGMENT LENGTH (downsampled frames @ {SAMPLING_RATE/DOWNSAMPLE:.0f}Hz)")
+    print("-" * 70)
+    print(f"  Min:    {ds_fr.min()}")
+    print(f"  Max:    {ds_fr.max()}")
+    print(f"  Mean:   {ds_fr.mean():.1f}")
+    print(f"  Median: {np.median(ds_fr):.0f}")
+    print(f"  Std:    {ds_fr.std():.1f}")
+    for p in [5, 10, 25, 50, 75, 90, 95]:
+        print(f"  P{p:2d}:    {np.percentile(ds_fr, p):.0f}")
+    # --- Comparison with window_frames ---
+    print("\n" + "-" * 70)
+    print("COMPARISON WITH window_frames SETTINGS")
+    print("-" * 70)
+    # Common window_sec values and their corresponding window_frames
+    for window_sec in [5.0, 10.0, 15.0, 20.0, 30.0]:
+        wf = int(window_sec * SAMPLING_RATE / DOWNSAMPLE)
+        shorter = (ds_fr < wf).sum()
+        equal_or_longer = (ds_fr >= wf).sum()
+        longer = (ds_fr > wf).sum()
+        pct_shorter = 100.0 * shorter / len(ds_fr)
+        pct_longer = 100.0 * longer / len(ds_fr)
+        print(f"\n  window_sec={window_sec:5.1f}s -> window_frames={wf}")
+        print(f"    Segments SHORTER than window: {shorter:4d} ({pct_shorter:5.1f}%) -> will be PADDED")
+        print(f"    Segments LONGER  than window: {longer:4d} ({pct_longer:5.1f}%) -> will be CENTER-CROPPED")
+    # --- Thresholds in downsampled frames ---
+    print("\n" + "-" * 70)
+    print("PERCENTAGE SHORTER THAN THRESHOLDS (downsampled frames)")
+    print("-" * 70)
+    for thresh in [20, 40, 60, 100, 200, 300, 400, 500, 1000, 2000]:
+        pct = 100.0 * (ds_fr < thresh).sum() / len(ds_fr)
+        print(f"  < {thresh:5d} frames ({thresh * DOWNSAMPLE / SAMPLING_RATE:6.1f}s): {pct:5.1f}%")
+    # --- Per-split stats ---
+    print("\n" + "-" * 70)
+    print("PER-SPLIT STATISTICS (downsampled frames)")
+    print("-" * 70)
+    for split in ['train', 'val', 'test']:
+        arr = np.array(split_stats[split])
+        if len(arr) == 0:
+            print(f"  {split}: no segments")
+            continue
+        print(f"\n  {split.upper()} ({len(arr)} segments):")
+        print(f"    Min={arr.min()}, Max={arr.max()}, Mean={arr.mean():.1f}, Median={np.median(arr):.0f}")
+    # --- Histogram (text-based) ---
+    print("\n" + "-" * 70)
+    print("HISTOGRAM OF SEGMENT DURATIONS (seconds)")
+    print("-" * 70)
+    bins = [0, 1, 2, 3, 4, 5, 7, 10, 15, 20, 30, 60, 120, 300, 600]
+    for i in range(len(bins) - 1):
+        count = ((raw_sec >= bins[i]) & (raw_sec < bins[i + 1])).sum()
+        pct = 100.0 * count / len(raw_sec)
+        bar = '#' * int(pct / 2)
+        print(f"  [{bins[i]:4d}-{bins[i+1]:4d})s: {count:5d} ({pct:5.1f}%) {bar}")
+    # Last bin: >= 600
+    count = (raw_sec >= bins[-1]).sum()
+    pct = 100.0 * count / len(raw_sec)
+    bar = '#' * int(pct / 2)
+    print(f"  [{bins[-1]:4d}+   )s: {count:5d} ({pct:5.1f}%) {bar}")
+    # --- Key insight ---
+    print("\n" + "=" * 70)
+    print("KEY INSIGHTS")
+    print("=" * 70)
+    median_sec = np.median(raw_sec)
+    mean_sec = raw_sec.mean()
+    print(f"  Median segment duration: {median_sec:.1f}s ({median_sec * SAMPLING_RATE / DOWNSAMPLE:.0f} ds-frames)")
+    print(f"  Mean segment duration:   {mean_sec:.1f}s ({mean_sec * SAMPLING_RATE / DOWNSAMPLE:.0f} ds-frames)")
+    print()
+    # Suggest optimal window
+    p95_sec = np.percentile(raw_sec, 95)
+    print(f"  95th percentile duration: {p95_sec:.1f}s")
+    print(f"  -> A window of {p95_sec:.0f}s would cover 95% of segments without cropping")
+    print(f"  -> Current default window_sec=15.0 -> window_frames={int(15.0 * SAMPLING_RATE / DOWNSAMPLE)}")
+    wf15 = int(15.0 * SAMPLING_RATE / DOWNSAMPLE)
+    pct_crop = 100.0 * (ds_fr > wf15).sum() / len(ds_fr)
+    pct_pad = 100.0 * (ds_fr < wf15).sum() / len(ds_fr)
+    print(f"     {pct_pad:.1f}% segments padded, {pct_crop:.1f}% center-cropped")
+if __name__ == '__main__':
+    main()

experiments/analysis/data_statistics_figure.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""Generate dataset statistics figure from the currently-available annotations.
+Panels (3):
+    (a) Recording duration distribution per scene (boxplot)
+    (b) Segment length distribution (histogram)
+    (c) Top-20 manipulated objects by segment count
+Note: panel for motor-primitive frequency is deferred until the 18-primitive
+annotation pipeline (anno.py) is rerun across all recordings.
+"""
+import json, re
+from pathlib import Path
+from collections import Counter
+import numpy as np
+import matplotlib.pyplot as plt
+ANNO_DIR = Path("${PULSE_ROOT}/annotations_by_scene")
+OUT = Path("${PULSE_ROOT}/paper/figures/dataset_stats.pdf")
+# Chinese -> English object name mapping (from anno.py OBJECT_TRANSLATIONS)
+OBJ_EN = {
+    "笔记本电脑": "laptop", "有线鼠标": "wired mouse", "有线键盘": "wired keyboard",
+    "马克笔": "marker", "胶带": "tape", "笔记本电源": "laptop power", "折叠伞": "umbrella",
+    "剪刀": "scissors", "钱包": "wallet", "纸": "paper", "订书机": "stapler",
+    "纸箱": "box", "文件": "document", "架子": "rack", "桌布": "tablecloth", "罐子": "jar",
+    "调料瓶": "seasoning bottle", "密封罐": "sealed jar", "厨房纸巾": "kitchen paper",
+    "抹布": "cloth", "茶包": "tea bag", "饭碗": "rice bowl", "菜盘": "plate",
+    "菜锅": "pot", "勺子": "spoon", "水杯": "water cup", "茶杯": "tea cup",
+    "茶壶": "teapot", "食物残渣": "food residue", "垃圾桶": "trash bin",
+    "纸巾": "tissue", "餐垫": "placemat", "托盘": "tray", "清洁喷雾": "spray",
+    "食物": "food", "电源": "power adapter", "移动硬盘": "HDD", "鼠标": "mouse",
+    "笔记本充电器": "laptop charger", "转换插头": "plug adapter", "插线板": "power strip",
+    "线材收纳包": "cable organizer", "衬衫": "shirt", "裤子": "pants",
+    "牙膏": "toothpaste", "牙刷": "toothbrush", "牙刷盒": "toothbrush case",
+    "剃须刀": "razor", "毛巾": "towel", "皮鞋": "shoes", "鞋袋": "shoe bag",
+    "耳机": "headphones", "护照套": "passport holder", "证件夹": "ID holder",
+    "纸巾包": "tissue pack", "行李箱": "suitcase", "马克杯": "mug",
+    "调料罐": "seasoning jar", "茶罐": "tea canister", "外套": "coat",
+    "围巾": "scarf", "衣架": "hanger",
+}
+def parse_t(ts: str) -> float:
+    parts = ts.split(":")
+    if len(parts) == 2:  # MM:SS
+        m, s = parts
+        return int(m) * 60 + int(s)
+    h, m, s = parts
+    return int(h) * 3600 + int(m) * 60 + int(s)
+durations = {f"S{i}": [] for i in range(1, 9)}
+seg_lengths = []
+objects = Counter()
+for v_dir in sorted(ANNO_DIR.glob("v*")):
+    for jf in sorted(v_dir.glob("s*.json")):
+        scene = jf.stem.upper()
+        try:
+            data = json.loads(jf.read_text())
+        except Exception:
+            continue
+        segs = data.get("segments", [])
+        if not segs:
+            continue
+        max_end = 0
+        for seg in segs:
+            ts = seg.get("timestamp", "")
+            if "-" not in ts:
+                continue
+            try:
+                start, end = ts.split("-")
+                s_sec, e_sec = parse_t(start), parse_t(end)
+                seg_lengths.append(e_sec - s_sec)
+                max_end = max(max_end, e_sec)
+                for o in seg.get("objects", []) or []:
+                    nm = o.get("name") if isinstance(o, dict) else o
+                    if nm:
+                        objects[OBJ_EN.get(nm, nm)] += 1
+            except Exception:
+                continue
+        if max_end > 0 and scene in durations:
+            durations[scene].append(max_end / 60.0)
+print(f"Per-scene durations: { {s: len(v) for s, v in durations.items()} }")
+print(f"Total segments: {len(seg_lengths)}")
+print(f"Unique objects: {len(objects)}")
+top_obj = objects.most_common(5)
+print(f"Top objects: {top_obj}")
+fig, axes = plt.subplots(1, 3, figsize=(12, 3.5))
+# (a) Duration boxplot per scene
+ax = axes[0]
+scene_order = [f"S{i}" for i in range(1, 9)]
+data = [durations[s] for s in scene_order]
+ax.boxplot(data, tick_labels=scene_order, showfliers=False, patch_artist=True,
+           boxprops=dict(facecolor="#b3cde3"))
+ax.set_ylabel("Recording duration (min)")
+ax.set_title("(a) Recording duration per scene")
+ax.grid(axis="y", alpha=0.3)
+# (b) Segment length histogram
+ax = axes[1]
+seg_arr = np.array(seg_lengths)
+seg_arr = seg_arr[seg_arr <= 10]
+ax.hist(seg_arr, bins=np.arange(0, 11) - 0.5, color="#8c96c6", edgecolor="black")
+ax.set_xlabel("Segment length (s)")
+ax.set_ylabel("Segment count")
+ax.set_title(f"(b) Segment length (n={len(seg_lengths)})")
+ax.set_xticks(range(0, 11))
+ax.grid(axis="y", alpha=0.3)
+# (c) Top-20 objects
+ax = axes[2]
+objs, ocounts = zip(*objects.most_common(20))
+ax.barh(objs[::-1], ocounts[::-1], color="#74c476")
+ax.set_xlabel("Segment count")
+ax.set_title("(c) Top-20 manipulated objects")
+ax.tick_params(axis="y", labelsize=8)
+ax.grid(axis="x", alpha=0.3)
+fig.tight_layout()
+fig.savefig(OUT, bbox_inches="tight")
+fig.savefig(str(OUT).replace(".pdf", ".png"), dpi=140, bbox_inches="tight")
+print(f"Saved: {OUT}")

experiments/analysis/exp_per_subject.py ADDED Viewed

	@@ -0,0 +1,150 @@

+#!/usr/bin/env python3
+"""
+Experiment G: Per-subject diagnostic analysis.
+Load the best scene-recognition checkpoint(s) from previous T1 runs and
+produce a per-test-volunteer breakdown of F1 and Accuracy. Reveals whether
+aggregate metrics are driven by one or two outlier subjects, as reviewers
+often ask.
+Runs CPU-side; no training.
+"""
+import os
+import sys
+import json
+import glob
+import argparse
+import numpy as np
+import torch
+from sklearn.metrics import accuracy_score, f1_score
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from data.dataset import (
+    MultimodalSceneDataset, TEST_VOLS, SCENE_LABELS, NUM_CLASSES,
+    get_dataloaders,
+)
+from nets.models import build_model
+def per_subject_eval(model, device, modalities, stats, downsample):
+    """Evaluate one model across each test volunteer separately."""
+    breakdown = {}
+    for vol in TEST_VOLS:
+        ds = MultimodalSceneDataset([vol], modalities, downsample=downsample,
+                                    stats=stats)
+        if len(ds) == 0:
+            breakdown[vol] = {'n': 0}
+            continue
+        preds, ys = [], []
+        model.eval()
+        with torch.no_grad():
+            for i in range(len(ds)):
+                x, y = ds[i]
+                x = x.to(device).unsqueeze(0)
+                mask = torch.ones(1, x.size(1), dtype=torch.bool).to(device)
+                logits = model(x, mask)
+                preds.append(logits.argmax(dim=1).cpu().item())
+                ys.append(y)
+        breakdown[vol] = {
+            'n': len(ds),
+            'acc': float(accuracy_score(ys, preds)),
+            'f1': float(f1_score(ys, preds, average='macro', zero_division=0)),
+            'preds': preds,
+            'labels': ys,
+            'samples': ds.sample_info,
+        }
+    return breakdown
+def run_on_checkpoint(ckpt_path, args_json_path, output_dir):
+    ckpt_args = json.load(open(args_json_path))['args']
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    modalities = ckpt_args['modalities'] if isinstance(ckpt_args['modalities'], list) \
+                 else ckpt_args['modalities'].split(',')
+    downsample = ckpt_args.get('downsample', 5)
+    # Get train stats
+    _, _, _, info = get_dataloaders(modalities,
+                                    batch_size=ckpt_args.get('batch_size', 16),
+                                    downsample=downsample)
+    # Need the actual stats object -- re-load train set to compute
+    tr_ds = MultimodalSceneDataset(
+        __import__('experiments.dataset', fromlist=['TRAIN_VOLS']).TRAIN_VOLS,
+        modalities, downsample=downsample)
+    stats = tr_ds.get_stats()
+    model = build_model(
+        ckpt_args.get('model', 'transformer'),
+        ckpt_args.get('fusion', 'late'),
+        info['feat_dim'], info['modality_dims'], NUM_CLASSES,
+        hidden_dim=ckpt_args.get('hidden_dim', 128),
+        proj_dim=ckpt_args.get('proj_dim', 0),
+        late_agg=ckpt_args.get('late_agg', 'mean'),
+    ).to(device)
+    try:
+        sd = torch.load(ckpt_path, weights_only=True, map_location=device)
+    except Exception:
+        sd = torch.load(ckpt_path, map_location=device)
+    model.load_state_dict(sd, strict=False)
+    breakdown = per_subject_eval(model, device, modalities, stats, downsample)
+    # Overall F1
+    all_preds, all_ys = [], []
+    for v, info_v in breakdown.items():
+        if info_v.get('n', 0) > 0:
+            all_preds.extend(info_v['preds'])
+            all_ys.extend(info_v['labels'])
+    overall_f1 = float(f1_score(all_ys, all_preds, average='macro', zero_division=0))
+    overall_acc = float(accuracy_score(all_ys, all_preds))
+    # Per-subject summary
+    summary = {
+        'ckpt': ckpt_path,
+        'modalities': modalities,
+        'overall': {'acc': overall_acc, 'f1': overall_f1,
+                    'n': len(all_preds)},
+        'per_subject': {
+            v: {'n': b.get('n'), 'acc': b.get('acc'), 'f1': b.get('f1')}
+            for v, b in breakdown.items()
+        },
+        'detail': breakdown,
+    }
+    os.makedirs(output_dir, exist_ok=True)
+    out_path = os.path.join(output_dir, os.path.basename(
+        os.path.dirname(ckpt_path)) + '_per_subject.json')
+    with open(out_path, 'w') as f:
+        json.dump(summary, f, indent=2)
+    print(f"Per-subject breakdown saved: {out_path}")
+    print(f"Overall F1: {overall_f1:.4f}  Acc: {overall_acc:.4f}")
+    for v, b in summary['per_subject'].items():
+        print(f"  {v}: n={b['n']} acc={b.get('acc'):.3f} f1={b.get('f1'):.3f}"
+              if b.get('n') else f"  {v}: (empty)")
+    return summary
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument('--exp_root', type=str, required=True,
+                   help='Directory containing run subdirs with model_best.pt and results.json')
+    p.add_argument('--output_dir', type=str, required=True)
+    args = p.parse_args()
+    runs = []
+    for sub in sorted(os.listdir(args.exp_root)):
+        if sub == 'slurm_logs':
+            continue
+        ckpt = os.path.join(args.exp_root, sub, 'model_best.pt')
+        res = os.path.join(args.exp_root, sub, 'results.json')
+        if os.path.exists(ckpt) and os.path.exists(res):
+            runs.append((ckpt, res))
+    print(f"Found {len(runs)} runs with checkpoints.")
+    for ckpt, res in runs:
+        try:
+            run_on_checkpoint(ckpt, res, args.output_dir)
+        except Exception as e:
+            print(f"  FAIL {ckpt}: {e}")
+if __name__ == '__main__':
+    main()

experiments/analysis/extract_video_features.py ADDED Viewed

	@@ -0,0 +1,208 @@

+#!/usr/bin/env python3
+"""
+Extract video features from Scene Camera videos using a pretrained backbone.
+Uses CLIP (ViT-B/16) which is lightweight and doesn't need video-specific pretraining.
+Output: per-frame feature vectors saved as .npy files, aligned to 100Hz sensor data.
+"""
+import os
+import sys
+import json
+import glob
+import argparse
+import numpy as np
+import cv2
+import torch
+import torch.nn as nn
+from torchvision import transforms
+DATASET_DIR = "${PULSE_ROOT}/dataset"
+class CLIPFeatureExtractor:
+    """Extract features using CLIP ViT-B/16 (via torchvision)."""
+    def __init__(self, device='cpu'):
+        self.device = device
+        # Use torchvision's pretrained ViT
+        from torchvision.models import vit_b_16, ViT_B_16_Weights
+        weights = ViT_B_16_Weights.IMAGENET1K_V1
+        model = vit_b_16(weights=weights)
+        # Remove classification head, keep feature extractor
+        model.heads = nn.Identity()
+        model.eval()
+        self.model = model.to(device)
+        self.transform = weights.transforms()
+        self.feat_dim = 768  # ViT-B/16 feature dimension
+    @torch.no_grad()
+    def extract_batch(self, frames):
+        """Extract features from a batch of frames.
+        Args:
+            frames: list of numpy arrays (H, W, 3) in BGR format
+        Returns:
+            features: numpy array (N, feat_dim)
+        """
+        tensors = []
+        for frame in frames:
+            # BGR -> RGB -> PIL-like tensor
+            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            tensor = torch.from_numpy(rgb).permute(2, 0, 1).float() / 255.0
+            tensor = self.transform(tensor)
+            tensors.append(tensor)
+        batch = torch.stack(tensors).to(self.device)
+        features = self.model(batch)
+        return features.cpu().numpy()
+def find_scene_video(scenario_dir, vol, scenario):
+    """Find the Scene Camera video file."""
+    pattern = os.path.join(scenario_dir, f"trimmed_{vol}{scenario}*Scene Cam.mp4")
+    matches = glob.glob(pattern)
+    return matches[0] if matches else None
+def extract_features_for_video(extractor, video_path, target_fps=100,
+                               batch_size=32, sample_fps=2):
+    """Extract features from a video file.
+    Args:
+        extractor: feature extractor
+        video_path: path to video file
+        target_fps: target frame rate to align with sensor data (100Hz)
+        batch_size: batch size for feature extraction
+        sample_fps: extract features at this rate (e.g., 2 = every 0.5s)
+            Features are then interpolated to target_fps.
+    Returns:
+        features: numpy array (T_target, feat_dim) aligned to target_fps
+    """
+    cap = cv2.VideoCapture(video_path)
+    video_fps = cap.get(cv2.CAP_PROP_FPS)
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    duration = total_frames / video_fps
+    # Sample frames at sample_fps
+    sample_interval = int(video_fps / sample_fps)
+    sample_indices = list(range(0, total_frames, sample_interval))
+    print(f"    Video: {total_frames} frames @ {video_fps:.1f}fps = {duration:.1f}s")
+    print(f"    Sampling {len(sample_indices)} frames @ {sample_fps}fps")
+    # Extract features in batches
+    all_features = []
+    batch_frames = []
+    batch_indices = []
+    for idx in sample_indices:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+        ret, frame = cap.read()
+        if not ret:
+            break
+        batch_frames.append(frame)
+        batch_indices.append(idx)
+        if len(batch_frames) >= batch_size:
+            feats = extractor.extract_batch(batch_frames)
+            all_features.append(feats)
+            batch_frames = []
+            if len(all_features) % 10 == 0:
+                print(f"      Processed {len(all_features) * batch_size} frames...")
+    if batch_frames:
+        feats = extractor.extract_batch(batch_frames)
+        all_features.append(feats)
+    cap.release()
+    if not all_features:
+        return None
+    features = np.concatenate(all_features, axis=0)  # (N_samples, feat_dim)
+    sample_times = np.array(batch_indices[:features.shape[0]]) / video_fps  # seconds
+    # Interpolate to target_fps (100Hz)
+    target_times = np.arange(0, duration, 1.0 / target_fps)
+    n_target = len(target_times)
+    # Linear interpolation per feature dimension
+    from scipy.interpolate import interp1d
+    if len(sample_times) < 2:
+        # Not enough samples, repeat
+        interpolated = np.tile(features[0], (n_target, 1))
+    else:
+        interp_func = interp1d(
+            sample_times, features, axis=0,
+            kind='linear', fill_value='extrapolate'
+        )
+        interpolated = interp_func(target_times).astype(np.float32)
+    print(f"    Output: {interpolated.shape} @ {target_fps}Hz")
+    return interpolated
+def main():
+    parser = argparse.ArgumentParser(description='Extract video features')
+    parser.add_argument('--sample_fps', type=int, default=2,
+                        help='Sample rate for feature extraction (default: 2fps)')
+    parser.add_argument('--batch_size', type=int, default=16,
+                        help='Batch size for feature extraction')
+    parser.add_argument('--device', type=str, default='cuda',
+                        help='Device (cuda or cpu)')
+    args = parser.parse_args()
+    device = args.device if torch.cuda.is_available() and args.device == 'cuda' else 'cpu'
+    print(f"Device: {device}")
+    print("Loading ViT-B/16 feature extractor...")
+    extractor = CLIPFeatureExtractor(device=device)
+    print(f"Feature dim: {extractor.feat_dim}")
+    # Process all volunteers and scenarios
+    processed = 0
+    skipped = 0
+    for vol_dir in sorted(glob.glob(f"{DATASET_DIR}/v*")):
+        vol = os.path.basename(vol_dir)
+        for scenario_dir in sorted(glob.glob(f"{vol_dir}/s*")):
+            scenario = os.path.basename(scenario_dir)
+            output_path = os.path.join(scenario_dir, "video_features_100hz.npy")
+            # Skip if already extracted
+            if os.path.exists(output_path):
+                print(f"[{vol}/{scenario}] Already exists, skipping")
+                skipped += 1
+                continue
+            # Find video
+            video_path = find_scene_video(scenario_dir, vol, scenario)
+            if video_path is None:
+                print(f"[{vol}/{scenario}] No Scene Camera video found, skipping")
+                skipped += 1
+                continue
+            print(f"\n[{vol}/{scenario}]")
+            print(f"  Video: {os.path.basename(video_path)}")
+            features = extract_features_for_video(
+                extractor, video_path,
+                batch_size=args.batch_size,
+                sample_fps=args.sample_fps,
+            )
+            if features is not None:
+                np.save(output_path, features)
+                print(f"  Saved: {output_path} ({features.shape})")
+                processed += 1
+            else:
+                print(f"  FAILED: Could not extract features")
+    print(f"\n{'='*60}")
+    print(f"Done! Processed: {processed}, Skipped: {skipped}")
+    print(f"Feature files: {DATASET_DIR}/*/*/video_features_100hz.npy")
+if __name__ == '__main__':
+    main()

experiments/analysis/extract_videomae_features.py ADDED Viewed

	@@ -0,0 +1,276 @@

+#!/usr/bin/env python3
+"""
+Extract video features using VideoMAE (pretrained on Kinetics-400).
+Process 16-frame video clips to capture temporal dynamics.
+Output: per-frame feature vectors aligned to 100Hz sensor data.
+"""
+import os
+import sys
+import json
+import glob
+import argparse
+import numpy as np
+import cv2
+import torch
+DATASET_DIR = "${PULSE_ROOT}/dataset"
+MODEL_NAME = "${PULSE_ROOT}/models/videomae-base-kinetics"
+class VideoMAEFeatureExtractor:
+    """Extract features using VideoMAE-Base (16-frame clips). Multi-GPU enabled."""
+    def __init__(self, device='cpu'):
+        from transformers import VideoMAEModel, VideoMAEImageProcessor
+        import torch.nn as nn
+        self.device = device
+        self.processor = VideoMAEImageProcessor.from_pretrained(MODEL_NAME)
+        model = VideoMAEModel.from_pretrained(MODEL_NAME).to(device)
+        model.eval()
+        # Wrap with DataParallel if multiple GPUs available
+        if torch.cuda.is_available() and torch.cuda.device_count() > 1:
+            self.n_gpus = torch.cuda.device_count()
+            print(f"  Using DataParallel across {self.n_gpus} GPUs")
+            self.model = nn.DataParallel(model)
+            self.num_frames = model.config.num_frames
+            self.feat_dim = model.config.hidden_size
+        else:
+            self.n_gpus = 1
+            self.model = model
+            self.num_frames = model.config.num_frames
+            self.feat_dim = model.config.hidden_size
+    @torch.no_grad()
+    def extract_clip(self, frames):
+        """Extract feature from a single 16-frame clip.
+        Args:
+            frames: list of 16 RGB numpy arrays (H, W, 3)
+        Returns:
+            feature: numpy array (feat_dim,) - mean-pooled patch tokens
+        """
+        # Pad/truncate to exactly num_frames
+        if len(frames) < self.num_frames:
+            frames = frames + [frames[-1]] * (self.num_frames - len(frames))
+        elif len(frames) > self.num_frames:
+            # uniform sampling
+            indices = np.linspace(0, len(frames) - 1, self.num_frames, dtype=int)
+            frames = [frames[i] for i in indices]
+        inputs = self.processor(frames, return_tensors="pt")
+        pixel_values = inputs["pixel_values"].to(self.device)
+        outputs = self.model(pixel_values)
+        # Average pool over all patch tokens
+        feature = outputs.last_hidden_state.mean(dim=1).squeeze(0)  # (768,)
+        return feature.cpu().numpy()
+    @torch.no_grad()
+    def extract_clip_batch(self, clips):
+        """Extract features from a batch of clips.
+        Args:
+            clips: list of clips, each is a list of 16 RGB frames
+        Returns:
+            features: numpy array (B, feat_dim)
+        """
+        # Process each clip
+        all_pixel_values = []
+        for frames in clips:
+            if len(frames) < self.num_frames:
+                frames = frames + [frames[-1]] * (self.num_frames - len(frames))
+            elif len(frames) > self.num_frames:
+                indices = np.linspace(0, len(frames) - 1, self.num_frames, dtype=int)
+                frames = [frames[i] for i in indices]
+            inputs = self.processor(frames, return_tensors="pt")
+            all_pixel_values.append(inputs["pixel_values"])
+        batch = torch.cat(all_pixel_values, dim=0).to(self.device)
+        outputs = self.model(batch)
+        features = outputs.last_hidden_state.mean(dim=1)  # (B, 768)
+        return features.cpu().numpy()
+def find_scene_video(scenario_dir, vol, scenario):
+    pattern = os.path.join(scenario_dir, f"trimmed_{vol}{scenario}*Scene Cam.mp4")
+    matches = glob.glob(pattern)
+    return matches[0] if matches else None
+def extract_features_for_video(extractor, video_path, target_fps=100,
+                               clip_stride_sec=0.5, batch_size=4):
+    """Extract VideoMAE features from a video.
+    Strategy (fast):
+    - Sequentially decode video ONCE, downsample to 8fps and store frames in RAM
+    - Build clips by indexing into the in-memory frame array (no random seeks)
+    """
+    import time
+    t0 = time.time()
+    cap = cv2.VideoCapture(video_path)
+    video_fps = cap.get(cv2.CAP_PROP_FPS)
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    duration = total_frames / video_fps
+    # Read all frames sequentially, downsample to ~16fps (every video_fps/16 frame)
+    decode_fps = 16  # we sample frames at this rate from the video
+    decode_stride = max(1, int(round(video_fps / decode_fps)))
+    print(f"    Video: {total_frames} frames @ {video_fps:.1f}fps = {duration:.1f}s")
+    print(f"    Decoding sequentially with stride {decode_stride} (~{video_fps/decode_stride:.1f}fps)...")
+    # Pre-resize to model input size during decoding to save memory
+    # VideoMAE expects 224x224
+    target_size = 224
+    decoded_frames = []  # list of (H, W, 3) uint8 RGB arrays
+    decoded_times = []   # corresponding timestamps in seconds
+    frame_idx = 0
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        if frame_idx % decode_stride == 0:
+            # Resize early to save memory
+            resized = cv2.resize(frame, (target_size, target_size), interpolation=cv2.INTER_AREA)
+            rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
+            decoded_frames.append(rgb)
+            decoded_times.append(frame_idx / video_fps)
+        frame_idx += 1
+    cap.release()
+    decoded_frames = np.array(decoded_frames)  # (N, 224, 224, 3)
+    decoded_times = np.array(decoded_times)
+    decode_time = time.time() - t0
+    print(f"    Decoded {len(decoded_frames)} frames in {decode_time:.1f}s")
+    # Build clips: each clip = 16 frames spanning ~1 second
+    # Sample 16 consecutive frames from in-memory array
+    frames_per_clip = 16
+    n_decoded = len(decoded_frames)
+    if n_decoded < 4:
+        return None
+    # Each clip occupies 16 frames at ~16fps = 1 second
+    clip_centers_sec = np.arange(0.5, duration - 0.5, clip_stride_sec)
+    n_clips = len(clip_centers_sec)
+    print(f"    Building {n_clips} clips (stride={clip_stride_sec}s, {frames_per_clip} frames each)")
+    all_features = []
+    clip_times = []
+    batch_clips = []
+    batch_times = []
+    t1 = time.time()
+    for center_sec in clip_centers_sec:
+        # Find decoded frames within ±0.5s window
+        center_idx = np.searchsorted(decoded_times, center_sec)
+        half = frames_per_clip // 2
+        start = max(0, center_idx - half)
+        end = min(n_decoded, start + frames_per_clip)
+        start = max(0, end - frames_per_clip)
+        if end - start < 4:
+            continue
+        clip = list(decoded_frames[start:end])
+        # Pad if needed
+        if len(clip) < frames_per_clip:
+            clip = clip + [clip[-1]] * (frames_per_clip - len(clip))
+        batch_clips.append(clip)
+        batch_times.append(center_sec)
+        if len(batch_clips) >= batch_size:
+            feats = extractor.extract_clip_batch(batch_clips)
+            all_features.append(feats)
+            clip_times.extend(batch_times)
+            batch_clips = []
+            batch_times = []
+    if batch_clips:
+        feats = extractor.extract_clip_batch(batch_clips)
+        all_features.append(feats)
+        clip_times.extend(batch_times)
+    inference_time = time.time() - t1
+    print(f"    Inference time: {inference_time:.1f}s ({len(clip_times)} clips)")
+    if not all_features:
+        return None
+    features = np.concatenate(all_features, axis=0)  # (N_clips, 768)
+    clip_times = np.array(clip_times[:features.shape[0]])
+    # Interpolate to target_fps (100Hz)
+    target_times = np.arange(0, duration, 1.0 / target_fps)
+    n_target = len(target_times)
+    from scipy.interpolate import interp1d
+    if len(clip_times) < 2:
+        interpolated = np.tile(features[0], (n_target, 1))
+    else:
+        interp_func = interp1d(
+            clip_times, features, axis=0,
+            kind='linear', fill_value='extrapolate'
+        )
+        interpolated = interp_func(target_times).astype(np.float32)
+    print(f"    Output: {interpolated.shape} @ {target_fps}Hz")
+    return interpolated
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--clip_stride', type=float, default=0.5,
+                        help='Clip extraction stride in seconds (default: 0.5)')
+    parser.add_argument('--batch_size', type=int, default=4)
+    parser.add_argument('--device', type=str, default='cuda')
+    parser.add_argument('--output_name', type=str, default='video_features_videomae_100hz.npy')
+    args = parser.parse_args()
+    device = args.device if torch.cuda.is_available() and args.device == 'cuda' else 'cpu'
+    print(f"Device: {device}")
+    print(f"Loading VideoMAE from {MODEL_NAME}...")
+    extractor = VideoMAEFeatureExtractor(device=device)
+    print(f"Feature dim: {extractor.feat_dim}, num frames per clip: {extractor.num_frames}")
+    processed = 0
+    skipped = 0
+    for vol_dir in sorted(glob.glob(f"{DATASET_DIR}/v*")):
+        vol = os.path.basename(vol_dir)
+        for scenario_dir in sorted(glob.glob(f"{vol_dir}/s*")):
+            scenario = os.path.basename(scenario_dir)
+            output_path = os.path.join(scenario_dir, args.output_name)
+            if os.path.exists(output_path):
+                print(f"[{vol}/{scenario}] exists, skip")
+                skipped += 1
+                continue
+            video_path = find_scene_video(scenario_dir, vol, scenario)
+            if video_path is None:
+                print(f"[{vol}/{scenario}] no video, skip")
+                skipped += 1
+                continue
+            print(f"\n[{vol}/{scenario}]")
+            features = extract_features_for_video(
+                extractor, video_path,
+                clip_stride_sec=args.clip_stride,
+                batch_size=args.batch_size,
+            )
+            if features is not None:
+                np.save(output_path, features)
+                print(f"  Saved: {output_path} ({features.shape})")
+                processed += 1
+            else:
+                print(f"  FAILED")
+    print(f"\nDone! Processed: {processed}, Skipped: {skipped}")
+if __name__ == '__main__':
+    main()

experiments/analysis/gen_val_comparison.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import os, sys, json, torch
+sys.path.insert(0, '${PULSE_ROOT}')
+os.environ['HF_HUB_OFFLINE'] = '1'
+os.environ['TRANSFORMERS_OFFLINE'] = '1'
+from tasks.train_pred import (
+    TextPredictionDataset, SensorToTextModel, apply_lora, set_seed
+)
+from data.dataset import TRAIN_VOLS, VAL_VOLS, TEST_VOLS
+set_seed(42)
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# Load tokenizer & LLM
+from transformers import AutoTokenizer, AutoModelForCausalLM
+llm_path = '${PULSE_ROOT}/models/qwen2.5-0.5b'
+tokenizer = AutoTokenizer.from_pretrained(llm_path, trust_remote_code=True, local_files_only=True)
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+llm = AutoModelForCausalLM.from_pretrained(
+    llm_path, trust_remote_code=True, torch_dtype=torch.float32, local_files_only=True
+).to(device)
+llm.config.pad_token_id = tokenizer.pad_token_id
+for p in llm.parameters():
+    p.requires_grad = False
+lora_params = apply_lora(llm, r=8, alpha=16)
+modalities = ['mocap', 'emg', 'imu']
+# Build datasets
+train_ds = TextPredictionDataset(TRAIN_VOLS, modalities, tokenizer, window_sec=15.0, downsample=5)
+stats = train_ds.get_stats()
+val_ds = TextPredictionDataset(VAL_VOLS, modalities, tokenizer, window_sec=15.0, downsample=5, stats=stats)
+test_ds = TextPredictionDataset(TEST_VOLS, modalities, tokenizer, window_sec=15.0, downsample=5, stats=stats)
+# Build model & load weights
+model = SensorToTextModel(train_ds.feat_dim, llm, tokenizer, n_sensor_tokens=8, d_model=64)
+model.to(device)
+ckpt_path = '${PULSE_ROOT}/results/pred_llm2/pred_llm_mocap-emg-imu/model_best.pt'
+sd = torch.load(ckpt_path, weights_only=True, map_location=device)
+model.load_state_dict(sd, strict=False)
+model.eval()
+out_path = '${PULSE_ROOT}/docs/pred_llm2_val_comparison.txt'
+from torch.utils.data import DataLoader
+with open(out_path, 'w') as f:
+    for split_name, ds in [('Validation', val_ds), ('Test', test_ds)]:
+        loader = DataLoader(ds, batch_size=8, shuffle=False)
+        f.write(f"{'='*70}\n")
+        f.write(f"{split_name} Set — mocap,emg,imu (best charF1=0.0324)\n")
+        f.write(f"Samples: {len(ds)}\n")
+        f.write(f"{'='*70}\n\n")
+        idx = 0
+        for batch in loader:
+            sensor = batch['sensor'].to(device)
+            preds = model.generate_text(sensor, tokenizer, max_new_tokens=20)
+            refs = [ds.texts[idx + i] for i in range(len(preds))]
+            for p, r in zip(preds, refs):
+                match = "OK" if p.strip() == r.strip() else "XX"
+                f.write(f"[{match}] #{idx+1}\n")
+                f.write(f"  Pred: {p.strip()}\n")
+                f.write(f"  Ref:  {r.strip()}\n\n")
+                idx += 1
+        # Stats
+        f.write(f"\n--- {split_name} Summary ---\n")
+        f.write(f"Total: {idx}\n\n")
+print(f"Written to {out_path}")

experiments/analysis/generate_action_labels.py ADDED Viewed

	@@ -0,0 +1,133 @@

+#!/usr/bin/env python3
+"""
+Generate action labels by clustering task descriptions using text embeddings.
+No manual rules — uses sentence-transformers + K-Means clustering.
+"""
+import os
+import json
+import glob
+import argparse
+import numpy as np
+from collections import Counter
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score
+ANNOTATION_DIR = "${PULSE_ROOT}"
+def collect_tasks():
+    """Collect all task descriptions from all annotation files."""
+    tasks = []
+    for path in sorted(glob.glob(os.path.join(ANNOTATION_DIR, 'v*/s*.json'))):
+        with open(path) as f:
+            data = json.load(f)
+        for seg in data.get('segments', []):
+            tasks.append(seg['task'])
+    return tasks
+def embed_texts(texts):
+    """Encode texts using sentence-transformers (multilingual model)."""
+    try:
+        from sentence_transformers import SentenceTransformer
+        model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
+        embeddings = model.encode(texts, show_progress_bar=True, batch_size=128)
+        print(f"Encoded {len(texts)} texts with sentence-transformers, dim={embeddings.shape[1]}")
+        return embeddings
+    except Exception as e:
+        print(f"sentence-transformers failed ({e}), falling back to TF-IDF")
+        from sklearn.feature_extraction.text import TfidfVectorizer
+        vec = TfidfVectorizer(analyzer='char', ngram_range=(1, 3), max_features=3000)
+        X = vec.fit_transform(texts).toarray()
+        print(f"Encoded {len(texts)} texts with TF-IDF char n-grams, dim={X.shape[1]}")
+        return X
+def cluster_tasks(tasks, k_range=(10, 30)):
+    unique_tasks = sorted(set(tasks))
+    print(f"Total segments: {len(tasks)}, Unique task texts: {len(unique_tasks)}")
+    X = embed_texts(unique_tasks)
+    # Find optimal K via silhouette score
+    best_k, best_score = k_range[0], -1
+    scores = {}
+    for k in range(k_range[0], k_range[1] + 1):
+        km = KMeans(n_clusters=k, random_state=42, n_init=10)
+        labels = km.fit_predict(X)
+        score = silhouette_score(X, labels, sample_size=min(2000, len(unique_tasks)))
+        scores[k] = score
+        if score > best_score:
+            best_score = score
+            best_k = k
+        print(f"  K={k}: silhouette={score:.4f}" + (" *" if k == best_k else ""))
+    print(f"\nBest K={best_k} (silhouette={best_score:.4f})")
+    # Final clustering
+    km = KMeans(n_clusters=best_k, random_state=42, n_init=10)
+    labels = km.fit_predict(X)
+    task_to_cluster = {task: int(labels[i]) for i, task in enumerate(unique_tasks)}
+    # Representative task per cluster (closest to centroid)
+    cluster_representatives = {}
+    cluster_members = {}
+    for cid in range(best_k):
+        member_idx = [i for i, l in enumerate(labels) if l == cid]
+        members = [unique_tasks[i] for i in member_idx]
+        cluster_members[cid] = members
+        centroid = km.cluster_centers_[cid]
+        dists = np.linalg.norm(X[member_idx] - centroid, axis=1)
+        closest = member_idx[np.argmin(dists)]
+        cluster_representatives[cid] = unique_tasks[closest]
+    return task_to_cluster, cluster_representatives, cluster_members, best_k, scores
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output_dir', type=str,
+                        default='${PULSE_ROOT}/results/pred')
+    parser.add_argument('--k_min', type=int, default=10)
+    parser.add_argument('--k_max', type=int, default=30)
+    args = parser.parse_args()
+    os.makedirs(args.output_dir, exist_ok=True)
+    tasks = collect_tasks()
+    task_to_cluster, representatives, members, K, scores = cluster_tasks(
+        tasks, k_range=(args.k_min, args.k_max)
+    )
+    # Print summary
+    segment_counts = Counter(task_to_cluster[t] for t in tasks)
+    print(f"\n{'='*60}")
+    print(f"Clusters (K={K}):")
+    for cid in range(K):
+        rep = representatives[cid]
+        n_unique = len(members[cid])
+        n_segs = segment_counts.get(cid, 0)
+        examples = [m for m in members[cid] if m != rep][:3]
+        print(f"\n  [{cid:2d}] ({n_segs:4d} segs, {n_unique:3d} unique) \"{rep}\"")
+        for ex in examples:
+            print(f"        - {ex}")
+    # Save
+    output = {
+        'num_classes': K,
+        'task_to_cluster': task_to_cluster,
+        'cluster_representatives': {str(k): v for k, v in representatives.items()},
+        'cluster_sizes_unique': {str(k): len(v) for k, v in members.items()},
+        'cluster_sizes_segments': {str(k): v for k, v in segment_counts.items()},
+        'silhouette_scores': {str(k): v for k, v in scores.items()},
+    }
+    out_path = os.path.join(args.output_dir, 'action_labels.json')
+    with open(out_path, 'w') as f:
+        json.dump(output, f, indent=2, ensure_ascii=False)
+    print(f"\nSaved to {out_path}")
+if __name__ == '__main__':
+    main()

experiments/analysis/generate_coarse_annotations.py ADDED Viewed

	@@ -0,0 +1,296 @@

+#!/usr/bin/env python3
+"""
+Generate coarse-grained annotations by merging consecutive fine-grained segments
+into composite actions (8-15s duration) using LLM.
+Input:  annotations_v2/ (fine-grained, ~2-3s segments, 11 classes)
+Output: annotations_coarse/ (coarse-grained, ~8-15s segments, ~6 classes)
+Does NOT modify annotations_v2/.
+"""
+import os
+import json
+import re
+import time
+import glob
+import urllib.request
+from collections import Counter
+INPUT_DIR = "${PULSE_ROOT}/annotations_v2"
+OUTPUT_DIR = "${PULSE_ROOT}/annotations_coarse"
+API_URL = "https://api.chatanywhere.tech/v1/chat/completions"
+API_KEYS = [
+    "sk-MN5n1uEETyaky96fLJdHqZobXF1f7KmOrZHzwD3lt585asFQ",
+    "sk-YnYrtPdAXwlE12hRpi6dYqlE1RRVR3LDVBka6wKaefU4iQRY",
+    "sk-jOZtodDv6OxUOMu3NuJ8lzffjwBlshn9OHY5KSmqmPTtc9qs",
+    "sk-qAaKTKYIRF24btu1oQWgubWG4UdA92bILNtzOkHNEPAcCxdB",
+    "sk-MgCBBonblMrCFnSXd6fJZaBLTCfCJ5FjYZfSe2e46bgmyktk",
+    "sk-79e30kYRgduuf2fSU0Lsc814YjNkClXXzQqIbx0iLS40IOEH",
+    "sk-h9Tej4tW6AQC6fT0njfzrPKXEk6fBwpiSvvQd0aJAhw4UwLz",
+    "sk-k2QNHt5wAH26Fw8hZuPWuVXw8Psd1jX09qusiA6PdBj5Vzuu",
+    "sk-w7EkTblciNI44cwosHXi0PGZNUf1hnJmpzOQ85va9VPdAKbz",
+    "sk-Dexs5ZF7OjFCq7CZW45wJ8EKoGtIswv6rsLUMzUXXkWBDBBJ",
+]
+SCENE_DESCRIPTIONS = {
+    "s1": "办公桌面整理与工作准备",
+    "s2": "快递打包发送",
+    "s3": "厨房调料整理",
+    "s4": "清理餐后桌面",
+    "s5": "餐前桌面布置",
+    "s6": "商务旅行行李箱打包",
+    "s7": "冲泡咖啡/饮品",
+    "s8": "晾衣架整理与衣物收纳",
+}
+COARSE_CATEGORIES = """粗粒度动作类别（共6类）：
+1. Manipulate - 操作物体（抓取、调整、放置某个物体的完整过程，包含拿起→操作→放下的组合）
+2. CleanOrganize - 清洁/整理（擦桌子、理线、整理桌面、叠衣服等持续性整理活动）
+3. Transfer - 搬运/传递（将物体从一个位置搬到另一个位置的过程）
+4. Assemble - 组装/连接/包装（封箱、贴胶带、盖盖子、插电源、拧瓶盖等需要精细对准的操作）
+5. FoodPrep - 食物/饮品准备（倒水、倒调料、搅拌、冲泡等与食物饮品相关的操作）
+6. Idle - 空闲/过渡（无明确操作的间隔）
+"""
+current_key_idx = 0
+call_count = 0
+def call_llm(prompt, max_tokens=1500, retries=3):
+    global current_key_idx, call_count
+    for attempt in range(retries * len(API_KEYS)):
+        key = API_KEYS[current_key_idx]
+        try:
+            data = json.dumps({
+                "model": "gpt-4o-mini",
+                "messages": [{"role": "user", "content": prompt}],
+                "max_tokens": max_tokens,
+                "temperature": 0.1,
+            }).encode()
+            req = urllib.request.Request(
+                API_URL, data=data,
+                headers={"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
+            )
+            resp = urllib.request.urlopen(req, timeout=30)
+            result = json.loads(resp.read())
+            call_count += 1
+            return result["choices"][0]["message"]["content"]
+        except Exception as e:
+            err = str(e)
+            if any(k in err for k in ["429", "quota", "limit", "402", "403"]):
+                current_key_idx = (current_key_idx + 1) % len(API_KEYS)
+            else:
+                time.sleep(0.5)
+                current_key_idx = (current_key_idx + 1) % len(API_KEYS)
+    return None
+def parse_ts(ts_str):
+    """Parse 'MM:SS' to seconds."""
+    m = re.match(r'(\d+):(\d+)', ts_str.strip())
+    if m:
+        return int(m.group(1)) * 60 + int(m.group(2))
+    return 0
+def format_ts(sec):
+    """Format seconds to 'MM:SS'."""
+    return f"{sec//60:02d}:{sec%60:02d}"
+def merge_segments_with_llm(segments, scene_id):
+    """Use LLM to merge fine-grained segments into coarse composite actions."""
+    scene_desc = SCENE_DESCRIPTIONS.get(scene_id, "日常活动")
+    # Build segment list
+    seg_lines = []
+    for i, seg in enumerate(segments):
+        label = seg.get("action_label", "Idle")
+        seg_lines.append(f"{i+1}. [{seg['timestamp']}] {label}: {seg['task']}")
+    seg_text = "\n".join(seg_lines)
+    prompt = f"""你是一个动作标注专家。以下是一段"{scene_desc}"录制中的细粒度动作序列（每个2-3秒）。
+请将相关的连续动作合并为粗粒度复合动作，每个复合动作持续5-15秒。
+合并规则：
+- 围绕同一个物体的连续操作合并为一个（如"抓取杯子→调整→放下"合并为一个Manipulate）
+- 连续的整理/清洁动作合并
+- 合并后的时间范围 = 第一个子动作的开始时间 到 最后一个子动作的结束时间
+- 如果中间有短暂Idle（≤3秒），可以包含进去
+- 每个复合动作必须从6个类别中选一个
+{COARSE_CATEGORIES}
+细粒度动作序列：
+{seg_text}
+请严格按以下JSON格式返回，不要添加任何额外文字：
+[{{"timestamp": "MM:SS-MM:SS", "coarse_action": "类别名", "description": "简��描述这段复合动作", "fine_segments": [子动作编号列表]}}]"""
+    response = call_llm(prompt, max_tokens=2000)
+    if response is None:
+        return None
+    try:
+        match = re.search(r'\[.*\]', response, re.DOTALL)
+        if match:
+            results = json.loads(match.group())
+            valid = []
+            for r in results:
+                if all(k in r for k in ["timestamp", "coarse_action", "description"]):
+                    # Validate category
+                    if r["coarse_action"] in {"Manipulate", "CleanOrganize", "Transfer",
+                                               "Assemble", "FoodPrep", "Idle"}:
+                        valid.append(r)
+            return valid
+    except (json.JSONDecodeError, KeyError) as e:
+        print(f"  Parse error: {e}")
+    return None
+def process_file(input_path, vol, scenario):
+    """Process one annotation file."""
+    data = json.load(open(input_path))
+    segments = data["segments"]
+    if not segments:
+        return {"fine_segments": segments, "coarse_segments": []}, 0
+    print(f"  Merging {len(segments)} fine segments...")
+    coarse = merge_segments_with_llm(segments, scenario)
+    if coarse is None:
+        # Fallback: simple time-based merging without LLM
+        print(f"  LLM failed, using fallback merge")
+        coarse = fallback_merge(segments)
+    result = {
+        "fine_segments": segments,
+        "coarse_segments": coarse,
+    }
+    return result, len(coarse)
+def fallback_merge(segments):
+    """Simple rule-based merging as fallback."""
+    if not segments:
+        return []
+    coarse = []
+    group = [segments[0]]
+    for seg in segments[1:]:
+        # Parse timestamps
+        prev_ts = group[-1]["timestamp"]
+        curr_ts = seg["timestamp"]
+        m1 = re.match(r'(\d+:\d+)\s*-\s*(\d+:\d+)', prev_ts)
+        m2 = re.match(r'(\d+:\d+)\s*-\s*(\d+:\d+)', curr_ts)
+        if not m1 or not m2:
+            group.append(seg)
+            continue
+        prev_end = parse_ts(m1.group(2))
+        curr_start = parse_ts(m2.group(1))
+        gap = curr_start - prev_end
+        # Merge if gap ≤ 3s and group duration < 15s
+        group_start = parse_ts(re.match(r'(\d+:\d+)', group[0]["timestamp"]).group(1))
+        curr_end = parse_ts(m2.group(2))
+        group_duration = curr_end - group_start
+        if gap <= 3 and group_duration <= 15:
+            group.append(seg)
+        else:
+            # Emit current group
+            coarse.append(_emit_group(group))
+            group = [seg]
+    if group:
+        coarse.append(_emit_group(group))
+    return coarse
+def _emit_group(group):
+    """Create a coarse segment from a group of fine segments."""
+    m_start = re.match(r'(\d+:\d+)', group[0]["timestamp"])
+    m_end = re.match(r'\d+:\d+\s*-\s*(\d+:\d+)', group[-1]["timestamp"])
+    start = m_start.group(1) if m_start else "00:00"
+    end = m_end.group(1) if m_end else "00:00"
+    labels = [seg.get("action_label", "Idle") for seg in group]
+    label_counts = Counter(labels)
+    dominant = label_counts.most_common(1)[0][0]
+    # Map fine label to coarse
+    label_map = {
+        "Grasp": "Manipulate", "Place": "Manipulate", "Arrange": "CleanOrganize",
+        "Wipe": "CleanOrganize", "Fold": "CleanOrganize", "Transport": "Transfer",
+        "OpenClose": "Assemble", "TearCut": "Assemble",
+        "Pour": "FoodPrep", "Stir": "FoodPrep", "Idle": "Idle",
+    }
+    coarse_label = label_map.get(dominant, "Manipulate")
+    tasks = [seg["task"] for seg in group]
+    desc = tasks[0] if len(tasks) == 1 else f"{tasks[0]}...{tasks[-1]}"
+    return {
+        "timestamp": f"{start}-{end}",
+        "coarse_action": coarse_label,
+        "description": desc[:80],
+        "fine_segments": list(range(1, len(group) + 1)),
+    }
+def main():
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    total_fine = 0
+    total_coarse = 0
+    total_files = 0
+    coarse_labels = Counter()
+    for vol_dir in sorted(glob.glob(f"{INPUT_DIR}/v*")):
+        vol = os.path.basename(vol_dir)
+        out_dir = os.path.join(OUTPUT_DIR, vol)
+        os.makedirs(out_dir, exist_ok=True)
+        for ann_file in sorted(glob.glob(f"{vol_dir}/s*.json")):
+            scenario = os.path.basename(ann_file).replace(".json", "")
+            print(f"[{vol}/{scenario}]", flush=True)
+            result, n_coarse = process_file(ann_file, vol, scenario)
+            out_path = os.path.join(out_dir, f"{scenario}.json")
+            with open(out_path, "w", encoding="utf-8") as f:
+                json.dump(result, f, ensure_ascii=False, indent=2)
+            n_fine = len(result["fine_segments"])
+            total_fine += n_fine
+            total_coarse += n_coarse
+            total_files += 1
+            for seg in result["coarse_segments"]:
+                coarse_labels[seg["coarse_action"]] += 1
+            print(f"  {n_fine} fine → {n_coarse} coarse segments", flush=True)
+    print(f"\n{'='*60}")
+    print(f"Total: {total_files} files")
+    print(f"  Fine segments:   {total_fine}")
+    print(f"  Coarse segments: {total_coarse}")
+    print(f"  Compression:     {total_fine/max(total_coarse,1):.1f}x")
+    print(f"  API calls:       {call_count}")
+    print(f"\n  Coarse label distribution:")
+    for label, count in coarse_labels.most_common():
+        print(f"    {label:<20} {count:>5} ({count/max(total_coarse,1)*100:.1f}%)")
+    print(f"\n  Output: {OUTPUT_DIR}")
+if __name__ == "__main__":
+    main()

experiments/analysis/grasp_phase_analysis.py ADDED Viewed

	@@ -0,0 +1,442 @@

+#!/usr/bin/env python3
+"""
+Grasp Phase Timing Analysis — Flagship visualization for the paper.
+Classic neuroscience finding:
+  Eye gaze → EMG activation → Hand motion → Pressure contact
+This script:
+1. Detects grasp events (pressure onset: 0 → >5g)
+2. Looks back in time to find:
+   - EMG envelope activation onset
+   - Hand velocity peak (from MoCap)
+   - Eye gaze fixation (if available)
+3. Computes statistics over all grasp events
+4. Produces the canonical "grasp phase" timing figure
+"""
+import os
+import glob
+import json
+import numpy as np
+import pandas as pd
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from scipy import signal as scisig
+from collections import defaultdict
+DATASET_DIR = "${PULSE_ROOT}/dataset"
+OUTPUT_DIR = "${PULSE_ROOT}/results/grasp_phase"
+SAMPLING_RATE = 100  # Hz
+PRESSURE_THRESHOLD = 5.0  # grams
+CONTEXT_WINDOW_SEC = 2.0  # look back 2s before contact
+CONTEXT_FRAMES = int(CONTEXT_WINDOW_SEC * SAMPLING_RATE)
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+def load_pressure(scenario_dir):
+    """Load pressure data and return (T, 2) array: [right_total, left_total]."""
+    f = os.path.join(scenario_dir, "aligned_pressure_100hz.csv")
+    if not os.path.exists(f):
+        return None
+    df = pd.read_csv(f, low_memory=False)
+    r_cols = [c for c in df.columns if c.startswith('R') and c.endswith('(g)')]
+    l_cols = [c for c in df.columns if c.startswith('L') and c.endswith('(g)')]
+    if not r_cols or not l_cols:
+        return None
+    r = df[r_cols].apply(pd.to_numeric, errors='coerce').fillna(0).values.sum(axis=1)
+    l = df[l_cols].apply(pd.to_numeric, errors='coerce').fillna(0).values.sum(axis=1)
+    return np.stack([r, l], axis=1)  # (T, 2)
+def load_emg(scenario_dir):
+    """Load EMG data: (T, 8) array."""
+    f = os.path.join(scenario_dir, "aligned_emg_100hz.csv")
+    if not os.path.exists(f):
+        return None
+    df = pd.read_csv(f, low_memory=False)
+    # Find EMG channel columns (e.g., EMG1...EMG8 or channels)
+    numeric_cols = df.select_dtypes(include=[np.number]).columns
+    numeric_cols = [c for c in numeric_cols if c not in ('Frame', 'Time', 'time', 'UTC')]
+    if len(numeric_cols) < 4:
+        return None
+    arr = df[numeric_cols].values.astype(np.float32)
+    arr = np.nan_to_num(arr, nan=0.0, posinf=0.0, neginf=0.0)
+    return arr
+def load_mocap(scenario_dir, vol, scenario):
+    """Load MoCap hand position, return (T, 3) right hand velocity magnitude, (T, 3) left hand."""
+    f = os.path.join(scenario_dir, f"aligned_{vol}{scenario}_s_Q.tsv")
+    if not os.path.exists(f):
+        return None, None
+    df = pd.read_csv(f, sep='\t', low_memory=False)
+    # Find right/left hand position columns
+    # Try common naming patterns
+    r_cols = [c for c in df.columns if 'RightHand' in c and (c.endswith('_X') or c.endswith('_Y') or c.endswith('_Z'))]
+    l_cols = [c for c in df.columns if 'LeftHand' in c and (c.endswith('_X') or c.endswith('_Y') or c.endswith('_Z'))]
+    if not r_cols or not l_cols:
+        # Try alternative naming
+        r_cols = [c for c in df.columns if 'R_Hand' in c or 'RHand' in c][:3]
+        l_cols = [c for c in df.columns if 'L_Hand' in c or 'LHand' in c][:3]
+    if not r_cols or not l_cols:
+        return None, None
+    r_pos = df[r_cols[:3]].apply(pd.to_numeric, errors='coerce').fillna(0).values
+    l_pos = df[l_cols[:3]].apply(pd.to_numeric, errors='coerce').fillna(0).values
+    return r_pos, l_pos
+def compute_emg_envelope(emg, window_size=20):
+    """Rectify and low-pass filter EMG to get envelope."""
+    # Rectify
+    rectified = np.abs(emg - np.mean(emg, axis=0))
+    # Moving average
+    kernel = np.ones(window_size) / window_size
+    envelope = np.zeros_like(rectified)
+    for ch in range(rectified.shape[1]):
+        envelope[:, ch] = np.convolve(rectified[:, ch], kernel, mode='same')
+    # Sum across channels and normalize
+    total = envelope.sum(axis=1)
+    if total.max() > total.min():
+        total = (total - total.min()) / (total.max() - total.min() + 1e-8)
+    return total  # (T,)
+def compute_velocity(position, window=3):
+    """Compute velocity magnitude from 3D position."""
+    vel = np.zeros_like(position)
+    vel[1:] = position[1:] - position[:-1]
+    vel_mag = np.linalg.norm(vel, axis=1)
+    # Smooth
+    kernel = np.ones(window) / window
+    vel_mag = np.convolve(vel_mag, kernel, mode='same')
+    return vel_mag  # (T,)
+def detect_grasp_events(pressure_1d, threshold=5.0, min_duration=10, min_gap=50):
+    """Detect pressure onset events (0 → >threshold).
+    Returns list of onset frame indices.
+    """
+    above = pressure_1d > threshold
+    # Hysteresis smoothing: require persistence
+    onsets = []
+    last_state = False
+    stable_counter = 0
+    for i, a in enumerate(above):
+        if a and not last_state:
+            # Candidate onset, check persistence
+            if i + min_duration < len(above) and np.mean(above[i:i+min_duration]) > 0.7:
+                if not onsets or i - onsets[-1] > min_gap:
+                    onsets.append(i)
+                last_state = True
+        elif not a and last_state:
+            # Check if really released
+            if i + 5 < len(above) and np.mean(above[i:i+5]) < 0.3:
+                last_state = False
+    return onsets
+def find_signal_onset(signal, ref_idx, window_frames, threshold_ratio=0.3):
+    """Find the LATEST pre-contact onset of signal activation.
+    Strategy: walk backward from ref_idx. Look for the last sample that's
+    still 'active' (> baseline + threshold_ratio * (peak-baseline)).
+    The first 'inactive' sample going backward marks the onset.
+    Returns: frame index of onset relative to ref_idx (negative = before).
+    """
+    start = max(0, ref_idx - window_frames)
+    segment = signal[start:ref_idx + 1]  # pre-contact window
+    if len(segment) < 10:
+        return None
+    # Baseline: lower quartile of the pre-contact window (robust to activation)
+    # Only use the earliest 30% as baseline estimate
+    early_part = segment[:max(10, int(len(segment) * 0.3))]
+    baseline = np.percentile(early_part, 25)
+    # Peak of the pre-contact activation
+    peak = np.max(segment)
+    if peak - baseline < 1e-4:
+        return None
+    threshold = baseline + (peak - baseline) * threshold_ratio
+    # Walk BACKWARD from ref_idx: find the last consecutive 'active' region
+    # ending at ref_idx, then the onset is where that region starts
+    above = segment > threshold
+    if not above[-1]:
+        # Not active at contact - use threshold crossing pattern
+        # Find the rising edge closest to ref_idx
+        rising = np.where(np.diff(above.astype(int)) == 1)[0]
+        if len(rising) == 0:
+            return None
+        onset_local = rising[-1] + 1  # first active frame
+    else:
+        # Active at contact - walk back to find onset
+        onset_local = len(segment) - 1
+        while onset_local > 0 and above[onset_local - 1]:
+            onset_local -= 1
+    onset_global = start + onset_local
+    return onset_global - ref_idx  # negative = before contact
+def is_clean_grasp(emg_env, velocity, pressure_trace, onset, look_back=150, rest_window=50):
+    """Check if this is a CLEAN grasp starting from rest.
+    Requires: EMG and velocity are both low in the REST window (onset-150 ~ onset-100).
+    """
+    rest_start = onset - look_back
+    rest_end = onset - (look_back - rest_window)
+    if rest_start < 0:
+        return False
+    # Quiescent rest period: EMG and velocity both low
+    emg_rest = emg_env[rest_start:rest_end].mean()
+    vel_rest = velocity[rest_start:rest_end].mean()
+    # Compare to the entire pre-contact activation
+    emg_pre = emg_env[rest_end:onset]
+    vel_pre = velocity[rest_end:onset]
+    if len(emg_pre) < 10:
+        return False
+    # The rest period should be significantly lower than the activation period
+    emg_active = np.percentile(emg_pre, 75)
+    vel_active = np.percentile(vel_pre, 75)
+    emg_increase = emg_active - emg_rest
+    vel_increase = vel_active - vel_rest
+    # Require meaningful increase from rest to activation
+    emg_dyn = emg_env.max() - emg_env.min()
+    vel_dyn = velocity.max() - velocity.min()
+    if emg_dyn < 1e-6 or vel_dyn < 1e-6:
+        return False
+    return (emg_increase / emg_dyn > 0.1) and (vel_increase / vel_dyn > 0.1)
+def analyze_one_scenario(vol, scenario):
+    """Analyze clean grasp events starting from rest."""
+    scenario_dir = os.path.join(DATASET_DIR, vol, scenario)
+    pressure = load_pressure(scenario_dir)
+    emg = load_emg(scenario_dir)
+    mocap_r, mocap_l = load_mocap(scenario_dir, vol, scenario)
+    if pressure is None or emg is None or mocap_r is None:
+        return None
+    min_len = min(pressure.shape[0], emg.shape[0], mocap_r.shape[0])
+    pressure = pressure[:min_len]
+    emg = emg[:min_len]
+    mocap_r = mocap_r[:min_len]
+    mocap_l = mocap_l[:min_len]
+    emg_env = compute_emg_envelope(emg)
+    vel_r = compute_velocity(mocap_r)
+    vel_l = compute_velocity(mocap_l)
+    events = []
+    for hand_name, hand_pressure, hand_vel in [
+        ('right', pressure[:, 0], vel_r),
+        ('left', pressure[:, 1], vel_l),
+    ]:
+        onsets = detect_grasp_events(hand_pressure, threshold=PRESSURE_THRESHOLD)
+        for onset in onsets:
+            if onset < CONTEXT_FRAMES:
+                continue
+            # Filter: only clean grasps starting from rest
+            if not is_clean_grasp(emg_env, hand_vel, hand_pressure, onset):
+                continue
+            # Find EMG onset: look for sustained activation rising from rest
+            emg_delay = find_signal_onset(emg_env, onset, CONTEXT_FRAMES, threshold_ratio=0.3)
+            motion_delay = find_signal_onset(hand_vel, onset, CONTEXT_FRAMES, threshold_ratio=0.3)
+            if emg_delay is None or motion_delay is None:
+                continue
+            # Sanity check: delays should be within [-1500, 0] ms
+            if emg_delay * 10 < -1500 or emg_delay * 10 > 0:
+                continue
+            if motion_delay * 10 < -1500 or motion_delay * 10 > 0:
+                continue
+            start = onset - CONTEXT_FRAMES
+            end = onset + 50
+            events.append({
+                'pressure': hand_pressure[start:end],
+                'emg': emg_env[start:end],
+                'velocity': hand_vel[start:end],
+                'hand': hand_name,
+                'onset_idx': onset,
+                'emg_delay_ms': emg_delay * 10,
+                'motion_delay_ms': motion_delay * 10,
+            })
+    return events
+def main():
+    all_events = []
+    stats = defaultdict(int)
+    for vol_dir in sorted(glob.glob(f"{DATASET_DIR}/v*")):
+        vol = os.path.basename(vol_dir)
+        for scenario_dir in sorted(glob.glob(f"{vol_dir}/s*")):
+            scenario = os.path.basename(scenario_dir)
+            meta_path = os.path.join(scenario_dir, 'alignment_metadata.json')
+            if not os.path.exists(meta_path):
+                continue
+            meta = json.load(open(meta_path))
+            # Need all 3 modalities
+            if not {'pressure', 'emg', 'mocap'}.issubset(set(meta['modalities'])):
+                stats['no_modality'] += 1
+                continue
+            events = analyze_one_scenario(vol, scenario)
+            if events is None:
+                stats['load_error'] += 1
+                continue
+            all_events.extend(events)
+            stats['scenarios'] += 1
+            stats['events'] += len(events)
+            print(f"[{vol}/{scenario}] {len(events)} grasp events", flush=True)
+    print(f"\n=== Summary ===")
+    print(f"Scenarios processed: {stats['scenarios']}")
+    print(f"Total grasp events:  {stats['events']}")
+    print(f"Loading errors:      {stats['load_error']}")
+    print(f"Missing modality:    {stats['no_modality']}")
+    if not all_events:
+        print("No events found!")
+        return
+    # Extract delays
+    emg_delays = np.array([e['emg_delay_ms'] for e in all_events])
+    motion_delays = np.array([e['motion_delay_ms'] for e in all_events])
+    print(f"\n=== Timing Statistics (ms, negative = before contact) ===")
+    print(f"EMG onset delay:    mean={emg_delays.mean():.1f}  median={np.median(emg_delays):.1f}  std={emg_delays.std():.1f}")
+    print(f"Motion peak delay:  mean={motion_delays.mean():.1f}  median={np.median(motion_delays):.1f}  std={motion_delays.std():.1f}")
+    # Save statistics
+    stats_dict = {
+        'n_events': len(all_events),
+        'emg_delay_ms': {'mean': float(emg_delays.mean()), 'median': float(np.median(emg_delays)),
+                          'std': float(emg_delays.std()), 'p25': float(np.percentile(emg_delays, 25)),
+                          'p75': float(np.percentile(emg_delays, 75))},
+        'motion_delay_ms': {'mean': float(motion_delays.mean()), 'median': float(np.median(motion_delays)),
+                            'std': float(motion_delays.std()), 'p25': float(np.percentile(motion_delays, 25)),
+                            'p75': float(np.percentile(motion_delays, 75))},
+    }
+    with open(os.path.join(OUTPUT_DIR, 'timing_stats.json'), 'w') as f:
+        json.dump(stats_dict, f, indent=2)
+    # ============ Figure 1: Aligned signal traces (averaged) ============
+    # Filter to events that have sufficient context
+    valid = [e for e in all_events if len(e['pressure']) == CONTEXT_FRAMES + 50]
+    print(f"\nEvents with full context: {len(valid)} / {len(all_events)}")
+    if len(valid) < 10:
+        print("Not enough events for plotting")
+        return
+    # Normalize signals (per-event max)
+    def normalize(sigs):
+        sigs = np.stack(sigs)
+        # Normalize each to [0, 1]
+        sigs = sigs - sigs.min(axis=1, keepdims=True)
+        maxs = sigs.max(axis=1, keepdims=True)
+        sigs = sigs / (maxs + 1e-8)
+        return sigs
+    pressure_stack = normalize([e['pressure'] for e in valid])
+    emg_stack = normalize([e['emg'] for e in valid])
+    vel_stack = normalize([e['velocity'] for e in valid])
+    time_axis = np.arange(-CONTEXT_FRAMES, 50) * 10  # ms
+    fig, ax = plt.subplots(figsize=(9, 5))
+    # Plot mean ± std
+    for sigs, color, label in [
+        (emg_stack, '#E74C3C', 'EMG envelope'),
+        (vel_stack, '#3498DB', 'Hand velocity'),
+        (pressure_stack, '#27AE60', 'Pressure (contact)'),
+    ]:
+        mean = sigs.mean(axis=0)
+        std = sigs.std(axis=0)
+        ax.plot(time_axis, mean, color=color, linewidth=2.5, label=label)
+        ax.fill_between(time_axis, mean - std * 0.5, mean + std * 0.5, color=color, alpha=0.15)
+    ax.axvline(0, color='black', linestyle='--', linewidth=1.2, alpha=0.7, label='Contact onset')
+    ax.axvline(emg_delays.mean(), color='#E74C3C', linestyle=':', alpha=0.8)
+    ax.axvline(motion_delays.mean(), color='#3498DB', linestyle=':', alpha=0.8)
+    # Annotations
+    ax.annotate(f'EMG\n{emg_delays.mean():.0f}ms',
+                xy=(emg_delays.mean(), 0.85), ha='center', fontsize=10, color='#C0392B',
+                bbox=dict(boxstyle="round,pad=0.3", fc='#FADBD8', ec='#E74C3C', alpha=0.9))
+    ax.annotate(f'Motion\n{motion_delays.mean():.0f}ms',
+                xy=(motion_delays.mean(), 0.65), ha='center', fontsize=10, color='#1F618D',
+                bbox=dict(boxstyle="round,pad=0.3", fc='#D6EAF8', ec='#3498DB', alpha=0.9))
+    ax.set_xlabel('Time relative to contact onset (ms)', fontsize=12)
+    ax.set_ylabel('Normalized amplitude', fontsize=12)
+    ax.set_title(f'Grasp Phase Timing ({len(valid)} events, {stats["scenarios"]} recordings)',
+                 fontsize=13, fontweight='bold')
+    ax.set_xlim(-CONTEXT_WINDOW_SEC * 1000, 500)
+    ax.legend(loc='upper left', frameon=True, fontsize=10)
+    ax.grid(True, alpha=0.3)
+    ax.set_ylim(-0.05, 1.1)
+    plt.tight_layout()
+    fig_path = os.path.join(OUTPUT_DIR, 'grasp_phase_timing.png')
+    plt.savefig(fig_path, dpi=150, bbox_inches='tight')
+    plt.savefig(fig_path.replace('.png', '.pdf'), bbox_inches='tight')
+    print(f"Saved figure: {fig_path}")
+    # ============ Figure 2: Delay distributions ============
+    fig, axes = plt.subplots(1, 2, figsize=(11, 4))
+    axes[0].hist(emg_delays, bins=30, color='#E74C3C', alpha=0.7, edgecolor='black')
+    axes[0].axvline(emg_delays.mean(), color='black', linestyle='--', linewidth=2, label=f'Mean: {emg_delays.mean():.0f}ms')
+    axes[0].axvline(np.median(emg_delays), color='grey', linestyle=':', linewidth=2, label=f'Median: {np.median(emg_delays):.0f}ms')
+    axes[0].set_xlabel('EMG onset - Contact onset (ms)', fontsize=11)
+    axes[0].set_ylabel('Count', fontsize=11)
+    axes[0].set_title('EMG → Contact Delay', fontsize=12, fontweight='bold')
+    axes[0].legend(fontsize=10)
+    axes[0].grid(True, alpha=0.3)
+    axes[1].hist(motion_delays, bins=30, color='#3498DB', alpha=0.7, edgecolor='black')
+    axes[1].axvline(motion_delays.mean(), color='black', linestyle='--', linewidth=2, label=f'Mean: {motion_delays.mean():.0f}ms')
+    axes[1].axvline(np.median(motion_delays), color='grey', linestyle=':', linewidth=2, label=f'Median: {np.median(motion_delays):.0f}ms')
+    axes[1].set_xlabel('Motion onset - Contact onset (ms)', fontsize=11)
+    axes[1].set_ylabel('Count', fontsize=11)
+    axes[1].set_title('Hand Motion → Contact Delay', fontsize=12, fontweight='bold')
+    axes[1].legend(fontsize=10)
+    axes[1].grid(True, alpha=0.3)
+    plt.tight_layout()
+    fig2_path = os.path.join(OUTPUT_DIR, 'delay_distributions.png')
+    plt.savefig(fig2_path, dpi=150, bbox_inches='tight')
+    plt.savefig(fig2_path.replace('.png', '.pdf'), bbox_inches='tight')
+    print(f"Saved figure: {fig2_path}")
+    print(f"\nAll outputs saved to: {OUTPUT_DIR}")
+if __name__ == '__main__':
+    main()

experiments/analysis/modality_viz.py ADDED Viewed

	@@ -0,0 +1,145 @@

+"""Visualize mocap skeleton frames, IMU waveforms, EMG waveforms."""
+import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D  # noqa
+REC = "${PULSE_ROOT}/dataset/v1/s1"
+OUT = "${PULSE_ROOT}/paper/figures"
+os.makedirs(OUT, exist_ok=True)
+# ---- Skeleton bone definition (marker pairs) ----
+BONES = [
+    # torso
+    ("HeadTop","HeadFront"),("HeadL","HeadR"),("HeadFront","SpineTop"),
+    ("SpineTop","Chest"),("Chest","WaistLFront"),("Chest","WaistRFront"),
+    ("WaistLFront","WaistLBack"),("WaistRFront","WaistRBack"),
+    ("WaistLBack","BackL"),("WaistRBack","BackR"),("BackL","BackR"),
+    ("SpineTop","LShoulderTop"),("SpineTop","RShoulderTop"),
+    ("LShoulderTop","LShoulderBack"),("RShoulderTop","RShoulderBack"),
+    # left arm
+    ("LShoulderTop","LArm"),("LArm","LElbowOut"),("LElbowOut","LElbowBack"),
+    ("LElbowOut","LForearmRoll"),("LForearmRoll","LWristOut"),
+    ("LWristOut","LWristIn"),("LWristOut","LHandOut"),("LWristIn","LHandIn"),
+    ("LHandOut","LIndex2"),("LIndex2","LIndexTip"),
+    ("LHandOut","LMiddle2"),("LMiddle2","LMiddleTip"),
+    ("LHandIn","LRing2"),("LRing2","LRingTip"),
+    ("LHandIn","LPinky2"),("LPinky2","LPinkyTip"),
+    ("LWristIn","LThumb1"),("LThumb1","LThumbTip"),
+    # right arm
+    ("RShoulderTop","RArm"),("RArm","RElbowOut"),("RElbowOut","RElbowBack"),
+    ("RElbowOut","RForearmRoll"),("RForearmRoll","RWristOut"),
+    ("RWristOut","RWristIn"),("RWristOut","RHandOut"),("RWristIn","RHandIn"),
+    ("RHandOut","RIndex2"),("RIndex2","RIndexTip"),
+    ("RHandOut","RMiddle2"),("RMiddle2","RMiddleTip"),
+    ("RHandIn","RRing2"),("RRing2","RRingTip"),
+    ("RHandIn","RPinky2"),("RPinky2","RPinkyTip"),
+    ("RWristIn","RThumb1"),("RThumb1","RThumbTip"),
+]
+def load_mocap(path):
+    df = pd.read_csv(path)
+    # Extract x,y,z for each marker ignoring Type cols
+    markers = {}
+    for col in df.columns:
+        if col.startswith("Q_") and col.endswith(" X"):
+            name = col[2:-2]
+            xs = df[f"Q_{name} X"].to_numpy()
+            ys = df[f"Q_{name} Y"].to_numpy()
+            zs = df[f"Q_{name} Z"].to_numpy()
+            markers[name] = np.stack([xs, ys, zs], axis=-1)
+    return df["Time"].to_numpy(), markers
+def plot_skeletons():
+    t, mk = load_mocap(os.path.join(REC, "aligned_mocap_100hz.csv"))
+    N = len(t)
+    # pick 4 time frames well spread through the recording with valid data
+    candidate = np.linspace(int(0.1*N), int(0.9*N), 4).astype(int)
+    fig = plt.figure(figsize=(12, 3.2))
+    for i, fr in enumerate(candidate):
+        ax = fig.add_subplot(1, 4, i+1, projection='3d')
+        # gather all points at this frame
+        pts = np.array([mk[n][fr] for n in mk])
+        pts = pts[~np.isnan(pts).any(axis=1)]
+        if len(pts) == 0:
+            continue
+        # draw bones
+        for a, b in BONES:
+            if a in mk and b in mk:
+                pa, pb = mk[a][fr], mk[b][fr]
+                if np.isnan(pa).any() or np.isnan(pb).any():
+                    continue
+                ax.plot([pa[0], pb[0]], [pa[1], pb[1]], [pa[2], pb[2]],
+                        color='#2266aa', lw=1.2)
+        ax.scatter(pts[:, 0], pts[:, 1], pts[:, 2], s=4, c='#cc3333', alpha=0.8)
+        # equal aspect
+        c = pts.mean(0)
+        r = np.ptp(pts, axis=0).max() / 2
+        ax.set_xlim(c[0]-r, c[0]+r); ax.set_ylim(c[1]-r, c[1]+r); ax.set_zlim(c[2]-r, c[2]+r)
+        ax.set_xticks([]); ax.set_yticks([]); ax.set_zticks([])
+        ax.set_title(f"t={t[fr]:.1f}s", fontsize=9)
+        ax.view_init(elev=12, azim=-75)
+    fig.suptitle("MoCap skeleton frames (56-marker Qualisys, v1/s1)", fontsize=11)
+    fig.tight_layout()
+    out = os.path.join(OUT, "mocap_skeleton.pdf")
+    fig.savefig(out, bbox_inches='tight'); fig.savefig(out.replace('.pdf', '.png'), dpi=150, bbox_inches='tight')
+    plt.close(fig)
+    print("Saved", out)
+def plot_imu():
+    df = pd.read_csv(os.path.join(REC, "aligned_imu_100hz.csv"))
+    t = df["time"].to_numpy(); t = t - t[0]
+    # pick 5 body locations (WT0..WT9 order roughly: wrists, forearms, upper arms, shins, thighs, torso)
+    sites = [("WT0", "Wrist R"), ("WT2", "Forearm R"),
+             ("WT4", "Upper arm R"), ("WT6", "Shin R"), ("WT9", "Torso")]
+    fig, axes = plt.subplots(len(sites), 1, figsize=(9, 6), sharex=True)
+    # crop to 20s window mid-recording
+    mid = len(t)//2
+    sl = slice(max(0, mid-1000), min(len(t), mid+1000))
+    for ax, (sid, lbl) in zip(axes, sites):
+        for comp, col in zip(["x", "y", "z"], ["#d62728", "#2ca02c", "#1f77b4"]):
+            ax.plot(t[sl], df[f"{sid}_acc_{comp}"].to_numpy()[sl], color=col, lw=0.8, label=f"acc_{comp}")
+        ax.set_ylabel(lbl, fontsize=9)
+        ax.grid(alpha=0.3)
+    axes[0].legend(loc="upper right", ncol=3, fontsize=8)
+    axes[-1].set_xlabel("Time (s)")
+    fig.suptitle("IMU 3-axis acceleration across 5 body sites (v1/s1, 20s window)", fontsize=11)
+    fig.tight_layout()
+    out = os.path.join(OUT, "imu_waveforms.pdf")
+    fig.savefig(out, bbox_inches='tight'); fig.savefig(out.replace('.pdf', '.png'), dpi=150, bbox_inches='tight')
+    plt.close(fig)
+    print("Saved", out)
+def plot_emg():
+    df = pd.read_csv(os.path.join(REC, "aligned_emg_100hz.csv"))
+    t = df["time"].to_numpy(); t = t - t[0]
+    ch = [f"emg_{i}" for i in range(1, 9)]
+    # 20s window mid-recording
+    mid = len(t)//2
+    sl = slice(max(0, mid-1000), min(len(t), mid+1000))
+    fig, axes = plt.subplots(8, 1, figsize=(9, 7), sharex=True)
+    for ax, c in zip(axes, ch):
+        sig = df[c].to_numpy()[sl]
+        ax.plot(t[sl], sig, color="#555", lw=0.5)
+        # envelope overlay
+        env = pd.Series(np.abs(sig)).rolling(20, min_periods=1).mean().to_numpy()
+        ax.plot(t[sl], env, color="#d62728", lw=0.9)
+        ax.set_ylabel(c, fontsize=8)
+        ax.grid(alpha=0.3)
+    axes[-1].set_xlabel("Time (s)")
+    fig.suptitle("Surface EMG 8-channel raw (grey) with rectified envelope (red), v1/s1, 20s window",
+                 fontsize=11)
+    fig.tight_layout()
+    out = os.path.join(OUT, "emg_waveforms.pdf")
+    fig.savefig(out, bbox_inches='tight'); fig.savefig(out.replace('.pdf', '.png'), dpi=150, bbox_inches='tight')
+    plt.close(fig)
+    print("Saved", out)
+if __name__ == "__main__":
+    plot_skeletons()
+    plot_imu()
+    plot_emg()

experiments/analysis/reannotate_actions.py ADDED Viewed

	@@ -0,0 +1,363 @@

+#!/usr/bin/env python3
+"""
+Re-annotate action segments using LLM (GPT-4o-mini).
+1. Re-classify existing segments with better accuracy
+2. Infer actions in unlabeled gaps based on context (scene, surrounding actions)
+3. Output improved annotations with higher coverage
+"""
+import os
+import sys
+import json
+import re
+import time
+import copy
+import glob
+import urllib.request
+from collections import Counter
+ANN_DIR = "${PULSE_ROOT}/annotations_by_scene"
+OUTPUT_DIR = "${PULSE_ROOT}/annotations_v2"
+DATASET_DIR = "${PULSE_ROOT}/dataset"
+API_URL = "https://api.chatanywhere.tech/v1/chat/completions"
+API_KEYS = [
+    "sk-MN5n1uEETyaky96fLJdHqZobXF1f7KmOrZHzwD3lt585asFQ",
+    "sk-YnYrtPdAXwlE12hRpi6dYqlE1RRVR3LDVBka6wKaefU4iQRY",
+    "sk-jOZtodDv6OxUOMu3NuJ8lzffjwBlshn9OHY5KSmqmPTtc9qs",
+    "sk-qAaKTKYIRF24btu1oQWgubWG4UdA92bILNtzOkHNEPAcCxdB",
+    "sk-MgCBBonblMrCFnSXd6fJZaBLTCfCJ5FjYZfSe2e46bgmyktk",
+    "sk-79e30kYRgduuf2fSU0Lsc814YjNkClXXzQqIbx0iLS40IOEH",
+    "sk-h9Tej4tW6AQC6fT0njfzrPKXEk6fBwpiSvvQd0aJAhw4UwLz",
+    "sk-k2QNHt5wAH26Fw8hZuPWuVXw8Psd1jX09qusiA6PdBj5Vzuu",
+    "sk-w7EkTblciNI44cwosHXi0PGZNUf1hnJmpzOQ85va9VPdAKbz",
+    "sk-Dexs5ZF7OjFCq7CZW45wJ8EKoGtIswv6rsLUMzUXXkWBDBBJ",
+]
+SCENE_DESCRIPTIONS = {
+    "s1": "办公桌面整理与工作准备（整理文件、电源线、鼠标、笔记本电脑等）",
+    "s2": "快递打包发送（折叠纸箱、放入物品、封箱、贴标签等）",
+    "s3": "厨房调料整理（拿取调料瓶、倒调料、拧瓶盖、擦拭等）",
+    "s4": "清理餐后桌面（收碗碟、擦桌子、整理餐具、倒残渣等）",
+    "s5": "餐前桌面布置（铺桌布、摆放餐具碗碟、放杯子等）",
+    "s6": "商务旅行行李箱打包（折叠衣物、放入行李箱、整理物品等）",
+    "s7": "冲泡咖啡/饮品（取杯子、放咖啡粉/茶包、倒热水、搅拌等）",
+    "s8": "晾衣架整理与衣物收纳（取衣架、挂衣服、折叠衣物等）",
+}
+ACTION_CATEGORIES = """动作类别定义（共11类）：
+1. Grasp - 抓取/拿起物体（手从无接触到接触并握住物体）
+2. Place - 放置/放下物体（将物体放到某个位置并释放）
+3. Pour - 倾倒/注入液体或颗粒（倒水、倒调料、倒咖啡粉等）
+4. Wipe - 擦拭/清洁表面（用抹布或手擦桌面、瓶身等）
+5. Fold - 折叠/卷起（折衣服、折桌布、折纸箱等）
+6. OpenClose - 打开/关闭/旋开/旋紧（开盒子、拧瓶盖、拉拉链、合箱盖等）
+7. Stir - 搅拌（搅拌咖啡、搅拌饮品等）
+8. TearCut - 撕/剪/粘贴（撕胶带、剪快递单、贴标签等）
+9. Arrange - 整理/摆放/调整位置（摆餐具、整理文件、调整物品位置、理线等）
+10. Transport - 搬运/移动物体到较远位置（把包裹搬到架子、把碗端到水槽等）
+11. Idle - 空闲/过渡/无明确操作（双手无目的性动作、等待、观察等）
+注意：
+- 只有真正没有任何手部操作时才标Idle
+- "调整姿态"、"检查物体"等属于Arrange
+- "插入"、"装入"等属于Place
+- "提起并移动"如果距离短属于Grasp，距离远属于Transport
+"""
+current_key_idx = 0
+call_count = 0
+def call_llm(prompt, max_tokens=1000, retries=3):
+    """Call LLM API with automatic key rotation."""
+    global current_key_idx, call_count
+    for attempt in range(retries * len(API_KEYS)):
+        key = API_KEYS[current_key_idx]
+        try:
+            data = json.dumps({
+                "model": "gpt-4o-mini",
+                "messages": [{"role": "user", "content": prompt}],
+                "max_tokens": max_tokens,
+                "temperature": 0.1,
+            }).encode()
+            req = urllib.request.Request(
+                API_URL, data=data,
+                headers={
+                    "Content-Type": "application/json",
+                    "Authorization": f"Bearer {key}",
+                }
+            )
+            resp = urllib.request.urlopen(req, timeout=30)
+            result = json.loads(resp.read())
+            call_count += 1
+            return result["choices"][0]["message"]["content"]
+        except Exception as e:
+            err = str(e)
+            if "429" in err or "quota" in err or "limit" in err or "402" in err:
+                # Key exhausted, rotate
+                print(f"  Key {current_key_idx+1} exhausted, rotating...")
+                current_key_idx = (current_key_idx + 1) % len(API_KEYS)
+            elif "timeout" in err.lower():
+                time.sleep(1)
+            else:
+                print(f"  API error: {err[:100]}")
+                current_key_idx = (current_key_idx + 1) % len(API_KEYS)
+                time.sleep(0.5)
+    print("  WARNING: All API keys failed!")
+    return None
+def reclassify_segments(segments, scene_id):
+    """Use LLM to reclassify all segments in a recording."""
+    scene_desc = SCENE_DESCRIPTIONS.get(scene_id, "日常活动")
+    # Build segment list for prompt
+    seg_list = []
+    for i, seg in enumerate(segments):
+        seg_list.append(f"{i+1}. [{seg['timestamp']}] {seg['task']}")
+    seg_text = "\n".join(seg_list)
+    prompt = f"""你是一个人体动作标注专家。请为以下每个动作片段分配一个动作类别。
+场景：{scene_desc}
+{ACTION_CATEGORIES}
+动作片段列表：
+{seg_text}
+请严格按以下JSON格式返回，不要添加任何额外文字：
+[{{"id": 1, "action": "类别名"}}, {{"id": 2, "action": "类别名"}}, ...]
+每个action必须是以下之一：Grasp, Place, Pour, Wipe, Fold, OpenClose, Stir, TearCut, Arrange, Transport, Idle"""
+    response = call_llm(prompt, max_tokens=len(segments) * 40)
+    if response is None:
+        return None
+    # Parse response
+    try:
+        # Extract JSON from response
+        match = re.search(r'\[.*\]', response, re.DOTALL)
+        if match:
+            results = json.loads(match.group())
+            return {r["id"]: r["action"] for r in results}
+    except (json.JSONDecodeError, KeyError) as e:
+        print(f"  Parse error: {e}, response: {response[:200]}")
+    return None
+def infer_gap_actions(scene_id, before_seg, after_seg, gap_start, gap_end):
+    """Use LLM to infer what actions likely happened in an unlabeled gap."""
+    scene_desc = SCENE_DESCRIPTIONS.get(scene_id, "日常活动")
+    gap_duration = gap_end - gap_start
+    before_text = f"[{before_seg['timestamp']}] {before_seg['task']}" if before_seg else "（录制开始）"
+    after_text = f"[{after_seg['timestamp']}] {after_seg['task']}" if after_seg else "（录制结束）"
+    prompt = f"""你是一个人体动作标注专家。在一段日常活动录制中，有一段时间没有被标注。请根据场景和前后动作推断这段时间内最可能发生的动作。
+场景：{scene_desc}
+未标注时间段：{gap_start//60:02d}:{gap_start%60:02d} - {gap_end//60:02d}:{gap_end%60:02d}（共{gap_duration}秒）
+前一个标注动作：{before_text}
+后一个标注动作：{after_text}
+{ACTION_CATEGORIES}
+请推断这段时间内可能发生的动作序列。每个动作段落2-4秒，时间用MM:SS格式。
+如果确实是空闲等待，标注为Idle。
+严格按以下JSON格式返回，不要添加任何额外文字：
+[{{"timestamp": "MM:SS-MM:SS", "task": "动作描述", "action": "类别名"}}]
+每个action必须是以下之一：Grasp, Place, Pour, Wipe, Fold, OpenClose, Stir, TearCut, Arrange, Transport, Idle"""
+    response = call_llm(prompt, max_tokens=500)
+    if response is None:
+        return []
+    try:
+        match = re.search(r'\[.*\]', response, re.DOTALL)
+        if match:
+            results = json.loads(match.group())
+            # Validate timestamps
+            valid = []
+            for r in results:
+                if "timestamp" in r and "action" in r and "task" in r:
+                    ts_match = re.match(r'(\d+):(\d+)\s*-\s*(\d+):(\d+)', r["timestamp"])
+                    if ts_match:
+                        s = int(ts_match.group(1))*60 + int(ts_match.group(2))
+                        e = int(ts_match.group(3))*60 + int(ts_match.group(4))
+                        if gap_start <= s < e <= gap_end:
+                            valid.append(r)
+            return valid
+    except (json.JSONDecodeError, KeyError) as e:
+        print(f"  Parse error: {e}")
+    return []
+def get_recording_duration(vol, scenario):
+    """Get total recording duration in seconds."""
+    meta_path = os.path.join(DATASET_DIR, vol, scenario, "alignment_metadata.json")
+    if os.path.exists(meta_path):
+        meta = json.load(open(meta_path))
+        if "aligned_length_sec" in meta:
+            return meta["aligned_length_sec"]
+        if "aligned_length_frames" in meta:
+            return meta["aligned_length_frames"] / 100.0
+    return None
+def process_one_file(ann_path, vol, scenario):
+    """Process one annotation file: reclassify + fill gaps."""
+    data = json.load(open(ann_path))
+    segments = data["segments"]
+    if not segments:
+        return data, {"reclassified": 0, "gaps_filled": 0}
+    # Step 1: Reclassify existing segments
+    print(f"  Reclassifying {len(segments)} segments...")
+    classifications = reclassify_segments(segments, scenario)
+    if classifications:
+        for i, seg in enumerate(segments):
+            action = classifications.get(i + 1)
+            if action and action in {"Grasp", "Place", "Pour", "Wipe", "Fold",
+                                      "OpenClose", "Stir", "TearCut", "Arrange",
+                                      "Transport", "Idle"}:
+                seg["action_label"] = action
+            else:
+                seg["action_label"] = "Idle"
+    else:
+        # Fallback: keep without label
+        for seg in segments:
+            seg["action_label"] = "Idle"
+    reclassified = sum(1 for s in segments if "action_label" in s)
+    # Step 2: Find and fill gaps ≥ 3 seconds
+    # Parse all timestamps
+    parsed = []
+    for seg in segments:
+        m = re.match(r'(\d+):(\d+)\s*-\s*(\d+):(\d+)', seg["timestamp"])
+        if m:
+            s = int(m.group(1))*60 + int(m.group(2))
+            e = int(m.group(3))*60 + int(m.group(4))
+            parsed.append((s, e, seg))
+    parsed.sort()
+    total_dur = get_recording_duration(vol, scenario)
+    new_segments = []
+    gaps_filled = 0
+    for i in range(len(parsed)):
+        new_segments.append(parsed[i][2])
+        # Check gap after this segment
+        if i < len(parsed) - 1:
+            gap_start = parsed[i][1]
+            gap_end = parsed[i + 1][0]
+        elif total_dur:
+            gap_start = parsed[i][1]
+            gap_end = int(total_dur)
+        else:
+            continue
+        gap_duration = gap_end - gap_start
+        if gap_duration >= 3:
+            before_seg = parsed[i][2]
+            after_seg = parsed[i + 1][2] if i < len(parsed) - 1 else None
+            print(f"    Filling gap {gap_start}s-{gap_end}s ({gap_duration}s)...")
+            inferred = infer_gap_actions(scenario, before_seg, after_seg, gap_start, gap_end)
+            for inf in inferred:
+                new_seg = {
+                    "timestamp": inf["timestamp"],
+                    "task": inf["task"],
+                    "action_label": inf["action"],
+                    "source": "llm_inferred",
+                    "left_hand": "",
+                    "right_hand": "",
+                    "bimanual_interaction": "",
+                    "objects": [],
+                }
+                new_segments.append(new_seg)
+                gaps_filled += 1
+    # Also check gap at the beginning
+    if parsed and parsed[0][0] >= 3:
+        print(f"    Filling start gap 0s-{parsed[0][0]}s...")
+        inferred = infer_gap_actions(scenario, None, parsed[0][2], 0, parsed[0][0])
+        for inf in inferred:
+            new_seg = {
+                "timestamp": inf["timestamp"],
+                "task": inf["task"],
+                "action_label": inf["action"],
+                "source": "llm_inferred",
+                "left_hand": "",
+                "right_hand": "",
+                "bimanual_interaction": "",
+                "objects": [],
+            }
+            new_segments.insert(0, new_seg)
+            gaps_filled += 1
+    # Sort by timestamp
+    def sort_key(seg):
+        m = re.match(r'(\d+):(\d+)', seg["timestamp"])
+        return int(m.group(1))*60 + int(m.group(2)) if m else 0
+    new_segments.sort(key=sort_key)
+    result = copy.deepcopy(data)
+    result["segments"] = new_segments
+    return result, {"reclassified": reclassified, "gaps_filled": gaps_filled}
+def main():
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    total_reclassified = 0
+    total_gaps_filled = 0
+    total_files = 0
+    for vol_dir in sorted(glob.glob(f"{ANN_DIR}/v*")):
+        vol = os.path.basename(vol_dir)
+        out_vol_dir = os.path.join(OUTPUT_DIR, vol)
+        os.makedirs(out_vol_dir, exist_ok=True)
+        for ann_file in sorted(glob.glob(f"{vol_dir}/s*.json")):
+            scenario = os.path.basename(ann_file).replace(".json", "")
+            print(f"\n[{vol}/{scenario}]", flush=True)
+            result, stats = process_one_file(ann_file, vol, scenario)
+            # Save
+            out_path = os.path.join(out_vol_dir, f"{scenario}.json")
+            with open(out_path, "w", encoding="utf-8") as f:
+                json.dump(result, f, ensure_ascii=False, indent=2)
+            total_reclassified += stats["reclassified"]
+            total_gaps_filled += stats["gaps_filled"]
+            total_files += 1
+            print(f"  Done: {stats['reclassified']} reclassified, {stats['gaps_filled']} gaps filled",
+                  flush=True)
+    print(f"\n{'='*60}")
+    print(f"Total: {total_files} files processed")
+    print(f"  Reclassified: {total_reclassified} segments")
+    print(f"  Gap-filled:   {total_gaps_filled} new segments")
+    print(f"  API calls:    {call_count}")
+    print(f"  Output:       {OUTPUT_DIR}")
+if __name__ == "__main__":
+    main()

experiments/data/__init__.py ADDED Viewed

File without changes

experiments/data/__pycache__/dataset.cpython-312.pyc ADDED Viewed

Binary file (18.8 kB). View file

experiments/data/dataset.py ADDED Viewed

	@@ -0,0 +1,332 @@

+"""
+Multimodal scene dataset for Experiment 1: Activity Recognition.
+Loads aligned 100Hz multi-modal data, supports modality selection,
+subject-independent splits, and variable-length sequence handling.
+"""
+import os
+import json
+import numpy as np
+import pandas as pd
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torch.nn.utils.rnn import pad_sequence
+DATASET_DIR = "${PULSE_ROOT}/dataset"
+MODALITY_FILES = {
+    'mocap': None,  # Special: uses aligned_{vol}{scene}_s_Q.tsv (skeleton data)
+    'emg': 'aligned_emg_100hz.csv',
+    'eyetrack': 'aligned_eyetrack_100hz.csv',
+    'imu': 'aligned_imu_100hz.csv',
+    'pressure': 'aligned_pressure_100hz.csv',
+    'video': 'video_features_100hz.npy',  # ViT-B/16 (ImageNet)
+    'videomae': 'video_features_videomae_100hz.npy',  # VideoMAE (Kinetics-400)
+}
+def get_modality_filepath(scenario_dir, modality, vol=None, scenario=None):
+    """Return the file path for a given modality.
+    Mocap uses a special naming pattern: aligned_{vol}{scene}_s_Q.tsv
+    All other modalities use MODALITY_FILES directly.
+    """
+    if modality == 'mocap':
+        if vol is None or scenario is None:
+            raise ValueError("vol and scenario required for mocap modality")
+        return os.path.join(scenario_dir, f"aligned_{vol}{scenario}_s_Q.tsv")
+    return os.path.join(scenario_dir, MODALITY_FILES[modality])
+SKIP_COLS = {'Frame', 'Time', 'time', 'UTC'}
+SKIP_COL_SUFFIXES = (' Type',)
+# Eyetrack exports sometimes include volunteer-specific marker/ICA columns.
+# Benchmark inputs use the fixed 24 core gaze columns below; recordings missing
+# any core column are skipped instead of truncating the full dataset.
+EYETRACK_SKIP_PATTERNS = ('Index Of Cognitive Activity', 'Marker Coordinates', 'Markers_')
+EYETRACK_CORE_COLS = [
+    'Dikablis Glasses 3_Eye Data_Original_Pupil X',
+    'Dikablis Glasses 3_Eye Data_Original_Pupil Y',
+    'Dikablis Glasses 3_Eye Data_Original_Left Eye_Pupil X',
+    'Dikablis Glasses 3_Eye Data_Original_Left Eye_Pupil Y',
+    'Dikablis Glasses 3_Eye Data_Original_Left Eye_Pupil Area',
+    'Dikablis Glasses 3_Eye Data_Original_Left Eye_Pupil Height',
+    'Dikablis Glasses 3_Eye Data_Original_Left Eye_Pupil Width',
+    'Dikablis Glasses 3_Eye Data_Original_Left Eye_Fixations_Fixations',
+    'Dikablis Glasses 3_Eye Data_Original_Left Eye_Fixations_Fixations Duration',
+    'Dikablis Glasses 3_Eye Data_Original_Left Eye_Saccades_Saccades',
+    'Dikablis Glasses 3_Eye Data_Original_Left Eye_Saccades_Saccades Duration',
+    'Dikablis Glasses 3_Eye Data_Original_Left Eye_Saccades_Saccades Angle',
+    'Dikablis Glasses 3_Eye Data_Original_Right Eye_Pupil X',
+    'Dikablis Glasses 3_Eye Data_Original_Right Eye_Pupil Y',
+    'Dikablis Glasses 3_Eye Data_Original_Right Eye_Pupil Area',
+    'Dikablis Glasses 3_Eye Data_Original_Right Eye_Pupil Height',
+    'Dikablis Glasses 3_Eye Data_Original_Right Eye_Pupil Width',
+    'Dikablis Glasses 3_Eye Data_Original_Right Eye_Fixations_Fixations',
+    'Dikablis Glasses 3_Eye Data_Original_Right Eye_Fixations_Fixations Duration',
+    'Dikablis Glasses 3_Eye Data_Original_Right Eye_Saccades_Saccades',
+    'Dikablis Glasses 3_Eye Data_Original_Right Eye_Saccades_Saccades Duration',
+    'Dikablis Glasses 3_Eye Data_Original_Right Eye_Saccades_Saccades Angle',
+    'Dikablis Glasses 3_Field Data_Scene Cam_Original_Gaze_Gaze X',
+    'Dikablis Glasses 3_Field Data_Scene Cam_Original_Gaze_Gaze Y',
+]
+EYETRACK_EXCLUDED_RECORDINGS = {('v1', 's1'), ('v14', 's8')}
+SCENE_LABELS = {f's{i}': i - 1 for i in range(1, 9)}
+NUM_CLASSES = 8
+TRAIN_VOLS = ['v1', 'v2', 'v11', 'v12', 'v13', 'v15', 'v16', 'v17', 'v19', 'v20', 'v21', 'v22', 'v23', 'v24']
+VAL_VOLS = []  # No separate val set; use train for early stopping or cross-val
+TEST_VOLS = ['v25', 'v26', 'v27', 'v3']
+def _preprocess_mocap_skeleton(arr, feat_cols):
+    """Convert absolute skeleton coords to hip-relative positions + velocity.
+    Input:  (T, F) with absolute XYZ + quaternions
+    Output: (T, F + N_pos) where N_pos = number of XYZ position features
+            [hip-relative features, XYZ velocity]
+    """
+    col_to_idx = {c: i for i, c in enumerate(feat_cols)}
+    # Find hip position for subtraction
+    hip_x_idx = col_to_idx.get('Hips_X')
+    hip_y_idx = col_to_idx.get('Hips_Y')
+    hip_z_idx = col_to_idx.get('Hips_Z')
+    if hip_x_idx is None:
+        return arr  # No hip joint found, skip preprocessing
+    # Identify all position columns (_X, _Y, _Z)
+    x_indices = [i for i, c in enumerate(feat_cols) if c.endswith('_X')]
+    y_indices = [i for i, c in enumerate(feat_cols) if c.endswith('_Y')]
+    z_indices = [i for i, c in enumerate(feat_cols) if c.endswith('_Z')]
+    all_pos_indices = sorted(x_indices + y_indices + z_indices)
+    # 1. Make XYZ positions hip-relative
+    arr_rel = arr.copy()
+    hip_xyz = arr[:, [hip_x_idx, hip_y_idx, hip_z_idx]]  # (T, 3)
+    for idx in x_indices:
+        arr_rel[:, idx] -= hip_xyz[:, 0]
+    for idx in y_indices:
+        arr_rel[:, idx] -= hip_xyz[:, 1]
+    for idx in z_indices:
+        arr_rel[:, idx] -= hip_xyz[:, 2]
+    # 2. Compute velocity of position features only
+    pos_data = arr_rel[:, all_pos_indices]  # (T, N_pos)
+    velocity = np.zeros_like(pos_data)
+    velocity[1:] = pos_data[1:] - pos_data[:-1]
+    # 3. Concatenate: [hip-relative features (pos+quat), position velocity]
+    return np.concatenate([arr_rel, velocity], axis=1)
+def load_modality_array(filepath, modality):
+    """Load a modality CSV/TSV/NPY and return numpy_array.
+    Returns None if data is corrupted (extreme values or mostly zeros)."""
+    # Video features stored as .npy
+    if filepath.endswith('.npy'):
+        if not os.path.exists(filepath):
+            return None
+        arr = np.load(filepath).astype(np.float32)
+        arr = np.nan_to_num(arr, nan=0.0, posinf=0.0, neginf=0.0)
+        return arr
+    # Mocap uses TSV with tab separator
+    sep = '\t' if filepath.endswith('.tsv') else ','
+    df = pd.read_csv(filepath, sep=sep, low_memory=False)
+    df.columns = [str(c).strip() for c in df.columns]
+    if modality == 'eyetrack':
+        parts = os.path.normpath(filepath).split(os.sep)
+        if len(parts) >= 3 and (parts[-3], parts[-2]) in EYETRACK_EXCLUDED_RECORDINGS:
+            return None
+    feat_cols = [c for c in df.columns
+                 if c not in SKIP_COLS
+                 and not any(c.endswith(s) for s in SKIP_COL_SUFFIXES)]
+    if modality == 'eyetrack':
+        feat_cols = [c for c in EYETRACK_CORE_COLS if c in feat_cols]
+        if len(feat_cols) != len(EYETRACK_CORE_COLS):
+            return None
+    sub = df[feat_cols]
+    # Coerce non-numeric columns
+    obj_cols = sub.select_dtypes(include=['object']).columns
+    if len(obj_cols) > 0:
+        sub = sub.copy()
+        sub[obj_cols] = sub[obj_cols].apply(pd.to_numeric, errors='coerce')
+    arr = sub.values.astype(np.float64)
+    arr = np.nan_to_num(arr, nan=0.0, posinf=0.0, neginf=0.0)
+    # Quality check: reject samples with extreme values (corrupted data)
+    max_abs = np.max(np.abs(arr))
+    if max_abs > 1e6:
+        return None  # Corrupted
+    # Quality check: reject samples that are mostly zeros (sensor dropout).
+    # Pressure and EMG are legitimately zero for long periods (rest, no grip)
+    # so we only apply the strict near-total-loss check to the modalities
+    # where a flat-zero stream is a clear dropout signal.
+    if modality not in ("pressure", "emg"):
+        zero_ratio = np.mean(arr == 0.0)
+        if zero_ratio > 0.9:
+            return None  # Near-total data loss
+    # Mocap skeleton: convert to hip-relative + velocity
+    if modality == 'mocap' and filepath.endswith('.tsv'):
+        arr = _preprocess_mocap_skeleton(arr, feat_cols)
+    arr = arr.astype(np.float32)
+    return arr
+class MultimodalSceneDataset(Dataset):
+    """Dataset for scene-level classification from multimodal time series."""
+    def __init__(self, volunteers, modalities, downsample=5, stats=None):
+        self.modalities = modalities
+        self.downsample = downsample
+        self.data = []
+        self.labels = []
+        self.sample_info = []
+        self._modality_dims = {}
+        for vol in volunteers:
+            vol_dir = os.path.join(DATASET_DIR, vol)
+            if not os.path.isdir(vol_dir):
+                continue
+            for scenario in sorted(os.listdir(vol_dir)):
+                scenario_dir = os.path.join(vol_dir, scenario)
+                if not os.path.isdir(scenario_dir) or scenario not in SCENE_LABELS:
+                    continue
+                meta_path = os.path.join(scenario_dir, 'alignment_metadata.json')
+                if not os.path.exists(meta_path):
+                    continue
+                with open(meta_path) as f:
+                    meta = json.load(f)
+                available = set(meta['modalities'])
+                if not set(modalities).issubset(available):
+                    continue
+                parts = []
+                skip = False
+                for mod in modalities:
+                    if mod == 'mocap':
+                        # Skeleton data: aligned_{vol}{scene}_s_Q.tsv
+                        tsv_name = f"aligned_{vol}{scenario}_s_Q.tsv"
+                        filepath = os.path.join(scenario_dir, tsv_name)
+                    else:
+                        filepath = os.path.join(scenario_dir, MODALITY_FILES[mod])
+                    if not os.path.exists(filepath):
+                        skip = True
+                        break
+                    arr = load_modality_array(filepath, mod)
+                    if arr is None:
+                        print(f"  SKIP {vol}/{scenario} {mod}: corrupted data", flush=True)
+                        skip = True
+                        break
+                    # Validate dimension consistency
+                    if mod in self._modality_dims and arr.shape[1] != self._modality_dims[mod]:
+                        print(f"  WARNING: {vol}/{scenario} {mod} dim {arr.shape[1]} "
+                              f"!= expected {self._modality_dims[mod]}, padding/truncating",
+                              flush=True)
+                        expected = self._modality_dims[mod]
+                        if arr.shape[1] < expected:
+                            pad = np.zeros((arr.shape[0], expected - arr.shape[1]), dtype=np.float32)
+                            arr = np.concatenate([arr, pad], axis=1)
+                        else:
+                            arr = arr[:, :expected]
+                    if mod not in self._modality_dims:
+                        self._modality_dims[mod] = arr.shape[1]
+                    parts.append(arr)
+                if skip:
+                    continue
+                min_len = min(p.shape[0] for p in parts)
+                parts = [p[:min_len] for p in parts]
+                combined = np.concatenate(parts, axis=1)
+                combined = combined[::downsample]
+                self.data.append(combined)
+                self.labels.append(SCENE_LABELS[scenario])
+                self.sample_info.append(f"{vol}/{scenario}")
+        print(f"  Loaded {len(self.data)} samples, modality dims: {self._modality_dims}, "
+              f"total feat dim: {sum(self._modality_dims.values())}", flush=True)
+        # Normalization (compute in float64 to avoid overflow)
+        if stats is not None:
+            self.mean, self.std = stats
+        else:
+            self._compute_stats()
+        for i in range(len(self.data)):
+            self.data[i] = ((self.data[i].astype(np.float64) - self.mean) / self.std).astype(np.float32)
+            self.data[i] = np.nan_to_num(self.data[i], nan=0.0, posinf=0.0, neginf=0.0)
+    def _compute_stats(self):
+        # Use float64 for accumulation to prevent overflow
+        all_frames = np.concatenate(self.data, axis=0).astype(np.float64)
+        self.mean = np.mean(all_frames, axis=0, keepdims=True)
+        self.std = np.std(all_frames, axis=0, keepdims=True)
+        self.std[self.std < 1e-8] = 1.0
+    def get_stats(self):
+        return (self.mean, self.std)
+    @property
+    def feat_dim(self):
+        return sum(self._modality_dims.values())
+    @property
+    def modality_dims(self):
+        return dict(self._modality_dims)
+    def get_class_weights(self):
+        counts = np.bincount(self.labels, minlength=NUM_CLASSES).astype(np.float32)
+        counts[counts == 0] = 1.0
+        weights = 1.0 / counts
+        weights = weights / weights.sum() * NUM_CLASSES
+        return torch.FloatTensor(weights)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        return torch.from_numpy(self.data[idx]), self.labels[idx]
+def collate_fn(batch):
+    """Pad variable-length sequences and create masks."""
+    sequences, labels = zip(*batch)
+    lengths = torch.LongTensor([s.shape[0] for s in sequences])
+    padded = pad_sequence(sequences, batch_first=True, padding_value=0.0)
+    max_len = padded.shape[1]
+    mask = torch.arange(max_len).unsqueeze(0) < lengths.unsqueeze(1)
+    labels = torch.LongTensor(labels)
+    return padded, labels, mask, lengths
+def get_dataloaders(modalities, batch_size=16, downsample=5, num_workers=0):
+    """Create train/val/test DataLoaders with proper normalization."""
+    print("Loading training data...", flush=True)
+    train_ds = MultimodalSceneDataset(TRAIN_VOLS, modalities, downsample)
+    stats = train_ds.get_stats()
+    print("Loading validation data...", flush=True)
+    val_ds = MultimodalSceneDataset(VAL_VOLS, modalities, downsample, stats=stats)
+    print("Loading test data...", flush=True)
+    test_ds = MultimodalSceneDataset(TEST_VOLS, modalities, downsample, stats=stats)
+    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,
+                              collate_fn=collate_fn, num_workers=num_workers,
+                              drop_last=False)
+    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False,
+                            collate_fn=collate_fn, num_workers=num_workers)
+    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False,
+                             collate_fn=collate_fn, num_workers=num_workers)
+    info = {
+        'feat_dim': train_ds.feat_dim,
+        'modality_dims': train_ds.modality_dims,
+        'num_classes': NUM_CLASSES,
+        'train_size': len(train_ds),
+        'val_size': len(val_ds),
+        'test_size': len(test_ds),
+        'class_weights': train_ds.get_class_weights(),
+    }
+    return train_loader, val_loader, test_loader, info

experiments/data/dataset_forecast.py ADDED Viewed

	@@ -0,0 +1,319 @@

+"""Frame-level future motor-primitive forecasting dataset.
+Task definition
+---------------
+At a sampled anchor time t in a recording:
+  past   = sensor frames over [t - T_obs, t]            ← input
+  future = per-frame verb_fine labels over (t, t + T_fut]   ← target
+We use NUM_VERB_FINE (= 17) as a sentinel "idle / no segment" class for
+frames not covered by any annotated segment, so every future frame has a
+valid label (output cardinality = NUM_VERB_FINE + 1 = 18).
+Anchors are sampled at fixed stride within each recording so the model
+sees both intra-segment future (mostly stationary) and across-boundary
+future (where the next-action label changes — the interesting cases).
+"""
+from __future__ import annotations
+import os
+import sys
+from pathlib import Path
+from typing import Dict, List, Optional, Sequence, Tuple
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+THIS = Path(__file__).resolve()
+sys.path.insert(0, str(THIS.parent))
+sys.path.insert(0, str(THIS.parents[1]))
+try:
+    from experiments.dataset_seqpred import (
+        SAMPLING_RATE_HZ, _load_recording_sensors, _load_annotations,
+        parse_ts_range, TRAIN_VOLS_V3, TEST_VOLS_V3,
+        DEFAULT_DATASET_DIR, DEFAULT_ANNOT_DIR,
+    )
+    from experiments.taxonomy import (
+        classify_segment, NUM_VERB_FINE,
+    )
+except ModuleNotFoundError:
+    from dataset_seqpred import (
+        SAMPLING_RATE_HZ, _load_recording_sensors, _load_annotations,
+        parse_ts_range, TRAIN_VOLS_V3, TEST_VOLS_V3,
+        DEFAULT_DATASET_DIR, DEFAULT_ANNOT_DIR,
+    )
+    from taxonomy import classify_segment, NUM_VERB_FINE
+IDLE_LABEL = NUM_VERB_FINE        # = 17, sentinel for "no segment covers this frame"
+NUM_FORECAST_CLASSES = NUM_VERB_FINE + 1   # = 18
+class ForecastDataset(Dataset):
+    """Forecast next T_fut seconds of per-frame verb_fine given past T_obs."""
+    def __init__(
+        self,
+        volunteers: Sequence[str],
+        modalities: Sequence[str],
+        t_obs_sec: float = 1.5,
+        t_fut_sec: float = 0.5,
+        anchor_stride_sec: float = 0.25,
+        downsample: int = 5,
+        dataset_dir: Path = DEFAULT_DATASET_DIR,
+        annot_dir: Path = DEFAULT_ANNOT_DIR,
+        stats: Optional[Dict[str, Tuple[np.ndarray, np.ndarray]]] = None,
+        expected_dims: Optional[Dict[str, int]] = None,
+        contact_only: bool = False,
+        contact_threshold_g: float = 5.0,
+        log: bool = True,
+    ):
+        super().__init__()
+        self.modalities = list(modalities)
+        self.t_obs_sec = float(t_obs_sec)
+        self.t_fut_sec = float(t_fut_sec)
+        self.anchor_stride_sec = float(anchor_stride_sec)
+        self.downsample = int(downsample)
+        self.sr = SAMPLING_RATE_HZ // self.downsample
+        self.dataset_dir = Path(dataset_dir)
+        self.annot_dir   = Path(annot_dir)
+        self.contact_only = bool(contact_only)
+        self.contact_threshold_g = float(contact_threshold_g)
+        # Output time-step counts (after downsample)
+        self.T_obs = int(round(self.t_obs_sec * self.sr))
+        self.T_fut = int(round(self.t_fut_sec * self.sr))
+        self._items: List[dict] = []
+        # Pre-seed modality dims if caller (e.g. test set) provides them
+        self._modality_dims: Dict[str, int] = dict(expected_dims) if expected_dims else {}
+        for vol in volunteers:
+            vol_dir = self.dataset_dir / vol
+            if not vol_dir.is_dir():
+                continue
+            for scenario_dir in sorted(vol_dir.glob("s*")):
+                if not scenario_dir.is_dir():
+                    continue
+                scene = scenario_dir.name
+                annot_path = self.annot_dir / vol / f"{scene}.json"
+                if not annot_path.exists():
+                    continue
+                # Always include pressure for the filter, even if model
+                # doesn't see it as input. We separate "filter sensors"
+                # (load_mods) from "model input sensors" (self.modalities).
+                load_mods = list(dict.fromkeys(list(self.modalities) + ["pressure"]))
+                try:
+                    sensors_all = _load_recording_sensors(
+                        scenario_dir, vol, scene, load_mods
+                    )
+                except Exception:
+                    continue
+                if sensors_all is None or any(a is None for a in sensors_all.values()):
+                    continue
+                pressure_full = sensors_all.get("pressure")  # (T, 50)
+                # Subset to model-input modalities for everything downstream
+                sensors = {m: sensors_all[m] for m in self.modalities}
+                # Track modality dim consistency
+                for m, arr in sensors.items():
+                    if m in self._modality_dims:
+                        target = self._modality_dims[m]
+                        if arr.shape[1] != target:
+                            if arr.shape[1] < target:
+                                pad = np.zeros((arr.shape[0], target - arr.shape[1]),
+                                               dtype=np.float32)
+                                sensors[m] = np.concatenate([arr, pad], axis=1)
+                            else:
+                                sensors[m] = arr[:, :target]
+                    else:
+                        self._modality_dims[m] = arr.shape[1]
+                T_avail = min(a.shape[0] for a in sensors.values())
+                if T_avail < (self.T_obs + self.T_fut) * self.downsample:
+                    continue
+                # Build per-frame verb_fine timeline at full 100 Hz
+                timeline = np.full(T_avail, IDLE_LABEL, dtype=np.int64)
+                segs = _load_annotations(annot_path)
+                for seg in segs:
+                    a = seg.get("action_annotation", {})
+                    labels = classify_segment(a)
+                    if labels is None:
+                        continue
+                    start_sec, end_sec = parse_ts_range(seg.get("timestamp", ""))
+                    s = int(round(start_sec * SAMPLING_RATE_HZ))
+                    e = int(round(end_sec * SAMPLING_RATE_HZ))
+                    s = max(0, s); e = min(T_avail, e)
+                    if e > s:
+                        timeline[s:e] = labels["verb_fine"]
+                # Downsample timeline to 20 Hz
+                timeline_ds = timeline[::self.downsample]
+                T_ds = len(timeline_ds)
+                # Downsample sensors to 20 Hz (kept as full record;
+                # we'll slice windows below)
+                sensors_ds = {m: arr[::self.downsample] for m, arr in sensors.items()}
+                # Build contact mask at 20 Hz (per-frame): is pressure-sum > thr?
+                # Pressure is 50 channels; we follow the T2 contact convention
+                # (sum across all fingertips and threshold at 5 g).
+                if pressure_full is not None:
+                    pressure_ds = pressure_full[::self.downsample]
+                    contact_ds = pressure_ds.sum(axis=1) > self.contact_threshold_g
+                else:
+                    contact_ds = np.zeros(T_ds, dtype=bool)
+                # Sample anchors at fixed stride (in 20 Hz frames)
+                stride = max(1, int(round(self.anchor_stride_sec * self.sr)))
+                first_anchor = self.T_obs
+                last_anchor = T_ds - self.T_fut
+                if last_anchor <= first_anchor:
+                    continue
+                for anchor in range(first_anchor, last_anchor + 1, stride):
+                    # contact-rich filter: any contact frame in past or future window?
+                    if self.contact_only:
+                        win = contact_ds[max(0, anchor - self.T_obs):
+                                         min(T_ds, anchor + self.T_fut)]
+                        if not win.any():
+                            continue
+                    past_slice = {m: arr[anchor - self.T_obs:anchor]
+                                  for m, arr in sensors_ds.items()}
+                    fut_labels = timeline_ds[anchor:anchor + self.T_fut].copy()
+                    # length sanity
+                    if any(w.shape[0] != self.T_obs for w in past_slice.values()):
+                        continue
+                    if fut_labels.shape[0] != self.T_fut:
+                        continue
+                    self._items.append({
+                        "x": past_slice,                  # dict[mod] -> (T_obs, F_mod)
+                        "y_seq": fut_labels,              # (T_fut,) int in [0..17]
+                        "meta": {"vol": vol, "scene": scene, "anchor_idx": int(anchor)},
+                    })
+        if not self._items:
+            raise RuntimeError("ForecastDataset: collected 0 anchors. Check annot_dir / modalities.")
+        # Per-modality z-score using training stats
+        if stats is None:
+            stats = self._compute_stats()
+        self._stats = stats
+        self._apply_stats(stats)
+        if log:
+            print(f"[ForecastDataset] vols={len(volunteers)} "
+                  f"anchors={len(self._items)} "
+                  f"T_obs={self.T_obs} T_fut={self.T_fut} "
+                  f"contact_only={self.contact_only} "
+                  f"modality_dims={self._modality_dims} "
+                  f"sr={self.sr}Hz", flush=True)
+    # ----- Stats / normalization -----
+    def _compute_stats(self) -> Dict[str, Tuple[np.ndarray, np.ndarray]]:
+        accs = {m: [] for m in self._modality_dims}
+        for it in self._items:
+            for m, w in it["x"].items():
+                accs[m].append(w)
+        out = {}
+        for m, ws in accs.items():
+            cat = np.concatenate(ws, axis=0)
+            mu = cat.mean(axis=0)
+            sd = cat.std(axis=0); sd = np.where(sd < 1e-6, 1.0, sd)
+            out[m] = (mu.astype(np.float32), sd.astype(np.float32))
+        return out
+    def _apply_stats(self, stats):
+        for it in self._items:
+            for m, w in it["x"].items():
+                if m in stats:
+                    mu, sd = stats[m]
+                    it["x"][m] = ((w - mu) / sd).astype(np.float32)
+    # ----- Dataset protocol -----
+    def __len__(self):
+        return len(self._items)
+    def __getitem__(self, idx):
+        it = self._items[idx]
+        x = {m: torch.from_numpy(np.ascontiguousarray(w)) for m, w in it["x"].items()}
+        y_seq = torch.from_numpy(np.ascontiguousarray(it["y_seq"]))   # (T_fut,)
+        return x, y_seq, it["meta"]
+    @property
+    def modality_dims(self):
+        return dict(self._modality_dims)
+    def class_freq(self) -> np.ndarray:
+        c = np.zeros(NUM_FORECAST_CLASSES, dtype=np.int64)
+        for it in self._items:
+            for v in it["y_seq"]:
+                c[int(v)] += 1
+        return c
+def collate_forecast(batch):
+    """Stack (x_dict, y_seq, meta) -> batched tensors. All samples share T_obs/T_fut."""
+    xs, ys, metas = zip(*batch)
+    B = len(batch)
+    mods = list(xs[0].keys())
+    x_out: Dict[str, torch.Tensor] = {}
+    for m in mods:
+        x_out[m] = torch.stack([x[m] for x in xs], dim=0)  # (B, T_obs, F_mod)
+    y_out = torch.stack(ys, dim=0)                          # (B, T_fut)
+    return x_out, y_out, list(metas)
+def build_train_test(
+    modalities: Sequence[str],
+    t_obs_sec: float = 1.5,
+    t_fut_sec: float = 0.5,
+    anchor_stride_sec: float = 0.25,
+    downsample: int = 5,
+    dataset_dir: Path = DEFAULT_DATASET_DIR,
+    annot_dir: Path = DEFAULT_ANNOT_DIR,
+    contact_only: bool = False,
+    contact_threshold_g: float = 5.0,
+):
+    train = ForecastDataset(
+        TRAIN_VOLS_V3, modalities=modalities,
+        t_obs_sec=t_obs_sec, t_fut_sec=t_fut_sec,
+        anchor_stride_sec=anchor_stride_sec, downsample=downsample,
+        dataset_dir=dataset_dir, annot_dir=annot_dir,
+        contact_only=contact_only, contact_threshold_g=contact_threshold_g,
+        stats=None, log=True,
+    )
+    test = ForecastDataset(
+        TEST_VOLS_V3, modalities=modalities,
+        t_obs_sec=t_obs_sec, t_fut_sec=t_fut_sec,
+        anchor_stride_sec=anchor_stride_sec, downsample=downsample,
+        dataset_dir=dataset_dir, annot_dir=annot_dir,
+        contact_only=contact_only, contact_threshold_g=contact_threshold_g,
+        stats=train._stats, expected_dims=train._modality_dims, log=True,
+    )
+    return train, test
+if __name__ == "__main__":
+    import argparse
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--modalities", type=str, default="imu,emg,eyetrack,mocap,pressure")
+    ap.add_argument("--t_obs", type=float, default=1.5)
+    ap.add_argument("--t_fut", type=float, default=0.5)
+    ap.add_argument("--stride", type=float, default=0.25)
+    args = ap.parse_args()
+    mods = args.modalities.split(",")
+    tr, te = build_train_test(
+        modalities=mods,
+        t_obs_sec=args.t_obs, t_fut_sec=args.t_fut,
+        anchor_stride_sec=args.stride,
+    )
+    print(f"\nTrain={len(tr)}  Test={len(te)}  T_obs={tr.T_obs}  T_fut={tr.T_fut}")
+    print(f"Train class freq:\n{tr.class_freq()}")
+    print(f"Test  class freq:\n{te.class_freq()}")
+    x, y, meta = tr[0]
+    print(f"Sample: x={ {m: tuple(v.shape) for m,v in x.items()} }  y_seq={tuple(y.shape)}")

experiments/data/dataset_grasp_state.py ADDED Viewed

	@@ -0,0 +1,571 @@

+"""Anchor-based binary "is_grasping" classification dataset (T5 v3 / TGSR).
+At each sampled anchor t in a recording:
+  past   = sensor frames over [t - T_obs, t]                       ← input
+  label  = majority vote of grasp-annotation mask over (t, t+T_fut] ← binary class
+Ground-truth source: annotations_v3 verb segments. A frame is marked
+"is_grasp" if it falls inside a segment whose action_name belongs to
+GRASP_VERBS (set below). The label is annotation-derived, completely
+independent of pressure — so adding/removing pressure as input does
+NOT leak the label.
+This is the cleanest test of "does pressure improve recognition of
+object-interaction state when human-annotated grasp segments are GT?"
+"""
+from __future__ import annotations
+import json
+import sys
+from pathlib import Path
+from typing import Dict, List, Optional, Sequence, Tuple
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+THIS = Path(__file__).resolve()
+sys.path.insert(0, str(THIS.parent))
+sys.path.insert(0, str(THIS.parents[1]))
+try:
+    from experiments.dataset_seqpred import (
+        SAMPLING_RATE_HZ, _load_recording_sensors,
+        TRAIN_VOLS_V3, TEST_VOLS_V3,
+        DEFAULT_DATASET_DIR, DEFAULT_ANNOT_DIR,
+    )
+except ModuleNotFoundError:
+    from dataset_seqpred import (
+        SAMPLING_RATE_HZ, _load_recording_sensors,
+        TRAIN_VOLS_V3, TEST_VOLS_V3,
+        DEFAULT_DATASET_DIR, DEFAULT_ANNOT_DIR,
+    )
+GRASP_VERBS = {
+    "grasp", "hold", "pick_up", "move", "place", "put_down",
+    "pull", "rotate", "insert", "remove",
+}
+# User-specified subset of action verbs that mean "the object has been lifted
+# off its resting surface and held in hand" (used as Class 2 stricter definition).
+LIFT_VERBS = {"grasp", "open", "move", "pick_up", "hold"}
+# Multi-class verb taxonomy (annotations_v3 verb_fine universe).
+# Verb 0 = background (anchor outside any segment).
+VERB_LIST = [
+    "background",
+    "grasp", "move", "place", "adjust", "pick_up",
+    "close", "put_down", "pull", "hold", "open",
+    "rotate", "release", "push", "insert", "remove",
+    "align", "stabilize",
+]
+VERB_TO_IDX = {v: i for i, v in enumerate(VERB_LIST)}
+# Top-15 most common object categories with non-zero coverage in the
+# pressure-bearing test set (annotations_v3 survey of TRAIN+TEST_VOLS_V3).
+# Index 0 = "_other": anchor outside any segment OR object not in top-15.
+# Note: "coat" excluded because it appears only in v14, which has no
+# pressure-aligned sessions and is silently dropped by the loader.
+OBJECT_TOP_LIST = [
+    "_other",
+    "sealed jar", "towel", "tablecloth", "box", "pot",
+    "rice bowl", "tape", "pants", "spoon", "plate",
+    "marker", "cloth", "laptop", "toothbrush case", "tea canister",
+]
+OBJECT_TO_IDX = {o: i for i, o in enumerate(OBJECT_TOP_LIST)}
+EVENT_NAMES = {0: "non-contact", 1: "pre-contact", 2: "steady-grip", 3: "release"}
+CLASS_NAMES_BINARY = {0: "non-grasp", 1: "grasp"}
+CLASS_NAMES_THREE  = {0: "no-grasp", 1: "attempted", 2: "sustained"}
+# Back-compat default (used by binary code paths)
+CLASS_NAMES = CLASS_NAMES_BINARY
+def _parse_one(x: str, fmt_mode: str) -> float:
+    p = x.split(":")
+    if len(p) == 2:
+        return int(p[0]) * 60 + int(p[1])
+    if fmt_mode == "hhmmss":
+        return int(p[0]) * 3600 + int(p[1]) * 60 + int(p[2])
+    return int(p[0]) * 60 + int(p[1]) + int(p[2]) / 30.0  # mmssff @ 30fps
+def _detect_fmt(segments, rec_sec: float) -> str:
+    for s in segments:
+        b = s["timestamp"].split("-")[1]
+        p = b.split(":")
+        if len(p) == 3:
+            hh = int(p[0]) * 3600 + int(p[1]) * 60 + int(p[2])
+            if hh > rec_sec * 1.05:
+                return "mmssff"
+    return "hhmmss"
+def build_object_label(annot_path: Path, n_frames: int,
+                       sr: int = SAMPLING_RATE_HZ) -> np.ndarray:
+    """Per-frame object index (top-15 + '_other' fallback as class 0)."""
+    label = np.zeros(n_frames, dtype=np.int8)
+    if not annot_path.exists():
+        return label
+    try:
+        ann = json.load(open(annot_path))
+    except Exception:
+        return label
+    segments = ann.get("segments", [])
+    if not segments:
+        return label
+    rec_sec = n_frames / sr
+    fmt = _detect_fmt(segments, rec_sec)
+    for s in segments:
+        obj = s.get("action_annotation", {}).get("object_name")
+        idx = OBJECT_TO_IDX.get(obj, 0)
+        if idx == 0:
+            continue  # leave as 0 ("_other"/background)
+        try:
+            a, b = s["timestamp"].split("-")
+            t0 = _parse_one(a, fmt); t1 = _parse_one(b, fmt)
+        except Exception:
+            continue
+        if t1 <= t0 or t1 > rec_sec * 1.10:
+            continue
+        i0 = max(0, int(round(t0 * sr)))
+        i1 = min(n_frames, int(round(t1 * sr)))
+        label[i0:i1] = idx
+    return label
+def build_lift_eligible_mask(annot_path: Path, n_frames: int,
+                             sr: int = SAMPLING_RATE_HZ) -> np.ndarray:
+    """Per-frame bool: True if frame is inside a segment that meets the
+    lifted-grasp criterion: verb ∈ LIFT_VERBS  OR  hand_type == 'both'.
+    Used by 3-class label_mode when require_lift_for_sustained=True."""
+    mask = np.zeros(n_frames, dtype=bool)
+    if not annot_path.exists():
+        return mask
+    try:
+        ann = json.load(open(annot_path))
+    except Exception:
+        return mask
+    segments = ann.get("segments", [])
+    if not segments:
+        return mask
+    rec_sec = n_frames / sr
+    fmt = _detect_fmt(segments, rec_sec)
+    for s in segments:
+        a = s.get("action_annotation", {})
+        verb = a.get("action_name")
+        hand = a.get("hand_type", "")
+        is_lift = (verb in LIFT_VERBS) or (hand == "both")
+        if not is_lift:
+            continue
+        try:
+            ts0, ts1 = s["timestamp"].split("-")
+            t0 = _parse_one(ts0, fmt); t1 = _parse_one(ts1, fmt)
+        except Exception:
+            continue
+        if t1 <= t0 or t1 > rec_sec * 1.10:
+            continue
+        i0 = max(0, int(round(t0 * sr)))
+        i1 = min(n_frames, int(round(t1 * sr)))
+        mask[i0:i1] = True
+    return mask
+def build_verb_label(annot_path: Path, n_frames: int,
+                     sr: int = SAMPLING_RATE_HZ) -> np.ndarray:
+    """Per-frame verb index (int8). Default (no segment) = 0 (background)."""
+    label = np.zeros(n_frames, dtype=np.int8)
+    if not annot_path.exists():
+        return label
+    try:
+        ann = json.load(open(annot_path))
+    except Exception:
+        return label
+    segments = ann.get("segments", [])
+    if not segments:
+        return label
+    rec_sec = n_frames / sr
+    fmt = _detect_fmt(segments, rec_sec)
+    for s in segments:
+        verb = s.get("action_annotation", {}).get("action_name")
+        v_idx = VERB_TO_IDX.get(verb, 0)        # unknown verb → background
+        if v_idx == 0:
+            continue
+        try:
+            a, b = s["timestamp"].split("-")
+            t0 = _parse_one(a, fmt); t1 = _parse_one(b, fmt)
+        except Exception:
+            continue
+        if t1 <= t0 or t1 > rec_sec * 1.10:
+            continue
+        i0 = max(0, int(round(t0 * sr)))
+        i1 = min(n_frames, int(round(t1 * sr)))
+        label[i0:i1] = v_idx
+    return label
+def build_grasp_mask(annot_path: Path, n_frames: int,
+                     sr: int = SAMPLING_RATE_HZ) -> np.ndarray:
+    """Return bool array of shape (n_frames,)."""
+    mask = np.zeros(n_frames, dtype=bool)
+    if not annot_path.exists():
+        return mask
+    try:
+        ann = json.load(open(annot_path))
+    except Exception:
+        return mask
+    segments = ann.get("segments", [])
+    if not segments:
+        return mask
+    rec_sec = n_frames / sr
+    fmt = _detect_fmt(segments, rec_sec)
+    for s in segments:
+        verb = s.get("action_annotation", {}).get("action_name")
+        if verb not in GRASP_VERBS:
+            continue
+        try:
+            a, b = s["timestamp"].split("-")
+            t0 = _parse_one(a, fmt); t1 = _parse_one(b, fmt)
+        except Exception:
+            continue
+        if t1 <= t0 or t1 > rec_sec * 1.10:
+            continue
+        i0 = max(0, int(round(t0 * sr)))
+        i1 = min(n_frames, int(round(t1 * sr)))
+        mask[i0:i1] = True
+    return mask
+class GraspStateDataset(Dataset):
+    """Predict binary 'is_grasping' label over future window from past sensor signals."""
+    def __init__(
+        self,
+        volunteers: Sequence[str],
+        input_modalities: Sequence[str],
+        t_obs_sec: float = 1.0,
+        t_fut_sec: float = 0.5,
+        anchor_stride_sec: float = 0.25,
+        downsample: int = 5,
+        dataset_dir: Path = DEFAULT_DATASET_DIR,
+        annot_dir: Path = DEFAULT_ANNOT_DIR,
+        contact_threshold_g: float = 5.0,        # legacy sum-threshold (kept for back-compat, unused if use_per_cell_contact=True)
+        per_cell_threshold_g: float = 10.0,      # per-cell threshold to declare a sensor cell "active"
+        min_active_cells: int = 3,               # need ≥ this many active cells to declare contact
+        use_per_cell_contact: bool = True,       # NEW default: use per-cell active-count for event_type
+        label_mode: str = "binary",              # "binary", "three_class", or "verb"
+        sustained_threshold_sec: float = 0.3,    # (3-class only) min contiguous contact for "Sustained"
+        require_lift_for_sustained: bool = False,  # (3-class only) Class 2 also requires verb ∈ LIFT_VERBS
+        per_class_max: Optional[int] = None,
+        input_stats: Optional[Dict[str, Tuple[np.ndarray, np.ndarray]]] = None,
+        expected_input_dims: Optional[Dict[str, int]] = None,
+        majority_threshold: float = 0.5,
+        rng_seed: int = 0,
+        log: bool = True,
+    ):
+        super().__init__()
+        self.input_modalities = list(input_modalities)
+        self.t_obs_sec = float(t_obs_sec)
+        self.t_fut_sec = float(t_fut_sec)
+        self.anchor_stride_sec = float(anchor_stride_sec)
+        self.downsample = int(downsample)
+        self.sr = SAMPLING_RATE_HZ // self.downsample
+        self.dataset_dir = Path(dataset_dir)
+        self.annot_dir = Path(annot_dir)
+        self.contact_threshold_g = float(contact_threshold_g)
+        self.per_cell_threshold_g = float(per_cell_threshold_g)
+        self.min_active_cells = int(min_active_cells)
+        self.use_per_cell_contact = bool(use_per_cell_contact)
+        self.label_mode = str(label_mode)
+        if self.label_mode not in ("binary", "three_class", "verb", "object"):
+            raise ValueError(f"label_mode must be binary|three_class|verb|object, got {label_mode}")
+        if self.label_mode == "binary":
+            self.num_classes = 2
+        elif self.label_mode == "three_class":
+            self.num_classes = 3
+        elif self.label_mode == "verb":
+            self.num_classes = len(VERB_LIST)
+        else:  # object
+            self.num_classes = len(OBJECT_TOP_LIST)
+        self.sustained_threshold_sec = float(sustained_threshold_sec)
+        self.require_lift_for_sustained = bool(require_lift_for_sustained)
+        self.per_class_max = per_class_max
+        self.majority_threshold = float(majority_threshold)
+        self.T_obs = int(round(self.t_obs_sec * self.sr))
+        self.T_fut = int(round(self.t_fut_sec * self.sr))
+        self._items: List[dict] = []
+        self._modality_dims: Dict[str, int] = dict(expected_input_dims) if expected_input_dims else {}
+        rng = np.random.default_rng(rng_seed)
+        # Load pressure even if not in inputs, for event_type stratification.
+        load_mods = list(dict.fromkeys(list(self.input_modalities) + ["pressure"]))
+        # Per-class anchor pool
+        pools: Dict[int, List[dict]] = {c: [] for c in range(self.num_classes)}
+        sustained_thresh_frames = int(round(self.sustained_threshold_sec * self.sr))
+        for vol in volunteers:
+            vol_dir = self.dataset_dir / vol
+            if not vol_dir.is_dir():
+                continue
+            for scenario_dir in sorted(vol_dir.glob("s*")):
+                if not scenario_dir.is_dir():
+                    continue
+                scene = scenario_dir.name
+                annot_path = self.annot_dir / vol / f"{scene}.json"
+                if not annot_path.exists():
+                    continue
+                try:
+                    sensors_all = _load_recording_sensors(
+                        scenario_dir, vol, scene, load_mods
+                    )
+                except Exception:
+                    continue
+                if sensors_all is None or any(a is None for a in sensors_all.values()):
+                    continue
+                pressure_full = sensors_all["pressure"]                  # (T, 50)
+                input_arrs = {m: sensors_all[m] for m in self.input_modalities}
+                for m, arr in input_arrs.items():
+                    self._enforce_dim(input_arrs, m, arr, self._modality_dims)
+                T_avail = min(a.shape[0] for a in input_arrs.values())
+                T_avail = min(T_avail, pressure_full.shape[0])
+                if T_avail < (self.T_obs + self.T_fut) * self.downsample:
+                    continue
+                # Build grasp mask at 100 Hz, then downsample.
+                mask_full = build_grasp_mask(annot_path, T_avail,
+                                             sr=SAMPLING_RATE_HZ)
+                if self.label_mode == "verb":
+                    verb_full = build_verb_label(annot_path, T_avail, sr=SAMPLING_RATE_HZ)
+                    verb_ds   = verb_full[:T_avail:self.downsample]
+                else:
+                    verb_ds = None
+                if self.label_mode == "object":
+                    obj_full = build_object_label(annot_path, T_avail, sr=SAMPLING_RATE_HZ)
+                    obj_ds   = obj_full[:T_avail:self.downsample]
+                else:
+                    obj_ds = None
+                if self.label_mode == "three_class" and self.require_lift_for_sustained:
+                    lift_full = build_lift_eligible_mask(annot_path, T_avail, sr=SAMPLING_RATE_HZ)
+                    lift_eligible_ds = lift_full[:T_avail:self.downsample]
+                else:
+                    lift_eligible_ds = None
+                input_ds = {m: arr[:T_avail:self.downsample] for m, arr in input_arrs.items()}
+                pressure_ds = pressure_full[:T_avail:self.downsample]
+                mask_ds = mask_full[:T_avail:self.downsample]
+                T_ds = mask_ds.shape[0]
+                if self.use_per_cell_contact:
+                    # n_active per frame: count cells with value > per_cell_threshold_g
+                    n_active = (pressure_ds > self.per_cell_threshold_g).sum(axis=1)
+                    contact_frame = n_active >= self.min_active_cells
+                else:
+                    pressure_sum = pressure_ds.sum(axis=1)
+                    contact_frame = pressure_sum > self.contact_threshold_g
+                stride = max(1, int(round(self.anchor_stride_sec * self.sr)))
+                first_anchor = self.T_obs
+                last_anchor = T_ds - self.T_fut
+                if last_anchor <= first_anchor:
+                    continue
+                for anchor in range(first_anchor, last_anchor + 1, stride):
+                    fut_mask = mask_ds[anchor:anchor + self.T_fut]
+                    if fut_mask.shape[0] != self.T_fut:
+                        continue
+                    annotation_is_grasp = fut_mask.mean() >= self.majority_threshold
+                    if self.label_mode == "binary":
+                        label = int(annotation_is_grasp)
+                    elif self.label_mode == "three_class":
+                        if not annotation_is_grasp:
+                            label = 0  # NoGrasp
+                        else:
+                            # longest contiguous run of contact in future window
+                            fut_contact = contact_frame[anchor:anchor + self.T_fut]
+                            longest = 0; cur = 0
+                            for v in fut_contact:
+                                if v: cur += 1; longest = max(longest, cur)
+                                else: cur = 0
+                            is_sustained = longest >= sustained_thresh_frames
+                            if is_sustained and self.require_lift_for_sustained:
+                                # Demote to Class 1 unless majority of future window is in
+                                # a "lift-eligible" segment (verb ∈ LIFT_VERBS or hand=both).
+                                fut_lift = lift_eligible_ds[anchor:anchor + self.T_fut]
+                                if fut_lift.mean() < 0.5:
+                                    is_sustained = False
+                            label = 2 if is_sustained else 1
+                    elif self.label_mode == "verb":
+                        fut_v = verb_ds[anchor:anchor + self.T_fut]
+                        counts = np.bincount(fut_v, minlength=self.num_classes)
+                        label = int(np.argmax(counts))
+                    else:  # object — majority object in future window
+                        fut_o = obj_ds[anchor:anchor + self.T_fut]
+                        counts = np.bincount(fut_o, minlength=self.num_classes)
+                        label = int(np.argmax(counts))
+                    # event_type for stratification (4-class transition taxonomy)
+                    past_high = contact_frame[anchor - self.T_obs:anchor].mean() > 0.5
+                    fut_high  = contact_frame[anchor:anchor + self.T_fut].mean() > 0.5
+                    if not past_high and not fut_high: et = 0
+                    elif not past_high and fut_high:   et = 1
+                    elif past_high and fut_high:       et = 2
+                    else:                              et = 3
+                    past_slice = {m: arr[anchor - self.T_obs:anchor]
+                                  for m, arr in input_ds.items()}
+                    if any(w.shape[0] != self.T_obs for w in past_slice.values()):
+                        continue
+                    item = {
+                        "x": past_slice,
+                        "label": label,
+                        "event_type": et,
+                        "meta": {"vol": vol, "scene": scene, "anchor_idx": int(anchor)},
+                    }
+                    pools[label].append(item)
+        # Balance classes if requested (cap larger pool to per_class_max)
+        if self.per_class_max is not None:
+            for c, pool in pools.items():
+                if len(pool) > self.per_class_max:
+                    idx = rng.choice(len(pool), size=self.per_class_max, replace=False)
+                    pools[c] = [pool[i] for i in sorted(idx)]
+        self._items = [it for c in range(self.num_classes) for it in pools[c]]
+        if not self._items:
+            raise RuntimeError("GraspStateDataset: collected 0 anchors.")
+        # Z-score inputs
+        if input_stats is None:
+            input_stats = self._compute_input_stats()
+        self._input_stats = input_stats
+        self._apply_input_stats(input_stats)
+        if log:
+            if self.label_mode == "binary":
+                class_names = CLASS_NAMES_BINARY
+            elif self.label_mode == "three_class":
+                class_names = CLASS_NAMES_THREE
+            elif self.label_mode == "verb":
+                class_names = {i: v for i, v in enumerate(VERB_LIST)}
+            else:  # object
+                class_names = {i: v for i, v in enumerate(OBJECT_TOP_LIST)}
+            counts_class = {class_names[c]: sum(1 for it in self._items if it["label"] == c)
+                            for c in range(self.num_classes)}
+            counts_event = {EVENT_NAMES[k]: sum(1 for it in self._items if it["event_type"] == k)
+                            for k in (0, 1, 2, 3)}
+            print(f"[GraspStateDataset] vols={len(volunteers)} "
+                  f"inputs={self.input_modalities} "
+                  f"anchors={len(self._items)} class={counts_class} "
+                  f"event={counts_event} "
+                  f"T_obs={self.T_obs} T_fut={self.T_fut} sr={self.sr}Hz "
+                  f"input_dims={self._modality_dims}", flush=True)
+    @staticmethod
+    def _enforce_dim(arrs, m, arr, dim_dict):
+        if m in dim_dict:
+            tgt = dim_dict[m]
+            if arr.shape[1] != tgt:
+                if arr.shape[1] < tgt:
+                    pad = np.zeros((arr.shape[0], tgt - arr.shape[1]), dtype=np.float32)
+                    arrs[m] = np.concatenate([arr, pad], axis=1)
+                else:
+                    arrs[m] = arr[:, :tgt]
+        else:
+            dim_dict[m] = arr.shape[1]
+    def _compute_input_stats(self):
+        accs = {m: [] for m in self._modality_dims}
+        for it in self._items:
+            for m, w in it["x"].items():
+                accs[m].append(w)
+        out = {}
+        for m, ws in accs.items():
+            cat = np.concatenate(ws, axis=0)
+            mu = cat.mean(axis=0).astype(np.float32)
+            sd = cat.std(axis=0); sd = np.where(sd < 1e-6, 1.0, sd)
+            out[m] = (mu, sd.astype(np.float32))
+        return out
+    def _apply_input_stats(self, stats):
+        for it in self._items:
+            for m, w in it["x"].items():
+                if m in stats:
+                    mu, sd = stats[m]
+                    it["x"][m] = ((w - mu) / sd).astype(np.float32)
+    def __len__(self): return len(self._items)
+    def __getitem__(self, idx):
+        it = self._items[idx]
+        x = {m: torch.from_numpy(np.ascontiguousarray(w)) for m, w in it["x"].items()}
+        label = int(it["label"])
+        et = int(it["event_type"])
+        return x, label, et, it["meta"]
+    @property
+    def modality_dims(self): return dict(self._modality_dims)
+def collate_grasp_state(batch):
+    xs, labels, ets, metas = zip(*batch)
+    mods = list(xs[0].keys())
+    x_out = {m: torch.stack([x[m] for x in xs], dim=0) for m in mods}
+    y_out = torch.tensor(labels, dtype=torch.long)
+    et_out = torch.tensor(ets, dtype=torch.long)
+    return x_out, y_out, et_out, list(metas)
+def build_grasp_train_test(
+    input_modalities,
+    t_obs_sec=1.0, t_fut_sec=0.5, anchor_stride_sec=0.25,
+    downsample=5,
+    dataset_dir=DEFAULT_DATASET_DIR, annot_dir=DEFAULT_ANNOT_DIR,
+    contact_threshold_g=5.0, per_class_max=None,
+    label_mode="binary", sustained_threshold_sec=0.3,
+    require_lift_for_sustained=False,
+    rng_seed=0,
+    train_vols=None, test_vols=None,
+):
+    if train_vols is None: train_vols = TRAIN_VOLS_V3
+    if test_vols is None:  test_vols  = TEST_VOLS_V3
+    train = GraspStateDataset(
+        train_vols, input_modalities=input_modalities,
+        t_obs_sec=t_obs_sec, t_fut_sec=t_fut_sec,
+        anchor_stride_sec=anchor_stride_sec, downsample=downsample,
+        dataset_dir=dataset_dir, annot_dir=annot_dir,
+        contact_threshold_g=contact_threshold_g, per_class_max=per_class_max,
+        label_mode=label_mode, sustained_threshold_sec=sustained_threshold_sec,
+        require_lift_for_sustained=require_lift_for_sustained,
+        rng_seed=rng_seed, log=True,
+    )
+    test = GraspStateDataset(
+        test_vols, input_modalities=input_modalities,
+        t_obs_sec=t_obs_sec, t_fut_sec=t_fut_sec,
+        anchor_stride_sec=anchor_stride_sec, downsample=downsample,
+        dataset_dir=dataset_dir, annot_dir=annot_dir,
+        contact_threshold_g=contact_threshold_g, per_class_max=None,  # don't cap test
+        label_mode=label_mode, sustained_threshold_sec=sustained_threshold_sec,
+        require_lift_for_sustained=require_lift_for_sustained,
+        input_stats=train._input_stats,
+        expected_input_dims=train._modality_dims,
+        rng_seed=rng_seed + 1, log=True,
+    )
+    return train, test
+if __name__ == "__main__":
+    import argparse
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--input_modalities", default="emg,imu,mocap")
+    ap.add_argument("--t_obs", type=float, default=1.0)
+    ap.add_argument("--t_fut", type=float, default=0.5)
+    args = ap.parse_args()
+    tr, te = build_grasp_train_test(
+        input_modalities=args.input_modalities.split(","),
+        t_obs_sec=args.t_obs, t_fut_sec=args.t_fut,
+    )
+    x, y, et, meta = tr[0]
+    print(f"sample: x={ {m: tuple(v.shape) for m,v in x.items()} } y={y} et={et}")

experiments/data/dataset_seqpred.py ADDED Viewed

	@@ -0,0 +1,533 @@

+"""
+Segment-to-Next-Segment Triplet Prediction dataset (T10).
+For every annotated action segment k in every recording:
+    anchor_t      = start_time(segment_k) - T_fut      (seconds)
+    observation   = sensor frames in [anchor_t - T_obs, anchor_t]
+    target        = triplet labels of segment_k: (verb_fine, verb_composite,
+                                                  noun, hand)
+Segments whose observation window would spill before t=0 of the recording
+are skipped (no left-padding), so we never mix noise with real sensor data.
+Strategy A is enforced in taxonomy.classify_segment(): segments whose noun is
+not in the kept set (<50 occurrences) are dropped entirely.
+Per-modality tensors are returned as a dict so downstream models can either
+concat them (single-flow baselines) or keep them separate (our cross-modal
+fusion model). A float mask is returned alongside the sensor tensor so
+variable-length obs windows can be padded within a batch.
+"""
+from __future__ import annotations
+# pandas must be imported BEFORE torch/numpy to avoid a GLIBCXX load-order bug
+# on this cluster.
+import pandas as pd
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Dict, List, Optional, Sequence, Tuple
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+# Make sibling modules importable from either (a) the neurips26 root, or
+# (b) the frozen row/code/ folder (populated by setup_row.sh).
+_THIS = Path(__file__).resolve()
+sys.path.insert(0, str(_THIS.parent))         # code/ itself
+sys.path.insert(0, str(_THIS.parent.parent))  # neurips26/
+try:
+    from data.dataset import (  # noqa: E402
+        MODALITY_FILES, load_modality_array,
+    )
+    from experiments.taxonomy import (  # noqa: E402
+        classify_segment, NOUN, NUM_VERB_FINE, NUM_VERB_COMPOSITE, NUM_NOUN,
+        NUM_HAND,
+    )
+except ModuleNotFoundError:
+    from dataset import (  # noqa: E402
+        MODALITY_FILES, load_modality_array,
+    )
+    from taxonomy import (  # noqa: E402
+        classify_segment, NOUN, NUM_VERB_FINE, NUM_VERB_COMPOSITE, NUM_NOUN,
+        NUM_HAND,
+    )
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+# Hard-code the dataset and annotation paths. The frozen row/code/ folders sit
+# at arbitrary depths under the repo, so relative-to-__file__ discovery is
+# unreliable. An env override is available for e.g. running on a mirror.
+REPO = Path(os.environ.get(
+    "DAILYACT_REPO", "${PULSE_ROOT}"
+))
+DEFAULT_DATASET_DIR = REPO / "aligned_gy"
+DEFAULT_ANNOT_DIR   = REPO / "annotations_v3"
+SAMPLING_RATE_HZ = 100
+# 5x downsample -> 20 Hz. Matches the existing pipeline in dataset.py.
+DEFAULT_DOWNSAMPLE = 5
+VALID_MODALITIES = ("mocap", "emg", "eyetrack", "imu", "pressure")
+# Fixed subject-independent split. Hand-picked 5 test volunteers with full
+# 8-scene coverage, spread across the ID range. Any volunteer not listed
+# below but annotated in v3 is assumed to be train data (so the lists stay
+# stable as more volunteers get annotated).
+TEST_VOLS_V3  = ["v14", "v30", "v34", "v38", "v41"]
+TRAIN_VOLS_V3 = [
+    "v1",  "v2",  "v3",  "v4",  "v5",  "v6",  "v7",  "v8",  "v9",  "v10",
+    "v11", "v12", "v13",        "v15", "v16", "v17", "v18", "v19", "v20",
+    "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+    "v31", "v32", "v33",        "v35", "v36", "v37",        "v39", "v40",
+]
+assert set(TRAIN_VOLS_V3).isdisjoint(TEST_VOLS_V3), "Split must be disjoint"
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _parse_ts(ts: str) -> float:
+    """Parse 'HH:MM:SS' or 'MM:SS' (or 'M:S') into seconds."""
+    parts = ts.strip().split(":")
+    try:
+        if len(parts) == 2:
+            return float(parts[0]) * 60 + float(parts[1])
+        if len(parts) == 3:
+            return float(parts[0]) * 3600 + float(parts[1]) * 60 + float(parts[2])
+    except ValueError:
+        return 0.0
+    return 0.0
+def parse_ts_range(ts_range: str) -> Tuple[float, float]:
+    """Parse 'MM:SS-MM:SS' or 'HH:MM:SS-HH:MM:SS' into (start_sec, end_sec)."""
+    if "-" not in ts_range:
+        return 0.0, 0.0
+    a, b = ts_range.split("-", 1)
+    return _parse_ts(a), _parse_ts(b)
+def _load_recording_sensors(
+    scenario_dir: Path, vol: str, scenario: str,
+    modalities: Sequence[str],
+) -> Optional[Dict[str, np.ndarray]]:
+    """Load each requested modality as a (T, F_mod) float32 array at 100 Hz.
+    Returns None if any requested modality is missing or corrupted."""
+    out: Dict[str, np.ndarray] = {}
+    for mod in modalities:
+        if mod == "mocap":
+            fp = scenario_dir / f"aligned_{vol}{scenario}_s_Q.tsv"
+        else:
+            fp = scenario_dir / MODALITY_FILES[mod]
+        if not fp.exists():
+            return None
+        arr = load_modality_array(str(fp), mod)
+        if arr is None:
+            return None
+        out[mod] = arr.astype(np.float32)
+    # Align lengths across modalities (take min); all start at sensor t=0.
+    T = min(a.shape[0] for a in out.values())
+    for m in out:
+        out[m] = out[m][:T]
+    return out
+def _load_annotations(annot_path: Path) -> List[dict]:
+    with open(annot_path) as f:
+        d = json.load(f)
+    return d.get("segments", [])
+# ---------------------------------------------------------------------------
+# Dataset
+# ---------------------------------------------------------------------------
+class TripletSeqPredDataset(Dataset):
+    """One sample per (annotated segment, recording) pair.
+    Sample schema returned by __getitem__:
+        x:     dict {mod_name: FloatTensor(T_frames, F_mod)}
+        y:     dict {'verb_fine': int, 'verb_composite': int,
+                     'noun': int, 'hand': int}
+        meta:  dict {'vol', 'scene', 'seg_idx', 'anchor_sec'}
+    """
+    def __init__(
+        self,
+        volunteers: Sequence[str],
+        modalities: Sequence[str] = ("imu", "mocap", "emg", "eyetrack", "pressure"),
+        t_obs_sec: float = 8.0,
+        t_fut_sec: float = 2.0,
+        downsample: int = DEFAULT_DOWNSAMPLE,
+        dataset_dir: Path = DEFAULT_DATASET_DIR,
+        annot_dir: Path = DEFAULT_ANNOT_DIR,
+        stats: Optional[Dict[str, Tuple[np.ndarray, np.ndarray]]] = None,
+        min_seg_duration_sec: float = 0.4,
+        log: bool = True,
+        mode: str = "recognition",
+    ):
+        for m in modalities:
+            if m not in VALID_MODALITIES:
+                raise ValueError(f"Unknown modality: {m}")
+        if mode not in ("recognition", "anticipation"):
+            raise ValueError(f"mode must be 'recognition' or 'anticipation', got {mode!r}")
+        self.modalities = tuple(modalities)
+        self.t_obs_sec = float(t_obs_sec)
+        self.t_fut_sec = float(t_fut_sec)
+        self.downsample = int(downsample)
+        self.dataset_dir = Path(dataset_dir)
+        self.annot_dir   = Path(annot_dir)
+        self.mode = mode
+        # Effective obs-window length in frames at the post-downsample rate.
+        sr = SAMPLING_RATE_HZ // self.downsample       # 20 Hz
+        self.T_frames = int(round(self.t_obs_sec * sr))  # used only for anticipation
+        self._sr_down = sr
+        self._items: List[dict] = []
+        self._modality_dims: Dict[str, int] = {}
+        # If re-using training-set stats, force each modality's feature
+        # layout to match so we never apply a (14,)-mean to (24,)-data.
+        if stats is not None:
+            for m, (mu, _) in stats.items():
+                self._modality_dims[m] = mu.shape[1]
+        stats_counts = {
+            "recordings_scanned":    0,
+            "recordings_used":       0,
+            "segments_seen":         0,
+            "seg_dropped_label":     0,  # Strategy A + invalid verb/hand
+            "seg_dropped_too_early": 0,  # obs window before t=0
+            "seg_dropped_short":     0,
+            "seg_kept":              0,
+        }
+        for vol in volunteers:
+            vol_dir = self.dataset_dir / vol
+            if not vol_dir.is_dir():
+                continue
+            for scenario_dir in sorted(vol_dir.glob("s*")):
+                if not scenario_dir.is_dir():
+                    continue
+                scene = scenario_dir.name
+                if scene not in {f"s{i}" for i in range(1, 9)}:
+                    continue
+                annot_path = self.annot_dir / vol / f"{scene}.json"
+                if not annot_path.exists():
+                    continue
+                stats_counts["recordings_scanned"] += 1
+                sensors = _load_recording_sensors(scenario_dir, vol, scene,
+                                                  self.modalities)
+                if sensors is None:
+                    continue
+                # Store / validate per-modality dim
+                for m, arr in sensors.items():
+                    if m in self._modality_dims:
+                        if arr.shape[1] != self._modality_dims[m]:
+                            # Pad or truncate to match the first seen dim.
+                            target = self._modality_dims[m]
+                            if arr.shape[1] < target:
+                                pad = np.zeros((arr.shape[0], target - arr.shape[1]),
+                                               dtype=np.float32)
+                                sensors[m] = np.concatenate([arr, pad], axis=1)
+                            else:
+                                sensors[m] = arr[:, :target]
+                    else:
+                        self._modality_dims[m] = arr.shape[1]
+                segs = _load_annotations(annot_path)
+                rec_used = False
+                # BOS index for first segment in a recording (or after dropped segs).
+                BOS_VC = NUM_VERB_COMPOSITE   # = 6
+                BOS_N  = NUM_NOUN              # = 34
+                prev_vc, prev_n = BOS_VC, BOS_N
+                for seg_idx, seg in enumerate(segs):
+                    stats_counts["segments_seen"] += 1
+                    a = seg.get("action_annotation", {})
+                    labels = classify_segment(a)
+                    if labels is None:
+                        stats_counts["seg_dropped_label"] += 1
+                        # do not advance prev (skipped segment doesn't update context)
+                        continue
+                    start_sec, end_sec = parse_ts_range(seg.get("timestamp", ""))
+                    if end_sec - start_sec < min_seg_duration_sec:
+                        stats_counts["seg_dropped_short"] += 1
+                        continue
+                    if self.mode == "anticipation":
+                        anchor_sec = start_sec - self.t_fut_sec
+                        obs_start_sec = anchor_sec - self.t_obs_sec
+                        if obs_start_sec < 0:
+                            stats_counts["seg_dropped_too_early"] += 1
+                            continue
+                        i0 = int(round(obs_start_sec * SAMPLING_RATE_HZ))
+                        i1 = int(round(anchor_sec * SAMPLING_RATE_HZ))
+                        meta_extra = {"anchor_sec": anchor_sec}
+                    else:  # recognition
+                        # Use the segment's own [start, end] as the input window.
+                        i0 = int(round(start_sec * SAMPLING_RATE_HZ))
+                        i1 = int(round(end_sec * SAMPLING_RATE_HZ))
+                        meta_extra = {"start_sec": start_sec, "end_sec": end_sec}
+                    T_avail = min(a.shape[0] for a in sensors.values())
+                    if i1 > T_avail:
+                        stats_counts["seg_dropped_too_early"] += 1
+                        continue
+                    if i0 < 0:
+                        i0 = 0  # safety; recognition mode shouldn't hit this
+                    window: Dict[str, np.ndarray] = {}
+                    for m, arr in sensors.items():
+                        w = arr[i0:i1]
+                        # Downsample: decimate every `downsample`-th frame.
+                        w = w[::self.downsample]
+                        window[m] = w
+                    # Must have at least 4 post-downsample frames to be useful.
+                    min_T = min(w.shape[0] for w in window.values())
+                    if min_T < 4:
+                        stats_counts["seg_dropped_short"] += 1
+                        continue
+                    self._items.append({
+                        "x": window,
+                        "y": labels,
+                        "prev": {"verb_composite": prev_vc, "noun": prev_n},
+                        "meta": {
+                            "vol": vol, "scene": scene,
+                            "seg_idx": seg_idx, **meta_extra,
+                        },
+                    })
+                    stats_counts["seg_kept"] += 1
+                    # Update context for next kept segment in this recording.
+                    prev_vc = labels["verb_composite"]
+                    prev_n  = labels["noun"]
+                    rec_used = True
+                if rec_used:
+                    stats_counts["recordings_used"] += 1
+        if len(self._items) == 0:
+            raise RuntimeError(
+                "No samples collected. Check annot_dir, modalities, t_obs, t_fut."
+            )
+        # Per-modality z-score normalization using training-set stats.
+        if stats is None:
+            stats = self._compute_stats()
+        self._stats = stats
+        self._apply_stats(stats)
+        if log:
+            print(f"[TripletSeqPredDataset:{self.mode}] "
+                  f"vols={len(volunteers)} "
+                  f"recs_scan={stats_counts['recordings_scanned']} "
+                  f"recs_used={stats_counts['recordings_used']} "
+                  f"segs_seen={stats_counts['segments_seen']} "
+                  f"kept={stats_counts['seg_kept']} "
+                  f"drop_label={stats_counts['seg_dropped_label']} "
+                  f"drop_early={stats_counts['seg_dropped_too_early']} "
+                  f"drop_short={stats_counts['seg_dropped_short']}",
+                  flush=True)
+            print(f"  modality_dims={self._modality_dims} "
+                  f"T_frames={self.T_frames} sr_down={sr}Hz",
+                  flush=True)
+        self.stats_counts = stats_counts
+    # ----- stats (per-modality mean/std on training split) -----
+    def _compute_stats(self) -> Dict[str, Tuple[np.ndarray, np.ndarray]]:
+        acc: Dict[str, List[np.ndarray]] = {m: [] for m in self.modalities}
+        for it in self._items:
+            for m, w in it["x"].items():
+                acc[m].append(w.astype(np.float64))
+        out: Dict[str, Tuple[np.ndarray, np.ndarray]] = {}
+        for m, arrs in acc.items():
+            cat = np.concatenate(arrs, axis=0)
+            mu  = cat.mean(axis=0, keepdims=True)
+            sd  = cat.std(axis=0, keepdims=True)
+            sd[sd < 1e-8] = 1.0
+            out[m] = (mu.astype(np.float32), sd.astype(np.float32))
+        return out
+    def _apply_stats(self, stats: Dict[str, Tuple[np.ndarray, np.ndarray]]) -> None:
+        for it in self._items:
+            for m, w in it["x"].items():
+                mu, sd = stats[m]
+                z = (w.astype(np.float32) - mu) / sd
+                z = np.nan_to_num(z, nan=0.0, posinf=0.0, neginf=0.0)
+                it["x"][m] = z.astype(np.float32)
+    def get_stats(self) -> Dict[str, Tuple[np.ndarray, np.ndarray]]:
+        return self._stats
+    # ----- Dataset protocol -----
+    def __len__(self) -> int:
+        return len(self._items)
+    def __getitem__(self, idx: int):
+        it = self._items[idx]
+        x = {m: torch.from_numpy(w) for m, w in it["x"].items()}
+        y = it["y"]
+        meta = it["meta"]
+        prev = it.get("prev", {"verb_composite": NUM_VERB_COMPOSITE, "noun": NUM_NOUN})
+        return x, y, meta, prev
+    # ----- convenience -----
+    @property
+    def modality_dims(self) -> Dict[str, int]:
+        return dict(self._modality_dims)
+    @property
+    def total_feat_dim(self) -> int:
+        return sum(self._modality_dims.values())
+    def class_counts(self) -> Dict[str, np.ndarray]:
+        vf = np.zeros(NUM_VERB_FINE, dtype=np.int64)
+        vc = np.zeros(NUM_VERB_COMPOSITE, dtype=np.int64)
+        n  = np.zeros(NUM_NOUN, dtype=np.int64)
+        h  = np.zeros(NUM_HAND, dtype=np.int64)
+        for it in self._items:
+            y = it["y"]
+            vf[y["verb_fine"]] += 1
+            vc[y["verb_composite"]] += 1
+            n[y["noun"]] += 1
+            h[y["hand"]] += 1
+        return {"verb_fine": vf, "verb_composite": vc, "noun": n, "hand": h}
+# ---------------------------------------------------------------------------
+# Collate: pad each modality to the max T_frames in the batch
+# ---------------------------------------------------------------------------
+def collate_triplet(batch):
+    """Stack samples into batched tensors. Backward-compatible: accepts
+    samples of either (x, y, meta) or (x, y, meta, prev) form.
+    Returned:
+        x:      dict[mod] -> FloatTensor (B, T_max, F_mod)
+        mask:   BoolTensor (B, T_max)
+        lens:   LongTensor (B,)
+        y:      dict (each -> LongTensor (B,))
+        meta:   list of dicts
+        prev:   dict {'verb_composite': LongTensor (B,), 'noun': LongTensor (B,)}
+                values are class indices, with NUM_VERB_COMPOSITE / NUM_NOUN
+                used as a BOS sentinel for the first segment in a recording.
+    """
+    has_prev = len(batch[0]) >= 4
+    if has_prev:
+        xs, ys, metas, prevs = zip(*batch)
+    else:
+        xs, ys, metas = zip(*batch)
+        prevs = [{"verb_composite": NUM_VERB_COMPOSITE, "noun": NUM_NOUN} for _ in batch]
+    B = len(batch)
+    mods = list(xs[0].keys())
+    lens = torch.tensor([x[mods[0]].shape[0] for x in xs], dtype=torch.long)
+    T_max = int(lens.max().item())
+    x_out: Dict[str, torch.Tensor] = {}
+    for m in mods:
+        F = xs[0][m].shape[1]
+        padded = torch.zeros(B, T_max, F, dtype=torch.float32)
+        for i, x in enumerate(xs):
+            w = x[m]
+            padded[i, :w.shape[0]] = w
+        x_out[m] = padded
+    ar = torch.arange(T_max).unsqueeze(0)
+    mask = ar < lens.unsqueeze(1)
+    y_out = {
+        k: torch.tensor([y[k] for y in ys], dtype=torch.long)
+        for k in ("verb_fine", "verb_composite", "noun", "hand")
+    }
+    prev_out = {
+        "verb_composite": torch.tensor([p["verb_composite"] for p in prevs], dtype=torch.long),
+        "noun":           torch.tensor([p["noun"]           for p in prevs], dtype=torch.long),
+    }
+    return x_out, mask, lens, y_out, list(metas), prev_out
+# ---------------------------------------------------------------------------
+# Convenience: build paired train/test datasets with shared normalization
+# ---------------------------------------------------------------------------
+def build_train_test(
+    modalities: Sequence[str] = ("imu", "mocap", "emg", "eyetrack", "pressure"),
+    t_obs_sec: float = 8.0,
+    t_fut_sec: float = 2.0,
+    downsample: int = DEFAULT_DOWNSAMPLE,
+    dataset_dir: Path = DEFAULT_DATASET_DIR,
+    annot_dir: Path = DEFAULT_ANNOT_DIR,
+    mode: str = "recognition",
+) -> Tuple["TripletSeqPredDataset", "TripletSeqPredDataset"]:
+    train = TripletSeqPredDataset(
+        TRAIN_VOLS_V3, modalities=modalities,
+        t_obs_sec=t_obs_sec, t_fut_sec=t_fut_sec, downsample=downsample,
+        dataset_dir=dataset_dir, annot_dir=annot_dir, mode=mode,
+    )
+    test = TripletSeqPredDataset(
+        TEST_VOLS_V3, modalities=modalities,
+        t_obs_sec=t_obs_sec, t_fut_sec=t_fut_sec, downsample=downsample,
+        dataset_dir=dataset_dir, annot_dir=annot_dir,
+        stats=train.get_stats(), mode=mode,
+    )
+    return train, test
+# ---------------------------------------------------------------------------
+# CLI: quick sanity check
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    import argparse
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--modalities", type=str, default="imu,emg,eyetrack")
+    ap.add_argument("--t_obs", type=float, default=8.0)
+    ap.add_argument("--t_fut", type=float, default=2.0)
+    ap.add_argument("--smoke_n", type=int, default=3,
+                    help="Inspect first N samples per split")
+    args = ap.parse_args()
+    mods = args.modalities.split(",")
+    print(f"Building train/test with modalities={mods} "
+          f"t_obs={args.t_obs}s t_fut={args.t_fut}s ...")
+    train, test = build_train_test(
+        modalities=mods,
+        t_obs_sec=args.t_obs,
+        t_fut_sec=args.t_fut,
+    )
+    print(f"train: {len(train)} samples | test: {len(test)} samples")
+    for name, ds in [("train", train), ("test", test)]:
+        counts = ds.class_counts()
+        print(f"\n[{name}] class counts:")
+        print("  verb_fine:",      counts["verb_fine"].tolist())
+        print("  verb_composite:", counts["verb_composite"].tolist())
+        print("  noun (sum):",     int(counts["noun"].sum()),
+              "nonzero:", int((counts["noun"] > 0).sum()))
+        print("  hand:",           counts["hand"].tolist())
+        print(f"\n[{name}] first {args.smoke_n} samples:")
+        for i in range(min(args.smoke_n, len(ds))):
+            x, y, meta = ds[i]
+            shape_str = " ".join(f"{m}:{tuple(x[m].shape)}" for m in x)
+            print(f"  {i:3d} {meta['vol']}/{meta['scene']}#{meta['seg_idx']:3d} "
+                  f"anchor={meta['anchor_sec']:.2f}s  y={y}  {shape_str}")

experiments/data/dataset_signal_forecast.py ADDED Viewed

	@@ -0,0 +1,391 @@

+"""Frame-level future *signal* forecasting dataset (T8 v2).
+Task definition
+---------------
+At a sampled anchor t in a recording:
+  past   = sensor frames over [t - T_obs, t]                   ← input
+  future = target-modality frames over (t, t + T_fut]          ← regression target
+Unlike the v1 ForecastDataset (which targets per-frame verb-fine class), this
+predicts the raw *signal* values of one chosen target modality. This directly
+tests the Johansson 1984 / monzee 2003 hypothesis that cutaneous force
+feedback drives sub-second motor planning at the *signal* level (motor
+commands / kinematics), not at the level of slow-changing semantic verbs.
+Anchor stratification (4 event types based on contact transitions)
+------------------------------------------------------------------
+For each candidate anchor, we compute pressure_sum on past and future windows
+and label it by the (past_majority_contact, future_majority_contact) pair:
+    type 0 = non-contact   (past low, future low)   — control: pressure ~ 0
+    type 1 = pre-contact   (past low, future high)  — pressure foretells onset
+    type 2 = steady-grip   (past high, future high) — sustained contact dynamics
+    type 3 = release       (past high, future low)  — letting-go dynamics
+Per-event-type counts are reported and (optionally) capped to balance.
+Evaluation is broken down per event type so we can see WHERE pressure helps.
+"""
+from __future__ import annotations
+import sys
+from pathlib import Path
+from typing import Dict, List, Optional, Sequence, Tuple
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+THIS = Path(__file__).resolve()
+sys.path.insert(0, str(THIS.parent))
+sys.path.insert(0, str(THIS.parents[1]))
+try:
+    from experiments.dataset_seqpred import (
+        SAMPLING_RATE_HZ, _load_recording_sensors,
+        TRAIN_VOLS_V3, TEST_VOLS_V3,
+        DEFAULT_DATASET_DIR, DEFAULT_ANNOT_DIR,
+    )
+except ModuleNotFoundError:
+    from dataset_seqpred import (
+        SAMPLING_RATE_HZ, _load_recording_sensors,
+        TRAIN_VOLS_V3, TEST_VOLS_V3,
+        DEFAULT_DATASET_DIR, DEFAULT_ANNOT_DIR,
+    )
+EVENT_NAMES = {0: "non-contact", 1: "pre-contact", 2: "steady-grip", 3: "release"}
+class SignalForecastDataset(Dataset):
+    """Predict future T_fut frames of `target_modality` from past T_obs of `input_modalities`."""
+    def __init__(
+        self,
+        volunteers: Sequence[str],
+        input_modalities: Sequence[str],
+        target_modality: str,
+        t_obs_sec: float = 1.5,
+        t_fut_sec: float = 0.5,
+        anchor_stride_sec: float = 0.25,
+        downsample: int = 5,
+        dataset_dir: Path = DEFAULT_DATASET_DIR,
+        annot_dir: Path = DEFAULT_ANNOT_DIR,
+        contact_threshold_g: float = 5.0,
+        per_event_max: Optional[int] = None,
+        input_stats: Optional[Dict[str, Tuple[np.ndarray, np.ndarray]]] = None,
+        target_stats: Optional[Tuple[np.ndarray, np.ndarray]] = None,
+        future_pressure_stats: Optional[Tuple[np.ndarray, np.ndarray]] = None,
+        expected_input_dims: Optional[Dict[str, int]] = None,
+        expected_target_dim: Optional[int] = None,
+        include_future_pressure: bool = False,
+        rng_seed: int = 0,
+        log: bool = True,
+    ):
+        super().__init__()
+        self.input_modalities = list(input_modalities)
+        self.target_modality = str(target_modality)
+        self.t_obs_sec = float(t_obs_sec)
+        self.t_fut_sec = float(t_fut_sec)
+        self.anchor_stride_sec = float(anchor_stride_sec)
+        self.downsample = int(downsample)
+        self.sr = SAMPLING_RATE_HZ // self.downsample
+        self.dataset_dir = Path(dataset_dir)
+        self.annot_dir = Path(annot_dir)
+        self.contact_threshold_g = float(contact_threshold_g)
+        self.per_event_max = per_event_max
+        self.include_future_pressure = bool(include_future_pressure)
+        self.T_obs = int(round(self.t_obs_sec * self.sr))
+        self.T_fut = int(round(self.t_fut_sec * self.sr))
+        self._items: List[dict] = []
+        self._modality_dims: Dict[str, int] = dict(expected_input_dims) if expected_input_dims else {}
+        self._target_dim: int = int(expected_target_dim) if expected_target_dim else -1
+        rng = np.random.default_rng(rng_seed)
+        # Modalities to load: union of inputs + target + pressure (for filter)
+        load_mods = list(dict.fromkeys(
+            list(self.input_modalities) + [self.target_modality, "pressure"]
+        ))
+        # Per-event-type pool of candidate anchor records
+        pools: Dict[int, List[dict]] = {0: [], 1: [], 2: [], 3: []}
+        for vol in volunteers:
+            vol_dir = self.dataset_dir / vol
+            if not vol_dir.is_dir():
+                continue
+            for scenario_dir in sorted(vol_dir.glob("s*")):
+                if not scenario_dir.is_dir():
+                    continue
+                scene = scenario_dir.name
+                annot_path = self.annot_dir / vol / f"{scene}.json"
+                if not annot_path.exists():
+                    continue
+                try:
+                    sensors_all = _load_recording_sensors(
+                        scenario_dir, vol, scene, load_mods
+                    )
+                except Exception:
+                    continue
+                if sensors_all is None or any(a is None for a in sensors_all.values()):
+                    continue
+                pressure_full = sensors_all["pressure"]      # (T, 50)
+                target_full = sensors_all[self.target_modality]
+                input_arrs = {m: sensors_all[m] for m in self.input_modalities}
+                # Track input modality dims
+                for m, arr in input_arrs.items():
+                    self._enforce_dim(input_arrs, m, arr, self._modality_dims)
+                # Track target dim
+                if self._target_dim < 0:
+                    self._target_dim = target_full.shape[1]
+                elif target_full.shape[1] != self._target_dim:
+                    if target_full.shape[1] < self._target_dim:
+                        pad = np.zeros((target_full.shape[0], self._target_dim - target_full.shape[1]),
+                                       dtype=np.float32)
+                        target_full = np.concatenate([target_full, pad], axis=1)
+                    else:
+                        target_full = target_full[:, :self._target_dim]
+                T_avail = min(a.shape[0] for a in input_arrs.values())
+                T_avail = min(T_avail, target_full.shape[0], pressure_full.shape[0])
+                if T_avail < (self.T_obs + self.T_fut) * self.downsample:
+                    continue
+                # Downsample to 20 Hz
+                input_ds = {m: arr[:T_avail:self.downsample] for m, arr in input_arrs.items()}
+                target_ds = target_full[:T_avail:self.downsample]
+                pressure_ds = pressure_full[:T_avail:self.downsample]
+                T_ds = target_ds.shape[0]
+                pressure_sum = pressure_ds.sum(axis=1)        # (T_ds,)
+                stride = max(1, int(round(self.anchor_stride_sec * self.sr)))
+                first_anchor = self.T_obs
+                last_anchor = T_ds - self.T_fut
+                if last_anchor <= first_anchor:
+                    continue
+                for anchor in range(first_anchor, last_anchor + 1, stride):
+                    past_p = pressure_sum[anchor - self.T_obs:anchor]
+                    fut_p = pressure_sum[anchor:anchor + self.T_fut]
+                    past_high = (past_p > self.contact_threshold_g).mean() > 0.5
+                    fut_high = (fut_p > self.contact_threshold_g).mean() > 0.5
+                    if not past_high and not fut_high:
+                        et = 0
+                    elif not past_high and fut_high:
+                        et = 1
+                    elif past_high and fut_high:
+                        et = 2
+                    else:
+                        et = 3
+                    past_slice = {m: arr[anchor - self.T_obs:anchor]
+                                  for m, arr in input_ds.items()}
+                    past_target_last = target_ds[anchor - 1].copy()         # (target_dim,)
+                    fut_target = target_ds[anchor:anchor + self.T_fut].copy()
+                    if any(w.shape[0] != self.T_obs for w in past_slice.values()):
+                        continue
+                    if fut_target.shape[0] != self.T_fut:
+                        continue
+                    item = {
+                        "x": past_slice,
+                        "y": fut_target,
+                        "y_last": past_target_last,                          # for persistence
+                        "event_type": int(et),
+                        "meta": {"vol": vol, "scene": scene, "anchor_idx": int(anchor)},
+                    }
+                    if self.include_future_pressure:
+                        fut_press = pressure_ds[anchor:anchor + self.T_fut].copy()
+                        if fut_press.shape[0] != self.T_fut:
+                            continue
+                        item["fp"] = fut_press                              # (T_fut, 50)
+                    pools[et].append(item)
+        # Cap per-event count if requested (uniform downsample for balance)
+        for et, pool in pools.items():
+            if self.per_event_max is not None and len(pool) > self.per_event_max:
+                idx = rng.choice(len(pool), size=self.per_event_max, replace=False)
+                pools[et] = [pool[i] for i in sorted(idx)]
+        self._items = [it for et in (0, 1, 2, 3) for it in pools[et]]
+        if not self._items:
+            raise RuntimeError("SignalForecastDataset: collected 0 anchors.")
+        # Z-score inputs and target separately
+        if input_stats is None:
+            input_stats = self._compute_input_stats()
+        self._input_stats = input_stats
+        self._apply_input_stats(input_stats)
+        if target_stats is None:
+            target_stats = self._compute_target_stats()
+        self._target_stats = target_stats
+        self._apply_target_stats(target_stats)
+        if self.include_future_pressure:
+            if future_pressure_stats is None:
+                future_pressure_stats = self._compute_fp_stats()
+            self._fp_stats = future_pressure_stats
+            self._apply_fp_stats(future_pressure_stats)
+        else:
+            self._fp_stats = None
+        if log:
+            counts = {EVENT_NAMES[k]: sum(1 for it in self._items if it["event_type"] == k)
+                      for k in (0, 1, 2, 3)}
+            print(f"[SignalForecastDataset] vols={len(volunteers)} "
+                  f"target={self.target_modality} inputs={self.input_modalities} "
+                  f"anchors={len(self._items)} {counts} "
+                  f"T_obs={self.T_obs} T_fut={self.T_fut} sr={self.sr}Hz "
+                  f"input_dims={self._modality_dims} target_dim={self._target_dim}",
+                  flush=True)
+    @staticmethod
+    def _enforce_dim(arrs, m, arr, dim_dict):
+        if m in dim_dict:
+            target = dim_dict[m]
+            if arr.shape[1] != target:
+                if arr.shape[1] < target:
+                    pad = np.zeros((arr.shape[0], target - arr.shape[1]), dtype=np.float32)
+                    arrs[m] = np.concatenate([arr, pad], axis=1)
+                else:
+                    arrs[m] = arr[:, :target]
+        else:
+            dim_dict[m] = arr.shape[1]
+    def _compute_input_stats(self):
+        accs = {m: [] for m in self._modality_dims}
+        for it in self._items:
+            for m, w in it["x"].items():
+                accs[m].append(w)
+        out = {}
+        for m, ws in accs.items():
+            cat = np.concatenate(ws, axis=0)
+            mu = cat.mean(axis=0).astype(np.float32)
+            sd = cat.std(axis=0); sd = np.where(sd < 1e-6, 1.0, sd)
+            out[m] = (mu, sd.astype(np.float32))
+        return out
+    def _apply_input_stats(self, stats):
+        for it in self._items:
+            for m, w in it["x"].items():
+                if m in stats:
+                    mu, sd = stats[m]
+                    it["x"][m] = ((w - mu) / sd).astype(np.float32)
+    def _compute_target_stats(self):
+        ys = np.concatenate([it["y"] for it in self._items], axis=0)
+        mu = ys.mean(axis=0).astype(np.float32)
+        sd = ys.std(axis=0); sd = np.where(sd < 1e-6, 1.0, sd)
+        return (mu, sd.astype(np.float32))
+    def _apply_target_stats(self, stats):
+        mu, sd = stats
+        for it in self._items:
+            it["y"] = ((it["y"] - mu) / sd).astype(np.float32)
+            it["y_last"] = ((it["y_last"] - mu) / sd).astype(np.float32)
+    def _compute_fp_stats(self):
+        fps = np.concatenate([it["fp"] for it in self._items], axis=0)
+        mu = fps.mean(axis=0).astype(np.float32)
+        sd = fps.std(axis=0); sd = np.where(sd < 1e-6, 1.0, sd)
+        return (mu, sd.astype(np.float32))
+    def _apply_fp_stats(self, stats):
+        mu, sd = stats
+        for it in self._items:
+            it["fp"] = ((it["fp"] - mu) / sd).astype(np.float32)
+    def __len__(self):
+        return len(self._items)
+    def __getitem__(self, idx):
+        it = self._items[idx]
+        x = {m: torch.from_numpy(np.ascontiguousarray(w)) for m, w in it["x"].items()}
+        y = torch.from_numpy(np.ascontiguousarray(it["y"]))                # (T_fut, target_dim)
+        y_last = torch.from_numpy(np.ascontiguousarray(it["y_last"]))      # (target_dim,)
+        et = int(it["event_type"])
+        if self.include_future_pressure:
+            fp = torch.from_numpy(np.ascontiguousarray(it["fp"]))          # (T_fut, 50)
+            return x, y, y_last, fp, et, it["meta"]
+        return x, y, y_last, et, it["meta"]
+    @property
+    def modality_dims(self):
+        return dict(self._modality_dims)
+    @property
+    def target_dim(self):
+        return self._target_dim
+def collate_signal_forecast(batch):
+    if len(batch[0]) == 6:                               # has future pressure
+        xs, ys, ylasts, fps, ets, metas = zip(*batch)
+        mods = list(xs[0].keys())
+        x_out = {m: torch.stack([x[m] for x in xs], dim=0) for m in mods}
+        y_out = torch.stack(ys, dim=0)
+        yl_out = torch.stack(ylasts, dim=0)
+        fp_out = torch.stack(fps, dim=0)                  # (B, T_fut, 50)
+        et_out = torch.tensor(ets, dtype=torch.long)
+        return x_out, y_out, yl_out, fp_out, et_out, list(metas)
+    xs, ys, ylasts, ets, metas = zip(*batch)
+    mods = list(xs[0].keys())
+    x_out = {m: torch.stack([x[m] for x in xs], dim=0) for m in mods}
+    y_out = torch.stack(ys, dim=0)
+    yl_out = torch.stack(ylasts, dim=0)
+    et_out = torch.tensor(ets, dtype=torch.long)
+    return x_out, y_out, yl_out, et_out, list(metas)
+def build_signal_train_test(
+    input_modalities, target_modality,
+    t_obs_sec=1.5, t_fut_sec=0.5, anchor_stride_sec=0.25,
+    downsample=5,
+    dataset_dir=DEFAULT_DATASET_DIR, annot_dir=DEFAULT_ANNOT_DIR,
+    contact_threshold_g=5.0, per_event_max=None,
+    include_future_pressure=False,
+    rng_seed=0,
+):
+    train = SignalForecastDataset(
+        TRAIN_VOLS_V3, input_modalities=input_modalities,
+        target_modality=target_modality,
+        t_obs_sec=t_obs_sec, t_fut_sec=t_fut_sec,
+        anchor_stride_sec=anchor_stride_sec, downsample=downsample,
+        dataset_dir=dataset_dir, annot_dir=annot_dir,
+        contact_threshold_g=contact_threshold_g, per_event_max=per_event_max,
+        include_future_pressure=include_future_pressure,
+        rng_seed=rng_seed, log=True,
+    )
+    test = SignalForecastDataset(
+        TEST_VOLS_V3, input_modalities=input_modalities,
+        target_modality=target_modality,
+        t_obs_sec=t_obs_sec, t_fut_sec=t_fut_sec,
+        anchor_stride_sec=anchor_stride_sec, downsample=downsample,
+        dataset_dir=dataset_dir, annot_dir=annot_dir,
+        contact_threshold_g=contact_threshold_g, per_event_max=per_event_max,
+        input_stats=train._input_stats, target_stats=train._target_stats,
+        future_pressure_stats=train._fp_stats,
+        expected_input_dims=train._modality_dims,
+        expected_target_dim=train._target_dim,
+        include_future_pressure=include_future_pressure,
+        rng_seed=rng_seed + 1, log=True,
+    )
+    return train, test
+if __name__ == "__main__":
+    import argparse
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--input_modalities", default="imu")
+    ap.add_argument("--target_modality", default="imu")
+    ap.add_argument("--t_obs", type=float, default=1.5)
+    ap.add_argument("--t_fut", type=float, default=0.5)
+    args = ap.parse_args()
+    tr, te = build_signal_train_test(
+        input_modalities=args.input_modalities.split(","),
+        target_modality=args.target_modality,
+        t_obs_sec=args.t_obs, t_fut_sec=args.t_fut,
+    )
+    x, y, y_last, et, meta = tr[0]
+    print(f"Sample: x={ {m: tuple(v.shape) for m,v in x.items()} } y={tuple(y.shape)} y_last={tuple(y_last.shape)} event_type={et}")

experiments/nets/__init__.py ADDED Viewed

File without changes

experiments/nets/__pycache__/models_seqpred.cpython-312.pyc ADDED Viewed

Binary file (44.4 kB). View file

experiments/nets/baselines_published/__init__.py ADDED Viewed

File without changes

experiments/nets/baselines_published/baselines.py ADDED Viewed

	@@ -0,0 +1,488 @@

+"""
+Published baselines for T1 Scene Recognition, reproduced on DailyAct-5M.
+Each method accepts a concatenated feature tensor (B, T, F_total) where F_total
+is the sum of the active modality dims; the per-modality slices are recorded in
+the `modality_dims` dict. Each method then uses the subset of modalities its
+original paper intended.
+All methods output an (B, num_classes) logit tensor.
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def _slice(x, mod_dims, wanted):
+    """Slice the concatenated feature tensor to keep only `wanted` modalities,
+    in the order given. mod_dims is an ordered dict. Returns
+    {name: tensor(B,T,d_name)} plus the concat."""
+    parts = {}
+    offset = 0
+    for name, d in mod_dims.items():
+        if name in wanted:
+            parts[name] = x[..., offset:offset + d]
+        offset += d
+    assert len(parts) > 0, f"None of {wanted} in {list(mod_dims.keys())}"
+    return parts
+# ---------------------------------------------------------------------------
+# 1) ST-GCN  (Yan et al., AAAI 2018)
+#    Spatio-temporal graph CNN for skeleton action recognition.
+#    We treat the 56-joint MoCap skeleton as the graph.
+# ---------------------------------------------------------------------------
+class STGCNBlock(nn.Module):
+    def __init__(self, in_ch, out_ch, n_joints, stride=1, dropout=0.2):
+        super().__init__()
+        # Spatial graph conv: learnable adjacency (fully learned, no handcrafted A)
+        self.A = nn.Parameter(torch.eye(n_joints) + 0.1 * torch.randn(n_joints, n_joints))
+        self.spatial = nn.Conv2d(in_ch, out_ch, kernel_size=(1, 1), bias=False)
+        self.spatial_bn = nn.BatchNorm2d(out_ch)
+        self.temporal = nn.Conv2d(out_ch, out_ch, kernel_size=(9, 1),
+                                  padding=(4, 0), stride=(stride, 1))
+        self.temporal_bn = nn.BatchNorm2d(out_ch)
+        self.dropout = nn.Dropout(dropout)
+        if in_ch != out_ch or stride != 1:
+            self.res = nn.Conv2d(in_ch, out_ch, kernel_size=1,
+                                 stride=(stride, 1))
+        else:
+            self.res = nn.Identity()
+    def forward(self, x):
+        # x: (B, C, T, V)
+        res = self.res(x)
+        # spatial: aggregate along joints via A
+        h = self.spatial(x)
+        h = torch.einsum('bctv,vw->bctw', h, F.softmax(self.A, dim=-1))
+        h = self.spatial_bn(h)
+        h = F.relu(h)
+        # temporal
+        h = self.temporal(h)
+        h = self.temporal_bn(h)
+        h = self.dropout(h)
+        return F.relu(h + res)
+class STGCN(nn.Module):
+    """ST-GCN on MoCap skeleton. We assume the MoCap modality is 620-dim
+    (hip-relative + velocity) and reshape to ~56 joints."""
+    def __init__(self, feat_dim_mocap, num_classes, hidden=64, n_joints=52):
+        super().__init__()
+        self.n_joints = n_joints
+        # MoCap feat is (T, 620). 52 joints × 4 (xyz+quat_type), or we take per-joint xyz-only = 156.
+        # In this repo, 620 = 52 markers * 4 cols + velocity features. We'll
+        # reshape by slicing to 3*52=156 "primary" coords, padded if needed.
+        self.coord_dim = 3  # we'll treat each joint as having 3 coords (XYZ)
+        self.proj_in = nn.Linear(feat_dim_mocap, n_joints * self.coord_dim)
+        self.blocks = nn.ModuleList([
+            STGCNBlock(self.coord_dim, hidden, n_joints),
+            STGCNBlock(hidden, hidden, n_joints),
+            STGCNBlock(hidden, hidden * 2, n_joints, stride=2),
+            STGCNBlock(hidden * 2, hidden * 2, n_joints),
+            STGCNBlock(hidden * 2, hidden * 4, n_joints, stride=2),
+            STGCNBlock(hidden * 4, hidden * 4, n_joints),
+        ])
+        self.head = nn.Sequential(
+            nn.Dropout(0.3),
+            nn.Linear(hidden * 4, num_classes),
+        )
+    def forward(self, x_mocap, mask=None):
+        # x_mocap: (B, T, feat_dim_mocap)
+        B, T, _ = x_mocap.shape
+        h = self.proj_in(x_mocap)  # (B, T, n_joints * 3)
+        h = h.reshape(B, T, self.n_joints, self.coord_dim).permute(0, 3, 1, 2)  # (B, C, T, V)
+        for blk in self.blocks:
+            h = blk(h)
+        # Global mean pool over time & joints (with mask if provided)
+        if mask is not None:
+            # mask: (B, T), h: (B, C, T', V) where T' may be < T due to stride
+            T_ = h.shape[2]
+            m = mask[:, :T_].float().unsqueeze(1).unsqueeze(-1)  # (B, 1, T', 1)
+            h = (h * m).sum(dim=(2, 3)) / (m.sum(dim=(2, 3)) * h.shape[3] + 1e-8)
+        else:
+            h = h.mean(dim=(2, 3))
+        return self.head(h)
+# ---------------------------------------------------------------------------
+# 2) CTR-GCN  (Chen et al., ICCV 2021)
+#    Channel-wise Topology Refinement GCN — learns a separate adjacency
+#    matrix per channel group, known as SOTA for skeleton action recognition.
+# ---------------------------------------------------------------------------
+class CTRGC(nn.Module):
+    """Simplified CTR-GC block: learnable per-channel topology refinement."""
+    def __init__(self, in_ch, out_ch, n_joints, rel_reduction=4):
+        super().__init__()
+        self.n_joints = n_joints
+        self.conv1 = nn.Conv2d(in_ch, out_ch // rel_reduction, 1)
+        self.conv2 = nn.Conv2d(in_ch, out_ch // rel_reduction, 1)
+        self.conv3 = nn.Conv2d(in_ch, out_ch, 1)
+        self.alpha = nn.Parameter(torch.zeros(1))
+        self.A = nn.Parameter(torch.eye(n_joints) + 0.1 * torch.randn(n_joints, n_joints))
+    def forward(self, x):
+        # x: (B, C, T, V)
+        q = self.conv1(x).mean(dim=2)        # (B, C', V)
+        k = self.conv2(x).mean(dim=2)        # (B, C', V)
+        v = self.conv3(x)                    # (B, C_out, T, V)
+        # Channel-specific topology refinement
+        topology = F.softmax(torch.tanh(q.unsqueeze(-1) - k.unsqueeze(-2)), dim=-1)
+        # topology: (B, C', V, V); we average across channels to get a shared (B, V, V)
+        topology = topology.mean(dim=1)
+        A = self.A.unsqueeze(0) + self.alpha * topology
+        # apply A to v
+        out = torch.einsum('bctv,bvw->bctw', v, A)
+        return out
+class CTRGCNBlock(nn.Module):
+    def __init__(self, in_ch, out_ch, n_joints, stride=1):
+        super().__init__()
+        self.gc = CTRGC(in_ch, out_ch, n_joints)
+        self.bn = nn.BatchNorm2d(out_ch)
+        self.tcn = nn.Sequential(
+            nn.Conv2d(out_ch, out_ch, (9, 1), padding=(4, 0), stride=(stride, 1)),
+            nn.BatchNorm2d(out_ch),
+        )
+        if in_ch != out_ch or stride != 1:
+            self.res = nn.Conv2d(in_ch, out_ch, 1, stride=(stride, 1))
+        else:
+            self.res = nn.Identity()
+    def forward(self, x):
+        res = self.res(x)
+        h = self.gc(x)
+        h = self.bn(h)
+        h = F.relu(h)
+        h = self.tcn(h)
+        return F.relu(h + res)
+class CTRGCN(nn.Module):
+    def __init__(self, feat_dim_mocap, num_classes, hidden=64, n_joints=52):
+        super().__init__()
+        self.n_joints = n_joints
+        self.coord_dim = 3
+        self.proj_in = nn.Linear(feat_dim_mocap, n_joints * self.coord_dim)
+        self.blocks = nn.ModuleList([
+            CTRGCNBlock(self.coord_dim, hidden, n_joints),
+            CTRGCNBlock(hidden, hidden, n_joints),
+            CTRGCNBlock(hidden, hidden * 2, n_joints, stride=2),
+            CTRGCNBlock(hidden * 2, hidden * 4, n_joints, stride=2),
+        ])
+        self.head = nn.Sequential(
+            nn.Dropout(0.3),
+            nn.Linear(hidden * 4, num_classes),
+        )
+    def forward(self, x_mocap, mask=None):
+        B, T, _ = x_mocap.shape
+        h = self.proj_in(x_mocap)
+        h = h.reshape(B, T, self.n_joints, self.coord_dim).permute(0, 3, 1, 2)
+        for blk in self.blocks:
+            h = blk(h)
+        h = h.mean(dim=(2, 3))
+        return self.head(h)
+# ---------------------------------------------------------------------------
+# 3) LIMU-BERT  (Xu et al., SenSys 2021)
+#    IMU self-supervised pretraining via masked reconstruction + fine-tune.
+#    We implement a simpler variant: BERT-style encoder with optional
+#    pretraining head.
+# ---------------------------------------------------------------------------
+class LIMUBertEncoder(nn.Module):
+    def __init__(self, feat_dim_imu, hidden=128, n_layers=4, n_heads=4, dropout=0.1):
+        super().__init__()
+        self.in_proj = nn.Linear(feat_dim_imu, hidden)
+        self.pos = nn.Parameter(torch.zeros(1, 4096, hidden))
+        nn.init.trunc_normal_(self.pos, std=0.02)
+        layer = nn.TransformerEncoderLayer(
+            d_model=hidden, nhead=n_heads, dim_feedforward=4 * hidden,
+            dropout=dropout, batch_first=True, activation='gelu',
+        )
+        self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers)
+    def forward(self, x, mask):
+        T = x.size(1)
+        h = self.in_proj(x) + self.pos[:, :T, :]
+        h = self.encoder(h, src_key_padding_mask=~mask)
+        return h
+class LIMUBert(nn.Module):
+    """Supervised-only variant: encoder + classifier head. Paper's
+    pretraining is a masked-recon objective; for simplicity we report the
+    supervised-only baseline here."""
+    def __init__(self, feat_dim_imu, num_classes, hidden=128, n_layers=4,
+                 n_heads=4, dropout=0.1):
+        super().__init__()
+        self.encoder = LIMUBertEncoder(feat_dim_imu, hidden, n_layers, n_heads, dropout)
+        self.head = nn.Sequential(
+            nn.LayerNorm(hidden),
+            nn.Dropout(dropout),
+            nn.Linear(hidden, num_classes),
+        )
+    def forward(self, x_imu, mask):
+        h = self.encoder(x_imu, mask)
+        m = mask.unsqueeze(-1).float()
+        pooled = (h * m).sum(dim=1) / m.sum(dim=1).clamp(min=1.0)
+        return self.head(pooled)
+# ---------------------------------------------------------------------------
+# 4) EMG-CNN  (standard 1D CNN baseline from sEMG classification literature)
+#    E.g. Atzori et al. — multi-layer CNN with moving-window input.
+# ---------------------------------------------------------------------------
+class EMGCNN(nn.Module):
+    def __init__(self, feat_dim_emg, num_classes, hidden=64):
+        super().__init__()
+        self.cnn = nn.Sequential(
+            nn.Conv1d(feat_dim_emg, hidden, 7, padding=3),
+            nn.BatchNorm1d(hidden), nn.ReLU(), nn.Dropout(0.3),
+            nn.Conv1d(hidden, hidden * 2, 5, padding=2),
+            nn.BatchNorm1d(hidden * 2), nn.ReLU(), nn.Dropout(0.3),
+            nn.Conv1d(hidden * 2, hidden * 4, 3, padding=1),
+            nn.BatchNorm1d(hidden * 4), nn.ReLU(),
+        )
+        self.head = nn.Linear(hidden * 4, num_classes)
+    def forward(self, x_emg, mask):
+        # (B, T, 8) -> (B, 8, T) for conv1d
+        h = self.cnn(x_emg.transpose(1, 2))
+        # Masked pool
+        m = mask.unsqueeze(1).float()
+        T_ = h.size(2)
+        if m.size(2) != T_:
+            m = F.adaptive_avg_pool1d(m, T_)
+            m = (m > 0.5).float()
+        pooled = (h * m).sum(dim=2) / m.sum(dim=2).clamp(min=1.0)
+        return self.head(pooled)
+# ---------------------------------------------------------------------------
+# 5) ActionSense baseline  (DelPreto et al., NeurIPS '22)
+#    Simple 3-layer MLP per modality + shared LSTM + classifier.
+# ---------------------------------------------------------------------------
+class ActionSenseLSTM(nn.Module):
+    def __init__(self, modality_dims: dict, num_classes, hidden=128):
+        super().__init__()
+        self.mod_names = list(modality_dims.keys())
+        self.mod_dims = modality_dims
+        self.per_mod = nn.ModuleDict({
+            name: nn.Sequential(
+                nn.Linear(d, hidden), nn.ReLU(), nn.Dropout(0.2),
+                nn.Linear(hidden, hidden), nn.ReLU(),
+            ) for name, d in modality_dims.items()
+        })
+        concat_dim = hidden * len(modality_dims)
+        self.lstm = nn.LSTM(concat_dim, hidden, num_layers=2,
+                            batch_first=True, bidirectional=True, dropout=0.2)
+        self.head = nn.Linear(hidden * 2, num_classes)
+    def forward(self, x, mask):
+        # x: (B, T, F_total), slice by modality
+        offset = 0
+        feats = []
+        for name in self.mod_names:
+            d = self.mod_dims[name]
+            x_m = x[..., offset:offset + d]
+            offset += d
+            feats.append(self.per_mod[name](x_m))
+        h = torch.cat(feats, dim=-1)  # (B, T, hidden * M)
+        h, _ = self.lstm(h)
+        m = mask.unsqueeze(-1).float()
+        pooled = (h * m).sum(dim=1) / m.sum(dim=1).clamp(min=1.0)
+        return self.head(pooled)
+# ---------------------------------------------------------------------------
+# 6) MulT  (Multimodal Transformer, Tsai et al., ACL 2019)
+#    Core idea: cross-modal attention between every pair of modalities.
+#    For a 3-modality input (A, B, C), produce
+#    {A->B, A->C, B->A, B->C, C->A, C->B} via directed cross-attention.
+# ---------------------------------------------------------------------------
+class CrossModalTransformer(nn.Module):
+    def __init__(self, d_model, n_heads=4, n_layers=2, dropout=0.1):
+        super().__init__()
+        self.layers = nn.ModuleList([
+            nn.TransformerDecoderLayer(
+                d_model=d_model, nhead=n_heads, dim_feedforward=4 * d_model,
+                dropout=dropout, batch_first=True, activation='gelu',
+            ) for _ in range(n_layers)
+        ])
+    def forward(self, q, kv, q_mask, kv_mask):
+        # q: (B, T_q, D), kv: (B, T_kv, D)
+        h = q
+        for layer in self.layers:
+            h = layer(h, kv,
+                      tgt_key_padding_mask=~q_mask,
+                      memory_key_padding_mask=~kv_mask)
+        return h
+class MulT(nn.Module):
+    """Multimodal Transformer. Uses MoCap + EMG + IMU as 3 modalities
+    (EyeTrack/Pressure omitted to match original 3-mod paper design)."""
+    def __init__(self, modality_dims: dict, num_classes, d_model=128,
+                 n_layers=2, n_heads=4, dropout=0.1):
+        super().__init__()
+        self.mod_names = [m for m in ['mocap', 'emg', 'imu'] if m in modality_dims]
+        if len(self.mod_names) < 2:
+            self.mod_names = list(modality_dims.keys())[:3]
+        self.mod_dims = {m: modality_dims[m] for m in self.mod_names}
+        self.in_proj = nn.ModuleDict({
+            m: nn.Linear(d, d_model) for m, d in self.mod_dims.items()
+        })
+        # Pairwise cross-attention
+        self.cross = nn.ModuleDict({
+            f"{a}_to_{b}": CrossModalTransformer(d_model, n_heads, n_layers, dropout)
+            for a in self.mod_names for b in self.mod_names if a != b
+        })
+        # Self-attention after cross
+        self.self_tx = nn.ModuleDict({
+            m: nn.TransformerEncoder(
+                nn.TransformerEncoderLayer(
+                    d_model=d_model, nhead=n_heads,
+                    dim_feedforward=4 * d_model, dropout=dropout,
+                    batch_first=True, activation='gelu',
+                ), num_layers=1,
+            ) for m in self.mod_names
+        })
+        total_dim = d_model * len(self.mod_names) * len(self.mod_names)
+        self.head = nn.Sequential(
+            nn.LayerNorm(total_dim),
+            nn.Dropout(dropout),
+            nn.Linear(total_dim, num_classes),
+        )
+    def forward(self, x, mask):
+        # Slice modalities from x
+        offset = 0
+        projs = {}
+        # Walk through all known mod_dims to find offsets
+        # We need the FULL modality_dims order, which we don't have here;
+        # expect caller to already supply x with exactly mod_names in order.
+        # Workaround: assume caller passes mod_names order matching projection.
+        for m in self.mod_names:
+            d = self.mod_dims[m]
+            projs[m] = self.in_proj[m](x[..., offset:offset + d])
+            offset += d
+        # Cross-attention: each modality attends to each other
+        fused = {m: [] for m in self.mod_names}
+        for a in self.mod_names:
+            for b in self.mod_names:
+                if a == b:
+                    fused[a].append(projs[a])
+                else:
+                    out = self.cross[f"{a}_to_{b}"](projs[a], projs[b], mask, mask)
+                    fused[a].append(out)
+        # Self-attention + pool per modality
+        pooled = []
+        for a in self.mod_names:
+            # Concat all attended-to representations along feature dim
+            cat = torch.cat(fused[a], dim=-1)  # (B, T, D * M)
+            # Actually re-project back to D per stream, then self-attn on stacked
+            # Simplified: self-attention over concatenated, pool, flatten
+            # Here we just pool each separately
+            for i, rep in enumerate(fused[a]):
+                rep = self.self_tx[a](rep)
+                m = mask.unsqueeze(-1).float()
+                p = (rep * m).sum(dim=1) / m.sum(dim=1).clamp(min=1.0)
+                pooled.append(p)
+        h = torch.cat(pooled, dim=-1)
+        return self.head(h)
+# ---------------------------------------------------------------------------
+# 7) Perceiver IO  (Jaegle et al., ICML 2021)
+#    Cross-attention from a fixed-size latent query set to all input tokens,
+#    repeated for a few iterations.
+# ---------------------------------------------------------------------------
+class PerceiverBlock(nn.Module):
+    def __init__(self, latent_dim, n_heads, dropout):
+        super().__init__()
+        self.ca = nn.MultiheadAttention(
+            latent_dim, n_heads, dropout=dropout, batch_first=True,
+        )
+        self.norm1 = nn.LayerNorm(latent_dim)
+        self.sa = nn.TransformerEncoderLayer(
+            d_model=latent_dim, nhead=n_heads,
+            dim_feedforward=4 * latent_dim, dropout=dropout,
+            batch_first=True, activation='gelu',
+        )
+    def forward(self, latents, inputs, input_kpm):
+        # Cross-attn: latents attend to inputs
+        h, _ = self.ca(latents, inputs, inputs, key_padding_mask=input_kpm)
+        latents = self.norm1(latents + h)
+        # Self-attn on latents
+        latents = self.sa(latents)
+        return latents
+class PerceiverIO(nn.Module):
+    """Perceiver with N learnable latent queries; supports any modality mix."""
+    def __init__(self, modality_dims: dict, num_classes,
+                 latent_dim=128, n_latents=32, n_layers=3, n_heads=4, dropout=0.1):
+        super().__init__()
+        self.mod_names = list(modality_dims.keys())
+        self.mod_dims = modality_dims
+        # Per-modality input projection to latent_dim, with modality-id embedding
+        self.in_proj = nn.ModuleDict({
+            m: nn.Linear(d, latent_dim) for m, d in modality_dims.items()
+        })
+        self.mod_emb = nn.Parameter(torch.randn(len(self.mod_names), latent_dim) * 0.02)
+        # Positional encoding (shared)
+        self.pos = nn.Parameter(torch.zeros(1, 4096, latent_dim))
+        nn.init.trunc_normal_(self.pos, std=0.02)
+        # Learnable latents
+        self.latents = nn.Parameter(torch.randn(n_latents, latent_dim) * 0.02)
+        self.blocks = nn.ModuleList([
+            PerceiverBlock(latent_dim, n_heads, dropout) for _ in range(n_layers)
+        ])
+        self.head = nn.Sequential(
+            nn.LayerNorm(latent_dim),
+            nn.Linear(latent_dim, num_classes),
+        )
+    def forward(self, x, mask):
+        B, T, _ = x.shape
+        # Project each modality + add modality embedding
+        offset = 0
+        tokens = []
+        for i, m in enumerate(self.mod_names):
+            d = self.mod_dims[m]
+            tok = self.in_proj[m](x[..., offset:offset + d])  # (B, T, D)
+            tok = tok + self.mod_emb[i]
+            offset += d
+            tokens.append(tok)
+        # Concatenate along TIME dim, add shared pos enc per-modality
+        # Each modality gets its own time sequence concatenated
+        # Simpler: sum across modalities (like early fusion in latent space) + pos
+        h = torch.stack(tokens, dim=2).mean(dim=2)  # (B, T, D)
+        h = h + self.pos[:, :T, :]
+        input_kpm = ~mask  # (B, T), True = ignore
+        # Iterative cross-attention
+        latents = self.latents.unsqueeze(0).expand(B, -1, -1)  # (B, N, D)
+        for blk in self.blocks:
+            latents = blk(latents, h, input_kpm)
+        # Mean-pool latents
+        pooled = latents.mean(dim=1)
+        return self.head(pooled)

experiments/nets/baselines_published/syncfuse.py ADDED Viewed

	@@ -0,0 +1,270 @@

+"""
+SyncFuse — our proposed method for T1 scene recognition.
+Four components (all toggleable via args for ablation):
+ (1) Modality dropout:    per-sample independent Bernoulli(p=0.3) drop on each
+                          modality during training; at test time all modalities
+                          are active. Keeps at least 1 modality.
+ (2) Pretrained transfer: each per-modality backbone is optionally loaded from
+                          an independently pretrained single-modality
+                          checkpoint and frozen during fine-tuning.
+ (3) Cross-modal temporal-shift attention:
+                          a late cross-attention block where EMG queries
+                          attend to MoCap keys/values at a LEARNED temporal
+                          offset Δ (Gumbel-softmax over {-10,...,+10} bins at
+                          20 Hz = ±500 ms). Motivated by the paper's case-study
+                          finding (EMG leads motion by ~20 ms sub-frame).
+ (4) Learnable late fusion:
+                          per-modality classifier logits are combined with a
+                          learnable softmax-weighted average (temperature is
+                          also learned). Equivalent to `late_agg='learned'`
+                          in the repo's existing LateFusionModel.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import random
+def masked_mean(x, mask):
+    m = mask.unsqueeze(-1).float()
+    return (x * m).sum(dim=1) / m.sum(dim=1).clamp(min=1.0)
+# ---------------------------------------------------------------------------
+# Per-modality Transformer branch (same as repo's TransformerBackbone)
+# ---------------------------------------------------------------------------
+class ModTransformer(nn.Module):
+    def __init__(self, feat_dim, hidden=128, n_layers=2, n_heads=4, dropout=0.1):
+        super().__init__()
+        self.in_proj = nn.Linear(feat_dim, hidden)
+        self.pos = nn.Parameter(torch.zeros(1, 4096, hidden))
+        nn.init.trunc_normal_(self.pos, std=0.02)
+        layer = nn.TransformerEncoderLayer(
+            d_model=hidden, nhead=n_heads, dim_feedforward=4 * hidden,
+            dropout=dropout, batch_first=True, activation='gelu',
+        )
+        self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers)
+        self.output_dim = hidden
+    def forward(self, x, mask):
+        # x: (B, T, feat_dim)
+        T = x.size(1)
+        h = self.in_proj(x) + self.pos[:, :T, :]
+        h = self.encoder(h, src_key_padding_mask=~mask)
+        return h  # (B, T, hidden) — token-level, NOT pooled
+# ---------------------------------------------------------------------------
+# (3) Cross-modal temporal-shift attention
+# ---------------------------------------------------------------------------
+class TemporalShiftAttention(nn.Module):
+    """Multi-head attention where queries are temporally shifted by a learned
+    offset Δ from the keys. Δ is drawn from a discrete set {-3,...,+3} via
+    straight-through Gumbel-softmax: we sample ONE shift per forward pass,
+    but the softmax weights flow gradient back through shift_logits.
+    At 20 Hz bins, ±3 ≈ ±150 ms, which brackets the paper's ~20 ms EMG-motion
+    lead. Memory cost is ~1 attention pass (not 7)."""
+    def __init__(self, d_model, n_heads=4, dropout=0.1, max_shift=3,
+                 gumbel_tau=1.0):
+        super().__init__()
+        self.max_shift = max_shift
+        self.shifts = list(range(-max_shift, max_shift + 1))
+        self.shift_logits = nn.Parameter(torch.zeros(len(self.shifts)))
+        self.tau = gumbel_tau
+        self.attn = nn.MultiheadAttention(
+            d_model, n_heads, dropout=dropout, batch_first=True,
+        )
+        self.norm = nn.LayerNorm(d_model)
+    def _shift_tensor(self, x, shift, mask):
+        if shift == 0:
+            return x, mask
+        B, T, D = x.shape
+        if shift > 0:
+            pad = torch.zeros(B, shift, D, device=x.device, dtype=x.dtype)
+            x_s = torch.cat([x[:, shift:, :], pad], dim=1)
+            m_s = torch.cat([mask[:, shift:],
+                             torch.zeros(B, shift, device=mask.device, dtype=torch.bool)],
+                            dim=1)
+        else:
+            s = -shift
+            pad = torch.zeros(B, s, D, device=x.device, dtype=x.dtype)
+            x_s = torch.cat([pad, x[:, :-s, :]], dim=1)
+            m_s = torch.cat([torch.zeros(B, s, device=mask.device, dtype=torch.bool),
+                             mask[:, :-s]], dim=1)
+        return x_s, m_s
+    def forward(self, q_tokens, kv_tokens, q_mask, kv_mask, hard=False):
+        if hard or not self.training:
+            # Eval: take the argmax shift
+            with torch.no_grad():
+                idx = self.shift_logits.argmax().item()
+            shift = self.shifts[idx]
+            shifted_kv, shifted_mask = self._shift_tensor(kv_tokens, shift, kv_mask)
+            out, _ = self.attn(q_tokens, shifted_kv, shifted_kv,
+                               key_padding_mask=~shifted_mask)
+            return self.norm(q_tokens + out)
+        # Training: straight-through Gumbel-softmax to sample 1 shift,
+        # with gradient flowing via softmax weights.
+        one_hot = F.gumbel_softmax(self.shift_logits, tau=self.tau, hard=True)
+        # pick the sampled shift (argmax of the hard one-hot)
+        idx = int(one_hot.argmax().item())
+        shift = self.shifts[idx]
+        shifted_kv, shifted_mask = self._shift_tensor(kv_tokens, shift, kv_mask)
+        out, _ = self.attn(q_tokens, shifted_kv, shifted_kv,
+                           key_padding_mask=~shifted_mask)
+        # scale out by the corresponding soft weight to let gradient flow
+        out = out * one_hot[idx]
+        return self.norm(q_tokens + out)
+# ---------------------------------------------------------------------------
+# SyncFuse main model
+# ---------------------------------------------------------------------------
+class SyncFuse(nn.Module):
+    def __init__(self, modality_dims: dict, num_classes, hidden=128, n_heads=4,
+                 n_layers=2, dropout=0.1,
+                 use_xmod_shift=True, use_learned_late=True):
+        super().__init__()
+        self.mod_names = list(modality_dims.keys())
+        self.mod_dims = modality_dims
+        self.use_xmod_shift = use_xmod_shift
+        self.use_learned_late = use_learned_late
+        self.branches = nn.ModuleDict({
+            m: ModTransformer(d, hidden, n_layers, n_heads, dropout)
+            for m, d in modality_dims.items()
+        })
+        self.classifiers = nn.ModuleDict({
+            m: nn.Sequential(nn.LayerNorm(hidden), nn.Dropout(dropout),
+                             nn.Linear(hidden, num_classes))
+            for m in self.mod_names
+        })
+        # Cross-modal temporal-shift: apply to EMG branch attending to MoCap
+        # (and symmetrically MoCap->EMG), only when both modalities are present.
+        if use_xmod_shift and 'emg' in self.mod_names and 'mocap' in self.mod_names:
+            self.xmod_emg2mocap = TemporalShiftAttention(hidden, n_heads, dropout)
+            self.xmod_mocap2emg = TemporalShiftAttention(hidden, n_heads, dropout)
+        else:
+            self.xmod_emg2mocap = None
+            self.xmod_mocap2emg = None
+        if use_learned_late:
+            self.late_logits = nn.Parameter(torch.zeros(len(self.mod_names)))
+            self.late_temperature = nn.Parameter(torch.ones(1))
+    def load_pretrained(self, pretrain_paths: dict, freeze=True):
+        """Load pretrained single-modality checkpoints into branches.
+        pretrain_paths: {modality_name: path_to_checkpoint_state_dict}."""
+        import torch as _torch
+        for m, path in pretrain_paths.items():
+            if m not in self.branches:
+                continue
+            try:
+                sd = _torch.load(path, weights_only=True, map_location='cpu')
+            except TypeError:
+                sd = _torch.load(path, map_location='cpu')
+            # Map SingleModel keys ("backbone.X.*") -> branch keys
+            mapped = {}
+            for k, v in sd.items():
+                if k.startswith('backbone.'):
+                    new_k = k.replace('backbone.', '')
+                    if new_k in self.branches[m].state_dict():
+                        mapped[new_k] = v
+            if mapped:
+                self.branches[m].load_state_dict(mapped, strict=False)
+                if freeze:
+                    for p in self.branches[m].parameters():
+                        p.requires_grad = False
+                print(f"  [SyncFuse] loaded {len(mapped)} tensors into branch '{m}' (frozen={freeze})")
+    def forward(self, x, mask, mod_dropout_p=0.0, training_time=True):
+        """
+        x:    (B, T, F_total) concatenated features
+        mask: (B, T)
+        mod_dropout_p: probability of dropping each modality (training only)
+        """
+        B, T, _ = x.shape
+        # Slice modality features
+        offset = 0
+        feats = {}
+        for m in self.mod_names:
+            d = self.mod_dims[m]
+            feats[m] = x[..., offset:offset + d]
+            offset += d
+        # (1) Modality dropout — per sample, independent per modality
+        active = {m: torch.ones(B, dtype=torch.bool, device=x.device) for m in self.mod_names}
+        if training_time and self.training and mod_dropout_p > 0:
+            drop_map = {m: (torch.rand(B, device=x.device) < mod_dropout_p)
+                        for m in self.mod_names}
+            all_dropped = torch.stack([drop_map[m] for m in self.mod_names], dim=0).all(dim=0)  # (B,)
+            if all_dropped.any():
+                # for all-dropped samples, un-drop one random modality
+                rescue_idx = torch.randint(0, len(self.mod_names),
+                                           (all_dropped.sum().item(),),
+                                           device=x.device)
+                mod_name_tensor = self.mod_names  # python list
+                j = 0
+                for b in range(B):
+                    if all_dropped[b]:
+                        r = mod_name_tensor[rescue_idx[j].item()]
+                        drop_map[r][b] = False
+                        j += 1
+            for m in self.mod_names:
+                active[m] = ~drop_map[m]
+                # zero out dropped features for that branch
+                feats[m] = feats[m] * active[m].view(B, 1, 1).float()
+        # Per-modality encoding
+        tokens = {}
+        for m in self.mod_names:
+            tokens[m] = self.branches[m](feats[m], mask)  # (B, T, hidden)
+        # (3) Cross-modal temporal-shift (bidirectional EMG <-> MoCap)
+        if self.xmod_emg2mocap is not None:
+            tokens['emg'] = self.xmod_emg2mocap(
+                tokens['emg'], tokens['mocap'], mask, mask,
+                hard=not self.training,
+            )
+            tokens['mocap'] = self.xmod_mocap2emg(
+                tokens['mocap'], tokens['emg'], mask, mask,
+                hard=not self.training,
+            )
+        # Pool and classify per modality
+        logits_per = []
+        for m in self.mod_names:
+            pooled = masked_mean(tokens[m], mask)
+            logits_per.append(self.classifiers[m](pooled))
+        stacked = torch.stack(logits_per, dim=0)  # (M, B, C)
+        # Mask out logits from dropped modalities (so they don't dominate)
+        if training_time and self.training and mod_dropout_p > 0:
+            act_mask = torch.stack([active[m].float() for m in self.mod_names], dim=0)  # (M, B)
+            # Re-normalize weights across active modalities
+            if self.use_learned_late:
+                w = F.softmax(self.late_logits / self.late_temperature.clamp(min=0.1), dim=0)
+                w = w.view(-1, 1) * act_mask  # (M, B)
+                w = w / w.sum(dim=0, keepdim=True).clamp(min=1e-6)
+                out = (stacked * w.unsqueeze(-1)).sum(dim=0)
+            else:
+                w = act_mask / act_mask.sum(dim=0, keepdim=True).clamp(min=1e-6)
+                out = (stacked * w.unsqueeze(-1)).sum(dim=0)
+        else:
+            # (4) Learnable late fusion (or simple mean)
+            if self.use_learned_late:
+                w = F.softmax(self.late_logits / self.late_temperature.clamp(min=0.1), dim=0)
+                out = (stacked * w.view(-1, 1, 1)).sum(dim=0)
+            else:
+                out = stacked.mean(dim=0)
+        return out

experiments/nets/models.py ADDED Viewed

	@@ -0,0 +1,648 @@

+"""
+Model definitions for Experiment 1: Scene Recognition.
+Backbones: CNN1D, BiLSTM, Transformer
+Fusion: Early (default), Late, Attention, WeightedLate, GatedLate, Stacking, Product, MoE
+Supports optional per-modality projection via proj_dim parameter:
+  proj_dim > 0: project each modality to proj_dim before backbone
+  proj_dim = 0: no projection, use raw features (original behavior)
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# ============================================================
+# Per-modality projection
+# ============================================================
+class ModalityProjector(nn.Module):
+    """Project each modality from its raw dimension to proj_dim."""
+    def __init__(self, modality_dims, proj_dim):
+        super().__init__()
+        self.mod_names = list(modality_dims.keys())
+        self.mod_dims = list(modality_dims.values())
+        self.proj_dim = proj_dim
+        self.projectors = nn.ModuleList()
+        for dim in self.mod_dims:
+            self.projectors.append(nn.Sequential(
+                nn.Linear(dim, proj_dim),
+                nn.LayerNorm(proj_dim),
+                nn.ReLU(),
+            ))
+    @property
+    def output_dim(self):
+        return self.proj_dim * len(self.mod_dims)
+    def forward(self, x):
+        """x: (B, T, total_raw_dim) -> (B, T, proj_dim * M)"""
+        parts = []
+        offset = 0
+        for i, dim in enumerate(self.mod_dims):
+            x_mod = x[:, :, offset:offset + dim]
+            offset += dim
+            parts.append(self.projectors[i](x_mod))
+        return torch.cat(parts, dim=-1)
+# ============================================================
+# Per-modality hidden dim scaling (used when proj_dim=0)
+# ============================================================
+def _compute_per_modality_hidden(mod_dim, base_hidden_dim):
+    if mod_dim >= 128:
+        return max(base_hidden_dim, 48)
+    elif mod_dim >= 32:
+        return base_hidden_dim
+    else:
+        return max(16, base_hidden_dim // 2)
+# ============================================================
+# Backbones
+# ============================================================
+class CNN1DBackbone(nn.Module):
+    def __init__(self, input_dim, hidden_dim=128):
+        super().__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv1d(input_dim, 64, kernel_size=7, padding=3),
+            nn.BatchNorm1d(64), nn.ReLU(), nn.Dropout(0.1),
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv1d(64, 128, kernel_size=5, padding=2),
+            nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(0.1),
+        )
+        self.conv3 = nn.Sequential(
+            nn.Conv1d(128, hidden_dim, kernel_size=3, padding=1),
+            nn.BatchNorm1d(hidden_dim), nn.ReLU(),
+        )
+        self.output_dim = hidden_dim
+    def forward(self, x, mask=None):
+        x = x.permute(0, 2, 1)
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        if mask is not None:
+            x = (x * mask.unsqueeze(1).float()).sum(2) / mask.sum(1, keepdim=True).float().clamp(min=1)
+        else:
+            x = x.mean(2)
+        return x
+class LSTMBackbone(nn.Module):
+    def __init__(self, input_dim, hidden_dim=128, num_layers=2, dropout=0.2):
+        super().__init__()
+        self.lstm = nn.LSTM(
+            input_dim, hidden_dim, num_layers=num_layers,
+            batch_first=True, bidirectional=True,
+            dropout=dropout if num_layers > 1 else 0,
+        )
+        self.attn = nn.Linear(hidden_dim * 2, 1)
+        self.output_dim = hidden_dim * 2
+    def forward(self, x, mask=None):
+        out, _ = self.lstm(x)
+        scores = self.attn(out).squeeze(-1)
+        if mask is not None:
+            scores = scores.masked_fill(~mask, float('-inf'))
+        weights = torch.softmax(scores, dim=1)
+        out = (out * weights.unsqueeze(-1)).sum(dim=1)
+        return out
+class TinyHARBackbone(nn.Module):
+    """TinyHAR backbone (Zhou et al., ISWC 2022 Best Paper).
+    Lightweight model for human activity recognition from wearable sensors.
+    Uses multi-scale temporal convolutions + cross-channel interaction + temporal pooling.
+    Input: (B, T, C) with optional mask
+    Output: (B, hidden_dim)
+    """
+    def __init__(self, input_dim, hidden_dim=128, num_scales=4):
+        super().__init__()
+        scale_dim = max(4, hidden_dim // num_scales)
+        actual_hidden = scale_dim * num_scales
+        # Multi-scale temporal convolution feature extraction
+        self.convs = nn.ModuleList()
+        for i in range(num_scales):
+            ks = 2 * (i + 1) + 1  # kernel sizes: 3, 5, 7, 9
+            self.convs.append(nn.Sequential(
+                nn.Conv1d(input_dim, scale_dim, kernel_size=ks, padding=ks // 2),
+                nn.BatchNorm1d(scale_dim),
+                nn.ReLU(),
+            ))
+        # Cross-channel interaction via multi-head self-attention
+        nhead = max(1, min(4, actual_hidden // 8))
+        # Ensure actual_hidden is divisible by nhead
+        while actual_hidden % nhead != 0 and nhead > 1:
+            nhead -= 1
+        self.channel_attn = nn.MultiheadAttention(
+            actual_hidden, num_heads=nhead, batch_first=True, dropout=0.1,
+        )
+        self.channel_norm = nn.LayerNorm(actual_hidden)
+        self.channel_ff = nn.Sequential(
+            nn.Linear(actual_hidden, actual_hidden),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(actual_hidden, actual_hidden),
+        )
+        self.ff_norm = nn.LayerNorm(actual_hidden)
+        # Temporal attention pooling
+        self.temporal_query = nn.Parameter(torch.randn(1, 1, actual_hidden) * 0.02)
+        self.temporal_attn = nn.MultiheadAttention(
+            actual_hidden, num_heads=1, batch_first=True, dropout=0.1,
+        )
+        self.output_dim = actual_hidden
+    def forward(self, x, mask=None):
+        # x: (B, T, C)
+        B, T, C = x.shape
+        x_t = x.permute(0, 2, 1)  # (B, C, T)
+        # Multi-scale feature extraction
+        scale_features = [conv(x_t) for conv in self.convs]
+        x = torch.cat(scale_features, dim=1)  # (B, actual_hidden, T)
+        x = x.permute(0, 2, 1)  # (B, T, actual_hidden)
+        # Cross-channel interaction
+        key_padding_mask = ~mask if mask is not None else None
+        attn_out, _ = self.channel_attn(x, x, x, key_padding_mask=key_padding_mask)
+        x = self.channel_norm(x + attn_out)
+        x = self.ff_norm(x + self.channel_ff(x))
+        # Temporal attention pooling
+        query = self.temporal_query.expand(B, -1, -1)  # (B, 1, actual_hidden)
+        pooled, _ = self.temporal_attn(query, x, x, key_padding_mask=key_padding_mask)
+        return pooled.squeeze(1)  # (B, actual_hidden)
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, dropout=0.1, max_len=5000):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        x = x + self.pe[:, :x.size(1)]
+        return self.dropout(x)
+class TransformerBackbone(nn.Module):
+    def __init__(self, input_dim, d_model=128, nhead=4, num_layers=2, dropout=0.1):
+        super().__init__()
+        self.input_proj = nn.Linear(input_dim, d_model)
+        self.pos_enc = PositionalEncoding(d_model, dropout=dropout)
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=d_model, nhead=nhead, dim_feedforward=d_model * 4,
+            dropout=dropout, batch_first=True,
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        self.output_dim = d_model
+    def forward(self, x, mask=None):
+        x = self.input_proj(x)
+        x = self.pos_enc(x)
+        src_key_padding_mask = ~mask if mask is not None else None
+        x = self.encoder(x, src_key_padding_mask=src_key_padding_mask)
+        if mask is not None:
+            x = (x * mask.unsqueeze(-1).float()).sum(1) / mask.sum(1, keepdim=True).float().clamp(min=1)
+        else:
+            x = x.mean(1)
+        return x
+# ============================================================
+# Full models
+# ============================================================
+def get_backbone(name, input_dim, hidden_dim=128):
+    if name == 'cnn':
+        return CNN1DBackbone(input_dim, hidden_dim)
+    elif name == 'lstm':
+        return LSTMBackbone(input_dim, hidden_dim)
+    elif name == 'transformer':
+        return TransformerBackbone(input_dim, hidden_dim)
+    elif name == 'tinyhar':
+        return TinyHARBackbone(input_dim, hidden_dim)
+    elif name == 'deepconvlstm':
+        from experiments.published_models import DeepConvLSTMBackbone
+        return DeepConvLSTMBackbone(input_dim, hidden_dim)
+    elif name == 'inceptiontime':
+        from experiments.published_models import InceptionTimeBackbone
+        return InceptionTimeBackbone(input_dim, hidden_dim)
+    else:
+        raise ValueError(f"Unknown backbone: {name}")
+def _make_branch(backbone_name, raw_dim, hidden_dim, proj_dim):
+    """Create optional projector + backbone for one modality branch."""
+    if proj_dim > 0:
+        proj = nn.Sequential(
+            nn.Linear(raw_dim, proj_dim),
+            nn.LayerNorm(proj_dim),
+            nn.ReLU(),
+        )
+        bb_input = proj_dim
+        bb_hidden = hidden_dim
+    else:
+        proj = None
+        bb_input = raw_dim
+        bb_hidden = _compute_per_modality_hidden(raw_dim, hidden_dim)
+    bb = get_backbone(backbone_name, bb_input, bb_hidden)
+    return proj, bb
+class SingleModel(nn.Module):
+    """Single backbone + classifier (early fusion or single-modality)."""
+    def __init__(self, backbone_name, input_dim, num_classes, hidden_dim=128,
+                 modality_dims=None, proj_dim=0):
+        super().__init__()
+        self.projector = None
+        if proj_dim > 0 and modality_dims:
+            self.projector = ModalityProjector(modality_dims, proj_dim)
+            actual_input_dim = self.projector.output_dim
+        else:
+            actual_input_dim = input_dim
+        self.backbone = get_backbone(backbone_name, actual_input_dim, hidden_dim)
+        self.classifier = nn.Sequential(
+            nn.Dropout(0.5),
+            nn.Linear(self.backbone.output_dim, num_classes),
+        )
+    def forward(self, x, mask=None):
+        if self.projector is not None:
+            x = self.projector(x)
+        feat = self.backbone(x, mask)
+        return self.classifier(feat)
+class LateFusionModel(nn.Module):
+    """Late fusion: separate backbone per modality, configurable logit aggregation.
+    late_agg='mean': simple average (original)
+    late_agg='confidence': entropy-based confidence weighting (0 extra params)
+    late_agg='learned': temperature-scaled learned weights (M+1 extra params)
+    """
+    def __init__(self, backbone_name, modality_dims, num_classes, hidden_dim=64,
+                 proj_dim=0, late_agg='mean'):
+        super().__init__()
+        self.mod_names = list(modality_dims.keys())
+        self.mod_dims = list(modality_dims.values())
+        self.late_agg = late_agg
+        self.projectors = nn.ModuleList()
+        self.backbones = nn.ModuleList()
+        self.classifiers = nn.ModuleList()
+        for dim in self.mod_dims:
+            proj, bb = _make_branch(backbone_name, dim, hidden_dim, proj_dim)
+            self.projectors.append(proj if proj else nn.Identity())
+            self.backbones.append(bb)
+            self.classifiers.append(nn.Sequential(
+                nn.Dropout(0.5), nn.Linear(bb.output_dim, num_classes),
+            ))
+        self._has_proj = proj_dim > 0
+        M = len(self.mod_dims)
+        if late_agg == 'learned':
+            self.modality_logits = nn.Parameter(torch.zeros(M))
+            self.temperature = nn.Parameter(torch.ones(1))
+    def forward(self, x, mask=None):
+        offset = 0
+        all_logits = []
+        for i, dim in enumerate(self.mod_dims):
+            x_mod = x[:, :, offset:offset + dim]
+            offset += dim
+            if self._has_proj:
+                x_mod = self.projectors[i](x_mod)
+            feat = self.backbones[i](x_mod, mask)
+            all_logits.append(self.classifiers[i](feat))
+        stacked = torch.stack(all_logits, dim=0)  # (M, B, C)
+        if self.late_agg == 'confidence':
+            # Weight by confidence: low entropy → high weight
+            probs = F.softmax(stacked, dim=-1)                    # (M, B, C)
+            entropy = -(probs * (probs + 1e-8).log()).sum(dim=-1)  # (M, B)
+            weights = F.softmax(-entropy, dim=0).unsqueeze(-1)     # (M, B, 1)
+            return (stacked * weights).sum(dim=0)
+        elif self.late_agg == 'learned':
+            weights = F.softmax(self.modality_logits / self.temperature, dim=0)
+            return (stacked * weights.view(-1, 1, 1)).sum(dim=0)
+        else:  # 'mean'
+            return stacked.mean(dim=0)
+class AttentionFusionModel(nn.Module):
+    """Attention fusion: separate encoder per modality -> cross-modal attention -> classifier."""
+    def __init__(self, backbone_name, modality_dims, num_classes, hidden_dim=64, proj_dim=0):
+        super().__init__()
+        self.mod_names = list(modality_dims.keys())
+        self.mod_dims = list(modality_dims.values())
+        unified_dim = hidden_dim
+        self.projectors = nn.ModuleList()
+        self.backbones = nn.ModuleList()
+        self.feat_projections = nn.ModuleList()
+        for dim in self.mod_dims:
+            proj, bb = _make_branch(backbone_name, dim, hidden_dim, proj_dim)
+            self.projectors.append(proj if proj else nn.Identity())
+            self.backbones.append(bb)
+            if bb.output_dim != unified_dim:
+                self.feat_projections.append(nn.Linear(bb.output_dim, unified_dim))
+            else:
+                self.feat_projections.append(nn.Identity())
+        self._has_proj = proj_dim > 0
+        nhead = 4 if unified_dim % 4 == 0 else (2 if unified_dim % 2 == 0 else 1)
+        self.cross_attn = nn.TransformerEncoderLayer(
+            d_model=unified_dim, nhead=nhead, dim_feedforward=unified_dim * 2,
+            dropout=0.1, batch_first=True,
+        )
+        self.classifier = nn.Sequential(
+            nn.Dropout(0.5), nn.Linear(unified_dim, num_classes),
+        )
+    def forward(self, x, mask=None):
+        offset = 0
+        mod_features = []
+        for i, dim in enumerate(self.mod_dims):
+            x_mod = x[:, :, offset:offset + dim]
+            offset += dim
+            if self._has_proj:
+                x_mod = self.projectors[i](x_mod)
+            feat = self.backbones[i](x_mod, mask)
+            feat = self.feat_projections[i](feat)
+            mod_features.append(feat)
+        tokens = torch.stack(mod_features, dim=1)
+        tokens = self.cross_attn(tokens)
+        pooled = tokens.mean(dim=1)
+        return self.classifier(pooled)
+class WeightedLateFusionModel(nn.Module):
+    def __init__(self, backbone_name, modality_dims, num_classes, hidden_dim=64, proj_dim=0):
+        super().__init__()
+        self.mod_names = list(modality_dims.keys())
+        self.mod_dims = list(modality_dims.values())
+        self.projectors = nn.ModuleList()
+        self.backbones = nn.ModuleList()
+        self.classifiers = nn.ModuleList()
+        for dim in self.mod_dims:
+            proj, bb = _make_branch(backbone_name, dim, hidden_dim, proj_dim)
+            self.projectors.append(proj if proj else nn.Identity())
+            self.backbones.append(bb)
+            self.classifiers.append(nn.Sequential(
+                nn.Dropout(0.5), nn.Linear(bb.output_dim, num_classes),
+            ))
+        self._has_proj = proj_dim > 0
+        self.modality_weights = nn.Parameter(torch.ones(len(self.mod_dims)))
+    def forward(self, x, mask=None):
+        offset = 0
+        all_logits = []
+        for i, dim in enumerate(self.mod_dims):
+            x_mod = x[:, :, offset:offset + dim]
+            offset += dim
+            if self._has_proj:
+                x_mod = self.projectors[i](x_mod)
+            feat = self.backbones[i](x_mod, mask)
+            all_logits.append(self.classifiers[i](feat))
+        weights = F.softmax(self.modality_weights, dim=0)
+        stacked = torch.stack(all_logits, dim=0)
+        return (stacked * weights.view(-1, 1, 1)).sum(dim=0)
+class GatedLateFusionModel(nn.Module):
+    def __init__(self, backbone_name, modality_dims, num_classes, hidden_dim=64, proj_dim=0):
+        super().__init__()
+        self.mod_names = list(modality_dims.keys())
+        self.mod_dims = list(modality_dims.values())
+        M = len(self.mod_dims)
+        self.projectors = nn.ModuleList()
+        self.backbones = nn.ModuleList()
+        self.classifiers = nn.ModuleList()
+        total_feat_dim = 0
+        for dim in self.mod_dims:
+            proj, bb = _make_branch(backbone_name, dim, hidden_dim, proj_dim)
+            self.projectors.append(proj if proj else nn.Identity())
+            self.backbones.append(bb)
+            total_feat_dim += bb.output_dim
+            self.classifiers.append(nn.Sequential(
+                nn.Dropout(0.5), nn.Linear(bb.output_dim, num_classes),
+            ))
+        self._has_proj = proj_dim > 0
+        self.gate = nn.Sequential(
+            nn.Linear(total_feat_dim, 32), nn.ReLU(), nn.Linear(32, M),
+        )
+    def forward(self, x, mask=None):
+        offset = 0
+        all_feats, all_logits = [], []
+        for i, dim in enumerate(self.mod_dims):
+            x_mod = x[:, :, offset:offset + dim]
+            offset += dim
+            if self._has_proj:
+                x_mod = self.projectors[i](x_mod)
+            feat = self.backbones[i](x_mod, mask)
+            all_feats.append(feat)
+            all_logits.append(self.classifiers[i](feat))
+        cat_feats = torch.cat(all_feats, dim=1)
+        gate_weights = F.softmax(self.gate(cat_feats), dim=1)
+        stacked = torch.stack(all_logits, dim=1)
+        return (stacked * gate_weights.unsqueeze(-1)).sum(dim=1)
+class StackingFusionModel(nn.Module):
+    def __init__(self, backbone_name, modality_dims, num_classes, hidden_dim=64, proj_dim=0):
+        super().__init__()
+        self.mod_names = list(modality_dims.keys())
+        self.mod_dims = list(modality_dims.values())
+        M = len(self.mod_dims)
+        self.projectors = nn.ModuleList()
+        self.backbones = nn.ModuleList()
+        self.classifiers = nn.ModuleList()
+        for dim in self.mod_dims:
+            proj, bb = _make_branch(backbone_name, dim, hidden_dim, proj_dim)
+            self.projectors.append(proj if proj else nn.Identity())
+            self.backbones.append(bb)
+            self.classifiers.append(nn.Sequential(
+                nn.Dropout(0.5), nn.Linear(bb.output_dim, num_classes),
+            ))
+        self._has_proj = proj_dim > 0
+        self.meta_learner = nn.Sequential(
+            nn.Linear(M * num_classes, 32), nn.ReLU(),
+            nn.Dropout(0.5), nn.Linear(32, num_classes),
+        )
+    def forward(self, x, mask=None):
+        offset = 0
+        all_logits = []
+        for i, dim in enumerate(self.mod_dims):
+            x_mod = x[:, :, offset:offset + dim]
+            offset += dim
+            if self._has_proj:
+                x_mod = self.projectors[i](x_mod)
+            feat = self.backbones[i](x_mod, mask)
+            all_logits.append(self.classifiers[i](feat))
+        cat_logits = torch.cat(all_logits, dim=1)
+        return self.meta_learner(cat_logits)
+class ProductOfExpertsModel(nn.Module):
+    def __init__(self, backbone_name, modality_dims, num_classes, hidden_dim=64, proj_dim=0):
+        super().__init__()
+        self.mod_names = list(modality_dims.keys())
+        self.mod_dims = list(modality_dims.values())
+        self.projectors = nn.ModuleList()
+        self.backbones = nn.ModuleList()
+        self.classifiers = nn.ModuleList()
+        for dim in self.mod_dims:
+            proj, bb = _make_branch(backbone_name, dim, hidden_dim, proj_dim)
+            self.projectors.append(proj if proj else nn.Identity())
+            self.backbones.append(bb)
+            self.classifiers.append(nn.Sequential(
+                nn.Dropout(0.5), nn.Linear(bb.output_dim, num_classes),
+            ))
+        self._has_proj = proj_dim > 0
+    def forward(self, x, mask=None):
+        offset = 0
+        log_probs_sum = None
+        for i, dim in enumerate(self.mod_dims):
+            x_mod = x[:, :, offset:offset + dim]
+            offset += dim
+            if self._has_proj:
+                x_mod = self.projectors[i](x_mod)
+            feat = self.backbones[i](x_mod, mask)
+            logits = self.classifiers[i](feat)
+            log_p = F.log_softmax(logits, dim=1)
+            log_probs_sum = log_p if log_probs_sum is None else log_probs_sum + log_p
+        return log_probs_sum
+class MoEFusionModel(nn.Module):
+    def __init__(self, backbone_name, modality_dims, num_classes, hidden_dim=64, proj_dim=0):
+        super().__init__()
+        self.mod_names = list(modality_dims.keys())
+        self.mod_dims = list(modality_dims.values())
+        M = len(self.mod_dims)
+        self.top_k = min(2, M)
+        self.projectors = nn.ModuleList()
+        self.backbones = nn.ModuleList()
+        self.classifiers = nn.ModuleList()
+        total_feat_dim = 0
+        for dim in self.mod_dims:
+            proj, bb = _make_branch(backbone_name, dim, hidden_dim, proj_dim)
+            self.projectors.append(proj if proj else nn.Identity())
+            self.backbones.append(bb)
+            total_feat_dim += bb.output_dim
+            self.classifiers.append(nn.Sequential(
+                nn.Dropout(0.5), nn.Linear(bb.output_dim, num_classes),
+            ))
+        self._has_proj = proj_dim > 0
+        self.router = nn.Linear(total_feat_dim, M)
+    def forward(self, x, mask=None):
+        offset = 0
+        all_feats, all_logits = [], []
+        for i, dim in enumerate(self.mod_dims):
+            x_mod = x[:, :, offset:offset + dim]
+            offset += dim
+            if self._has_proj:
+                x_mod = self.projectors[i](x_mod)
+            feat = self.backbones[i](x_mod, mask)
+            all_feats.append(feat)
+            all_logits.append(self.classifiers[i](feat))
+        cat_feats = torch.cat(all_feats, dim=1)
+        router_logits = self.router(cat_feats)
+        top_vals, top_idx = router_logits.topk(self.top_k, dim=1)
+        top_weights = F.softmax(top_vals, dim=1)
+        stacked = torch.stack(all_logits, dim=1)
+        top_idx_exp = top_idx.unsqueeze(-1).expand(-1, -1, stacked.size(-1))
+        selected = stacked.gather(1, top_idx_exp)
+        return (selected * top_weights.unsqueeze(-1)).sum(dim=1)
+class FeatureConcatFusionModel(nn.Module):
+    """Feature-level late fusion: separate backbones, concatenate features, joint classifier."""
+    def __init__(self, backbone_name, modality_dims, num_classes, hidden_dim=64, proj_dim=0):
+        super().__init__()
+        self.mod_names = list(modality_dims.keys())
+        self.mod_dims = list(modality_dims.values())
+        self.projectors = nn.ModuleList()
+        self.backbones = nn.ModuleList()
+        total_feat_dim = 0
+        for dim in self.mod_dims:
+            proj, bb = _make_branch(backbone_name, dim, hidden_dim, proj_dim)
+            self.projectors.append(proj if proj else nn.Identity())
+            self.backbones.append(bb)
+            total_feat_dim += bb.output_dim
+        self._has_proj = proj_dim > 0
+        self.classifier = nn.Sequential(
+            nn.LayerNorm(total_feat_dim),
+            nn.Dropout(0.5),
+            nn.Linear(total_feat_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(hidden_dim, num_classes),
+        )
+    def forward(self, x, mask=None):
+        offset = 0
+        all_feats = []
+        for i, dim in enumerate(self.mod_dims):
+            x_mod = x[:, :, offset:offset + dim]
+            offset += dim
+            if self._has_proj:
+                x_mod = self.projectors[i](x_mod)
+            feat = self.backbones[i](x_mod, mask)
+            all_feats.append(feat)
+        cat_feats = torch.cat(all_feats, dim=1)
+        return self.classifier(cat_feats)
+def build_model(backbone_name, fusion, input_dim, modality_dims, num_classes,
+                hidden_dim=128, proj_dim=0, late_agg='mean'):
+    """Factory function. proj_dim=0 means no projection (raw features)."""
+    if fusion == 'early':
+        return SingleModel(backbone_name, input_dim, num_classes, hidden_dim,
+                           modality_dims=modality_dims, proj_dim=proj_dim)
+    elif fusion == 'late':
+        return LateFusionModel(backbone_name, modality_dims, num_classes, hidden_dim,
+                               proj_dim, late_agg=late_agg)
+    elif fusion == 'attention':
+        return AttentionFusionModel(backbone_name, modality_dims, num_classes, hidden_dim, proj_dim)
+    elif fusion == 'weighted_late':
+        return WeightedLateFusionModel(backbone_name, modality_dims, num_classes, hidden_dim, proj_dim)
+    elif fusion == 'gated_late':
+        return GatedLateFusionModel(backbone_name, modality_dims, num_classes, hidden_dim, proj_dim)
+    elif fusion == 'stacking':
+        return StackingFusionModel(backbone_name, modality_dims, num_classes, hidden_dim, proj_dim)
+    elif fusion == 'product':
+        return ProductOfExpertsModel(backbone_name, modality_dims, num_classes, hidden_dim, proj_dim)
+    elif fusion == 'moe':
+        return MoEFusionModel(backbone_name, modality_dims, num_classes, hidden_dim, proj_dim)
+    elif fusion == 'feat_concat':
+        return FeatureConcatFusionModel(backbone_name, modality_dims, num_classes, hidden_dim, proj_dim)
+    else:
+        raise ValueError(f"Unknown fusion: {fusion}")

experiments/nets/models_forecast.py ADDED Viewed

	@@ -0,0 +1,269 @@

+"""Frame-level future forecasting models.
+Three baselines (all sharing the same forecast head signature):
+  - TransformerForecast (our DAF-style)
+  - FUTRForecast       (Transformer encoder + parallel query decoder)
+  - DeepConvLSTMForecast (Ordoñez & Roggen 2016 wearable HAR backbone)
+All take a dict {mod: (B, T_obs, F_mod)} and output (B, T_fut, num_classes).
+"""
+from __future__ import annotations
+from typing import Dict, List
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# ---------------------------------------------------------------------------
+# Shared per-modality projection: each modality -> hidden dim d_model
+# ---------------------------------------------------------------------------
+class _PerModalityProj(nn.Module):
+    def __init__(self, modality_dims: Dict[str, int], d_model: int):
+        super().__init__()
+        self.proj = nn.ModuleDict({
+            m: nn.Linear(d, d_model) for m, d in modality_dims.items()
+        })
+        self.mod_emb = nn.Parameter(torch.zeros(len(modality_dims), d_model))
+        nn.init.trunc_normal_(self.mod_emb, std=0.02)
+        self.mods = list(modality_dims.keys())
+    def forward(self, x: Dict[str, torch.Tensor]) -> torch.Tensor:
+        # Concatenate per-modality projections along time? Or sum?
+        # We sum modality-projected features per time step (with modality
+        # embedding broadcast). Equivalent to early-fusion at the d_model
+        # space and is what a "modality-aware Transformer" typically uses.
+        out = None
+        for i, m in enumerate(self.mods):
+            h = self.proj[m](x[m]) + self.mod_emb[i]
+            out = h if out is None else out + h
+        return out / len(self.mods)        # (B, T_obs, d_model)
+# ---------------------------------------------------------------------------
+# 1. Transformer (DAF-style) forecast model
+# ---------------------------------------------------------------------------
+class TransformerForecast(nn.Module):
+    def __init__(self, modality_dims: Dict[str, int], num_classes: int,
+                 t_obs: int, t_fut: int, d_model: int = 128,
+                 n_heads: int = 4, n_layers: int = 2, dropout: float = 0.1):
+        super().__init__()
+        self.t_obs = t_obs
+        self.t_fut = t_fut
+        self.num_classes = num_classes
+        self.embed = _PerModalityProj(modality_dims, d_model)
+        self.pos = nn.Parameter(torch.zeros(1, t_obs, d_model))
+        nn.init.trunc_normal_(self.pos, std=0.02)
+        layer = nn.TransformerEncoderLayer(
+            d_model=d_model, nhead=n_heads, dim_feedforward=4 * d_model,
+            dropout=dropout, batch_first=True, activation="gelu",
+        )
+        self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers)
+        self.queries = nn.Parameter(torch.zeros(1, t_fut, d_model))
+        nn.init.trunc_normal_(self.queries, std=0.02)
+        self.cross_attn = nn.MultiheadAttention(
+            d_model, n_heads, dropout=dropout, batch_first=True
+        )
+        self.norm = nn.LayerNorm(d_model)
+        self.head = nn.Linear(d_model, num_classes)
+    def forward(self, x: Dict[str, torch.Tensor]) -> torch.Tensor:
+        h = self.embed(x) + self.pos
+        h = self.encoder(h)                                     # (B, T_obs, D)
+        q = self.queries.expand(h.size(0), -1, -1)              # (B, T_fut, D)
+        out, _ = self.cross_attn(q, h, h, need_weights=False)
+        out = self.norm(out)
+        return self.head(out)                                   # (B, T_fut, C)
+# ---------------------------------------------------------------------------
+# 2. FUTR-style forecast (Future Transformer, Gong et al. CVPR 2022)
+#    Same encoder + parallel query decoder. We add a small Transformer
+#    decoder so it's not literally identical to TransformerForecast.
+# ---------------------------------------------------------------------------
+class FUTRForecast(nn.Module):
+    def __init__(self, modality_dims: Dict[str, int], num_classes: int,
+                 t_obs: int, t_fut: int, d_model: int = 128,
+                 n_heads: int = 4, n_enc: int = 2, n_dec: int = 1,
+                 dropout: float = 0.1):
+        super().__init__()
+        self.t_obs = t_obs
+        self.t_fut = t_fut
+        self.num_classes = num_classes
+        self.embed = _PerModalityProj(modality_dims, d_model)
+        self.pos = nn.Parameter(torch.zeros(1, t_obs, d_model))
+        nn.init.trunc_normal_(self.pos, std=0.02)
+        enc_layer = nn.TransformerEncoderLayer(
+            d_model=d_model, nhead=n_heads, dim_feedforward=4 * d_model,
+            dropout=dropout, batch_first=True, activation="gelu",
+        )
+        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_enc)
+        dec_layer = nn.TransformerDecoderLayer(
+            d_model=d_model, nhead=n_heads, dim_feedforward=4 * d_model,
+            dropout=dropout, batch_first=True, activation="gelu",
+        )
+        self.decoder = nn.TransformerDecoder(dec_layer, num_layers=n_dec)
+        self.queries = nn.Parameter(torch.zeros(1, t_fut, d_model))
+        nn.init.trunc_normal_(self.queries, std=0.02)
+        self.head = nn.Linear(d_model, num_classes)
+    def forward(self, x: Dict[str, torch.Tensor]) -> torch.Tensor:
+        memory = self.encoder(self.embed(x) + self.pos)         # (B, T_obs, D)
+        q = self.queries.expand(memory.size(0), -1, -1)         # (B, T_fut, D)
+        out = self.decoder(q, memory)
+        return self.head(out)                                   # (B, T_fut, C)
+# ---------------------------------------------------------------------------
+# 3. DeepConvLSTM-style forecast
+# ---------------------------------------------------------------------------
+class DeepConvLSTMForecast(nn.Module):
+    def __init__(self, modality_dims: Dict[str, int], num_classes: int,
+                 t_obs: int, t_fut: int, conv_filters: int = 64,
+                 lstm_hidden: int = 128, n_lstm_layers: int = 2,
+                 dropout: float = 0.1):
+        super().__init__()
+        self.t_obs = t_obs
+        self.t_fut = t_fut
+        self.num_classes = num_classes
+        self.mods = list(modality_dims.keys())
+        in_ch = sum(modality_dims.values())
+        # Same 4-layer conv stack as the original DeepConvLSTM
+        layers = []
+        ch = in_ch
+        for i in range(4):
+            layers.append(nn.Sequential(
+                nn.Conv1d(ch, conv_filters, kernel_size=5, padding=2),
+                nn.BatchNorm1d(conv_filters),
+                nn.ReLU(),
+                nn.Dropout(dropout if i < 3 else 0.2),
+            ))
+            ch = conv_filters
+        self.convs = nn.ModuleList(layers)
+        self.lstm = nn.LSTM(
+            conv_filters, lstm_hidden, num_layers=n_lstm_layers,
+            batch_first=True, dropout=dropout if n_lstm_layers > 1 else 0,
+        )
+        self.head = nn.Linear(lstm_hidden, t_fut * num_classes)
+    def forward(self, x: Dict[str, torch.Tensor]) -> torch.Tensor:
+        h = torch.cat([x[m] for m in self.mods], dim=-1)        # (B, T_obs, F_total)
+        h = h.permute(0, 2, 1)                                  # (B, F, T_obs)
+        for c in self.convs:
+            h = c(h)
+        h = h.permute(0, 2, 1)                                  # (B, T_obs, conv_filters)
+        out, (h_n, _) = self.lstm(h)
+        feat = h_n[-1]                                          # (B, lstm_hidden)
+        logits = self.head(feat).view(-1, self.t_fut, self.num_classes)
+        return logits
+# ---------------------------------------------------------------------------
+# 4. RU-LSTM (Furnari et al. RAL 2019, "Rolling-Unrolling LSTM for action
+#    anticipation"). Two-phase LSTM: a "rolling" phase encodes past, an
+#    "unrolling" phase autoregressively decodes future tokens.
+# ---------------------------------------------------------------------------
+class RULSTMForecast(nn.Module):
+    def __init__(self, modality_dims: Dict[str, int], num_classes: int,
+                 t_obs: int, t_fut: int, d_model: int = 128,
+                 n_lstm_layers: int = 2, dropout: float = 0.1):
+        super().__init__()
+        self.t_obs = t_obs
+        self.t_fut = t_fut
+        self.num_classes = num_classes
+        self.embed = _PerModalityProj(modality_dims, d_model)
+        self.rolling = nn.LSTM(
+            d_model, d_model, num_layers=n_lstm_layers,
+            batch_first=True, dropout=dropout if n_lstm_layers > 1 else 0,
+        )
+        self.unrolling = nn.LSTM(
+            d_model, d_model, num_layers=n_lstm_layers,
+            batch_first=True, dropout=dropout if n_lstm_layers > 1 else 0,
+        )
+        self.fut_init = nn.Parameter(torch.zeros(1, 1, d_model))
+        nn.init.trunc_normal_(self.fut_init, std=0.02)
+        self.head = nn.Linear(d_model, num_classes)
+    def forward(self, x: Dict[str, torch.Tensor]) -> torch.Tensor:
+        h_past = self.embed(x)                                  # (B, T_obs, D)
+        _, (h_n, c_n) = self.rolling(h_past)
+        B = h_past.size(0)
+        # Use a learned initial future token, repeated T_fut times
+        fut_input = self.fut_init.expand(B, self.t_fut, -1)
+        out, _ = self.unrolling(fut_input, (h_n, c_n))
+        return self.head(out)                                   # (B, T_fut, C)
+# ---------------------------------------------------------------------------
+# 5. AVT (Girdhar & Grauman ICCV 2021, "Anticipative Video Transformer").
+#    Causal Transformer over the concatenation of past + future tokens.
+# ---------------------------------------------------------------------------
+class AVTForecast(nn.Module):
+    def __init__(self, modality_dims: Dict[str, int], num_classes: int,
+                 t_obs: int, t_fut: int, d_model: int = 128,
+                 n_heads: int = 4, n_layers: int = 2, dropout: float = 0.1):
+        super().__init__()
+        self.t_obs = t_obs
+        self.t_fut = t_fut
+        self.num_classes = num_classes
+        self.embed = _PerModalityProj(modality_dims, d_model)
+        seq_len = t_obs + t_fut
+        self.pos = nn.Parameter(torch.zeros(1, seq_len, d_model))
+        nn.init.trunc_normal_(self.pos, std=0.02)
+        layer = nn.TransformerEncoderLayer(
+            d_model=d_model, nhead=n_heads, dim_feedforward=4 * d_model,
+            dropout=dropout, batch_first=True, activation="gelu",
+        )
+        self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers)
+        self.fut_tokens = nn.Parameter(torch.zeros(1, t_fut, d_model))
+        nn.init.trunc_normal_(self.fut_tokens, std=0.02)
+        self.head = nn.Linear(d_model, num_classes)
+        # Causal mask over concatenated [past | future] sequence
+        mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
+        self.register_buffer("causal_mask", mask)
+    def forward(self, x: Dict[str, torch.Tensor]) -> torch.Tensor:
+        h_past = self.embed(x)                                  # (B, T_obs, D)
+        B = h_past.size(0)
+        h_fut = self.fut_tokens.expand(B, -1, -1)               # (B, T_fut, D)
+        seq = torch.cat([h_past, h_fut], dim=1) + self.pos
+        out = self.encoder(seq, mask=self.causal_mask)
+        out_fut = out[:, self.t_obs:, :]
+        return self.head(out_fut)                               # (B, T_fut, C)
+# ---------------------------------------------------------------------------
+# Builder
+# ---------------------------------------------------------------------------
+def build_forecast_model(name: str, modality_dims: Dict[str, int],
+                         num_classes: int, t_obs: int, t_fut: int,
+                         d_model: int = 128, dropout: float = 0.1) -> nn.Module:
+    name = name.lower()
+    if name in ("daf", "transformer"):
+        return TransformerForecast(modality_dims, num_classes,
+                                   t_obs=t_obs, t_fut=t_fut,
+                                   d_model=d_model, dropout=dropout)
+    if name == "futr":
+        return FUTRForecast(modality_dims, num_classes,
+                            t_obs=t_obs, t_fut=t_fut,
+                            d_model=d_model, dropout=dropout)
+    if name == "deepconvlstm":
+        return DeepConvLSTMForecast(modality_dims, num_classes,
+                                    t_obs=t_obs, t_fut=t_fut,
+                                    dropout=dropout)
+    if name in ("rulstm", "ru-lstm", "ru_lstm"):
+        return RULSTMForecast(modality_dims, num_classes,
+                              t_obs=t_obs, t_fut=t_fut,
+                              d_model=d_model, dropout=dropout)
+    if name == "avt":
+        return AVTForecast(modality_dims, num_classes,
+                           t_obs=t_obs, t_fut=t_fut,
+                           d_model=d_model, dropout=dropout)
+    raise ValueError(f"Unknown forecast model: {name!r}")

experiments/nets/models_forecast_priv.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""Models for T8 v3 — privileged future-pressure conditioning.
+Wraps the existing TransformerForecast (DAF) to accept future pressure as
+side-channel context. The future pressure trajectory is encoded into T_fut
+tokens that get appended to the past memory; future queries cross-attend
+over the union (past sensors + future pressure). This is privileged
+information (oracle) — at test time we'd not have future pressure — so
+this is a hypothesis-test setup, not a deployable forecaster.
+"""
+from __future__ import annotations
+from typing import Dict
+import torch
+import torch.nn as nn
+class _PerModalityProj(nn.Module):
+    def __init__(self, modality_dims, d_model):
+        super().__init__()
+        self.proj = nn.ModuleDict({
+            m: nn.Linear(d, d_model) for m, d in modality_dims.items()
+        })
+        self.mod_emb = nn.Parameter(torch.zeros(len(modality_dims), d_model))
+        nn.init.trunc_normal_(self.mod_emb, std=0.02)
+        self.mods = list(modality_dims.keys())
+    def forward(self, x):
+        out = None
+        for i, m in enumerate(self.mods):
+            h = self.proj[m](x[m]) + self.mod_emb[i]
+            out = h if out is None else out + h
+        return out / len(self.mods)
+class DAFFuturePressure(nn.Module):
+    """DAF backbone + future-pressure conditioning."""
+    def __init__(self, modality_dims: Dict[str, int], target_dim: int,
+                 t_obs: int, t_fut: int, future_pressure_dim: int = 50,
+                 d_model: int = 128, n_heads: int = 4, n_layers: int = 2,
+                 dropout: float = 0.1):
+        super().__init__()
+        self.t_obs = t_obs
+        self.t_fut = t_fut
+        self.embed = _PerModalityProj(modality_dims, d_model)
+        self.pos = nn.Parameter(torch.zeros(1, t_obs, d_model))
+        nn.init.trunc_normal_(self.pos, std=0.02)
+        layer = nn.TransformerEncoderLayer(
+            d_model=d_model, nhead=n_heads, dim_feedforward=4 * d_model,
+            dropout=dropout, batch_first=True, activation="gelu",
+        )
+        self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers)
+        # future-pressure encoder
+        self.fp_proj = nn.Linear(future_pressure_dim, d_model)
+        self.fp_pos = nn.Parameter(torch.zeros(1, t_fut, d_model))
+        nn.init.trunc_normal_(self.fp_pos, std=0.02)
+        self.fp_seg = nn.Parameter(torch.zeros(1, 1, d_model))             # segment id
+        nn.init.trunc_normal_(self.fp_seg, std=0.02)
+        # decoder side
+        self.queries = nn.Parameter(torch.zeros(1, t_fut, d_model))
+        nn.init.trunc_normal_(self.queries, std=0.02)
+        self.cross_attn = nn.MultiheadAttention(
+            d_model, n_heads, dropout=dropout, batch_first=True
+        )
+        self.norm = nn.LayerNorm(d_model)
+        self.head = nn.Linear(d_model, target_dim)
+    def forward(self, x: Dict[str, torch.Tensor],
+                future_pressure: torch.Tensor) -> torch.Tensor:
+        h_past = self.encoder(self.embed(x) + self.pos)         # (B, T_obs, D)
+        h_fp = self.fp_proj(future_pressure) + self.fp_pos + self.fp_seg
+        memory = torch.cat([h_past, h_fp], dim=1)                # (B, T_obs+T_fut, D)
+        q = self.queries.expand(memory.size(0), -1, -1)          # (B, T_fut, D)
+        out, _ = self.cross_attn(q, memory, memory, need_weights=False)
+        out = self.norm(out)
+        return self.head(out)                                    # (B, T_fut, target_dim)

experiments/nets/models_seqpred.py ADDED Viewed

	@@ -0,0 +1,806 @@

+"""
+Models for T10 Triplet Next-Action Prediction.
+Two classes live here:
+  * TripletHead          — shared head module producing (verb_fine, verb_composite,
+                           noun, hand) logits from a pooled feature vector.
+  * DeepConvLSTMTriplet  — single-flow CNN+LSTM baseline (concatenates all
+                           available modalities along the feature axis).
+  * DailyActFormer       — our full-modality cross-modal Transformer that keeps
+                           each modality in its own stem, fuses via a modality
+                           token, and runs a causal temporal Transformer. Supports
+                           the anticipatory auxiliary loss mentioned in the paper
+                           plan (currently as a stub; enabled later in training).
+All models take:
+    x:     dict[mod_name -> (B, T, F_mod)]
+    mask:  BoolTensor (B, T)
+and return a dict:
+    {'verb_fine':      (B, NUM_VERB_FINE),
+     'verb_composite': (B, NUM_VERB_COMPOSITE),
+     'noun':           (B, NUM_NOUN),
+     'hand':           (B, NUM_HAND)}
+"""
+from __future__ import annotations
+import math
+import sys
+from pathlib import Path
+from typing import Dict, List, Optional, Sequence
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# Importable from either (a) neurips26 root, or (b) frozen row/code/ folder.
+_THIS = Path(__file__).resolve()
+sys.path.insert(0, str(_THIS.parent))
+sys.path.insert(0, str(_THIS.parent.parent))
+try:
+    from experiments.taxonomy import (
+        NUM_VERB_FINE, NUM_VERB_COMPOSITE, NUM_NOUN, NUM_HAND,
+    )
+except ModuleNotFoundError:
+    from taxonomy import (
+        NUM_VERB_FINE, NUM_VERB_COMPOSITE, NUM_NOUN, NUM_HAND,
+    )
+# ---------------------------------------------------------------------------
+# Shared triplet head
+# ---------------------------------------------------------------------------
+class _PrevActionConcat(nn.Module):
+    """Embeds the previous-segment (verb_composite, noun) ground-truth labels
+    and concatenates them to a pooled feature vector. Used by every model
+    when `use_prev_action=True`. The +1 vocab slot is the BOS / no-prev
+    sentinel emitted by the dataset for the first kept segment of each
+    recording. Output dim added to pooled = 2 * prev_emb_dim."""
+    def __init__(self, prev_emb_dim: int = 32):
+        super().__init__()
+        from taxonomy import NUM_VERB_COMPOSITE as _NVC, NUM_NOUN as _NN  # noqa
+        self.vc_emb = nn.Embedding(_NVC + 1, prev_emb_dim)
+        self.n_emb  = nn.Embedding(_NN + 1, prev_emb_dim)
+        self.out_dim = 2 * prev_emb_dim
+    def forward(self, pooled: torch.Tensor,
+                prev_v_comp: Optional[torch.Tensor] = None,
+                prev_noun:   Optional[torch.Tensor] = None) -> torch.Tensor:
+        if prev_v_comp is None or prev_noun is None:
+            B = pooled.size(0)
+            prev_v_comp = torch.full((B,), self.vc_emb.num_embeddings - 1,
+                                     dtype=torch.long, device=pooled.device)
+            prev_noun = torch.full((B,), self.n_emb.num_embeddings - 1,
+                                   dtype=torch.long, device=pooled.device)
+        pe = torch.cat([self.vc_emb(prev_v_comp), self.n_emb(prev_noun)], dim=-1)
+        return torch.cat([pooled, pe], dim=-1)
+class TripletHead(nn.Module):
+    def __init__(self, feat_dim: int, hidden: int = 256, dropout: float = 0.2):
+        super().__init__()
+        self.norm = nn.LayerNorm(feat_dim)
+        self.trunk = nn.Sequential(
+            nn.Linear(feat_dim, hidden),
+            nn.GELU(),
+            nn.Dropout(dropout),
+        )
+        self.verb_fine      = nn.Linear(hidden, NUM_VERB_FINE)
+        self.verb_composite = nn.Linear(hidden, NUM_VERB_COMPOSITE)
+        self.noun           = nn.Linear(hidden, NUM_NOUN)
+        self.hand           = nn.Linear(hidden, NUM_HAND)
+    def forward(self, feat: torch.Tensor) -> Dict[str, torch.Tensor]:
+        h = self.trunk(self.norm(feat))
+        return {
+            "verb_fine":      self.verb_fine(h),
+            "verb_composite": self.verb_composite(h),
+            "noun":           self.noun(h),
+            "hand":           self.hand(h),
+        }
+def _masked_mean_pool(h: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+    """Mean over the time axis of `h` (B, T, D) using a boolean mask (B, T)."""
+    m = mask.to(h.dtype).unsqueeze(-1)
+    return (h * m).sum(dim=1) / m.sum(dim=1).clamp(min=1.0)
+# ---------------------------------------------------------------------------
+# Baseline: DeepConvLSTM (Ordonez & Roggen 2016) adapted for triplet prediction
+# ---------------------------------------------------------------------------
+class DeepConvLSTMTriplet(nn.Module):
+    """Single-flow CNN+LSTM. Concatenates per-modality features on F axis."""
+    def __init__(
+        self,
+        modality_dims: Dict[str, int],
+        conv_filters: int = 64,
+        conv_kernel: int = 5,
+        num_conv_layers: int = 4,
+        lstm_hidden: int = 128,
+        num_lstm_layers: int = 2,
+        dropout: float = 0.2,
+        head_hidden: int = 256,
+        use_prev_action: bool = False,
+        prev_emb_dim: int = 32,
+    ):
+        super().__init__()
+        self.modality_dims = dict(modality_dims)
+        self.use_prev_action = use_prev_action
+        in_ch = sum(modality_dims.values())
+        convs: List[nn.Module] = []
+        c = in_ch
+        for i in range(num_conv_layers):
+            convs.append(nn.Sequential(
+                nn.Conv1d(c, conv_filters, conv_kernel, padding=conv_kernel // 2),
+                nn.BatchNorm1d(conv_filters),
+                nn.ReLU(),
+                nn.Dropout(dropout if i < num_conv_layers - 1 else dropout + 0.1),
+            ))
+            c = conv_filters
+        self.convs = nn.Sequential(*convs)
+        self.lstm = nn.LSTM(
+            conv_filters, lstm_hidden, num_layers=num_lstm_layers,
+            batch_first=True, bidirectional=False,
+            dropout=dropout if num_lstm_layers > 1 else 0.0,
+        )
+        head_in = lstm_hidden
+        if use_prev_action:
+            self.prev_concat = _PrevActionConcat(prev_emb_dim)
+            head_in += self.prev_concat.out_dim
+        else:
+            self.prev_concat = None
+        self.head = TripletHead(head_in, hidden=head_hidden, dropout=dropout)
+    def forward(
+        self, x: Dict[str, torch.Tensor], mask: torch.Tensor,
+        prev_v_comp: Optional[torch.Tensor] = None,
+        prev_noun:   Optional[torch.Tensor] = None,
+    ) -> Dict[str, torch.Tensor]:
+        feats = torch.cat([x[m] for m in x], dim=-1).transpose(1, 2)
+        feats = self.convs(feats).transpose(1, 2)
+        out, (h_n, _) = self.lstm(feats)
+        pooled = h_n[-1]
+        if self.use_prev_action:
+            pooled = self.prev_concat(pooled, prev_v_comp, prev_noun)
+        return self.head(pooled)
+# ---------------------------------------------------------------------------
+# Our model: DailyActFormer
+# ---------------------------------------------------------------------------
+class _ModalityStem(nn.Module):
+    """Multi-scale 1-D conv stem (kernels 3, 5, 9) per modality.
+    Borrowed from HandFormer (the top-1 baseline on T10 recognition): three
+    parallel convolutions capture fast (k=3, ~0.15s @ 20Hz), medium (k=5),
+    and slow (k=9, ~0.45s) temporal patterns. Output is a 1×1 fusion of
+    the three branches, projected back to d_model.
+    """
+    def __init__(self, in_dim: int, d_model: int, kernels=(3, 5, 9),
+                 dropout: float = 0.1):
+        super().__init__()
+        self.kernels = kernels
+        self.branches = nn.ModuleList([
+            nn.Conv1d(in_dim, d_model, k, padding=k // 2) for k in kernels
+        ])
+        self.merge = nn.Sequential(
+            nn.GELU(),
+            nn.Conv1d(d_model * len(kernels), d_model, 1),
+        )
+        self.norm = nn.LayerNorm(d_model)
+        self.drop = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (B, T, F_in) -> (B, F_in, T) for conv1d
+        z = x.transpose(1, 2)
+        multi = [c(z) for c in self.branches]                # each (B, D, T)
+        h = self.merge(torch.cat(multi, dim=1)).transpose(1, 2)  # (B, T, D)
+        return self.drop(self.norm(h))
+class _QueryPool(nn.Module):
+    """Learnable-query cross-attention pooling (replaces mean pool).
+    Inspired by FUTR (the top-5 baseline winner): a single learnable query
+    cross-attends to the entire encoder output, producing one summary vector.
+    Compared to a plain mean pool this lets the model weight informative
+    frames more heavily.
+    """
+    def __init__(self, d_model: int, n_heads: int = 4, dropout: float = 0.1):
+        super().__init__()
+        self.q = nn.Parameter(torch.zeros(1, 1, d_model))
+        nn.init.trunc_normal_(self.q, std=0.02)
+        self.attn = nn.MultiheadAttention(
+            d_model, n_heads, dropout=dropout, batch_first=True,
+        )
+        self.norm = nn.LayerNorm(d_model)
+    def forward(self, h: torch.Tensor, key_padding_mask: Optional[torch.Tensor]):
+        # h: (B, T, D); key_padding_mask: (B, T) where True = pad-to-mask-out
+        B = h.size(0)
+        q = self.q.expand(B, -1, -1)
+        out, _ = self.attn(q, h, h, key_padding_mask=key_padding_mask,
+                           need_weights=False)
+        return self.norm(out.squeeze(1))
+class _CrossModalTemporalShift(nn.Module):
+    """Cross-modal temporal-shift attention between two modalities.
+    Motivation (paper case study, §sec:grasp-phase-main): EMG activation leads
+    motion onset by a sub-frame ~20ms in our 100Hz recordings. After the 5x
+    downsample to 20Hz, that lag is ~0.4 frames, but per-subject variability
+    plus slack in our segment annotations introduces a few frames of drift
+    that a fixed alignment cannot capture.
+    We learn a discrete temporal shift Δ ∈ {-max_shift, …, +max_shift} frames
+    applied to one of the two modalities (EMG by default), so the shifted
+    tokens align with the other branch (MoCap) before cross-modal fusion. The
+    shift is sampled via straight-through Gumbel-softmax during training; at
+    inference we take the argmax (deterministic).
+    Inputs are per-modality token sequences (B, T, D). Outputs the same shape.
+    Only the `shift_modality` branch is shifted; other modalities pass through.
+    """
+    def __init__(self, max_shift: int = 3, tau: float = 1.0):
+        super().__init__()
+        self.max_shift = max_shift
+        self.tau = tau
+        # Logits over 2*max_shift+1 categorical shift candidates.
+        self.shift_logits = nn.Parameter(torch.zeros(2 * max_shift + 1))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (B, T, D); produce a shifted version that's a soft-blend over
+        # the shift dimension. Hard at inference, gumbel-softmax at training.
+        if self.training:
+            w = F.gumbel_softmax(self.shift_logits, tau=self.tau, hard=True, dim=-1)
+        else:
+            w = F.one_hot(self.shift_logits.argmax(),
+                          num_classes=2 * self.max_shift + 1).float()
+        shifted = []
+        for i, s in enumerate(range(-self.max_shift, self.max_shift + 1)):
+            shifted.append(w[i] * torch.roll(x, shifts=s, dims=1))
+        return torch.stack(shifted, dim=0).sum(dim=0)
+class _CausalTransformerBlock(nn.Module):
+    """Standard Transformer encoder block with a strictly causal attention mask."""
+    def __init__(self, d_model: int, n_heads: int, mlp_ratio: float = 4.0,
+                 dropout: float = 0.1):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout,
+                                          batch_first=True)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        mlp_dim = int(d_model * mlp_ratio)
+        self.mlp = nn.Sequential(
+            nn.Linear(d_model, mlp_dim), nn.GELU(), nn.Dropout(dropout),
+            nn.Linear(mlp_dim, d_model), nn.Dropout(dropout),
+        )
+    def forward(self, x: torch.Tensor, attn_mask: torch.Tensor,
+                key_padding_mask: Optional[torch.Tensor]) -> torch.Tensor:
+        h = self.norm1(x)
+        h, _ = self.attn(h, h, h, attn_mask=attn_mask,
+                         key_padding_mask=key_padding_mask, need_weights=False)
+        x = x + h
+        x = x + self.mlp(self.norm2(x))
+        return x
+class DailyActFormer(nn.Module):
+    """Cross-modal Transformer that uses every available modality.
+    Architecture outline:
+        per-modality stem  →  learnable modality embedding  →
+        concat across time (each frame -> M modality tokens)  →
+        1 fusion-layer cross-modal attention (compress M→1 per frame)  →
+        temporal Transformer (bidirectional by default; causal when
+        `causal=True` for anticipation-style next-action prediction)
+          →  pooled → TripletHead
+    For simplicity the fusion step is an attention pooling with learnable
+    queries, rather than a full cross-modal block. This keeps the parameter
+    count modest (2–4 M range with d_model=128).
+    """
+    def __init__(
+        self,
+        modality_dims: Dict[str, int],
+        d_model: int = 128,
+        n_layers: int = 4,
+        n_heads: int = 4,
+        dropout: float = 0.1,
+        head_hidden: int = 256,
+        max_T: int = 256,
+        causal: bool = False,
+        xshift_modality: Optional[str] = "emg",
+        xshift_max: int = 3,
+        use_prev_action: bool = False,
+        prev_emb_dim: int = 32,
+    ):
+        super().__init__()
+        self.modalities = list(modality_dims.keys())
+        self.causal = causal
+        self.use_prev_action = use_prev_action
+        # Prev-action concat (shared helper)
+        if use_prev_action:
+            self.prev_concat = _PrevActionConcat(prev_emb_dim)
+            self._prev_extra_dim = self.prev_concat.out_dim
+        else:
+            self.prev_concat = None
+            self._prev_extra_dim = 0
+        # 0) Cross-modal temporal-shift block on one branch (EMG by default).
+        # Disabled if `xshift_modality` is None or not present.
+        if xshift_modality is not None and xshift_modality in modality_dims:
+            self.xshift_modality = xshift_modality
+            self.xshift = _CrossModalTemporalShift(max_shift=xshift_max)
+        else:
+            self.xshift_modality = None
+            self.xshift = None
+        # 1) per-modality 1-D conv stems (each produces d_model features/frame)
+        self.stems = nn.ModuleDict({
+            m: _ModalityStem(F, d_model, dropout=dropout)
+            for m, F in modality_dims.items()
+        })
+        # 2) modality embedding (broadcast-add to per-modality tokens)
+        self.modality_embed = nn.Parameter(
+            torch.zeros(len(self.modalities), d_model)
+        )
+        nn.init.trunc_normal_(self.modality_embed, std=0.02)
+        # 3) per-frame cross-modal fusion: use a single learnable query token
+        self.fusion_q   = nn.Parameter(torch.zeros(1, 1, d_model))
+        self.fusion_kv  = nn.LayerNorm(d_model)
+        self.fusion_attn = nn.MultiheadAttention(d_model, n_heads, batch_first=True)
+        # 4) positional embedding along time (post-fusion)
+        self.pos_embed = nn.Parameter(torch.zeros(1, max_T, d_model))
+        nn.init.trunc_normal_(self.pos_embed, std=0.02)
+        self.max_T = max_T
+        # 5) causal temporal Transformer
+        self.temporal_norm = nn.LayerNorm(d_model)
+        self.temporal = nn.ModuleList([
+            _CausalTransformerBlock(d_model, n_heads, dropout=dropout)
+            for _ in range(n_layers)
+        ])
+        # 6) Pool: learnable-query cross-attention (replaces mean pool, FUTR-style)
+        self.pool = _QueryPool(d_model, n_heads=n_heads, dropout=dropout)
+        # 7) triplet head: input dim = d_model + (optional prev-action embed)
+        head_in = d_model + self._prev_extra_dim
+        self.head = TripletHead(head_in, hidden=head_hidden, dropout=dropout)
+        nn.init.trunc_normal_(self.fusion_q, std=0.02)
+    # ---- helpers ----
+    def _causal_mask(self, T: int, device) -> torch.Tensor:
+        # MultiheadAttention wants additive mask with -inf above diag.
+        m = torch.full((T, T), float("-inf"), device=device)
+        m.triu_(diagonal=1)
+        return m
+    # ---- forward ----
+    def forward(
+        self, x: Dict[str, torch.Tensor], mask: torch.Tensor,
+        prev_v_comp: Optional[torch.Tensor] = None,
+        prev_noun: Optional[torch.Tensor] = None,
+        return_features: bool = False,
+    ) -> Dict[str, torch.Tensor]:
+        # Stems: per-modality token streams
+        stem_tokens: List[torch.Tensor] = []
+        mods_in = [m for m in self.modalities if m in x]
+        if not mods_in:
+            raise ValueError("No modality from the model signature was provided.")
+        for i, m in enumerate(mods_in):
+            h = self.stems[m](x[m])                          # (B, T, D)
+            # Cross-modal temporal shift: apply to one branch (e.g. EMG) so it
+            # aligns with the others before fusion. Implements paper SyncFuse's
+            # main novelty (sub-frame anticipatory coupling between EMG/MoCap).
+            if self.xshift is not None and m == self.xshift_modality:
+                h = self.xshift(h)
+            h = h + self.modality_embed[self.modalities.index(m)]
+            stem_tokens.append(h)
+        # Cross-modal fusion: per-frame, attend learnable query over the M stacked
+        # modality tokens. Output is (B, T, D).
+        B, T, D = stem_tokens[0].shape
+        # stack -> (B, T, M, D) -> reshape as (B*T, M, D)
+        stacked = torch.stack(stem_tokens, dim=2)            # (B, T, M, D)
+        M = stacked.size(2)
+        stacked = stacked.reshape(B * T, M, D)
+        kv = self.fusion_kv(stacked)
+        q = self.fusion_q.expand(B * T, -1, -1)
+        fused, _ = self.fusion_attn(q, kv, kv, need_weights=False)
+        fused = fused.reshape(B, T, D)                        # (B, T, D)
+        # Positional embedding + causal temporal Transformer
+        if T > self.max_T:
+            raise ValueError(f"T={T} exceeds max_T={self.max_T}")
+        h = fused + self.pos_embed[:, :T, :]
+        h = self.temporal_norm(h)
+        attn_mask = self._causal_mask(T, h.device) if self.causal else None
+        key_padding = ~mask if mask is not None else None
+        for block in self.temporal:
+            h = block(h, attn_mask=attn_mask, key_padding_mask=key_padding)
+        # Pool: learnable-query cross-attention (FUTR-style) over valid frames
+        pooled = self.pool(h, key_padding_mask=key_padding)
+        # Optional: condition on previous segment's labels
+        if self.use_prev_action:
+            pooled = self.prev_concat(pooled, prev_v_comp, prev_noun)
+        logits = self.head(pooled)
+        if return_features:
+            logits["_pooled"] = pooled
+        return logits
+# ===========================================================================
+# Published baselines, sensor-adapted. Each keeps the original paper's key
+# idea (rolling+unrolling LSTM for RULSTM, causal encoder–decoder for FUTR,
+# early modality-token fusion for AFFT, etc.) but swaps the RGB/feature input
+# for our multimodal sensor streams, and the classification head for our
+# shared TripletHead.
+# ===========================================================================
+# ---------------------------------------------------------------------------
+# RULSTM (Furnari & Farinella, TPAMI 2020) — sensor-adapted
+#   Per-modality rolling LSTM summarises the past, a second unrolling LSTM
+#   takes R-LSTM state and walks `future_steps` steps forward to mimic
+#   anticipation without needing future sensor data. Fusion is late: each
+#   modality produces logits, we average them.
+# ---------------------------------------------------------------------------
+class _RULSTMBranch(nn.Module):
+    def __init__(self, in_dim: int, hidden: int, future_steps: int,
+                 dropout: float = 0.2):
+        super().__init__()
+        self.future_steps = future_steps
+        self.rolling   = nn.LSTM(in_dim, hidden, batch_first=True)
+        self.unrolling = nn.LSTMCell(hidden, hidden)
+        self.drop = nn.Dropout(dropout)
+        self.out_dim = hidden
+    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        # x: (B, T, F_in), mask: (B, T)
+        # Pack-free: LSTM on padded sequences is fine since we pool from h_n.
+        _, (h_n, c_n) = self.rolling(x)           # (1, B, H)
+        h = h_n.squeeze(0); c = c_n.squeeze(0)
+        inp = h
+        for _ in range(self.future_steps):
+            h, c = self.unrolling(inp, (h, c))
+            inp = h
+        return self.drop(h)
+class RULSTMTriplet(nn.Module):
+    def __init__(self, modality_dims: Dict[str, int], hidden: int = 128,
+                 future_steps: int = 8, dropout: float = 0.2,
+                 head_hidden: int = 256,
+                 use_prev_action: bool = False, prev_emb_dim: int = 32):
+        super().__init__()
+        self.use_prev_action = use_prev_action
+        self.branches = nn.ModuleDict({
+            m: _RULSTMBranch(F, hidden, future_steps, dropout)
+            for m, F in modality_dims.items()
+        })
+        head_in = hidden
+        if use_prev_action:
+            self.prev_concat = _PrevActionConcat(prev_emb_dim)
+            head_in += self.prev_concat.out_dim
+        else:
+            self.prev_concat = None
+        self.head = TripletHead(head_in, hidden=head_hidden, dropout=dropout)
+    def forward(self, x, mask, prev_v_comp=None, prev_noun=None):
+        feats = []
+        for m in x:
+            feats.append(self.branches[m](x[m], mask))
+        fused = torch.stack(feats, dim=0).mean(dim=0)
+        if self.use_prev_action:
+            fused = self.prev_concat(fused, prev_v_comp, prev_noun)
+        return self.head(fused)
+# ---------------------------------------------------------------------------
+# FUTR (Gong et al., CVPR 2022) — sensor-adapted
+#   Transformer encoder over observation frames (with per-frame feature from
+#   concat(modalities)). A decoder query attends over the encoder memory to
+#   produce a single future-action embedding which is fed into the triplet
+#   head. No autoregressive decoding — we only predict 1 target segment.
+# ---------------------------------------------------------------------------
+class FUTRTriplet(nn.Module):
+    def __init__(self, modality_dims: Dict[str, int], d_model: int = 128,
+                 n_heads: int = 4, n_layers: int = 3, dropout: float = 0.1,
+                 head_hidden: int = 256, max_T: int = 256,
+                 use_prev_action: bool = False, prev_emb_dim: int = 32):
+        super().__init__()
+        self.use_prev_action = use_prev_action
+        in_dim = sum(modality_dims.values())
+        self.in_proj = nn.Linear(in_dim, d_model)
+        self.pos = nn.Parameter(torch.zeros(1, max_T, d_model))
+        nn.init.trunc_normal_(self.pos, std=0.02)
+        self.max_T = max_T
+        enc_layer = nn.TransformerEncoderLayer(
+            d_model=d_model, nhead=n_heads, dim_feedforward=4 * d_model,
+            dropout=dropout, batch_first=True, activation="gelu",
+        )
+        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)
+        self.future_q = nn.Parameter(torch.zeros(1, 1, d_model))
+        nn.init.trunc_normal_(self.future_q, std=0.02)
+        self.cross_attn = nn.MultiheadAttention(
+            d_model, n_heads, dropout=dropout, batch_first=True,
+        )
+        head_in = d_model
+        if use_prev_action:
+            self.prev_concat = _PrevActionConcat(prev_emb_dim)
+            head_in += self.prev_concat.out_dim
+        else:
+            self.prev_concat = None
+        self.head = TripletHead(head_in, hidden=head_hidden, dropout=dropout)
+    def forward(self, x, mask, prev_v_comp=None, prev_noun=None):
+        feats = torch.cat([x[m] for m in x], dim=-1)
+        B, T, _ = feats.shape
+        if T > self.max_T:
+            raise ValueError(f"T={T} exceeds FUTR max_T={self.max_T}")
+        h = self.in_proj(feats) + self.pos[:, :T, :]
+        h = self.encoder(h, src_key_padding_mask=~mask)
+        q = self.future_q.expand(B, -1, -1)
+        out, _ = self.cross_attn(q, h, h, key_padding_mask=~mask,
+                                 need_weights=False)
+        pooled = out.squeeze(1)
+        if self.use_prev_action:
+            pooled = self.prev_concat(pooled, prev_v_comp, prev_noun)
+        return self.head(pooled)
+# ---------------------------------------------------------------------------
+# AFFT (Zhong et al., WACV 2023) — sensor-adapted
+#   Per-modality tokens (one per frame per modality) are concatenated into a
+#   long token sequence of length T*M and passed through an encoder with
+#   causal temporal attention so the model must anticipate strictly from the
+#   past. Fusion happens "anticipatively" inside the attention.
+# ---------------------------------------------------------------------------
+class AFFTTriplet(nn.Module):
+    def __init__(self, modality_dims: Dict[str, int], d_model: int = 96,
+                 n_heads: int = 4, n_layers: int = 3, dropout: float = 0.1,
+                 head_hidden: int = 256, max_T: int = 256,
+                 use_prev_action: bool = False, prev_emb_dim: int = 32):
+        super().__init__()
+        self.use_prev_action = use_prev_action
+        self.modalities = list(modality_dims.keys())
+        self.stems = nn.ModuleDict({
+            m: nn.Linear(F, d_model) for m, F in modality_dims.items()
+        })
+        self.mod_embed = nn.Parameter(
+            torch.zeros(len(self.modalities), d_model)
+        )
+        nn.init.trunc_normal_(self.mod_embed, std=0.02)
+        self.pos = nn.Parameter(torch.zeros(1, max_T, d_model))
+        nn.init.trunc_normal_(self.pos, std=0.02)
+        self.max_T = max_T
+        self.d_model = d_model
+        self.blocks = nn.ModuleList([
+            _CausalTransformerBlock(d_model, n_heads, dropout=dropout)
+            for _ in range(n_layers)
+        ])
+        head_in = d_model
+        if use_prev_action:
+            self.prev_concat = _PrevActionConcat(prev_emb_dim)
+            head_in += self.prev_concat.out_dim
+        else:
+            self.prev_concat = None
+        self.head = TripletHead(head_in, hidden=head_hidden, dropout=dropout)
+    def _expand_causal_mask(self, T: int, M: int, device) -> torch.Tensor:
+        # Token layout: [m0_t0, m1_t0, ..., mM_t0, m0_t1, ..., mM_t(T-1)]
+        # Token at (m, t) can attend to all (m', t') with t' <= t.
+        ts = torch.arange(T, device=device).unsqueeze(1).expand(-1, M).reshape(-1)
+        return ts[:, None] < ts[None, :]          # True where future (mask out)
+    def forward(self, x, mask, prev_v_comp=None, prev_noun=None):
+        # Build per-frame token streams.
+        mods = [m for m in self.modalities if m in x]
+        per_mod_tokens = []
+        B, T, _ = x[mods[0]].shape
+        for i, m in enumerate(mods):
+            h = self.stems[m](x[m]) + self.mod_embed[self.modalities.index(m)]
+            per_mod_tokens.append(h)
+        stacked = torch.stack(per_mod_tokens, dim=2)
+        M = stacked.size(2)
+        tokens = stacked.reshape(B, T * M, self.d_model)
+        if T > self.max_T:
+            raise ValueError(f"T={T} exceeds AFFT max_T={self.max_T}")
+        pos_per_frame = self.pos[:, :T, :].unsqueeze(2).expand(-1, -1, M, -1)
+        tokens = tokens + pos_per_frame.reshape(1, T * M, self.d_model)
+        attn_mask = self._expand_causal_mask(T, M, tokens.device)
+        attn_mask = torch.where(attn_mask, torch.tensor(float("-inf"),
+                                                        device=tokens.device),
+                                torch.tensor(0.0, device=tokens.device))
+        kp = (~mask).unsqueeze(2).expand(-1, -1, M).reshape(B, T * M)
+        for blk in self.blocks:
+            tokens = blk(tokens, attn_mask=attn_mask, key_padding_mask=kp)
+        last_slice = tokens[:, -M:, :]
+        pooled = last_slice.mean(dim=1)
+        if self.use_prev_action:
+            pooled = self.prev_concat(pooled, prev_v_comp, prev_noun)
+        return self.head(pooled)
+# ---------------------------------------------------------------------------
+# HandFormer (Shamil et al., ECCV 2024) — sensor-adapted
+#   Originally on 3D hand poses. We feed it only the MoCap modality (which
+#   contains 10 fingertip joints). Multi-scale 1-D conv over time, followed
+#   by a Transformer. If MoCap is not in `modalities`, falls back to whatever
+#   is provided (but then it's no longer the paper's "pose-only" setup).
+# ---------------------------------------------------------------------------
+class HandFormerTriplet(nn.Module):
+    def __init__(self, modality_dims: Dict[str, int], d_model: int = 128,
+                 n_heads: int = 4, n_layers: int = 3, kernels=(3, 5, 9),
+                 dropout: float = 0.1, head_hidden: int = 256, max_T: int = 256,
+                 use_prev_action: bool = False, prev_emb_dim: int = 32):
+        super().__init__()
+        self.use_prev_action = use_prev_action
+        in_dim = sum(modality_dims.values())
+        self.multi_conv = nn.ModuleList([
+            nn.Conv1d(in_dim, d_model, k, padding=k // 2) for k in kernels
+        ])
+        self.conv_merge = nn.Conv1d(d_model * len(kernels), d_model, 1)
+        self.pos = nn.Parameter(torch.zeros(1, max_T, d_model))
+        nn.init.trunc_normal_(self.pos, std=0.02)
+        self.max_T = max_T
+        enc_layer = nn.TransformerEncoderLayer(
+            d_model=d_model, nhead=n_heads, dim_feedforward=4 * d_model,
+            dropout=dropout, batch_first=True, activation="gelu",
+        )
+        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)
+        head_in = d_model
+        if use_prev_action:
+            self.prev_concat = _PrevActionConcat(prev_emb_dim)
+            head_in += self.prev_concat.out_dim
+        else:
+            self.prev_concat = None
+        self.head = TripletHead(head_in, hidden=head_hidden, dropout=dropout)
+    def forward(self, x, mask, prev_v_comp=None, prev_noun=None):
+        feats = torch.cat([x[m] for m in x], dim=-1).transpose(1, 2)
+        multi = [c(feats) for c in self.multi_conv]
+        h = self.conv_merge(torch.cat(multi, dim=1))
+        h = h.transpose(1, 2)
+        T = h.size(1)
+        if T > self.max_T:
+            raise ValueError(f"T={T} exceeds HandFormer max_T={self.max_T}")
+        h = h + self.pos[:, :T, :]
+        h = self.encoder(h, src_key_padding_mask=~mask)
+        pooled = _masked_mean_pool(h, mask)
+        if self.use_prev_action:
+            pooled = self.prev_concat(pooled, prev_v_comp, prev_noun)
+        return self.head(pooled)
+# ---------------------------------------------------------------------------
+# Placeholder ActionLLM — a conv-stem sensor encoder + a 2-layer Transformer
+# trained from scratch as a surrogate. The *full* LoRA+Qwen version lives in
+# `train_pred.py` and can be wired in later if the surrogate is too weak.
+# ---------------------------------------------------------------------------
+class ActionLLMSurrogate(nn.Module):
+    def __init__(self, modality_dims: Dict[str, int], d_model: int = 192,
+                 n_heads: int = 6, n_layers: int = 2, dropout: float = 0.1,
+                 head_hidden: int = 256, max_T: int = 256,
+                 use_prev_action: bool = False, prev_emb_dim: int = 32):
+        super().__init__()
+        self.use_prev_action = use_prev_action
+        in_dim = sum(modality_dims.values())
+        self.stem = nn.Sequential(
+            nn.Conv1d(in_dim, d_model, 5, padding=2),
+            nn.GELU(),
+            nn.Conv1d(d_model, d_model, 5, padding=2),
+        )
+        self.pos = nn.Parameter(torch.zeros(1, max_T, d_model))
+        nn.init.trunc_normal_(self.pos, std=0.02)
+        self.max_T = max_T
+        enc_layer = nn.TransformerEncoderLayer(
+            d_model=d_model, nhead=n_heads, dim_feedforward=4 * d_model,
+            dropout=dropout, batch_first=True, activation="gelu",
+        )
+        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)
+        head_in = d_model
+        if use_prev_action:
+            self.prev_concat = _PrevActionConcat(prev_emb_dim)
+            head_in += self.prev_concat.out_dim
+        else:
+            self.prev_concat = None
+        self.head = TripletHead(head_in, hidden=head_hidden, dropout=dropout)
+    def forward(self, x, mask, prev_v_comp=None, prev_noun=None):
+        feats = torch.cat([x[m] for m in x], dim=-1).transpose(1, 2)
+        h = self.stem(feats).transpose(1, 2)
+        T = h.size(1)
+        if T > self.max_T:
+            raise ValueError(f"T={T} exceeds ActionLLM max_T={self.max_T}")
+        h = h + self.pos[:, :T, :]
+        h = self.encoder(h, src_key_padding_mask=~mask)
+        pooled = _masked_mean_pool(h, mask)
+        if self.use_prev_action:
+            pooled = self.prev_concat(pooled, prev_v_comp, prev_noun)
+        return self.head(pooled)
+# ---------------------------------------------------------------------------
+# Factory
+# ---------------------------------------------------------------------------
+def build_model(
+    name: str, modality_dims: Dict[str, int], **kwargs,
+) -> nn.Module:
+    name = name.lower()
+    if name in ("deepconvlstm", "dcl"):
+        return DeepConvLSTMTriplet(modality_dims, **kwargs)
+    if name in ("dailyactformer", "ours", "daf"):
+        return DailyActFormer(modality_dims, **kwargs)
+    if name in ("rulstm",):
+        return RULSTMTriplet(modality_dims, **kwargs)
+    if name in ("futr",):
+        return FUTRTriplet(modality_dims, **kwargs)
+    if name in ("afft",):
+        return AFFTTriplet(modality_dims, **kwargs)
+    if name in ("handformer",):
+        return HandFormerTriplet(modality_dims, **kwargs)
+    if name in ("actionllm",):
+        return ActionLLMSurrogate(modality_dims, **kwargs)
+    raise ValueError(f"Unknown model: {name}")
+# ---------------------------------------------------------------------------
+# Smoke-test: build each model, run a random batch, check output shapes.
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    B, T = 2, 160
+    dims = {"imu": 180, "emg": 8, "eyetrack": 24}
+    x = {m: torch.randn(B, T, d) for m, d in dims.items()}
+    mask = torch.ones(B, T, dtype=torch.bool)
+    for name in ("deepconvlstm", "dailyactformer", "rulstm", "futr", "afft",
+                 "handformer", "actionllm"):
+        model = build_model(name, dims)
+        n_params = sum(p.numel() for p in model.parameters())
+        out = model(x, mask)
+        print(f"{name:16s} params={n_params:>10,}  shapes="
+              f"vf={tuple(out['verb_fine'].shape)} "
+              f"vc={tuple(out['verb_composite'].shape)} "
+              f"n={tuple(out['noun'].shape)} "
+              f"h={tuple(out['hand'].shape)}")

experiments/nets/published_models.py ADDED Viewed

	@@ -0,0 +1,699 @@

+"""
+Published baseline models for NeurIPS 2026 benchmark experiments.
+Contains faithful implementations of 6 published models:
+  1. DeepConvLSTM (Ordonez & Roggen, Sensors 2016) - Exp1/Exp3
+  2. InceptionTime (Fawaz et al., DMKD 2020) - Exp1/Exp3
+  3. MS-TCN++ (Li et al., TPAMI 2020) - Exp2
+  4. DiffAct (Liu et al., ICCV 2023) - Exp2
+  5. UnderPressure (Mourot et al., SCA/CGF 2022) - Exp3/Exp4a
+  6. emg2pose (Meta, NeurIPS 2024 D&B) - Exp4b
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+# ============================================================
+# 1. DeepConvLSTM (Ordonez & Roggen, Sensors 2016)
+#    "Deep Convolutional and LSTM Recurrent Neural Networks
+#     for Multimodal Wearable Activity Recognition"
+#    4 Conv layers -> 2 LSTM layers -> pooling/per-frame output
+# ============================================================
+class DeepConvLSTMBackbone(nn.Module):
+    """DeepConvLSTM backbone for sequence-level classification (Exp1).
+    Input: (B, T, C), optional mask
+    Output: (B, output_dim)
+    """
+    def __init__(self, input_dim, hidden_dim=128, num_conv_layers=4,
+                 conv_filters=64, conv_kernel=5, num_lstm_layers=2):
+        super().__init__()
+        conv_layers = []
+        in_ch = input_dim
+        for i in range(num_conv_layers):
+            out_ch = conv_filters
+            conv_layers.append(nn.Sequential(
+                nn.Conv1d(in_ch, out_ch, conv_kernel, padding=conv_kernel // 2),
+                nn.BatchNorm1d(out_ch),
+                nn.ReLU(),
+                nn.Dropout(0.1 if i < num_conv_layers - 1 else 0.2),
+            ))
+            in_ch = out_ch
+        self.convs = nn.ModuleList(conv_layers)
+        self.lstm = nn.LSTM(
+            conv_filters, hidden_dim, num_layers=num_lstm_layers,
+            batch_first=True, bidirectional=False,
+            dropout=0.2 if num_lstm_layers > 1 else 0,
+        )
+        self.output_dim = hidden_dim
+    def forward(self, x, mask=None):
+        # x: (B, T, C) -> Conv expects (B, C, T)
+        x = x.permute(0, 2, 1)
+        for conv in self.convs:
+            x = conv(x)
+        x = x.permute(0, 2, 1)  # (B, T, conv_filters)
+        out, (h_n, _) = self.lstm(x)
+        # Use last hidden state
+        feat = h_n[-1]  # (B, hidden_dim)
+        return feat
+class DeepConvLSTMContact(nn.Module):
+    """DeepConvLSTM for frame-level contact detection (Exp3).
+    Input: (B, T, C)
+    Output: (B, T, 2)
+    """
+    def __init__(self, input_dim, hidden_dim=64, num_conv_layers=4,
+                 conv_filters=64, conv_kernel=5):
+        super().__init__()
+        conv_layers = []
+        in_ch = input_dim
+        for i in range(num_conv_layers):
+            conv_layers.append(nn.Sequential(
+                nn.Conv1d(in_ch, conv_filters, conv_kernel, padding=conv_kernel // 2),
+                nn.BatchNorm1d(conv_filters),
+                nn.ReLU(),
+                nn.Dropout(0.1),
+            ))
+            in_ch = conv_filters
+        self.convs = nn.ModuleList(conv_layers)
+        self.lstm = nn.LSTM(conv_filters, hidden_dim, num_layers=2,
+                            batch_first=True, bidirectional=True, dropout=0.2)
+        self.head = nn.Linear(hidden_dim * 2, 2)
+    def forward(self, x):
+        x = x.permute(0, 2, 1)
+        for conv in self.convs:
+            x = conv(x)
+        x = x.permute(0, 2, 1)
+        out, _ = self.lstm(x)
+        return self.head(out)
+# ============================================================
+# 2. InceptionTime (Fawaz et al., DMKD 2020)
+#    "InceptionTime: Finding AlexNet for Time Series Classification"
+#    Inception modules with multi-scale convolutions + residual
+# ============================================================
+class InceptionModule(nn.Module):
+    """Single Inception module for time series."""
+    def __init__(self, in_channels, n_filters=32, kernel_sizes=(9, 19, 39),
+                 bottleneck_channels=32):
+        super().__init__()
+        # Bottleneck
+        self.bottleneck = nn.Conv1d(in_channels, bottleneck_channels, 1, bias=False)
+        # Parallel convolutions with different kernel sizes (odd kernels for symmetric padding)
+        self.convs = nn.ModuleList()
+        for ks in kernel_sizes:
+            self.convs.append(
+                nn.Conv1d(bottleneck_channels, n_filters, ks,
+                          padding=(ks - 1) // 2, bias=False)
+            )
+        # MaxPool branch
+        self.maxpool_conv = nn.Sequential(
+            nn.MaxPool1d(3, stride=1, padding=1),
+            nn.Conv1d(in_channels, n_filters, 1, bias=False),
+        )
+        self.bn = nn.BatchNorm1d(n_filters * (len(kernel_sizes) + 1))
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        # x: (B, C, T)
+        x_bottleneck = self.bottleneck(x)
+        conv_outputs = [conv(x_bottleneck) for conv in self.convs]
+        conv_outputs.append(self.maxpool_conv(x))
+        out = torch.cat(conv_outputs, dim=1)
+        return self.relu(self.bn(out))
+class InceptionBlock(nn.Module):
+    """Stack of Inception modules with a residual connection."""
+    def __init__(self, in_channels, n_filters=32, depth=3):
+        super().__init__()
+        n_out = n_filters * 4  # 3 conv branches + 1 maxpool branch
+        modules = []
+        for i in range(depth):
+            inc = in_channels if i == 0 else n_out
+            modules.append(InceptionModule(inc, n_filters))
+        self.modules_list = nn.ModuleList(modules)
+        # Residual connection
+        self.use_residual = (in_channels != n_out)
+        if self.use_residual:
+            self.residual = nn.Sequential(
+                nn.Conv1d(in_channels, n_out, 1, bias=False),
+                nn.BatchNorm1d(n_out),
+            )
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        residual = x
+        for mod in self.modules_list:
+            x = mod(x)
+        if self.use_residual:
+            residual = self.residual(residual)
+        return self.relu(x + residual)
+class InceptionTimeBackbone(nn.Module):
+    """InceptionTime backbone for sequence-level classification (Exp1).
+    Input: (B, T, C), optional mask
+    Output: (B, output_dim)
+    """
+    def __init__(self, input_dim, hidden_dim=128, n_filters=32, num_blocks=2, depth=3):
+        super().__init__()
+        blocks = []
+        in_ch = input_dim
+        for i in range(num_blocks):
+            blocks.append(InceptionBlock(in_ch, n_filters, depth))
+            in_ch = n_filters * 4
+        self.blocks = nn.ModuleList(blocks)
+        self.output_dim = n_filters * 4
+    def forward(self, x, mask=None):
+        # x: (B, T, C) -> (B, C, T)
+        x = x.permute(0, 2, 1)
+        for block in self.blocks:
+            x = block(x)
+        # Global average pooling with mask
+        if mask is not None:
+            x = (x * mask.unsqueeze(1).float()).sum(2) / mask.sum(1, keepdim=True).float().clamp(min=1)
+        else:
+            x = x.mean(2)
+        return x  # (B, n_filters*4)
+class InceptionTimeContact(nn.Module):
+    """InceptionTime for frame-level contact detection (Exp3).
+    Input: (B, T, C)
+    Output: (B, T, 2)
+    """
+    def __init__(self, input_dim, hidden_dim=64, n_filters=32, num_blocks=2, depth=3):
+        super().__init__()
+        blocks = []
+        in_ch = input_dim
+        for i in range(num_blocks):
+            blocks.append(InceptionBlock(in_ch, n_filters, depth))
+            in_ch = n_filters * 4
+        self.blocks = nn.ModuleList(blocks)
+        self.head = nn.Conv1d(n_filters * 4, 2, 1)
+    def forward(self, x):
+        x = x.permute(0, 2, 1)
+        for block in self.blocks:
+            x = block(x)
+        out = self.head(x)
+        return out.permute(0, 2, 1)  # (B, T, 2)
+# ============================================================
+# 3. MS-TCN++ (Li et al., TPAMI 2020)
+#    "MS-TCN++: Multi-Stage Temporal Convolutional Network
+#     for Action Segmentation"
+#    Key improvement: dual dilated layers in each residual block
+# ============================================================
+class DualDilatedResBlock(nn.Module):
+    """Dual dilated residual block (MS-TCN++ key contribution).
+    Uses two parallel dilated convolutions with different dilation rates
+    to capture both short-range and long-range temporal patterns.
+    """
+    def __init__(self, channels, dilation1, dilation2):
+        super().__init__()
+        # Branch 1: smaller dilation
+        self.conv1_dilated = nn.Conv1d(
+            channels, channels, 3,
+            padding=dilation1, dilation=dilation1
+        )
+        # Branch 2: larger dilation
+        self.conv2_dilated = nn.Conv1d(
+            channels, channels, 3,
+            padding=dilation2, dilation=dilation2
+        )
+        self.conv_fusion = nn.Conv1d(channels, channels, 1)
+        self.bn = nn.BatchNorm1d(channels)
+        self.dropout = nn.Dropout(0.3)
+    def forward(self, x):
+        residual = x
+        out1 = F.relu(self.conv1_dilated(x))
+        out2 = F.relu(self.conv2_dilated(x))
+        out = out1 + out2
+        out = self.dropout(F.relu(self.bn(self.conv_fusion(out))))
+        return out + residual
+class MSTCNPPStage(nn.Module):
+    """Single stage of MS-TCN++ with dual dilated layers."""
+    def __init__(self, in_channels, hidden_channels, num_classes, num_layers=10):
+        super().__init__()
+        self.input_conv = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.layers = nn.ModuleList()
+        for i in range(num_layers):
+            dilation1 = 2 ** i
+            dilation2 = 2 ** (i + 1) if i < num_layers - 1 else 2 ** i
+            self.layers.append(DualDilatedResBlock(hidden_channels, dilation1, dilation2))
+        self.output_conv = nn.Conv1d(hidden_channels, num_classes, 1)
+    def forward(self, x):
+        x = self.input_conv(x)
+        for layer in self.layers:
+            x = layer(x)
+        return self.output_conv(x)
+class MSTCNPP(nn.Module):
+    """MS-TCN++ for temporal action segmentation (Exp2).
+    Input: (B, T, C)
+    Output: list of (B, T, num_classes) per stage
+    """
+    def __init__(self, input_dim, num_classes, hidden_dim=64, num_stages=4, num_layers=10):
+        super().__init__()
+        self.stages = nn.ModuleList()
+        # First stage: input features -> predictions
+        self.stages.append(MSTCNPPStage(input_dim, hidden_dim, num_classes, num_layers))
+        # Refinement stages: predictions -> refined predictions
+        for _ in range(num_stages - 1):
+            self.stages.append(MSTCNPPStage(num_classes, hidden_dim, num_classes, num_layers))
+    def forward(self, x):
+        x = x.permute(0, 2, 1)  # (B, C, T)
+        outputs = []
+        for stage in self.stages:
+            x = stage(x)
+            outputs.append(x.permute(0, 2, 1))  # (B, T, num_classes)
+            # Feed softmax of predictions to next stage
+            if stage != self.stages[-1]:
+                x = F.softmax(x, dim=1)
+        return outputs
+# ============================================================
+# 4. DiffAct (Liu et al., ICCV 2023)
+#    "Diffusion Action Segmentation"
+#    Denoising diffusion model for iterative action refinement.
+#    Simplified but faithful implementation.
+# ============================================================
+class ConditionalLayerNorm(nn.Module):
+    """Layer norm conditioned on diffusion timestep."""
+    def __init__(self, channels):
+        super().__init__()
+        self.norm = nn.GroupNorm(1, channels)  # equivalent to LayerNorm for 1D
+    def forward(self, x):
+        return self.norm(x)
+class DiffActBlock(nn.Module):
+    """Residual block for DiffAct denoising network."""
+    def __init__(self, channels, dilation, time_emb_dim):
+        super().__init__()
+        self.conv1 = nn.Conv1d(channels, channels, 3, padding=dilation, dilation=dilation)
+        self.conv2 = nn.Conv1d(channels, channels, 1)
+        self.norm1 = ConditionalLayerNorm(channels)
+        self.norm2 = ConditionalLayerNorm(channels)
+        self.time_proj = nn.Linear(time_emb_dim, channels)
+        self.dropout = nn.Dropout(0.1)
+    def forward(self, x, time_emb):
+        residual = x
+        x = self.norm1(x)
+        x = F.relu(self.conv1(x))
+        # Add time embedding
+        t = self.time_proj(time_emb).unsqueeze(-1)  # (B, C, 1)
+        x = x + t
+        x = self.norm2(x)
+        x = self.dropout(F.relu(self.conv2(x)))
+        return x + residual
+class DiffActConditionEncoder(nn.Module):
+    """Temporal feature encoder for conditioning the denoising network."""
+    def __init__(self, input_dim, hidden_dim, num_layers=6):
+        super().__init__()
+        self.input_conv = nn.Conv1d(input_dim, hidden_dim, 1)
+        self.layers = nn.ModuleList()
+        for i in range(num_layers):
+            dilation = 2 ** (i % 5)
+            self.layers.append(nn.Sequential(
+                nn.Conv1d(hidden_dim, hidden_dim, 3, padding=dilation, dilation=dilation),
+                nn.BatchNorm1d(hidden_dim),
+                nn.ReLU(),
+                nn.Dropout(0.1),
+            ))
+    def forward(self, x):
+        x = self.input_conv(x)
+        for layer in self.layers:
+            x = layer(x) + x  # residual
+        return x
+class SinusoidalTimeEmbedding(nn.Module):
+    """Sinusoidal positional embedding for diffusion timestep."""
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        self.mlp = nn.Sequential(
+            nn.Linear(dim, dim * 4),
+            nn.GELU(),
+            nn.Linear(dim * 4, dim),
+        )
+    def forward(self, t):
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=t.device) * -emb)
+        emb = t.unsqueeze(-1).float() * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+        return self.mlp(emb)
+class DiffAct(nn.Module):
+    """DiffAct: Diffusion Action Segmentation (Exp2).
+    During training: noises ground-truth action probabilities and denoises.
+    During inference: iteratively denoises from pure noise.
+    Input: (B, T, C)
+    Output: list of (B, T, num_classes) [final denoised prediction]
+    """
+    def __init__(self, input_dim, num_classes, hidden_dim=64,
+                 num_encoder_layers=6, num_denoise_layers=6,
+                 num_diffusion_steps=10):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_steps = num_diffusion_steps
+        # Condition encoder: extract temporal features from input
+        self.condition_encoder = DiffActConditionEncoder(input_dim, hidden_dim, num_encoder_layers)
+        # Initial prediction head (non-diffusion baseline)
+        self.initial_head = nn.Conv1d(hidden_dim, num_classes, 1)
+        # Time embedding
+        self.time_emb = SinusoidalTimeEmbedding(hidden_dim)
+        # Denoising network
+        self.denoise_input = nn.Conv1d(num_classes + hidden_dim, hidden_dim, 1)
+        self.denoise_blocks = nn.ModuleList()
+        for i in range(num_denoise_layers):
+            dilation = 2 ** (i % 5)
+            self.denoise_blocks.append(DiffActBlock(hidden_dim, dilation, hidden_dim))
+        self.denoise_output = nn.Conv1d(hidden_dim, num_classes, 1)
+        # Noise schedule (cosine)
+        self._setup_noise_schedule()
+    def _setup_noise_schedule(self):
+        steps = self.num_steps
+        s = 0.008
+        t = torch.linspace(0, steps, steps + 1)
+        alphas_cumprod = torch.cos(((t / steps) + s) / (1 + s) * math.pi * 0.5) ** 2
+        alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
+        betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
+        betas = torch.clamp(betas, 0.0001, 0.999)
+        alphas = 1.0 - betas
+        alphas_cumprod = torch.cumprod(alphas, dim=0)
+        self.register_buffer('betas', betas)
+        self.register_buffer('alphas_cumprod', alphas_cumprod)
+        self.register_buffer('sqrt_alphas_cumprod', torch.sqrt(alphas_cumprod))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod', torch.sqrt(1 - alphas_cumprod))
+    def _add_noise(self, x_start, t, noise=None):
+        """Add noise to x_start at timestep t."""
+        if noise is None:
+            noise = torch.randn_like(x_start)
+        sqrt_alpha = self.sqrt_alphas_cumprod[t].view(-1, 1, 1)
+        sqrt_one_minus = self.sqrt_one_minus_alphas_cumprod[t].view(-1, 1, 1)
+        return sqrt_alpha * x_start + sqrt_one_minus * noise
+    def _denoise_step(self, x_noisy, cond_features, time_emb):
+        """Single denoising step."""
+        x = torch.cat([x_noisy, cond_features], dim=1)  # (B, C+hidden, T)
+        x = self.denoise_input(x)
+        for block in self.denoise_blocks:
+            x = block(x, time_emb)
+        return self.denoise_output(x)
+    def forward(self, x):
+        """
+        Training: returns [initial_pred, denoised_pred]
+        Inference: returns [initial_pred, iteratively_denoised_pred]
+        """
+        x_in = x.permute(0, 2, 1)  # (B, C, T)
+        B, _, T = x_in.shape
+        # Encode condition features
+        cond = self.condition_encoder(x_in)  # (B, hidden, T)
+        initial_logits = self.initial_head(cond).permute(0, 2, 1)  # (B, T, num_classes)
+        if self.training:
+            # Training: noise the initial prediction and denoise (end-to-end)
+            x_start = F.softmax(initial_logits, dim=-1).permute(0, 2, 1)  # (B, C, T)
+            t = torch.randint(0, self.num_steps, (B,), device=x.device)
+            noise = torch.randn_like(x_start)
+            x_noisy = self._add_noise(x_start.detach(), t, noise)
+            time_emb = self.time_emb(t)
+            denoised = self._denoise_step(x_noisy, cond, time_emb)
+            return [initial_logits, denoised.permute(0, 2, 1)]
+        else:
+            # Inference: iterative denoising from noise
+            x_t = torch.randn(B, self.num_classes, T, device=x.device)
+            for step in reversed(range(self.num_steps)):
+                t = torch.full((B,), step, device=x.device, dtype=torch.long)
+                time_emb = self.time_emb(t)
+                pred_noise = self._denoise_step(x_t, cond, time_emb)
+                # Simplified DDPM update
+                alpha = self.alphas_cumprod[step]
+                alpha_prev = self.alphas_cumprod[step - 1] if step > 0 else torch.tensor(1.0)
+                beta = self.betas[step]
+                x_t = (1 / torch.sqrt(1 - beta)) * (
+                    x_t - beta / self.sqrt_one_minus_alphas_cumprod[step] * pred_noise
+                )
+                if step > 0:
+                    x_t = x_t + torch.sqrt(beta) * torch.randn_like(x_t) * 0.5
+            return [initial_logits, x_t.permute(0, 2, 1)]
+# ============================================================
+# 5. UnderPressure (Mourot et al., SCA/CGF 2022)
+#    "UnderPressure: Deep Learning for Foot Contact Detection,
+#     Ground Reaction Force Estimation and Footskate Cleanup"
+#    GRU-based architecture for contact detection + force regression.
+#    Adapted for hand contact detection and MoCap->Pressure prediction.
+# ============================================================
+class UnderPressureContact(nn.Module):
+    """UnderPressure model adapted for hand contact detection (Exp3).
+    Architecture: Conv feature extractor -> BiGRU -> contact prediction head
+    Input: (B, T, C)
+    Output: (B, T, 2) [right_contact, left_contact]
+    """
+    def __init__(self, input_dim, hidden_dim=64, num_gru_layers=2):
+        super().__init__()
+        # Feature extractor (conv layers for local temporal patterns)
+        self.feature_extractor = nn.Sequential(
+            nn.Conv1d(input_dim, hidden_dim, 7, padding=3),
+            nn.BatchNorm1d(hidden_dim),
+            nn.ReLU(),
+            nn.Conv1d(hidden_dim, hidden_dim, 5, padding=2),
+            nn.BatchNorm1d(hidden_dim),
+            nn.ReLU(),
+        )
+        # BiGRU for temporal modeling
+        self.gru = nn.GRU(
+            hidden_dim, hidden_dim, num_layers=num_gru_layers,
+            batch_first=True, bidirectional=True,
+            dropout=0.2 if num_gru_layers > 1 else 0,
+        )
+        # Contact prediction head
+        self.contact_head = nn.Sequential(
+            nn.Linear(hidden_dim * 2, hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(hidden_dim, 2),
+        )
+    def forward(self, x):
+        # x: (B, T, C) -> (B, C, T)
+        feat = self.feature_extractor(x.permute(0, 2, 1))
+        feat = feat.permute(0, 2, 1)  # (B, T, hidden)
+        gru_out, _ = self.gru(feat)
+        return self.contact_head(gru_out)  # (B, T, 2)
+class UnderPressureRegressor(nn.Module):
+    """UnderPressure model adapted for MoCap -> Pressure regression (Exp4a).
+    Architecture: Conv feature extractor -> BiGRU -> pressure regression head
+    Input: (B, T, input_dim)
+    Output: (B, T, output_dim)
+    """
+    def __init__(self, input_dim, output_dim, hidden_dim=128, num_gru_layers=2):
+        super().__init__()
+        self.feature_extractor = nn.Sequential(
+            nn.Conv1d(input_dim, hidden_dim, 7, padding=3),
+            nn.BatchNorm1d(hidden_dim),
+            nn.ReLU(),
+            nn.Conv1d(hidden_dim, hidden_dim, 5, padding=2),
+            nn.BatchNorm1d(hidden_dim),
+            nn.ReLU(),
+            nn.Conv1d(hidden_dim, hidden_dim, 3, padding=1),
+            nn.BatchNorm1d(hidden_dim),
+            nn.ReLU(),
+        )
+        self.gru = nn.GRU(
+            hidden_dim, hidden_dim, num_layers=num_gru_layers,
+            batch_first=True, bidirectional=True,
+            dropout=0.2 if num_gru_layers > 1 else 0,
+        )
+        self.regression_head = nn.Sequential(
+            nn.Linear(hidden_dim * 2, hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(hidden_dim, output_dim),
+        )
+    def forward(self, x):
+        feat = self.feature_extractor(x.permute(0, 2, 1))
+        feat = feat.permute(0, 2, 1)
+        gru_out, _ = self.gru(feat)
+        return self.regression_head(gru_out)
+# ============================================================
+# 6. emg2pose (Meta/Facebook Research, NeurIPS 2024 D&B)
+#    "emg2pose: A Large and Diverse Benchmark for
+#     Surface Electromyographic Hand Pose Estimation"
+#    CNN feature extractor + Transformer encoder,
+#    with optional velocity-based integration (vemg2pose).
+# ============================================================
+class EMG2PoseEncoder(nn.Module):
+    """CNN + Transformer encoder from emg2pose."""
+    def __init__(self, input_dim, hidden_dim=128, num_transformer_layers=4, nhead=4):
+        super().__init__()
+        # Multi-scale CNN feature extractor
+        self.conv_small = nn.Sequential(
+            nn.Conv1d(input_dim, hidden_dim // 2, 3, padding=1),
+            nn.BatchNorm1d(hidden_dim // 2),
+            nn.ReLU(),
+        )
+        self.conv_medium = nn.Sequential(
+            nn.Conv1d(input_dim, hidden_dim // 4, 7, padding=3),
+            nn.BatchNorm1d(hidden_dim // 4),
+            nn.ReLU(),
+        )
+        self.conv_large = nn.Sequential(
+            nn.Conv1d(input_dim, hidden_dim // 4, 15, padding=7),
+            nn.BatchNorm1d(hidden_dim // 4),
+            nn.ReLU(),
+        )
+        # Projection to hidden_dim
+        self.proj = nn.Sequential(
+            nn.Conv1d(hidden_dim, hidden_dim, 1),
+            nn.BatchNorm1d(hidden_dim),
+            nn.ReLU(),
+        )
+        # Transformer encoder for temporal modeling
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=hidden_dim, nhead=nhead,
+            dim_feedforward=hidden_dim * 4,
+            dropout=0.1, batch_first=True,
+        )
+        self.transformer = nn.TransformerEncoder(encoder_layer, num_transformer_layers)
+    def forward(self, x):
+        # x: (B, T, C) -> (B, C, T)
+        x_t = x.permute(0, 2, 1)
+        f_small = self.conv_small(x_t)
+        f_medium = self.conv_medium(x_t)
+        f_large = self.conv_large(x_t)
+        feat = torch.cat([f_small, f_medium, f_large], dim=1)
+        feat = self.proj(feat).permute(0, 2, 1)  # (B, T, hidden)
+        return self.transformer(feat)
+class EMG2Pose(nn.Module):
+    """emg2pose model for EMG -> Hand Pose regression (Exp4b).
+    Predicts per-frame hand joint positions from EMG signals.
+    Uses velocity-based integration (vemg2pose variant):
+      predict velocity -> integrate to get positions.
+    Input: (B, T, input_dim)  [EMG channels]
+    Output: (B, T, output_dim)  [hand joint positions]
+    """
+    def __init__(self, input_dim, output_dim, hidden_dim=128,
+                 num_transformer_layers=4, use_velocity=True):
+        super().__init__()
+        self.use_velocity = use_velocity
+        self.encoder = EMG2PoseEncoder(input_dim, hidden_dim, num_transformer_layers)
+        if use_velocity:
+            # Predict velocity, then integrate
+            self.velocity_head = nn.Sequential(
+                nn.Linear(hidden_dim, hidden_dim // 2),
+                nn.ReLU(),
+                nn.Dropout(0.1),
+                nn.Linear(hidden_dim // 2, output_dim),
+            )
+            # Learnable initial position
+            self.initial_pos = nn.Parameter(torch.zeros(1, 1, output_dim))
+        else:
+            # Direct position prediction
+            self.position_head = nn.Sequential(
+                nn.Linear(hidden_dim, hidden_dim // 2),
+                nn.ReLU(),
+                nn.Dropout(0.1),
+                nn.Linear(hidden_dim // 2, output_dim),
+            )
+    def forward(self, x):
+        features = self.encoder(x)  # (B, T, hidden)
+        if self.use_velocity:
+            velocity = self.velocity_head(features)  # (B, T, output_dim)
+            # Cumulative sum to integrate velocity -> position
+            positions = torch.cumsum(velocity, dim=1) + self.initial_pos
+            return positions
+        else:
+            return self.position_head(features)

experiments/s9_primitives.json ADDED Viewed

	@@ -0,0 +1,76 @@

+{
+  "version": "s9_docx_2025_12_05",
+  "source": "${PULSE_ROOT}",
+  "categories": ["hand", "arm", "body", "fine", "composite"],
+  "primitives": [
+    {"id":  0, "category": "hand", "zh": "伸手",       "en": "reach",                "note": "forward/up/down/side"},
+    {"id":  1, "category": "hand", "zh": "抓握",       "en": "grasp",                "note": "pinch / hold / clamp"},
+    {"id":  2, "category": "hand", "zh": "松开",       "en": "release",              "note": "release object"},
+    {"id":  3, "category": "hand", "zh": "旋转手腕",   "en": "rotate_wrist",         "note": "twist / turn"},
+    {"id":  4, "category": "hand", "zh": "按压",       "en": "press",                "note": "downward force"},
+    {"id":  5, "category": "hand", "zh": "拉动",       "en": "pull",                 "note": "toward self"},
+    {"id":  6, "category": "hand", "zh": "推动",       "en": "push",                 "note": "outward force"},
+    {"id":  7, "category": "hand", "zh": "滑动",       "en": "slide",                "note": "translation motion"},
+    {"id":  8, "category": "hand", "zh": "捏合",       "en": "pinch",                "note": "two/multi finger pinch"},
+    {"id":  9, "category": "hand", "zh": "展开",       "en": "spread_fingers",       "note": "fingers open"},
+    {"id": 10, "category": "arm",  "zh": "抬起",       "en": "raise_arm",            "note": "arm up"},
+    {"id": 11, "category": "arm",  "zh": "放下",       "en": "lower_arm",            "note": "arm down"},
+    {"id": 12, "category": "arm",  "zh": "伸展",       "en": "extend_arm",           "note": "arm straight"},
+    {"id": 13, "category": "arm",  "zh": "弯曲",       "en": "bend_elbow",           "note": "elbow bend"},
+    {"id": 14, "category": "arm",  "zh": "摆动",       "en": "swing_arm",            "note": "left-right / forward-back"},
+    {"id": 15, "category": "arm",  "zh": "环绕",       "en": "circle_arm",           "note": "circular motion"},
+    {"id": 16, "category": "body", "zh": "弯腰",       "en": "bend_torso",           "note": "lean forward"},
+    {"id": 17, "category": "body", "zh": "直立",       "en": "stand_upright",        "note": "return to standing"},
+    {"id": 18, "category": "body", "zh": "蹲下",       "en": "squat_down",           "note": "lower center of mass"},
+    {"id": 19, "category": "body", "zh": "站起",       "en": "stand_up",             "note": "return to height"},
+    {"id": 20, "category": "body", "zh": "转身",       "en": "turn_body",            "note": "torso rotate"},
+    {"id": 21, "category": "body", "zh": "侧身",       "en": "lean_side",            "note": "torso tilt"},
+    {"id": 22, "category": "body", "zh": "迈步",       "en": "step",                 "note": "shift position"},
+    {"id": 23, "category": "fine", "zh": "插入",       "en": "insert",               "note": "object enters"},
+    {"id": 24, "category": "fine", "zh": "拔出",       "en": "extract",              "note": "object exits"},
+    {"id": 25, "category": "fine", "zh": "折叠",       "en": "fold",                 "note": "change shape"},
+    {"id": 26, "category": "fine", "zh": "撕扯",       "en": "tear",                 "note": "separate"},
+    {"id": 27, "category": "fine", "zh": "擦拭",       "en": "wipe",                 "note": "back-and-forth"},
+    {"id": 28, "category": "composite", "zh": "拿起物品",     "en": "pick_up_object",        "note": "reach -> grasp -> raise"},
+    {"id": 29, "category": "composite", "zh": "放下物品",     "en": "put_down_object",       "note": "move -> release -> retract"},
+    {"id": 30, "category": "composite", "zh": "移动物品",     "en": "move_object",           "note": "pick_up -> move -> put_down"},
+    {"id": 31, "category": "composite", "zh": "交换手持物",   "en": "transfer_between_hands","note": "one hand grasp -> other hand take -> first release"},
+    {"id": 32, "category": "composite", "zh": "打开盖子",     "en": "open_lid",              "note": "grasp -> rotate/lift"},
+    {"id": 33, "category": "composite", "zh": "关闭盖子",     "en": "close_lid",             "note": "align -> press/rotate"},
+    {"id": 34, "category": "composite", "zh": "倒入液体",     "en": "pour_liquid",           "note": "lift -> tilt -> control flow -> reset"},
+    {"id": 35, "category": "composite", "zh": "舀取",         "en": "scoop",                 "note": "insert -> raise -> move"},
+    {"id": 36, "category": "composite", "zh": "打开柜门",     "en": "open_cabinet_door",     "note": "grasp handle -> pull"},
+    {"id": 37, "category": "composite", "zh": "关闭柜门",     "en": "close_cabinet_door",    "note": "push -> confirm"},
+    {"id": 38, "category": "composite", "zh": "打开抽屉",     "en": "open_drawer",           "note": "grasp -> pull out"},
+    {"id": 39, "category": "composite", "zh": "按下开关",     "en": "press_switch",          "note": "reach -> press"},
+    {"id": 40, "category": "composite", "zh": "折叠衣物",     "en": "fold_clothing",         "note": "spread -> fold -> flatten"},
+    {"id": 41, "category": "composite", "zh": "叠放物品",     "en": "stack_objects",         "note": "pick_up -> align -> place gently"},
+    {"id": 42, "category": "composite", "zh": "排列物品",     "en": "arrange_objects",       "note": "move -> adjust spacing -> align"},
+    {"id": 43, "category": "composite", "zh": "分类收纳",     "en": "sort_and_store",        "note": "identify -> group -> place"},
+    {"id": 44, "category": "composite", "zh": "擦拭表面",     "en": "wipe_surface",          "note": "take cloth -> press -> back-and-forth"},
+    {"id": 45, "category": "composite", "zh": "扫除垃圾",     "en": "sweep_debris",          "note": "broom -> gather -> dustpan"},
+    {"id": 46, "category": "composite", "zh": "倾倒垃圾",     "en": "dump_trash",            "note": "lift container -> align -> tilt -> pour"},
+    {"id": 47, "category": "composite", "zh": "喷洒液体",     "en": "spray_liquid",          "note": "press nozzle -> move -> release"},
+    {"id": 48, "category": "composite", "zh": "撕胶带",       "en": "tear_tape",             "note": "pull -> tear off"},
+    {"id": 49, "category": "composite", "zh": "贴标签",       "en": "stick_label",           "note": "peel -> align -> press"},
+    {"id": 50, "category": "composite", "zh": "包裹物品",     "en": "wrap_object",           "note": "spread wrap -> place item -> fold -> seal"},
+    {"id": 51, "category": "composite", "zh": "系绳打结",     "en": "tie_knot",              "note": "cross -> through -> tighten"},
+    {"id": 52, "category": "composite", "zh": "拿起笔",       "en": "pick_up_pen",           "note": "pinch -> adjust grip"},
+    {"id": 53, "category": "composite", "zh": "写字",         "en": "write",                 "note": "controlled motion -> apply pressure"},
+    {"id": 54, "category": "composite", "zh": "翻页",         "en": "turn_page",             "note": "pinch corner -> flip"},
+    {"id": 55, "category": "composite", "zh": "插入电源",     "en": "plug_in_power",         "note": "align -> push in"},
+    {"id": 56, "category": "composite", "zh": "连接线缆",     "en": "connect_cable",         "note": "align connector -> insert -> confirm"},
+    {"id": 57, "category": "composite", "zh": "组装部件",     "en": "assemble_parts",        "note": "align -> snap/screw"},
+    {"id": 58, "category": "composite", "zh": "称重",         "en": "weigh",                 "note": "place item -> read scale"},
+    {"id": 59, "category": "composite", "zh": "量取",         "en": "measure_volume",        "note": "pour -> read marking -> adjust"},
+    {"id": 60, "category": "composite", "zh": "计数",         "en": "count",                 "note": "move one by one -> tally"},
+    {"id": 61, "category": "composite", "zh": "挂衣服",       "en": "hang_clothing",         "note": "take hanger -> insert garment -> hang"},
+    {"id": 62, "category": "composite", "zh": "铲猫砂",       "en": "scoop_litter",          "note": "insert -> raise -> sift -> pour"},
+    {"id": 63, "category": "composite", "zh": "搅拌",         "en": "stir",                  "note": "insert spoon -> circular motion"},
+    {"id": 64, "category": "composite", "zh": "剪切",         "en": "cut",                   "note": "hold scissors -> align -> close"}
+  ]
+}

experiments/slurm/freeze_all_rows.sh ADDED Viewed

	@@ -0,0 +1,179 @@

+#!/bin/bash
+# Create folder structure for ALL rows across Tables 1, 3, 4, 5, 7 and
+# freeze the current experiments/ code into each one. After this you can
+# cd into any <table>/<row>/ and run ./run.sh to submit 5 SLURM seeds.
+#
+# Re-running this script is safe: it will re-freeze the code (overwrite the
+# snapshot), but won't clobber any existing seeds/ outputs.
+set -euo pipefail
+BASEDIR=${BASEDIR:-${PULSE_ROOT}}
+EXP=${BASEDIR}/experiments
+SETUP="${EXP}/setup_row.sh"
+COMMON="--epochs 40 --batch_size 32 --lr 3e-4 --weight_decay 1e-4 \
+--patience 12 --label_smoothing 0.05 --use_class_weights \
+--num_workers 2"
+ALL5="imu,emg,eyetrack,mocap,pressure"
+row () {
+    # $1=table  $2=row  $3=desc  $4=cli
+    bash "${SETUP}" --table "$1" --row "$2" --desc "$3" --cli "$4 ${COMMON}"
+}
+# ============================================================
+# Table 1: Main comparison at T_fut=2s
+# ============================================================
+T1=table1_main_comparison
+cat > "${BASEDIR}/${T1}/README.md" <<'EOF'
+# Table 1: Main Comparison (Next-Action Prediction, T_fut = 2 s)
+Each baseline is run on its most favourable modality subset; our model
+(DailyActFormer) uses all 5 synchronised modalities. 5 seeds per row;
+report mean ± std of Verb fine Top-1/5, Noun Top-1/5, Hand Top-1, Action
+Top-1 (= verb ∧ noun ∧ hand). Action Top-1 is the headline metric.
+| Row | Method            | Family          | Modalities          |
+|-----|-------------------|-----------------|---------------------|
+| 01  | DailyActFormer    | cross-modal Trf | imu+emg+eye+mocap+P |
+| 02  | DeepConvLSTM      | CNN+LSTM (IMU)  | imu                 |
+| 03  | DeepConvLSTM 3mod | CNN+LSTM        | imu+mocap+emg       |
+| 04  | RULSTM            | rolling LSTM    | imu+mocap           |
+| 05  | FUTR              | long-term Trf   | mocap+imu+emg       |
+| 06  | AFFT              | multimodal Trf  | imu+emg+eye+mocap   |
+| 07  | HandFormer        | hand-pose Trf   | mocap (fingers)     |
+| 08  | ActionLLM (LoRA)  | LLM-based       | imu+emg+eye         |
+EOF
+mkdir -p "${BASEDIR}/${T1}"
+row ${T1} row01_ours_dailyactformer_all5 \
+    "Our model, all 5 modalities (headline row)" \
+    "--model dailyactformer --modalities ${ALL5} --t_obs 8 --t_fut 2"
+row ${T1} row02_deepconvlstm_imu \
+    "DeepConvLSTM on IMU only (classic HAR baseline)" \
+    "--model deepconvlstm --modalities imu --t_obs 8 --t_fut 2"
+row ${T1} row03_deepconvlstm_3mod \
+    "DeepConvLSTM on IMU+MoCap+EMG (best 3-modality concat)" \
+    "--model deepconvlstm --modalities imu,mocap,emg --t_obs 8 --t_fut 2"
+row ${T1} row04_rulstm_imu_mocap \
+    "RULSTM, rolling-unrolling LSTM (IMU + MoCap late fusion)" \
+    "--model rulstm --modalities imu,mocap --t_obs 8 --t_fut 2"
+row ${T1} row05_futr_3mod \
+    "FUTR (causal transformer) on MoCap+IMU+EMG" \
+    "--model futr --modalities mocap,imu,emg --t_obs 8 --t_fut 2"
+row ${T1} row06_afft_4mod \
+    "AFFT (anticipative feature fusion transformer) on 4 modalities" \
+    "--model afft --modalities imu,emg,eyetrack,mocap --t_obs 8 --t_fut 2"
+row ${T1} row07_handformer_mocap \
+    "HandFormer (skeleton-only ECCV'24) on MoCap finger joints" \
+    "--model handformer --modalities mocap --t_obs 8 --t_fut 2"
+row ${T1} row08_actionllm_3mod \
+    "ActionLLM (Qwen2.5-0.5B + LoRA) on IMU+EMG+EyeTrack" \
+    "--model actionllm --modalities imu,emg,eyetrack --t_obs 8 --t_fut 2"
+# ============================================================
+# Table 3: Horizon curve (DailyActFormer)
+# ============================================================
+T3=table3_horizon_curve
+mkdir -p "${BASEDIR}/${T3}"
+cat > "${BASEDIR}/${T3}/README.md" <<'EOF'
+# Table 3: Prediction Horizon Curve (DailyActFormer, all 5 modalities)
+Same model, varying T_fut. Expect monotonic drop in Action Top-1 as
+horizon grows; plot line graph in the paper alongside this table.
+EOF
+HORIZONS=(1 2 5 10 15)
+for i in "${!HORIZONS[@]}"; do
+    tfut="${HORIZONS[$i]}"
+    idx=$(printf "%02d" $((i+1)))
+    row ${T3} row${idx}_ours_tfut${tfut}s \
+        "Our model at T_fut=${tfut}s" \
+        "--model dailyactformer --modalities ${ALL5} --t_obs 8 --t_fut ${tfut}"
+done
+# ============================================================
+# Table 4: Modality ablation on DailyActFormer (T_fut=2s)
+# ============================================================
+T4=table4_modality_ablation
+mkdir -p "${BASEDIR}/${T4}"
+cat > "${BASEDIR}/${T4}/README.md" <<'EOF'
+# Table 4: Modality Ablation (DailyActFormer, T_fut = 2 s)
+Same model, progressively remove modalities. Each row trained from scratch.
+EOF
+row ${T4} row01_full_5mod    "Full 5-modality (reference)"         "--model dailyactformer --modalities imu,emg,eyetrack,mocap,pressure --t_obs 8 --t_fut 2"
+row ${T4} row02_no_pressure  "Drop pressure"                        "--model dailyactformer --modalities imu,emg,eyetrack,mocap          --t_obs 8 --t_fut 2"
+row ${T4} row03_no_eyetrack  "Drop eye-tracking"                    "--model dailyactformer --modalities imu,emg,mocap,pressure          --t_obs 8 --t_fut 2"
+row ${T4} row04_no_emg       "Drop EMG"                             "--model dailyactformer --modalities imu,eyetrack,mocap,pressure     --t_obs 8 --t_fut 2"
+row ${T4} row05_no_imu       "Drop IMU"                             "--model dailyactformer --modalities emg,eyetrack,mocap,pressure     --t_obs 8 --t_fut 2"
+row ${T4} row06_no_mocap     "Drop MoCap"                           "--model dailyactformer --modalities imu,emg,eyetrack,pressure       --t_obs 8 --t_fut 2"
+row ${T4} row07_imu_emg_only "Only IMU + EMG (physiology-light)"    "--model dailyactformer --modalities imu,emg                         --t_obs 8 --t_fut 2"
+row ${T4} row08_mocap_only   "Only MoCap (skeleton-only)"           "--model dailyactformer --modalities mocap                           --t_obs 8 --t_fut 2"
+# ============================================================
+# Table 5: Component ablation (DailyActFormer switches)
+# ============================================================
+T5=table5_component_ablation
+mkdir -p "${BASEDIR}/${T5}"
+cat > "${BASEDIR}/${T5}/README.md" <<'EOF'
+# Table 5: Component Ablation (DailyActFormer, T_fut = 2 s)
+Each row toggles one architectural/training component of our model.
+Component flags are implemented as CLI switches on train_seqpred.py;
+see models_seqpred.py for the corresponding model options.
+EOF
+row ${T5} row01_full \
+    "Full model (reference)" \
+    "--model dailyactformer --modalities ${ALL5} --t_obs 8 --t_fut 2"
+row ${T5} row02_no_composite_head \
+    "Drop the auxiliary verb-composite head (lambda=0)" \
+    "--model dailyactformer --modalities ${ALL5} --t_obs 8 --t_fut 2 --lambda_verb_composite 0.0"
+row ${T5} row03_equal_lambda \
+    "Equal-weight all 4 heads (no prior on verb>hand)" \
+    "--model dailyactformer --modalities ${ALL5} --t_obs 8 --t_fut 2 --lambda_verb_composite 1.0 --lambda_hand 1.0"
+row ${T5} row04_no_class_weight \
+    "No inverse-frequency class weighting" \
+    "--model dailyactformer --modalities ${ALL5} --t_obs 8 --t_fut 2 --lambda_verb_composite 0.5"
+# row04 re-exposes the default; the variable-off is the absence of --use_class_weights
+# We patch this manually — strip the flag out of COMMON.
+ROW_DIR="${BASEDIR}/${T5}/row04_no_class_weight/run.sh"
+if [[ -e "${ROW_DIR}" ]]; then
+    sed -i 's/--use_class_weights //g' "${ROW_DIR}"
+fi
+row ${T5} row05_no_label_smoothing \
+    "Label smoothing off" \
+    "--model dailyactformer --modalities ${ALL5} --t_obs 8 --t_fut 2 --label_smoothing 0.0"
+# ============================================================
+# Table 7: Missing-modality robustness (train once, eval 6 ways)
+# ============================================================
+T7=table7_missing_modality
+mkdir -p "${BASEDIR}/${T7}"
+cat > "${BASEDIR}/${T7}/README.md" <<'EOF'
+# Table 7: Missing-Modality Robustness (T_fut = 2 s)
+Train DailyActFormer with random per-modality dropout (p=0.3). At test time,
+evaluate under 6 configurations: full / drop one modality each. Only the
+training job has its own folder; eval uses the trained checkpoint to fill
+multiple rows of the final table.
+EOF
+row ${T7} row01_train_with_modality_dropout \
+    "DailyActFormer trained with --modality_dropout 0.3" \
+    "--model dailyactformer --modalities ${ALL5} --t_obs 8 --t_fut 2 --modality_dropout 0.3"
+# The 6 test-time configurations (full / no_P / no_E / no_emg / no_imu /
+# no_mocap) will be produced by a separate eval script that loads the
+# checkpoint from row01 and runs evaluate() with modality subsets. See
+# experiments/tasks/eval_missing_modality.py (TBD).
+echo ""
+echo "[ok] Froze rows under:"
+echo "     ${BASEDIR}/{${T1},${T3},${T4},${T5},${T7}}/"

experiments/slurm/run_ablation_fix.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+#!/bin/bash
+#SBATCH --job-name=ablation_fix
+#SBATCH --partition=gpuA800
+#SBATCH --gres=gpu:1
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=32G
+#SBATCH --time=1:00:00
+#SBATCH --output=${PULSE_ROOT}/results/ablation_fix_%j.log
+# Fix: mocap+emg late+pretrained — pretrain MOCAP branch (idx=0) instead of emg
+set -e
+export PYTHONUNBUFFERED=1
+PYTHON=python
+BASEDIR=${PULSE_ROOT}
+SCRIPT=${BASEDIR}/experiments/train_exp1.py
+OUTDIR=${BASEDIR}/results/modality_ablation
+COMMON="--model transformer --epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 15 --proj_dim 0 --output_dir $OUTDIR"
+SEEDS=(42 123 456 789 2024)
+PT_MOCAP=${BASEDIR}/results/exp1_v8/transformer_mocap_early/model_best.pt
+echo "=== Fix: mocap+emg / late+pretrained(mocap, idx=0) ==="
+for seed in "${SEEDS[@]}"; do
+    echo "  mocap+emg seed=$seed"
+    $PYTHON $SCRIPT --modalities mocap,emg --fusion late --seed $seed \
+        --pretrained_backbone $PT_MOCAP --freeze_backbone_idx 0 \
+        --tag ablation_pt_s${seed} $COMMON 2>&1 | tail -5
+done
+echo "=== Done ==="

experiments/slurm/run_ablation_fusion.sh ADDED Viewed

	@@ -0,0 +1,174 @@

+#!/bin/bash
+#SBATCH --job-name=ablation_fuse
+#SBATCH --partition=gpuA800
+#SBATCH --gres=gpu:2
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64G
+#SBATCH --time=4:00:00
+#SBATCH --output=${PULSE_ROOT}/results/ablation_fusion_%j.log
+# Test confidence-weighted and learned-weight fusion on all multi-modal combos
+# Compare against existing mean fusion results
+set -e
+export PYTHONUNBUFFERED=1
+PYTHON=python
+BASEDIR=${PULSE_ROOT}
+SCRIPT=${BASEDIR}/experiments/train_exp1.py
+OUTDIR=${BASEDIR}/results/modality_ablation
+COMMON="--model transformer --epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 15 --proj_dim 0 --output_dir $OUTDIR"
+SEEDS=(42 123 456 789 2024)
+PT_IMU=${BASEDIR}/results/exp1_v7/transformer_imu_early/model_best.pt
+PT_MOCAP=${BASEDIR}/results/exp1_v8/transformer_mocap_early/model_best.pt
+echo "=== Ablation: Confidence & Learned Fusion ==="
+# ============================================================
+# GPU 0: confidence-weighted fusion
+# ============================================================
+(
+export CUDA_VISIBLE_DEVICES=0
+# mocap+imu / confidence / pretrained imu (idx=1)
+echo "--- GPU0: mocap+imu / confidence ---"
+for seed in "${SEEDS[@]}"; do
+    echo "  mocap+imu confidence seed=$seed"
+    $PYTHON $SCRIPT --modalities mocap,imu --fusion late --late_agg confidence \
+        --seed $seed --pretrained_backbone $PT_IMU --freeze_backbone_idx 1 \
+        --tag ablation_conf_s${seed} $COMMON 2>&1 | tail -3
+done
+# emg+imu / confidence / pretrained imu (idx=1)
+echo "--- GPU0: emg+imu / confidence ---"
+for seed in "${SEEDS[@]}"; do
+    echo "  emg+imu confidence seed=$seed"
+    $PYTHON $SCRIPT --modalities emg,imu --fusion late --late_agg confidence \
+        --seed $seed --pretrained_backbone $PT_IMU --freeze_backbone_idx 1 \
+        --tag ablation_conf_s${seed} $COMMON 2>&1 | tail -3
+done
+# mocap+emg / confidence / pretrained mocap (idx=0)
+echo "--- GPU0: mocap+emg / confidence ---"
+for seed in "${SEEDS[@]}"; do
+    echo "  mocap+emg confidence seed=$seed"
+    $PYTHON $SCRIPT --modalities mocap,emg --fusion late --late_agg confidence \
+        --seed $seed --pretrained_backbone $PT_MOCAP --freeze_backbone_idx 0 \
+        --tag ablation_conf_s${seed} $COMMON 2>&1 | tail -3
+done
+# mocap+emg+imu / confidence / pretrained imu (idx=2, modalities=mocap,emg,imu)
+echo "--- GPU0: mocap+emg+imu / confidence ---"
+for seed in "${SEEDS[@]}"; do
+    echo "  mocap+emg+imu confidence seed=$seed"
+    $PYTHON $SCRIPT --modalities imu,mocap,emg --fusion late --late_agg confidence \
+        --seed $seed --pretrained_backbone $PT_IMU --freeze_backbone_idx 0 \
+        --tag ablation_conf_s${seed} $COMMON 2>&1 | tail -3
+done
+echo "--- GPU0 Done ---"
+) &
+PID0=$!
+# ============================================================
+# GPU 1: learned-weight fusion
+# ============================================================
+(
+export CUDA_VISIBLE_DEVICES=1
+# mocap+imu / learned / pretrained imu (idx=1)
+echo "--- GPU1: mocap+imu / learned ---"
+for seed in "${SEEDS[@]}"; do
+    echo "  mocap+imu learned seed=$seed"
+    $PYTHON $SCRIPT --modalities mocap,imu --fusion late --late_agg learned \
+        --seed $seed --pretrained_backbone $PT_IMU --freeze_backbone_idx 1 \
+        --tag ablation_lrn_s${seed} $COMMON 2>&1 | tail -3
+done
+# emg+imu / learned / pretrained imu (idx=1)
+echo "--- GPU1: emg+imu / learned ---"
+for seed in "${SEEDS[@]}"; do
+    echo "  emg+imu learned seed=$seed"
+    $PYTHON $SCRIPT --modalities emg,imu --fusion late --late_agg learned \
+        --seed $seed --pretrained_backbone $PT_IMU --freeze_backbone_idx 1 \
+        --tag ablation_lrn_s${seed} $COMMON 2>&1 | tail -3
+done
+# mocap+emg / learned / pretrained mocap (idx=0)
+echo "--- GPU1: mocap+emg / learned ---"
+for seed in "${SEEDS[@]}"; do
+    echo "  mocap+emg learned seed=$seed"
+    $PYTHON $SCRIPT --modalities mocap,emg --fusion late --late_agg learned \
+        --seed $seed --pretrained_backbone $PT_MOCAP --freeze_backbone_idx 0 \
+        --tag ablation_lrn_s${seed} $COMMON 2>&1 | tail -3
+done
+# mocap+emg+imu / learned / pretrained imu (idx=0, modalities=imu,mocap,emg)
+echo "--- GPU1: mocap+emg+imu / learned ---"
+for seed in "${SEEDS[@]}"; do
+    echo "  mocap+emg+imu learned seed=$seed"
+    $PYTHON $SCRIPT --modalities imu,mocap,emg --fusion late --late_agg learned \
+        --seed $seed --pretrained_backbone $PT_IMU --freeze_backbone_idx 0 \
+        --tag ablation_lrn_s${seed} $COMMON 2>&1 | tail -3
+done
+echo "--- GPU1 Done ---"
+) &
+PID1=$!
+wait $PID0 $PID1
+# ============================================================
+# Collect results
+# ============================================================
+echo ""
+echo "=== Fusion Comparison ==="
+$PYTHON -c "
+import json, os, numpy as np
+base = '$OUTDIR'
+v8_base = '${BASEDIR}/results/exp1_v8_multiseed'
+v9_base = '${BASEDIR}/results/exp1_v9'
+seeds = [42, 123, 456, 789, 2024]
+configs = [
+    # (label, pattern_template)
+    # mean (from previous ablation run)
+    ('mocap+imu / mean',      base + '/transformer_mocap-imu_late_ablation_pt_s{}/results.json'),
+    ('mocap+imu / confidence', base + '/transformer_mocap-imu_late_ablation_conf_s{}/results.json'),
+    ('mocap+imu / learned',   base + '/transformer_mocap-imu_late_ablation_lrn_s{}/results.json'),
+    ('emg+imu / mean',        base + '/transformer_emg-imu_late_ablation_pt_s{}/results.json'),
+    ('emg+imu / confidence',  base + '/transformer_emg-imu_late_ablation_conf_s{}/results.json'),
+    ('emg+imu / learned',     base + '/transformer_emg-imu_late_ablation_lrn_s{}/results.json'),
+    ('mocap+emg / mean',      base + '/transformer_mocap-emg_late_ablation_pt_s{}/results.json'),
+    ('mocap+emg / confidence', base + '/transformer_mocap-emg_late_ablation_conf_s{}/results.json'),
+    ('mocap+emg / learned',   base + '/transformer_mocap-emg_late_ablation_lrn_s{}/results.json'),
+    ('3mod / mean',           v9_base + '/transformer_imu-mocap-emg_late_pt_s{}/results.json'),
+    ('3mod / confidence',     base + '/transformer_imu-mocap-emg_late_ablation_conf_s{}/results.json'),
+    ('3mod / learned',        base + '/transformer_imu-mocap-emg_late_ablation_lrn_s{}/results.json'),
+]
+print(f'{\"Config\":<30} {\"F1 (mean±std)\":<20} {\"Acc (mean±std)\":<20} N')
+print('-' * 75)
+for label, pat in configs:
+    f1s, accs = [], []
+    for s in seeds:
+        path = pat.format(s)
+        if os.path.exists(path):
+            with open(path) as f:
+                d = json.load(f)
+            f1s.append(d['test_macro_f1'])
+            accs.append(d['test_accuracy'])
+    if f1s:
+        f1 = np.array(f1s)
+        acc = np.array(accs)
+        print(f'{label:<30} {f1.mean():.3f}±{f1.std():.3f}           {acc.mean():.3f}±{acc.std():.3f}           {len(f1s)}')
+    else:
+        print(f'{label:<30} (no results)')
+"
+echo ""
+echo "=== All done ==="

experiments/slurm/run_asformer_exp3.sh ADDED Viewed

	@@ -0,0 +1,44 @@

+#!/bin/bash
+#SBATCH --partition=gpuA800
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=4
+#SBATCH --gres=gpu:1
+#SBATCH --mem=32G
+#SBATCH --time=4:00:00
+#SBATCH --job-name=ASF_exp3
+#SBATCH --output=${PULSE_ROOT}/results/asformer_exp3_%j.log
+set -e
+PYTHON=python
+PROJECT=${PULSE_ROOT}
+cd $PROJECT
+EXP3_OUT=$PROJECT/results/published_baselines/exp3_asformer
+mkdir -p $EXP3_OUT
+echo "=== ASFormer Contact Detection ==="
+for MOD in mocap emg imu "mocap,emg" "mocap,emg,eyetrack" "mocap,emg,eyetrack,imu"; do
+    echo "--- ASFormer / ${MOD} ---"
+    $PYTHON experiments/train_exp3.py \
+        --model asformer --modalities $MOD \
+        --hidden_dim 64 --epochs 50 --batch_size 32 \
+        --lr 1e-3 --weight_decay 1e-4 --downsample 2 \
+        --seed 42 --output_dir $EXP3_OUT 2>&1 | tail -8
+done
+echo ""
+echo "=== Results ==="
+for f in $EXP3_OUT/*/results.json; do
+    if [ -f "$f" ]; then
+        $PYTHON -c "
+import json
+with open('$f') as fp:
+    r = json.load(fp)
+mods = ','.join(r.get('input_modalities', []))
+m = r.get('test_metrics', {})
+print(f'  ASFormer | {mods:<30} | R_F1={m.get(\"right_f1\",0):.4f} L_F1={m.get(\"left_f1\",0):.4f} Avg_F1={m.get(\"avg_f1\",0):.4f}')
+"
+    fi
+done

experiments/slurm/run_exp1.sh ADDED Viewed

	@@ -0,0 +1,40 @@

+#!/bin/bash
+#SBATCH -J exp1_scene
+#SBATCH -p gpuA800
+#SBATCH --gres=gpu:1
+#SBATCH -N 1
+#SBATCH -n 1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64G
+#SBATCH -t 12:00:00
+#SBATCH -o ${PULSE_ROOT}/results/exp1/slurm_%j.out
+#SBATCH -e ${PULSE_ROOT}/results/exp1/slurm_%j.err
+export PYTHONUNBUFFERED=1
+echo "=== Job Info ==="
+echo "Job ID: $SLURM_JOB_ID"
+echo "Node: $SLURM_NODELIST"
+echo "Start time: $(date)"
+nvidia-smi --query-gpu=name,memory.total --format=csv,noheader
+echo "================"
+PYTHON=python
+SCRIPT=${PULSE_ROOT}/experiments/train_exp1.py
+OUTDIR=${PULSE_ROOT}/results/exp1
+cd ${PULSE_ROOT}
+$PYTHON $SCRIPT --run_all \
+    --epochs 100 \
+    --batch_size 16 \
+    --lr 1e-3 \
+    --weight_decay 1e-4 \
+    --hidden_dim 128 \
+    --downsample 5 \
+    --patience 15 \
+    --seed 42 \
+    --output_dir $OUTDIR
+echo "=== Done ==="
+echo "End time: $(date)"

experiments/slurm/run_exp1_fusion.sh ADDED Viewed

	@@ -0,0 +1,36 @@

+#!/bin/bash
+# Submit all fusion experiments as individual 1-GPU SLURM jobs
+# SLURM scheduler will automatically place them on any available GPU
+PYTHON=python
+SCRIPT=${PULSE_ROOT}/experiments/train_exp1.py
+OUTDIR=${PULSE_ROOT}/results/exp1
+LOGDIR=${OUTDIR}/slurm_logs
+mkdir -p $LOGDIR
+COMMON_ARGS="--model transformer --epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 15 --seed 42 --output_dir $OUTDIR"
+FUSIONS=(weighted_late gated_late stacking product moe late attention)
+MODALITIES=("mocap,emg,eyetrack" "mocap,emg,eyetrack,imu,pressure")
+for fusion in "${FUSIONS[@]}"; do
+    for mods in "${MODALITIES[@]}"; do
+        mod_tag=$(echo $mods | tr ',' '-')
+        job_name="f_${fusion}_${mod_tag}"
+        sbatch \
+            -J "$job_name" \
+            -p gpuA800 \
+            --gres=gpu:1 \
+            -N 1 -n 1 \
+            --cpus-per-task=8 \
+            --mem=32G \
+            -t 3:00:00 \
+            -o "${LOGDIR}/${job_name}_%j.out" \
+            -e "${LOGDIR}/${job_name}_%j.err" \
+            --export=ALL \
+            --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion $fusion --modalities $mods $COMMON_ARGS"
+        echo "Submitted: $job_name"
+    done
+done
+echo "All 14 fusion experiments submitted!"

experiments/slurm/run_exp1_parallel.sh ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/bin/bash
+# Scene Recognition (Exp1) - Parallelized version
+# Part 1: 9 modality combos × 3 backbones = 27 jobs (early fusion)
+# Part 2: 7 fusion methods × transformer × (3-core + all-5) = 14 jobs
+# Total: 41 jobs
+PYTHON=python
+BASEDIR=${PULSE_ROOT}
+SCRIPT=${BASEDIR}/experiments/train_exp1.py
+OUTDIR=${BASEDIR}/results/exp1_v2
+LOGDIR=${OUTDIR}/slurm_logs
+mkdir -p $LOGDIR
+COMMON="--epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 15 --seed 42 --output_dir $OUTDIR"
+MODS=("mocap" "emg" "eyetrack" "imu" "pressure" "mocap,emg,eyetrack" "mocap,emg,eyetrack,imu" "mocap,emg,eyetrack,pressure" "mocap,emg,eyetrack,imu,pressure")
+MODELS=("cnn" "lstm" "transformer")
+# Part 1: Modality ablation × 3 backbones
+echo "=== Part 1: Modality Ablation (27 jobs) ==="
+for mods in "${MODS[@]}"; do
+    mod_tag=$(echo $mods | tr ',' '-')
+    for model in "${MODELS[@]}"; do
+        sbatch \
+            -J "exp1_${model}_${mod_tag}" \
+            -p gpuA800 \
+            --gres=gpu:1 \
+            -N 1 -n 1 \
+            --cpus-per-task=4 \
+            --mem=32G \
+            -t 2:00:00 \
+            -o "${LOGDIR}/${model}_${mod_tag}_early_%j.out" \
+            -e "${LOGDIR}/${model}_${mod_tag}_early_%j.err" \
+            --export=ALL \
+            --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model $model --modalities $mods --fusion early $COMMON"
+        echo "  Submitted: $model / $mods / early"
+    done
+done
+# Part 2: Fusion methods × transformer
+FUSIONS=("late" "attention" "weighted_late" "gated_late" "stacking" "product" "moe")
+FUSION_MODS=("mocap,emg,eyetrack" "mocap,emg,eyetrack,imu,pressure")
+echo ""
+echo "=== Part 2: Fusion Ablation (14 jobs) ==="
+for fmods in "${FUSION_MODS[@]}"; do
+    fmod_tag=$(echo $fmods | tr ',' '-')
+    for fusion in "${FUSIONS[@]}"; do
+        sbatch \
+            -J "exp1_tf_${fusion}_${fmod_tag}" \
+            -p gpuA800 \
+            --gres=gpu:1 \
+            -N 1 -n 1 \
+            --cpus-per-task=4 \
+            --mem=32G \
+            -t 2:00:00 \
+            -o "${LOGDIR}/transformer_${fmod_tag}_${fusion}_%j.out" \
+            -e "${LOGDIR}/transformer_${fmod_tag}_${fusion}_%j.err" \
+            --export=ALL \
+            --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model transformer --modalities $fmods --fusion $fusion $COMMON"
+        echo "  Submitted: transformer / $fmods / $fusion"
+    done
+done
+echo ""
+echo "Total: 41 jobs | Scene Recognition | Updated IMU data"
+echo "Results: $OUTDIR"

experiments/slurm/run_exp1_small.sh ADDED Viewed

	@@ -0,0 +1,84 @@

+#!/bin/bash
+# Exp1 small model: hidden_dim=32, dropout=0.5, weight_decay=1e-3
+# 3 modalities: mocap, emg, imu (exclude pressure & eyetrack)
+# Output: results/exp1_small
+PYTHON=python
+SCRIPT=${PULSE_ROOT}/experiments/train_exp1.py
+OUTDIR=${PULSE_ROOT}/results/exp1_small
+LOGDIR=${OUTDIR}/slurm_logs
+mkdir -p $LOGDIR
+COMMON="--model transformer --epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-3 --hidden_dim 32 --downsample 5 --patience 15 --seed 42 --output_dir $OUTDIR"
+# ============================================================
+# Part 1: Single modality (early fusion = single backbone)
+# ============================================================
+for mod in mocap emg imu; do
+    job_name="s_${mod}"
+    sbatch \
+        -J "$job_name" \
+        -p gpuA800 \
+        --gres=gpu:1 \
+        -N 1 -n 1 \
+        --cpus-per-task=8 \
+        --mem=32G \
+        -t 1:00:00 \
+        -o "${LOGDIR}/${job_name}_%j.out" \
+        -e "${LOGDIR}/${job_name}_%j.err" \
+        --export=ALL \
+        --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion early --modalities $mod $COMMON"
+    echo "Submitted: $job_name"
+done
+# ============================================================
+# Part 2: Multi-modality early fusion (4 combos)
+# ============================================================
+EARLY_COMBOS=("mocap,emg" "mocap,imu" "emg,imu" "mocap,emg,imu")
+for mods in "${EARLY_COMBOS[@]}"; do
+    mod_tag=$(echo $mods | tr ',' '-')
+    job_name="e_${mod_tag}"
+    sbatch \
+        -J "$job_name" \
+        -p gpuA800 \
+        --gres=gpu:1 \
+        -N 1 -n 1 \
+        --cpus-per-task=8 \
+        --mem=32G \
+        -t 1:00:00 \
+        -o "${LOGDIR}/${job_name}_%j.out" \
+        -e "${LOGDIR}/${job_name}_%j.err" \
+        --export=ALL \
+        --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion early --modalities $mods $COMMON"
+    echo "Submitted: $job_name"
+done
+# ============================================================
+# Part 3: Fusion methods x modality sets
+# ============================================================
+FUSIONS=(late attention weighted_late gated_late stacking product moe)
+FUSION_MODS=("mocap,emg,imu" "mocap,imu")
+for fusion in "${FUSIONS[@]}"; do
+    for mods in "${FUSION_MODS[@]}"; do
+        mod_tag=$(echo $mods | tr ',' '-')
+        job_name="f_${fusion}_${mod_tag}"
+        sbatch \
+            -J "$job_name" \
+            -p gpuA800 \
+            --gres=gpu:1 \
+            -N 1 -n 1 \
+            --cpus-per-task=8 \
+            --mem=32G \
+            -t 1:00:00 \
+            -o "${LOGDIR}/${job_name}_%j.out" \
+            -e "${LOGDIR}/${job_name}_%j.err" \
+            --export=ALL \
+            --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion $fusion --modalities $mods $COMMON"
+        echo "Submitted: $job_name"
+    done
+done
+echo ""
+echo "Total: 3 single + 4 early + 14 fusion = 21 jobs submitted!"
+echo "Results will be saved to: $OUTDIR"

experiments/slurm/run_exp1_small2.sh ADDED Viewed

	@@ -0,0 +1,85 @@

+#!/bin/bash
+# Exp1 small2: per-modality hidden_dim + missing emg+imu fusion experiments
+# hidden_dim=32 base, scaled per modality: mocap(211)->48, imu(161)->48, emg(9)->16
+# Output: results/exp1_small2
+PYTHON=python
+SCRIPT=${PULSE_ROOT}/experiments/train_exp1.py
+OUTDIR=${PULSE_ROOT}/results/exp1_small2
+LOGDIR=${OUTDIR}/slurm_logs
+mkdir -p $LOGDIR
+COMMON="--model transformer --epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-3 --hidden_dim 32 --downsample 5 --patience 15 --seed 42 --output_dir $OUTDIR"
+# ============================================================
+# Part 1: Single modality baselines (3 jobs)
+# ============================================================
+for mod in mocap emg imu; do
+    job_name="s2_${mod}"
+    sbatch \
+        -J "$job_name" \
+        -p gpuA800 \
+        --gres=gpu:1 \
+        -N 1 -n 1 \
+        --cpus-per-task=8 \
+        --mem=32G \
+        -t 1:00:00 \
+        -o "${LOGDIR}/${job_name}_%j.out" \
+        -e "${LOGDIR}/${job_name}_%j.err" \
+        --export=ALL \
+        --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion early --modalities $mod $COMMON"
+    echo "Submitted: $job_name"
+done
+# ============================================================
+# Part 2: Early fusion baselines (3 combos)
+# ============================================================
+EARLY_COMBOS=("emg,imu" "mocap,imu" "mocap,emg,imu")
+for mods in "${EARLY_COMBOS[@]}"; do
+    mod_tag=$(echo $mods | tr ',' '-')
+    job_name="s2_e_${mod_tag}"
+    sbatch \
+        -J "$job_name" \
+        -p gpuA800 \
+        --gres=gpu:1 \
+        -N 1 -n 1 \
+        --cpus-per-task=8 \
+        --mem=32G \
+        -t 1:00:00 \
+        -o "${LOGDIR}/${job_name}_%j.out" \
+        -e "${LOGDIR}/${job_name}_%j.err" \
+        --export=ALL \
+        --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion early --modalities $mods $COMMON"
+    echo "Submitted: $job_name"
+done
+# ============================================================
+# Part 3: Fusion methods x modality combos (7 methods x 3 combos = 21 jobs)
+# Key addition: emg,imu fusion (was missing in round 1)
+# ============================================================
+FUSIONS=(late attention weighted_late gated_late stacking product moe)
+FUSION_MODS=("emg,imu" "mocap,imu" "mocap,emg,imu")
+for fusion in "${FUSIONS[@]}"; do
+    for mods in "${FUSION_MODS[@]}"; do
+        mod_tag=$(echo $mods | tr ',' '-')
+        job_name="s2_${fusion}_${mod_tag}"
+        sbatch \
+            -J "$job_name" \
+            -p gpuA800 \
+            --gres=gpu:1 \
+            -N 1 -n 1 \
+            --cpus-per-task=8 \
+            --mem=32G \
+            -t 1:00:00 \
+            -o "${LOGDIR}/${job_name}_%j.out" \
+            -e "${LOGDIR}/${job_name}_%j.err" \
+            --export=ALL \
+            --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion $fusion --modalities $mods $COMMON"
+        echo "Submitted: $job_name"
+    done
+done
+echo ""
+echo "Total: 3 single + 3 early + 21 fusion = 27 jobs submitted!"
+echo "Results will be saved to: $OUTDIR"

experiments/slurm/run_exp1_small3.sh ADDED Viewed

	@@ -0,0 +1,137 @@

+#!/bin/bash
+# Exp1 small3: Data augmentation + Frozen pretrained IMU + Label smoothing
+# Goal: Break the IMU-alone F1=0.771 ceiling with emg+imu fusion
+# Phase 0: pretrain IMU with hidden_dim=48 (matches fusion branch)
+# Baselines: IMU+aug+ls, emg+imu early+aug+ls
+# Group A: 7 fusion + aug + ls (no freeze)
+# Group B: 7 fusion + frozen IMU + ls (no aug)  [dep: phase0]
+# Group C: 7 fusion + frozen IMU + aug + ls      [dep: phase0]
+# Total: 1 + 2 + 7 + 7 + 7 = 24 jobs
+PYTHON=python
+SCRIPT=${PULSE_ROOT}/experiments/train_exp1.py
+OUTDIR=${PULSE_ROOT}/results/exp1_small3
+LOGDIR=${OUTDIR}/slurm_logs
+mkdir -p $LOGDIR
+COMMON="--model transformer --epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-3 --hidden_dim 32 --downsample 5 --patience 15 --seed 42"
+FUSIONS=(late attention weighted_late gated_late stacking product moe)
+# ============================================================
+# Phase 0: Pretrain IMU with hidden_dim=48 (matches fusion branch)
+# ============================================================
+PHASE0_JOB=$(sbatch --parsable \
+    -J "s3_phase0_imu48" \
+    -p gpuA800 \
+    --gres=gpu:1 \
+    -N 1 -n 1 \
+    --cpus-per-task=8 \
+    --mem=32G \
+    -t 1:00:00 \
+    -o "${LOGDIR}/phase0_imu48_%j.out" \
+    -e "${LOGDIR}/phase0_imu48_%j.err" \
+    --export=ALL \
+    --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --model transformer --fusion early --modalities imu --hidden_dim 48 --epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-3 --downsample 5 --patience 15 --seed 42 --output_dir ${OUTDIR}/phase0")
+echo "Phase 0 (IMU h48): job $PHASE0_JOB"
+PRETRAINED="${OUTDIR}/phase0/transformer_imu_early/model_best.pt"
+# ============================================================
+# Baselines (no dependency)
+# ============================================================
+# Baseline 1: IMU alone + augment + label_smoothing
+sbatch \
+    -J "s3_bl_imu_aug" \
+    -p gpuA800 \
+    --gres=gpu:1 \
+    -N 1 -n 1 \
+    --cpus-per-task=8 \
+    --mem=32G \
+    -t 1:00:00 \
+    -o "${LOGDIR}/bl_imu_aug_%j.out" \
+    -e "${LOGDIR}/bl_imu_aug_%j.err" \
+    --export=ALL \
+    --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion early --modalities imu $COMMON --augment --label_smoothing 0.1 --tag bl_aug --output_dir $OUTDIR"
+echo "Submitted: baseline IMU+aug+ls"
+# Baseline 2: emg,imu early + augment + label_smoothing
+sbatch \
+    -J "s3_bl_ei_aug" \
+    -p gpuA800 \
+    --gres=gpu:1 \
+    -N 1 -n 1 \
+    --cpus-per-task=8 \
+    --mem=32G \
+    -t 1:00:00 \
+    -o "${LOGDIR}/bl_ei_aug_%j.out" \
+    -e "${LOGDIR}/bl_ei_aug_%j.err" \
+    --export=ALL \
+    --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion early --modalities emg,imu $COMMON --augment --label_smoothing 0.1 --tag bl_aug --output_dir $OUTDIR"
+echo "Submitted: baseline emg+imu early+aug+ls"
+# ============================================================
+# Group A: emg+imu x 7 fusion + augment + label_smoothing (no freeze)
+# ============================================================
+for fusion in "${FUSIONS[@]}"; do
+    sbatch \
+        -J "s3_A_${fusion}" \
+        -p gpuA800 \
+        --gres=gpu:1 \
+        -N 1 -n 1 \
+        --cpus-per-task=8 \
+        --mem=32G \
+        -t 1:00:00 \
+        -o "${LOGDIR}/grpA_${fusion}_%j.out" \
+        -e "${LOGDIR}/grpA_${fusion}_%j.err" \
+        --export=ALL \
+        --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion $fusion --modalities emg,imu $COMMON --augment --label_smoothing 0.1 --tag grpA --output_dir $OUTDIR"
+    echo "Submitted: Group A $fusion"
+done
+# ============================================================
+# Group B: emg+imu x 7 fusion + frozen IMU + label_smoothing (no augment)
+# Depends on Phase 0
+# ============================================================
+for fusion in "${FUSIONS[@]}"; do
+    sbatch \
+        --dependency=afterok:${PHASE0_JOB} \
+        -J "s3_B_${fusion}" \
+        -p gpuA800 \
+        --gres=gpu:1 \
+        -N 1 -n 1 \
+        --cpus-per-task=8 \
+        --mem=32G \
+        -t 1:00:00 \
+        -o "${LOGDIR}/grpB_${fusion}_%j.out" \
+        -e "${LOGDIR}/grpB_${fusion}_%j.err" \
+        --export=ALL \
+        --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion $fusion --modalities emg,imu $COMMON --label_smoothing 0.1 --pretrained_backbone $PRETRAINED --freeze_backbone_idx 1 --tag grpB --output_dir $OUTDIR"
+    echo "Submitted: Group B $fusion (dep: $PHASE0_JOB)"
+done
+# ============================================================
+# Group C: emg+imu x 7 fusion + frozen IMU + augment + label_smoothing
+# Depends on Phase 0
+# ============================================================
+for fusion in "${FUSIONS[@]}"; do
+    sbatch \
+        --dependency=afterok:${PHASE0_JOB} \
+        -J "s3_C_${fusion}" \
+        -p gpuA800 \
+        --gres=gpu:1 \
+        -N 1 -n 1 \
+        --cpus-per-task=8 \
+        --mem=32G \
+        -t 1:00:00 \
+        -o "${LOGDIR}/grpC_${fusion}_%j.out" \
+        -e "${LOGDIR}/grpC_${fusion}_%j.err" \
+        --export=ALL \
+        --wrap="export PYTHONUNBUFFERED=1; cd ${PULSE_ROOT}; $PYTHON $SCRIPT --fusion $fusion --modalities emg,imu $COMMON --augment --label_smoothing 0.1 --pretrained_backbone $PRETRAINED --freeze_backbone_idx 1 --tag grpC --output_dir $OUTDIR"
+    echo "Submitted: Group C $fusion (dep: $PHASE0_JOB)"
+done
+echo ""
+echo "Total: 1 phase0 + 2 baselines + 7 grpA + 7 grpB + 7 grpC = 24 jobs"
+echo "Results: $OUTDIR"
+echo "Phase 0 job ID: $PHASE0_JOB (Groups B & C depend on it)"

experiments/slurm/run_exp1_v3.sh ADDED Viewed

	@@ -0,0 +1,68 @@

+#!/bin/bash
+# Scene Recognition (Exp1 v3) - Train 14 vols / Test 4 vols (no val)
+# v23,v24 moved from val to train; v3 stays in test
+# Part 1: 9 modality combos × 3 backbones = 27 jobs (early fusion)
+# Part 2: 7 fusion methods × transformer × (3-core + all-5) = 14 jobs
+# Total: 41 jobs
+PYTHON=python
+BASEDIR=${PULSE_ROOT}
+SCRIPT=${BASEDIR}/experiments/train_exp1.py
+OUTDIR=${BASEDIR}/results/exp1_v3
+LOGDIR=${OUTDIR}/slurm_logs
+mkdir -p $LOGDIR
+COMMON="--epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 15 --seed 42 --output_dir $OUTDIR"
+MODS=("mocap" "emg" "eyetrack" "imu" "pressure" "mocap,emg,eyetrack" "mocap,emg,eyetrack,imu" "mocap,emg,eyetrack,pressure" "mocap,emg,eyetrack,imu,pressure")
+MODELS=("cnn" "lstm" "transformer")
+# Part 1: Modality ablation × 3 backbones
+echo "=== Part 1: Modality Ablation (27 jobs) ==="
+for mods in "${MODS[@]}"; do
+    mod_tag=$(echo $mods | tr ',' '-')
+    for model in "${MODELS[@]}"; do
+        sbatch \
+            -J "e1v3_${model}_${mod_tag}" \
+            -p gpuA800 \
+            --gres=gpu:1 \
+            -N 1 -n 1 \
+            --cpus-per-task=4 \
+            --mem=32G \
+            -t 2:00:00 \
+            -o "${LOGDIR}/${model}_${mod_tag}_early_%j.out" \
+            -e "${LOGDIR}/${model}_${mod_tag}_early_%j.err" \
+            --export=ALL \
+            --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model $model --modalities $mods --fusion early $COMMON"
+        echo "  $model / $mods / early"
+    done
+done
+# Part 2: Fusion methods × transformer
+FUSIONS=("late" "attention" "weighted_late" "gated_late" "stacking" "product" "moe")
+FUSION_MODS=("mocap,emg,eyetrack" "mocap,emg,eyetrack,imu,pressure")
+echo ""
+echo "=== Part 2: Fusion Ablation (14 jobs) ==="
+for fmods in "${FUSION_MODS[@]}"; do
+    fmod_tag=$(echo $fmods | tr ',' '-')
+    for fusion in "${FUSIONS[@]}"; do
+        sbatch \
+            -J "e1v3_tf_${fusion}" \
+            -p gpuA800 \
+            --gres=gpu:1 \
+            -N 1 -n 1 \
+            --cpus-per-task=4 \
+            --mem=32G \
+            -t 2:00:00 \
+            -o "${LOGDIR}/transformer_${fmod_tag}_${fusion}_%j.out" \
+            -e "${LOGDIR}/transformer_${fmod_tag}_${fusion}_%j.err" \
+            --export=ALL \
+            --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model transformer --modalities $fmods --fusion $fusion $COMMON"
+        echo "  transformer / $fmods / $fusion"
+    done
+done
+echo ""
+echo "Total: 41 jobs | Scene Recognition v3 | Train=14vols, Test=4vols"
+echo "Results: $OUTDIR"

experiments/slurm/run_exp1_v4.sh ADDED Viewed

	@@ -0,0 +1,69 @@

+#!/bin/bash
+# Scene Recognition (Exp1 v4) - Per-modality projection to 50 dims
+# All modalities projected to 50d via FC before backbone processing
+# Train 14 vols / Test 4 vols (no val)
+# Part 1: 9 modality combos × 3 backbones = 27 jobs (early fusion)
+# Part 2: 7 fusion methods × transformer × (3-core + all-5) = 14 jobs
+# Total: 41 jobs
+PYTHON=python
+BASEDIR=${PULSE_ROOT}
+SCRIPT=${BASEDIR}/experiments/train_exp1.py
+OUTDIR=${BASEDIR}/results/exp1_v4
+LOGDIR=${OUTDIR}/slurm_logs
+mkdir -p $LOGDIR
+COMMON="--epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 15 --seed 42 --output_dir $OUTDIR"
+MODS=("mocap" "emg" "eyetrack" "imu" "pressure" "mocap,emg,eyetrack" "mocap,emg,eyetrack,imu" "mocap,emg,eyetrack,pressure" "mocap,emg,eyetrack,imu,pressure")
+MODELS=("cnn" "lstm" "transformer")
+# Part 1: Modality ablation × 3 backbones
+echo "=== Part 1: Modality Ablation (27 jobs) ==="
+for mods in "${MODS[@]}"; do
+    mod_tag=$(echo $mods | tr ',' '-')
+    for model in "${MODELS[@]}"; do
+        sbatch \
+            -J "e1v4_${model}_${mod_tag}" \
+            -p gpuA800 \
+            --gres=gpu:1 \
+            -N 1 -n 1 \
+            --cpus-per-task=4 \
+            --mem=32G \
+            -t 2:00:00 \
+            -o "${LOGDIR}/${model}_${mod_tag}_early_%j.out" \
+            -e "${LOGDIR}/${model}_${mod_tag}_early_%j.err" \
+            --export=ALL \
+            --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model $model --modalities $mods --fusion early $COMMON"
+        echo "  $model / $mods / early"
+    done
+done
+# Part 2: Fusion methods × transformer
+FUSIONS=("late" "attention" "weighted_late" "gated_late" "stacking" "product" "moe")
+FUSION_MODS=("mocap,emg,eyetrack" "mocap,emg,eyetrack,imu,pressure")
+echo ""
+echo "=== Part 2: Fusion Ablation (14 jobs) ==="
+for fmods in "${FUSION_MODS[@]}"; do
+    fmod_tag=$(echo $fmods | tr ',' '-')
+    for fusion in "${FUSIONS[@]}"; do
+        sbatch \
+            -J "e1v4_tf_${fusion}" \
+            -p gpuA800 \
+            --gres=gpu:1 \
+            -N 1 -n 1 \
+            --cpus-per-task=4 \
+            --mem=32G \
+            -t 2:00:00 \
+            -o "${LOGDIR}/transformer_${fmod_tag}_${fusion}_%j.out" \
+            -e "${LOGDIR}/transformer_${fmod_tag}_${fusion}_%j.err" \
+            --export=ALL \
+            --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model transformer --modalities $fmods --fusion $fusion $COMMON"
+        echo "  transformer / $fmods / $fusion"
+    done
+done
+echo ""
+echo "Total: 41 jobs | Scene Recognition v4 | Proj50d | Train=14vols, Test=4vols"
+echo "Results: $OUTDIR"

experiments/slurm/run_exp1_v5.sh ADDED Viewed

	@@ -0,0 +1,62 @@

+#!/bin/bash
+# Scene Recognition (Exp1 v5) - Only imu, mocap, emg
+# Per-modality projection to 50d
+# Train 14 vols / Test 4 vols
+PYTHON=python
+BASEDIR=${PULSE_ROOT}
+SCRIPT=${BASEDIR}/experiments/train_exp1.py
+OUTDIR=${BASEDIR}/results/exp1_v5
+LOGDIR=${OUTDIR}/slurm_logs
+mkdir -p $LOGDIR
+COMMON="--epochs 100 --batch_size 16 --lr 1e-3 --weight_decay 1e-4 --hidden_dim 128 --downsample 5 --patience 15 --seed 42 --output_dir $OUTDIR"
+MODELS=("cnn" "lstm" "transformer")
+# Part 1: Single modality (3 mods × 3 backbones = 9 jobs)
+echo "=== Part 1: Single Modality (9 jobs) ==="
+for mods in "imu" "mocap" "emg"; do
+    for model in "${MODELS[@]}"; do
+        sbatch -J "e1v5_${model}_${mods}" -p gpuA800 --gres=gpu:1 -N1 -n1 \
+            --cpus-per-task=4 --mem=32G -t 2:00:00 \
+            -o "${LOGDIR}/${model}_${mods}_early_%j.out" \
+            -e "${LOGDIR}/${model}_${mods}_early_%j.err" \
+            --export=ALL \
+            --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model $model --modalities $mods --fusion early $COMMON"
+        echo "  $model / $mods / early"
+    done
+done
+# Part 2: Multi-modality early fusion (4 combos × 3 backbones = 12 jobs)
+echo ""
+echo "=== Part 2: Multi-Modality Early Fusion (12 jobs) ==="
+for mods in "imu,mocap" "imu,emg" "mocap,emg" "imu,mocap,emg"; do
+    mod_tag=$(echo $mods | tr ',' '-')
+    for model in "${MODELS[@]}"; do
+        sbatch -J "e1v5_${model}_${mod_tag}" -p gpuA800 --gres=gpu:1 -N1 -n1 \
+            --cpus-per-task=4 --mem=32G -t 2:00:00 \
+            -o "${LOGDIR}/${model}_${mod_tag}_early_%j.out" \
+            -e "${LOGDIR}/${model}_${mod_tag}_early_%j.err" \
+            --export=ALL \
+            --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model $model --modalities $mods --fusion early $COMMON"
+        echo "  $model / $mods / early"
+    done
+done
+# Part 3: Fusion ablation with imu+mocap+emg × transformer (7 jobs)
+FUSIONS=("late" "attention" "weighted_late" "gated_late" "stacking" "product" "moe")
+echo ""
+echo "=== Part 3: Fusion Ablation - transformer × imu+mocap+emg (7 jobs) ==="
+for fusion in "${FUSIONS[@]}"; do
+    sbatch -J "e1v5_tf_${fusion}" -p gpuA800 --gres=gpu:1 -N1 -n1 \
+        --cpus-per-task=4 --mem=32G -t 2:00:00 \
+        -o "${LOGDIR}/transformer_imu-mocap-emg_${fusion}_%j.out" \
+        -e "${LOGDIR}/transformer_imu-mocap-emg_${fusion}_%j.err" \
+        --export=ALL \
+        --wrap="export PYTHONUNBUFFERED=1; cd ${BASEDIR}; $PYTHON $SCRIPT --model transformer --modalities imu,mocap,emg --fusion $fusion $COMMON"
+    echo "  transformer / imu,mocap,emg / $fusion"
+done
+echo ""
+echo "Total: 28 jobs | 3 modalities: imu(160d→50d), mocap(156d→50d), emg(8d→50d)"
+echo "Results: $OUTDIR"