flow-matching / src /DataLoader.py
sabertoaster's picture
Upload folder using huggingface_hub
4edc9aa verified
import sys
import h5py
import numpy as np
from pathlib import Path
# =============================================================================
# Paths β€” edit these to match your setup
# =============================================================================
ALGONAUTS_REPO = Path("/raid/lttung05/fmri_encoder/code/algonauts2025")
FMRI_PATH = Path("/raid/lttung05/fmri_encoder/data/fmri/algonauts_2025.competitors")
FEAT_PATH = Path("/raid/lttung05/fmri_encoder/data/features")
# OUTPUT_DIR = Path(__file__).parent / "outputs" / "ridgecv"
# =============================================================================
# Config
# =============================================================================
SUBJECTS = [1, 2, 3, 5]
MODEL_NAME = "Llama-3.2-3B"
LAYER = "model.layers.11"
ALPHAS = np.logspace(-2, 10, 20)
TRAIN_SEASONS = list(range(1, 5)) # Friends S1-S4 β†’ fit
ALPHA_SEASONS = [5] # Friends S5 β†’ alpha selection
VAL_SEASONS = [6] # Friends S6 β†’ final eval
TRAIN_MOVIES = ["bourne", "wolf"] # Movie10 β†’ fit
# =============================================================================
# Setup
# =============================================================================
sys.path.insert(0, str(ALGONAUTS_REPO))
from src.data import (
load_algonauts2025_friends_fmri,
load_algonauts2025_movie10_fmri,
episode_filter,
)
# OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# =============================================================================
# FeatureStore: lazy per-episode loading from disk
# =============================================================================
class FeatureStore:
"""Reads one episode from its HDF5 shard on demand; holds no arrays.
Parameters
----------
root : base features directory (e.g. .../features)
model : model name (e.g. "Llama-3.2-3B")
layer : HDF5 key (e.g. "model.layers.11")
series : "friends" | "movie10" | "ood"
"""
def __init__(self, root: Path, model: str, layer: str, series: str):
self._layer = layer
self._paths: dict[str, Path] = {}
for path in sorted((root / model / series).rglob("*.h5")):
if path.stem.endswith("_video"):
ep = path.stem.split("-")[-1].split("_")[0]
else:
ep = path.stem.split("_")[-1]
self._paths[ep] = path
def __getitem__(self, ep: "str | tuple") -> np.ndarray:
key = ep[0] if isinstance(ep, tuple) else ep
with h5py.File(self._paths[key]) as f:
return f[self._layer][:].squeeze().astype(np.float32)
def __contains__(self, ep: "str | tuple") -> bool:
key = ep[0] if isinstance(ep, tuple) else ep
return key in self._paths
def keys(self):
return self._paths.keys()
# =============================================================================
# Load fMRI (compact; keep in memory)
# =============================================================================
print("Loading fMRI data...")
friends_fmri = load_algonauts2025_friends_fmri(
root=FMRI_PATH,
subjects=SUBJECTS,
seasons=TRAIN_SEASONS + ALPHA_SEASONS + VAL_SEASONS,
)
movie10_fmri = load_algonauts2025_movie10_fmri(
root=FMRI_PATH,
subjects=SUBJECTS,
movies=TRAIN_MOVIES,
runs=[1],
)
all_fmri = {**friends_fmri, **movie10_fmri}
print(f" {len(friends_fmri)} Friends + {len(movie10_fmri)} Movie10 episodes")
# Infer dimensions from one episode
_sample = next(iter(all_fmri.values()))
n_subs, _, n_parcels = _sample.shape
print(f" n_subs={n_subs}, n_parcels={n_parcels}")
# =============================================================================
# Feature stores (one per series; lazy)
# =============================================================================
print(f"\nBuilding FeatureStore: {MODEL_NAME} / {LAYER}")
_friends_store = FeatureStore(FEAT_PATH, MODEL_NAME, LAYER, "friends")
_movie10_store = FeatureStore(FEAT_PATH, MODEL_NAME, LAYER, "movie10")
def get_features(ep) -> np.ndarray:
"""Load features for one episode (float32, shape: time Γ— feat_dim)."""
if ep in _friends_store:
return _friends_store[ep]
return _movie10_store[ep]
# Peek to get feat_dim
feat_dim = get_features(next(iter(all_fmri))).shape[-1]
print(f" feat_dim={feat_dim}")
# =============================================================================
# Episode splits
# =============================================================================
all_episodes = sorted(all_fmri.keys(), key=str)
fit_filter = episode_filter(seasons=TRAIN_SEASONS, movies=TRAIN_MOVIES, runs=[1])
alpha_filter = episode_filter(seasons=ALPHA_SEASONS, movies=[], runs=[1])
val_filter = episode_filter(seasons=VAL_SEASONS, movies=[], runs=[1])
fit_episodes = [ep for ep in all_episodes if fit_filter(ep)]
alpha_episodes = [ep for ep in all_episodes if alpha_filter(ep)]
val_episodes = [ep for ep in all_episodes if val_filter(ep)]
print(f"\nFit episodes: {len(fit_episodes)}")
print(f"Alpha episodes: {len(alpha_episodes)}")
print(f"Val episodes: {len(val_episodes)}")