import sys import h5py import numpy as np from pathlib import Path # ============================================================================= # Paths — edit these to match your setup # ============================================================================= ALGONAUTS_REPO = Path("/raid/lttung05/fmri_encoder/code/algonauts2025") FMRI_PATH = Path("/raid/lttung05/fmri_encoder/data/fmri/algonauts_2025.competitors") FEAT_PATH = Path("/raid/lttung05/fmri_encoder/data/features") # OUTPUT_DIR = Path(__file__).parent / "outputs" / "ridgecv" # ============================================================================= # Config # ============================================================================= SUBJECTS = [1, 2, 3, 5] MODEL_NAME = "Llama-3.2-3B" LAYER = "model.layers.11" ALPHAS = np.logspace(-2, 10, 20) TRAIN_SEASONS = list(range(1, 5)) # Friends S1-S4 → fit ALPHA_SEASONS = [5] # Friends S5 → alpha selection VAL_SEASONS = [6] # Friends S6 → final eval TRAIN_MOVIES = ["bourne", "wolf"] # Movie10 → fit # ============================================================================= # Setup # ============================================================================= sys.path.insert(0, str(ALGONAUTS_REPO)) from src.data import ( load_algonauts2025_friends_fmri, load_algonauts2025_movie10_fmri, episode_filter, ) # OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # ============================================================================= # FeatureStore: lazy per-episode loading from disk # ============================================================================= class FeatureStore: """Reads one episode from its HDF5 shard on demand; holds no arrays. Parameters ---------- root : base features directory (e.g. .../features) model : model name (e.g. "Llama-3.2-3B") layer : HDF5 key (e.g. "model.layers.11") series : "friends" | "movie10" | "ood" """ def __init__(self, root: Path, model: str, layer: str, series: str): self._layer = layer self._paths: dict[str, Path] = {} for path in sorted((root / model / series).rglob("*.h5")): if path.stem.endswith("_video"): ep = path.stem.split("-")[-1].split("_")[0] else: ep = path.stem.split("_")[-1] self._paths[ep] = path def __getitem__(self, ep: "str | tuple") -> np.ndarray: key = ep[0] if isinstance(ep, tuple) else ep with h5py.File(self._paths[key]) as f: return f[self._layer][:].squeeze().astype(np.float32) def __contains__(self, ep: "str | tuple") -> bool: key = ep[0] if isinstance(ep, tuple) else ep return key in self._paths def keys(self): return self._paths.keys() # ============================================================================= # Load fMRI (compact; keep in memory) # ============================================================================= print("Loading fMRI data...") friends_fmri = load_algonauts2025_friends_fmri( root=FMRI_PATH, subjects=SUBJECTS, seasons=TRAIN_SEASONS + ALPHA_SEASONS + VAL_SEASONS, ) movie10_fmri = load_algonauts2025_movie10_fmri( root=FMRI_PATH, subjects=SUBJECTS, movies=TRAIN_MOVIES, runs=[1], ) all_fmri = {**friends_fmri, **movie10_fmri} print(f" {len(friends_fmri)} Friends + {len(movie10_fmri)} Movie10 episodes") # Infer dimensions from one episode _sample = next(iter(all_fmri.values())) n_subs, _, n_parcels = _sample.shape print(f" n_subs={n_subs}, n_parcels={n_parcels}") # ============================================================================= # Feature stores (one per series; lazy) # ============================================================================= print(f"\nBuilding FeatureStore: {MODEL_NAME} / {LAYER}") _friends_store = FeatureStore(FEAT_PATH, MODEL_NAME, LAYER, "friends") _movie10_store = FeatureStore(FEAT_PATH, MODEL_NAME, LAYER, "movie10") def get_features(ep) -> np.ndarray: """Load features for one episode (float32, shape: time × feat_dim).""" if ep in _friends_store: return _friends_store[ep] return _movie10_store[ep] # Peek to get feat_dim feat_dim = get_features(next(iter(all_fmri))).shape[-1] print(f" feat_dim={feat_dim}") # ============================================================================= # Episode splits # ============================================================================= all_episodes = sorted(all_fmri.keys(), key=str) fit_filter = episode_filter(seasons=TRAIN_SEASONS, movies=TRAIN_MOVIES, runs=[1]) alpha_filter = episode_filter(seasons=ALPHA_SEASONS, movies=[], runs=[1]) val_filter = episode_filter(seasons=VAL_SEASONS, movies=[], runs=[1]) fit_episodes = [ep for ep in all_episodes if fit_filter(ep)] alpha_episodes = [ep for ep in all_episodes if alpha_filter(ep)] val_episodes = [ep for ep in all_episodes if val_filter(ep)] print(f"\nFit episodes: {len(fit_episodes)}") print(f"Alpha episodes: {len(alpha_episodes)}") print(f"Val episodes: {len(val_episodes)}")