File size: 5,223 Bytes

4edc9aa

import sys
import h5py
import numpy as np
from pathlib import Path

# =============================================================================
# Paths — edit these to match your setup
# =============================================================================
ALGONAUTS_REPO = Path("/raid/lttung05/fmri_encoder/code/algonauts2025")
FMRI_PATH      = Path("/raid/lttung05/fmri_encoder/data/fmri/algonauts_2025.competitors")
FEAT_PATH      = Path("/raid/lttung05/fmri_encoder/data/features")
# OUTPUT_DIR     = Path(__file__).parent / "outputs" / "ridgecv"

# =============================================================================
# Config
# =============================================================================
SUBJECTS      = [1, 2, 3, 5]
MODEL_NAME    = "Llama-3.2-3B"
LAYER         = "model.layers.11"
ALPHAS        = np.logspace(-2, 10, 20)

TRAIN_SEASONS = list(range(1, 5))   # Friends S1-S4 → fit
ALPHA_SEASONS = [5]                  # Friends S5    → alpha selection
VAL_SEASONS   = [6]                  # Friends S6    → final eval
TRAIN_MOVIES  = ["bourne", "wolf"]  # Movie10       → fit

# =============================================================================
# Setup
# =============================================================================
sys.path.insert(0, str(ALGONAUTS_REPO))

from src.data import (
    load_algonauts2025_friends_fmri,
    load_algonauts2025_movie10_fmri,
    episode_filter,
)

# OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# =============================================================================
# FeatureStore: lazy per-episode loading from disk
# =============================================================================

class FeatureStore:
    """Reads one episode from its HDF5 shard on demand; holds no arrays.

    Parameters
    ----------
    root   : base features directory (e.g. .../features)
    model  : model name (e.g. "Llama-3.2-3B")
    layer  : HDF5 key (e.g. "model.layers.11")
    series : "friends" | "movie10" | "ood"
    """

    def __init__(self, root: Path, model: str, layer: str, series: str):
        self._layer = layer
        self._paths: dict[str, Path] = {}

        for path in sorted((root / model / series).rglob("*.h5")):
            if path.stem.endswith("_video"):
                ep = path.stem.split("-")[-1].split("_")[0]
            else:
                ep = path.stem.split("_")[-1]
            self._paths[ep] = path

    def __getitem__(self, ep: "str | tuple") -> np.ndarray:
        key = ep[0] if isinstance(ep, tuple) else ep
        with h5py.File(self._paths[key]) as f:
            return f[self._layer][:].squeeze().astype(np.float32)

    def __contains__(self, ep: "str | tuple") -> bool:
        key = ep[0] if isinstance(ep, tuple) else ep
        return key in self._paths

    def keys(self):
        return self._paths.keys()


# =============================================================================
# Load fMRI (compact; keep in memory)
# =============================================================================
print("Loading fMRI data...")

friends_fmri = load_algonauts2025_friends_fmri(
    root=FMRI_PATH,
    subjects=SUBJECTS,
    seasons=TRAIN_SEASONS + ALPHA_SEASONS + VAL_SEASONS,
)
movie10_fmri = load_algonauts2025_movie10_fmri(
    root=FMRI_PATH,
    subjects=SUBJECTS,
    movies=TRAIN_MOVIES,
    runs=[1],
)
all_fmri = {**friends_fmri, **movie10_fmri}
print(f"  {len(friends_fmri)} Friends + {len(movie10_fmri)} Movie10 episodes")

# Infer dimensions from one episode
_sample = next(iter(all_fmri.values()))
n_subs, _, n_parcels = _sample.shape
print(f"  n_subs={n_subs}, n_parcels={n_parcels}")

# =============================================================================
# Feature stores (one per series; lazy)
# =============================================================================
print(f"\nBuilding FeatureStore: {MODEL_NAME} / {LAYER}")
_friends_store = FeatureStore(FEAT_PATH, MODEL_NAME, LAYER, "friends")
_movie10_store = FeatureStore(FEAT_PATH, MODEL_NAME, LAYER, "movie10")


def get_features(ep) -> np.ndarray:
    """Load features for one episode (float32, shape: time × feat_dim)."""
    if ep in _friends_store:
        return _friends_store[ep]
    return _movie10_store[ep]


# Peek to get feat_dim
feat_dim = get_features(next(iter(all_fmri))).shape[-1]
print(f"  feat_dim={feat_dim}")

# =============================================================================
# Episode splits
# =============================================================================
all_episodes = sorted(all_fmri.keys(), key=str)

fit_filter   = episode_filter(seasons=TRAIN_SEASONS, movies=TRAIN_MOVIES, runs=[1])
alpha_filter = episode_filter(seasons=ALPHA_SEASONS, movies=[],           runs=[1])
val_filter   = episode_filter(seasons=VAL_SEASONS,   movies=[],           runs=[1])

fit_episodes   = [ep for ep in all_episodes if fit_filter(ep)]
alpha_episodes = [ep for ep in all_episodes if alpha_filter(ep)]
val_episodes   = [ep for ep in all_episodes if val_filter(ep)]

print(f"\nFit episodes:   {len(fit_episodes)}")
print(f"Alpha episodes: {len(alpha_episodes)}")
print(f"Val episodes:   {len(val_episodes)}")