File size: 5,223 Bytes
4edc9aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import sys
import h5py
import numpy as np
from pathlib import Path

# =============================================================================
# Paths — edit these to match your setup
# =============================================================================
ALGONAUTS_REPO = Path("/raid/lttung05/fmri_encoder/code/algonauts2025")
FMRI_PATH      = Path("/raid/lttung05/fmri_encoder/data/fmri/algonauts_2025.competitors")
FEAT_PATH      = Path("/raid/lttung05/fmri_encoder/data/features")
# OUTPUT_DIR     = Path(__file__).parent / "outputs" / "ridgecv"

# =============================================================================
# Config
# =============================================================================
SUBJECTS      = [1, 2, 3, 5]
MODEL_NAME    = "Llama-3.2-3B"
LAYER         = "model.layers.11"
ALPHAS        = np.logspace(-2, 10, 20)

TRAIN_SEASONS = list(range(1, 5))   # Friends S1-S4 → fit
ALPHA_SEASONS = [5]                  # Friends S5    → alpha selection
VAL_SEASONS   = [6]                  # Friends S6    → final eval
TRAIN_MOVIES  = ["bourne", "wolf"]  # Movie10       → fit

# =============================================================================
# Setup
# =============================================================================
sys.path.insert(0, str(ALGONAUTS_REPO))

from src.data import (
    load_algonauts2025_friends_fmri,
    load_algonauts2025_movie10_fmri,
    episode_filter,
)

# OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# =============================================================================
# FeatureStore: lazy per-episode loading from disk
# =============================================================================

class FeatureStore:
    """Reads one episode from its HDF5 shard on demand; holds no arrays.

    Parameters
    ----------
    root   : base features directory (e.g. .../features)
    model  : model name (e.g. "Llama-3.2-3B")
    layer  : HDF5 key (e.g. "model.layers.11")
    series : "friends" | "movie10" | "ood"
    """

    def __init__(self, root: Path, model: str, layer: str, series: str):
        self._layer = layer
        self._paths: dict[str, Path] = {}

        for path in sorted((root / model / series).rglob("*.h5")):
            if path.stem.endswith("_video"):
                ep = path.stem.split("-")[-1].split("_")[0]
            else:
                ep = path.stem.split("_")[-1]
            self._paths[ep] = path

    def __getitem__(self, ep: "str | tuple") -> np.ndarray:
        key = ep[0] if isinstance(ep, tuple) else ep
        with h5py.File(self._paths[key]) as f:
            return f[self._layer][:].squeeze().astype(np.float32)

    def __contains__(self, ep: "str | tuple") -> bool:
        key = ep[0] if isinstance(ep, tuple) else ep
        return key in self._paths

    def keys(self):
        return self._paths.keys()


# =============================================================================
# Load fMRI (compact; keep in memory)
# =============================================================================
print("Loading fMRI data...")

friends_fmri = load_algonauts2025_friends_fmri(
    root=FMRI_PATH,
    subjects=SUBJECTS,
    seasons=TRAIN_SEASONS + ALPHA_SEASONS + VAL_SEASONS,
)
movie10_fmri = load_algonauts2025_movie10_fmri(
    root=FMRI_PATH,
    subjects=SUBJECTS,
    movies=TRAIN_MOVIES,
    runs=[1],
)
all_fmri = {**friends_fmri, **movie10_fmri}
print(f"  {len(friends_fmri)} Friends + {len(movie10_fmri)} Movie10 episodes")

# Infer dimensions from one episode
_sample = next(iter(all_fmri.values()))
n_subs, _, n_parcels = _sample.shape
print(f"  n_subs={n_subs}, n_parcels={n_parcels}")

# =============================================================================
# Feature stores (one per series; lazy)
# =============================================================================
print(f"\nBuilding FeatureStore: {MODEL_NAME} / {LAYER}")
_friends_store = FeatureStore(FEAT_PATH, MODEL_NAME, LAYER, "friends")
_movie10_store = FeatureStore(FEAT_PATH, MODEL_NAME, LAYER, "movie10")


def get_features(ep) -> np.ndarray:
    """Load features for one episode (float32, shape: time × feat_dim)."""
    if ep in _friends_store:
        return _friends_store[ep]
    return _movie10_store[ep]


# Peek to get feat_dim
feat_dim = get_features(next(iter(all_fmri))).shape[-1]
print(f"  feat_dim={feat_dim}")

# =============================================================================
# Episode splits
# =============================================================================
all_episodes = sorted(all_fmri.keys(), key=str)

fit_filter   = episode_filter(seasons=TRAIN_SEASONS, movies=TRAIN_MOVIES, runs=[1])
alpha_filter = episode_filter(seasons=ALPHA_SEASONS, movies=[],           runs=[1])
val_filter   = episode_filter(seasons=VAL_SEASONS,   movies=[],           runs=[1])

fit_episodes   = [ep for ep in all_episodes if fit_filter(ep)]
alpha_episodes = [ep for ep in all_episodes if alpha_filter(ep)]
val_episodes   = [ep for ep in all_episodes if val_filter(ep)]

print(f"\nFit episodes:   {len(fit_episodes)}")
print(f"Alpha episodes: {len(alpha_episodes)}")
print(f"Val episodes:   {len(val_episodes)}")