Upload folder using huggingface_hub

4edc9aa verified 5 days ago

5.22 kB

	import sys
	import h5py
	import numpy as np
	from pathlib import Path

	# =============================================================================
	# Paths — edit these to match your setup
	# =============================================================================
	ALGONAUTS_REPO = Path("/raid/lttung05/fmri_encoder/code/algonauts2025")
	FMRI_PATH = Path("/raid/lttung05/fmri_encoder/data/fmri/algonauts_2025.competitors")
	FEAT_PATH = Path("/raid/lttung05/fmri_encoder/data/features")
	# OUTPUT_DIR = Path(__file__).parent / "outputs" / "ridgecv"

	# =============================================================================
	# Config
	# =============================================================================
	SUBJECTS = [1, 2, 3, 5]
	MODEL_NAME = "Llama-3.2-3B"
	LAYER = "model.layers.11"
	ALPHAS = np.logspace(-2, 10, 20)

	TRAIN_SEASONS = list(range(1, 5)) # Friends S1-S4 → fit
	ALPHA_SEASONS = [5] # Friends S5 → alpha selection
	VAL_SEASONS = [6] # Friends S6 → final eval
	TRAIN_MOVIES = ["bourne", "wolf"] # Movie10 → fit

	# =============================================================================
	# Setup
	# =============================================================================
	sys.path.insert(0, str(ALGONAUTS_REPO))

	from src.data import (
	load_algonauts2025_friends_fmri,
	load_algonauts2025_movie10_fmri,
	episode_filter,
	)

	# OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

	# =============================================================================
	# FeatureStore: lazy per-episode loading from disk
	# =============================================================================

	class FeatureStore:
	"""Reads one episode from its HDF5 shard on demand; holds no arrays.

	Parameters
	----------
	root : base features directory (e.g. .../features)
	model : model name (e.g. "Llama-3.2-3B")
	layer : HDF5 key (e.g. "model.layers.11")
	series : "friends" \| "movie10" \| "ood"
	"""

	def __init__(self, root: Path, model: str, layer: str, series: str):
	self._layer = layer
	self._paths: dict[str, Path] = {}

	for path in sorted((root / model / series).rglob("*.h5")):
	if path.stem.endswith("_video"):
	ep = path.stem.split("-")[-1].split("_")[0]
	else:
	ep = path.stem.split("_")[-1]
	self._paths[ep] = path

	def __getitem__(self, ep: "str \| tuple") -> np.ndarray:
	key = ep[0] if isinstance(ep, tuple) else ep
	with h5py.File(self._paths[key]) as f:
	return f[self._layer][:].squeeze().astype(np.float32)

	def __contains__(self, ep: "str \| tuple") -> bool:
	key = ep[0] if isinstance(ep, tuple) else ep
	return key in self._paths

	def keys(self):
	return self._paths.keys()


	# =============================================================================
	# Load fMRI (compact; keep in memory)
	# =============================================================================
	print("Loading fMRI data...")

	friends_fmri = load_algonauts2025_friends_fmri(
	root=FMRI_PATH,
	subjects=SUBJECTS,
	seasons=TRAIN_SEASONS + ALPHA_SEASONS + VAL_SEASONS,
	)
	movie10_fmri = load_algonauts2025_movie10_fmri(
	root=FMRI_PATH,
	subjects=SUBJECTS,
	movies=TRAIN_MOVIES,
	runs=[1],
	)
	all_fmri = {friends_fmri, movie10_fmri}
	print(f" {len(friends_fmri)} Friends + {len(movie10_fmri)} Movie10 episodes")

	# Infer dimensions from one episode
	_sample = next(iter(all_fmri.values()))
	n_subs, _, n_parcels = _sample.shape
	print(f" n_subs={n_subs}, n_parcels={n_parcels}")

	# =============================================================================
	# Feature stores (one per series; lazy)
	# =============================================================================
	print(f"\nBuilding FeatureStore: {MODEL_NAME} / {LAYER}")
	_friends_store = FeatureStore(FEAT_PATH, MODEL_NAME, LAYER, "friends")
	_movie10_store = FeatureStore(FEAT_PATH, MODEL_NAME, LAYER, "movie10")


	def get_features(ep) -> np.ndarray:
	"""Load features for one episode (float32, shape: time × feat_dim)."""
	if ep in _friends_store:
	return _friends_store[ep]
	return _movie10_store[ep]


	# Peek to get feat_dim
	feat_dim = get_features(next(iter(all_fmri))).shape[-1]
	print(f" feat_dim={feat_dim}")

	# =============================================================================
	# Episode splits
	# =============================================================================
	all_episodes = sorted(all_fmri.keys(), key=str)

	fit_filter = episode_filter(seasons=TRAIN_SEASONS, movies=TRAIN_MOVIES, runs=[1])
	alpha_filter = episode_filter(seasons=ALPHA_SEASONS, movies=[], runs=[1])
	val_filter = episode_filter(seasons=VAL_SEASONS, movies=[], runs=[1])

	fit_episodes = [ep for ep in all_episodes if fit_filter(ep)]
	alpha_episodes = [ep for ep in all_episodes if alpha_filter(ep)]
	val_episodes = [ep for ep in all_episodes if val_filter(ep)]

	print(f"\nFit episodes: {len(fit_episodes)}")
	print(f"Alpha episodes: {len(alpha_episodes)}")
	print(f"Val episodes: {len(val_episodes)}")