"""Data loading, cleaning, and splitting. Pure functions over file paths. Nothing here trains a model or logs to W&B — that's the train.py boundary.""" from __future__ import annotations import pandas as pd from . import config def load_raw(filename: str) -> pd.DataFrame: """Load a raw dataset by filename from data/raw/.""" path = config.RAW_DIR / filename return pd.read_csv(path) if path.suffix == ".csv" else pd.read_parquet(path) def chronological_split( df: pd.DataFrame, time_col: str, train_frac: float = 0.8, val_frac: float = 0.1, ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """Time-ordered train/val/test split. Never shuffle a time series.""" df_sorted = df.sort_values(time_col).reset_index(drop=True) n = len(df_sorted) train_end = int(n * train_frac) val_end = int(n * (train_frac + val_frac)) return df_sorted.iloc[:train_end], df_sorted.iloc[train_end:val_end], df_sorted.iloc[val_end:]