"""Deterministic Parquet I/O for `data/processed/` outputs. Implements AGENTS.md ยง6 storage convention: pyarrow engine, snappy compression, index suppressed. Combined with `src.core.determinism.pin_threads`, this writes byte-identical Parquet files across runs. """ from __future__ import annotations from pathlib import Path import pandas as pd def write_parquet(df: pd.DataFrame, output_path: Path) -> None: """Write `df` to `output_path` as deterministic, snappy-compressed Parquet. Creates parent directories as needed. Overwrites any existing file at `output_path`. Raises `IsADirectoryError` if `output_path` resolves to an existing directory (caller passed a directory by mistake). Args: df: DataFrame to persist. Dtypes preserved (uint8 stays uint8, etc.). output_path: Destination file path (parent directories auto-created). Raises: IsADirectoryError: if `output_path` is an existing directory. """ output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) if output_path.is_dir(): raise IsADirectoryError( f"output_path must be a file, got a directory: {output_path}" ) df.to_parquet( output_path, index=False, engine="pyarrow", compression="snappy", )