File size: 1,309 Bytes
c70b852
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
"""Deterministic Parquet I/O for `data/processed/` outputs.

Implements AGENTS.md §6 storage convention: pyarrow engine, snappy compression,
index suppressed. Combined with `src.core.determinism.pin_threads`, this writes
byte-identical Parquet files across runs.
"""
from __future__ import annotations

from pathlib import Path

import pandas as pd


def write_parquet(df: pd.DataFrame, output_path: Path) -> None:
    """Write `df` to `output_path` as deterministic, snappy-compressed Parquet.

    Creates parent directories as needed. Overwrites any existing file at
    `output_path`. Raises `IsADirectoryError` if `output_path` resolves to an
    existing directory (caller passed a directory by mistake).

    Args:
        df: DataFrame to persist. Dtypes preserved (uint8 stays uint8, etc.).
        output_path: Destination file path (parent directories auto-created).

    Raises:
        IsADirectoryError: if `output_path` is an existing directory.
    """
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    if output_path.is_dir():
        raise IsADirectoryError(
            f"output_path must be a file, got a directory: {output_path}"
        )
    df.to_parquet(
        output_path, index=False, engine="pyarrow", compression="snappy",
    )