Spaces:

mekosotto
/

hackathon

Running

App Files Files Community

mekosotto Claude Sonnet 4.6 commited on 7 days ago

Commit

c70b852

1 Parent(s): 99af1d9

feat(core): extract write_parquet() helper for §6 storage contract

Browse files

Files changed (2) hide show

src/core/storage.py +36 -0
tests/core/test_storage.py +61 -0

src/core/storage.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""Deterministic Parquet I/O for `data/processed/` outputs.
+Implements AGENTS.md §6 storage convention: pyarrow engine, snappy compression,
+index suppressed. Combined with `src.core.determinism.pin_threads`, this writes
+byte-identical Parquet files across runs.
+"""
+from __future__ import annotations
+from pathlib import Path
+import pandas as pd
+def write_parquet(df: pd.DataFrame, output_path: Path) -> None:
+    """Write `df` to `output_path` as deterministic, snappy-compressed Parquet.
+    Creates parent directories as needed. Overwrites any existing file at
+    `output_path`. Raises `IsADirectoryError` if `output_path` resolves to an
+    existing directory (caller passed a directory by mistake).
+    Args:
+        df: DataFrame to persist. Dtypes preserved (uint8 stays uint8, etc.).
+        output_path: Destination file path (parent directories auto-created).
+    Raises:
+        IsADirectoryError: if `output_path` is an existing directory.
+    """
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    if output_path.is_dir():
+        raise IsADirectoryError(
+            f"output_path must be a file, got a directory: {output_path}"
+        )
+    df.to_parquet(
+        output_path, index=False, engine="pyarrow", compression="snappy",
+    )

tests/core/test_storage.py ADDED Viewed

	@@ -0,0 +1,61 @@

+"""Tests for src.core.storage."""
+from __future__ import annotations
+import hashlib
+from pathlib import Path
+import pandas as pd
+import pytest
+from src.core import storage
+def _md5(path: Path) -> str:
+    return hashlib.md5(path.read_bytes()).hexdigest()
+class TestWriteParquet:
+    def test_writes_parquet_at_path(self, tmp_path: Path):
+        df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]})
+        out = tmp_path / "out.parquet"
+        storage.write_parquet(df, out)
+        round_trip = pd.read_parquet(out)
+        pd.testing.assert_frame_equal(round_trip, df)
+    def test_creates_parent_directories(self, tmp_path: Path):
+        df = pd.DataFrame({"a": [1]})
+        out = tmp_path / "deep" / "nested" / "out.parquet"
+        storage.write_parquet(df, out)
+        assert out.exists()
+    def test_overwrites_existing_file(self, tmp_path: Path):
+        out = tmp_path / "out.parquet"
+        storage.write_parquet(pd.DataFrame({"a": [1]}), out)
+        storage.write_parquet(pd.DataFrame({"a": [2]}), out)
+        assert pd.read_parquet(out)["a"].tolist() == [2]
+    def test_raises_if_path_is_directory(self, tmp_path: Path):
+        (tmp_path / "out.parquet").mkdir()
+        with pytest.raises(IsADirectoryError):
+            storage.write_parquet(pd.DataFrame({"a": [1]}), tmp_path / "out.parquet")
+    def test_byte_deterministic_on_repeat(self, tmp_path: Path):
+        df = pd.DataFrame({"a": list(range(100)), "b": list(range(100, 200))})
+        a, b = tmp_path / "a.parquet", tmp_path / "b.parquet"
+        storage.write_parquet(df, a)
+        storage.write_parquet(df, b)
+        assert _md5(a) == _md5(b)
+    def test_preserves_uint8_dtype(self, tmp_path: Path):
+        """BBB fingerprints are uint8; writing must not silently widen."""
+        df = pd.DataFrame({"fp_0": pd.Series([0, 1], dtype="uint8")})
+        out = tmp_path / "out.parquet"
+        storage.write_parquet(df, out)
+        assert pd.read_parquet(out)["fp_0"].dtype == "uint8"
+    def test_index_not_persisted(self, tmp_path: Path):
+        """index=False must be the default — round-trip should reset to RangeIndex."""
+        df = pd.DataFrame({"a": [1, 2]}, index=["foo", "bar"])
+        out = tmp_path / "out.parquet"
+        storage.write_parquet(df, out)
+        assert list(pd.read_parquet(out).index) == [0, 1]