File size: 2,337 Bytes
c70b852 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | """Tests for src.core.storage."""
from __future__ import annotations
import hashlib
from pathlib import Path
import pandas as pd
import pytest
from src.core import storage
def _md5(path: Path) -> str:
return hashlib.md5(path.read_bytes()).hexdigest()
class TestWriteParquet:
def test_writes_parquet_at_path(self, tmp_path: Path):
df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]})
out = tmp_path / "out.parquet"
storage.write_parquet(df, out)
round_trip = pd.read_parquet(out)
pd.testing.assert_frame_equal(round_trip, df)
def test_creates_parent_directories(self, tmp_path: Path):
df = pd.DataFrame({"a": [1]})
out = tmp_path / "deep" / "nested" / "out.parquet"
storage.write_parquet(df, out)
assert out.exists()
def test_overwrites_existing_file(self, tmp_path: Path):
out = tmp_path / "out.parquet"
storage.write_parquet(pd.DataFrame({"a": [1]}), out)
storage.write_parquet(pd.DataFrame({"a": [2]}), out)
assert pd.read_parquet(out)["a"].tolist() == [2]
def test_raises_if_path_is_directory(self, tmp_path: Path):
(tmp_path / "out.parquet").mkdir()
with pytest.raises(IsADirectoryError):
storage.write_parquet(pd.DataFrame({"a": [1]}), tmp_path / "out.parquet")
def test_byte_deterministic_on_repeat(self, tmp_path: Path):
df = pd.DataFrame({"a": list(range(100)), "b": list(range(100, 200))})
a, b = tmp_path / "a.parquet", tmp_path / "b.parquet"
storage.write_parquet(df, a)
storage.write_parquet(df, b)
assert _md5(a) == _md5(b)
def test_preserves_uint8_dtype(self, tmp_path: Path):
"""BBB fingerprints are uint8; writing must not silently widen."""
df = pd.DataFrame({"fp_0": pd.Series([0, 1], dtype="uint8")})
out = tmp_path / "out.parquet"
storage.write_parquet(df, out)
assert pd.read_parquet(out)["fp_0"].dtype == "uint8"
def test_index_not_persisted(self, tmp_path: Path):
"""index=False must be the default — round-trip should reset to RangeIndex."""
df = pd.DataFrame({"a": [1, 2]}, index=["foo", "bar"])
out = tmp_path / "out.parquet"
storage.write_parquet(df, out)
assert list(pd.read_parquet(out).index) == [0, 1]
|