File size: 2,337 Bytes
c70b852
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""Tests for src.core.storage."""
from __future__ import annotations

import hashlib
from pathlib import Path

import pandas as pd
import pytest

from src.core import storage


def _md5(path: Path) -> str:
    return hashlib.md5(path.read_bytes()).hexdigest()


class TestWriteParquet:
    def test_writes_parquet_at_path(self, tmp_path: Path):
        df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]})
        out = tmp_path / "out.parquet"
        storage.write_parquet(df, out)
        round_trip = pd.read_parquet(out)
        pd.testing.assert_frame_equal(round_trip, df)

    def test_creates_parent_directories(self, tmp_path: Path):
        df = pd.DataFrame({"a": [1]})
        out = tmp_path / "deep" / "nested" / "out.parquet"
        storage.write_parquet(df, out)
        assert out.exists()

    def test_overwrites_existing_file(self, tmp_path: Path):
        out = tmp_path / "out.parquet"
        storage.write_parquet(pd.DataFrame({"a": [1]}), out)
        storage.write_parquet(pd.DataFrame({"a": [2]}), out)
        assert pd.read_parquet(out)["a"].tolist() == [2]

    def test_raises_if_path_is_directory(self, tmp_path: Path):
        (tmp_path / "out.parquet").mkdir()
        with pytest.raises(IsADirectoryError):
            storage.write_parquet(pd.DataFrame({"a": [1]}), tmp_path / "out.parquet")

    def test_byte_deterministic_on_repeat(self, tmp_path: Path):
        df = pd.DataFrame({"a": list(range(100)), "b": list(range(100, 200))})
        a, b = tmp_path / "a.parquet", tmp_path / "b.parquet"
        storage.write_parquet(df, a)
        storage.write_parquet(df, b)
        assert _md5(a) == _md5(b)

    def test_preserves_uint8_dtype(self, tmp_path: Path):
        """BBB fingerprints are uint8; writing must not silently widen."""
        df = pd.DataFrame({"fp_0": pd.Series([0, 1], dtype="uint8")})
        out = tmp_path / "out.parquet"
        storage.write_parquet(df, out)
        assert pd.read_parquet(out)["fp_0"].dtype == "uint8"

    def test_index_not_persisted(self, tmp_path: Path):
        """index=False must be the default — round-trip should reset to RangeIndex."""
        df = pd.DataFrame({"a": [1, 2]}, index=["foo", "bar"])
        out = tmp_path / "out.parquet"
        storage.write_parquet(df, out)
        assert list(pd.read_parquet(out).index) == [0, 1]