File size: 1,235 Bytes
a1b4ce8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from __future__ import annotations

from pathlib import Path

import pandas as pd
from datasets import load_dataset


def load_hf_dataframe(dataset: str, split: str = "train", revision: str | None = None) -> pd.DataFrame:
    """Load a tabular Hugging Face dataset split as a pandas DataFrame."""
    kwargs = {"path": dataset, "split": split}
    if revision is not None:
        kwargs["revision"] = revision
    return load_dataset(**kwargs).to_pandas()


def load_dataframe(
    *,
    csv_path: str | None = None,
    hf_dataset: str | None = None,
    hf_split: str = "train",
    hf_revision: str | None = None,
) -> tuple[pd.DataFrame, str]:
    """Load a DataFrame from exactly one supported source and return a source label."""
    sources = [source is not None for source in (csv_path, hf_dataset)]
    if sum(sources) != 1:
        raise ValueError("Provide exactly one data source: csv_path or hf_dataset")

    if hf_dataset is not None:
        return load_hf_dataframe(hf_dataset, split=hf_split, revision=hf_revision), hf_dataset

    assert csv_path is not None
    path = Path(csv_path)
    if not path.exists():
        raise FileNotFoundError(f"CSV file not found: {path}")
    return pd.read_csv(path), str(path)