Spaces:
Sleeping
Sleeping
File size: 1,235 Bytes
a1b4ce8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | from __future__ import annotations
from pathlib import Path
import pandas as pd
from datasets import load_dataset
def load_hf_dataframe(dataset: str, split: str = "train", revision: str | None = None) -> pd.DataFrame:
"""Load a tabular Hugging Face dataset split as a pandas DataFrame."""
kwargs = {"path": dataset, "split": split}
if revision is not None:
kwargs["revision"] = revision
return load_dataset(**kwargs).to_pandas()
def load_dataframe(
*,
csv_path: str | None = None,
hf_dataset: str | None = None,
hf_split: str = "train",
hf_revision: str | None = None,
) -> tuple[pd.DataFrame, str]:
"""Load a DataFrame from exactly one supported source and return a source label."""
sources = [source is not None for source in (csv_path, hf_dataset)]
if sum(sources) != 1:
raise ValueError("Provide exactly one data source: csv_path or hf_dataset")
if hf_dataset is not None:
return load_hf_dataframe(hf_dataset, split=hf_split, revision=hf_revision), hf_dataset
assert csv_path is not None
path = Path(csv_path)
if not path.exists():
raise FileNotFoundError(f"CSV file not found: {path}")
return pd.read_csv(path), str(path)
|