""" HuggingFace Dataset I/O Utilities Reusable functions for uploading and downloading JSONL data to/from HuggingFace Hub. Supports the dataset_name@config_name notation for managing multiple configurations. """ from typing import List, Optional import pandas as pd from datasets import Dataset, load_dataset def list_dataset_configs(dataset_name: str) -> Optional[List[str]]: """ List all available configs for a dataset on HuggingFace Hub. Args: dataset_name: Name of the dataset (e.g., "username/my-dataset") Returns: List of config names, or None if unable to retrieve Example: >>> configs = list_dataset_configs("username/hf-agent-benchmark") >>> print(configs) ['default', 'rubrics', 'evaluations'] """ try: from datasets import get_dataset_config_names configs = get_dataset_config_names(dataset_name) return configs except Exception as e: print(f"✗ Failed to list configs: {type(e).__name__}: {str(e)}") return None def df_to_hub( df: pd.DataFrame, dataset_spec: str, split: str = "train", private: bool = False, ) -> bool: """ Upload a pandas DataFrame directly to HuggingFace Hub as a dataset. This function converts a pandas DataFrame to a HuggingFace Dataset and uploads it to the Hub. This is useful for uploading data directly without creating an intermediate JSONL file. Args: df: pandas DataFrame to upload. All column types should be serializable. Example DataFrame: ``` | question | solution | rubric | |----------|----------|--------| | "How..." | "You..." | {...} | ``` dataset_spec: Dataset specification in the format "dataset_name" or "dataset_name@config_name". Examples: - "username/my-dataset" (uses "default" config) - "username/my-dataset@rubrics" (uses "rubrics" config) - "username/my-dataset@evaluations" (uses "evaluations" config) split: The dataset split name. Defaults to "train". Common values: - "train": Training or main data - "validation": Validation data - "test": Test data private: Whether to create a private dataset. Defaults to False (public). Returns: bool: True if upload succeeded, False otherwise Raises: ValueError: If DataFrame is empty Exception: For HuggingFace Hub upload errors Example: >>> import pandas as pd >>> df = pd.DataFrame({ ... "question": ["How to train?", "What is fine-tuning?"], ... "solution": ["Use trainer...", "Fine-tuning is..."], ... "rubric": ['[{"title": "...", ...}]', '[{"title": "...", ...}]'] ... }) >>> upload_dataframe_to_hf(df, "username/dataset@rubrics") Notes: - Requires authentication via `huggingface-cli login` or HF_TOKEN env var - DataFrame columns with complex objects should be serialized first (e.g., to JSON strings) - If the dataset doesn't exist, it will be created automatically - Empty DataFrames will raise ValueError to prevent uploading invalid data """ # Validate DataFrame if df.empty: raise ValueError("DataFrame is empty") # Parse dataset specification if "@" in dataset_spec: dataset_name, config_name = dataset_spec.split("@", 1) else: dataset_name = dataset_spec config_name = "default" try: print("\nUploading DataFrame to HuggingFace Hub...") print(f" Dataset: {dataset_name}") print(f" Config: {config_name}") print(f" Split: {split}") print(f" Rows: {len(df)}") print(f" Columns: {list(df.columns)}") # Convert DataFrame to HuggingFace Dataset dataset = Dataset.from_pandas(df) # Upload to HuggingFace Hub dataset.push_to_hub( dataset_name, config_name=config_name, split=split, private=private, ) print( f"✓ Successfully uploaded to {dataset_name}@{config_name} (split: {split})" ) return True except Exception as e: print(f"✗ Failed to upload to HuggingFace: {type(e).__name__}: {str(e)}") return False def hub_to_df( dataset_spec: str, split: str = "train", ) -> Optional[pd.DataFrame]: """ Download a dataset from HuggingFace Hub as a pandas DataFrame. This function downloads a dataset from the HuggingFace Hub and returns it as a pandas DataFrame for immediate use in Python. Args: dataset_spec: Dataset specification in the format "dataset_name" or "dataset_name@config_name". Examples: - "username/my-dataset" (uses "default" config) - "username/my-dataset@rubrics" (uses "rubrics" config) - "username/my-dataset@evaluations" (uses "evaluations" config) split: The dataset split to download. Defaults to "train". Common values: - "train": Training or main data - "validation": Validation data - "test": Test data Returns: pd.DataFrame: Downloaded data as pandas DataFrame, or None if failed Raises: ValueError: If the dataset/config/split doesn't exist Exception: For HuggingFace Hub download errors Example: >>> # Download rubrics from specific config >>> df = hub_to_df("username/hf-agent-benchmark@rubrics") >>> print(df.head()) >>> print(f"Shape: {df.shape}") >>> # Download evaluation results >>> results_df = download_hf_to_dataframe( ... "username/hf-agent-benchmark@evaluations", ... split="test" ... ) Notes: - Requires authentication for private datasets via `huggingface-cli login` - Downloaded data will be in the same format as uploaded (preserves structure) - Large datasets may take time to download and consume significant memory - For very large datasets, consider using streaming or download_hf_to_jsonl """ # Parse dataset specification if "@" in dataset_spec: dataset_name, config_name = dataset_spec.split("@", 1) else: dataset_name = dataset_spec config_name = "default" try: print("\nDownloading from HuggingFace Hub...") print(f" Dataset: {dataset_name}") print(f" Config: {config_name}") print(f" Split: {split}") # Download dataset from HuggingFace Hub dataset = load_dataset( dataset_name, name=config_name, split=split, ) print(f" Downloaded {len(dataset)} records") # Convert to pandas DataFrame df = dataset.to_pandas() print("✓ Successfully loaded as DataFrame") print(f" Shape: {df.shape}") print(f" Columns: {list(df.columns)}") return df except Exception as e: print(f"✗ Failed to download from HuggingFace: {type(e).__name__}: {str(e)}") return None