Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| """ | |
| HuggingFace Dataset I/O Utilities | |
| Reusable functions for uploading and downloading JSONL data to/from HuggingFace Hub. | |
| Supports the dataset_name@config_name notation for managing multiple configurations. | |
| """ | |
| from typing import List, Optional | |
| import pandas as pd | |
| from datasets import Dataset, load_dataset | |
| def list_dataset_configs(dataset_name: str) -> Optional[List[str]]: | |
| """ | |
| List all available configs for a dataset on HuggingFace Hub. | |
| Args: | |
| dataset_name: Name of the dataset (e.g., "username/my-dataset") | |
| Returns: | |
| List of config names, or None if unable to retrieve | |
| Example: | |
| >>> configs = list_dataset_configs("username/hf-agent-benchmark") | |
| >>> print(configs) | |
| ['default', 'rubrics', 'evaluations'] | |
| """ | |
| try: | |
| from datasets import get_dataset_config_names | |
| configs = get_dataset_config_names(dataset_name) | |
| return configs | |
| except Exception as e: | |
| print(f"✗ Failed to list configs: {type(e).__name__}: {str(e)}") | |
| return None | |
| def df_to_hub( | |
| df: pd.DataFrame, | |
| dataset_spec: str, | |
| split: str = "train", | |
| private: bool = False, | |
| ) -> bool: | |
| """ | |
| Upload a pandas DataFrame directly to HuggingFace Hub as a dataset. | |
| This function converts a pandas DataFrame to a HuggingFace Dataset and uploads | |
| it to the Hub. This is useful for uploading data directly without creating an | |
| intermediate JSONL file. | |
| Args: | |
| df: pandas DataFrame to upload. All column types should be serializable. | |
| Example DataFrame: | |
| ``` | |
| | question | solution | rubric | | |
| |----------|----------|--------| | |
| | "How..." | "You..." | {...} | | |
| ``` | |
| dataset_spec: Dataset specification in the format "dataset_name" or | |
| "dataset_name@config_name". Examples: | |
| - "username/my-dataset" (uses "default" config) | |
| - "username/my-dataset@rubrics" (uses "rubrics" config) | |
| - "username/my-dataset@evaluations" (uses "evaluations" config) | |
| split: The dataset split name. Defaults to "train". Common values: | |
| - "train": Training or main data | |
| - "validation": Validation data | |
| - "test": Test data | |
| private: Whether to create a private dataset. Defaults to False (public). | |
| Returns: | |
| bool: True if upload succeeded, False otherwise | |
| Raises: | |
| ValueError: If DataFrame is empty | |
| Exception: For HuggingFace Hub upload errors | |
| Example: | |
| >>> import pandas as pd | |
| >>> df = pd.DataFrame({ | |
| ... "question": ["How to train?", "What is fine-tuning?"], | |
| ... "solution": ["Use trainer...", "Fine-tuning is..."], | |
| ... "rubric": ['[{"title": "...", ...}]', '[{"title": "...", ...}]'] | |
| ... }) | |
| >>> upload_dataframe_to_hf(df, "username/dataset@rubrics") | |
| Notes: | |
| - Requires authentication via `huggingface-cli login` or HF_TOKEN env var | |
| - DataFrame columns with complex objects should be serialized first (e.g., to JSON strings) | |
| - If the dataset doesn't exist, it will be created automatically | |
| - Empty DataFrames will raise ValueError to prevent uploading invalid data | |
| """ | |
| # Validate DataFrame | |
| if df.empty: | |
| raise ValueError("DataFrame is empty") | |
| # Parse dataset specification | |
| if "@" in dataset_spec: | |
| dataset_name, config_name = dataset_spec.split("@", 1) | |
| else: | |
| dataset_name = dataset_spec | |
| config_name = "default" | |
| try: | |
| print("\nUploading DataFrame to HuggingFace Hub...") | |
| print(f" Dataset: {dataset_name}") | |
| print(f" Config: {config_name}") | |
| print(f" Split: {split}") | |
| print(f" Rows: {len(df)}") | |
| print(f" Columns: {list(df.columns)}") | |
| # Convert DataFrame to HuggingFace Dataset | |
| dataset = Dataset.from_pandas(df) | |
| # Upload to HuggingFace Hub | |
| dataset.push_to_hub( | |
| dataset_name, | |
| config_name=config_name, | |
| split=split, | |
| private=private, | |
| ) | |
| print( | |
| f"✓ Successfully uploaded to {dataset_name}@{config_name} (split: {split})" | |
| ) | |
| return True | |
| except Exception as e: | |
| print(f"✗ Failed to upload to HuggingFace: {type(e).__name__}: {str(e)}") | |
| return False | |
| def hub_to_df( | |
| dataset_spec: str, | |
| split: str = "train", | |
| ) -> Optional[pd.DataFrame]: | |
| """ | |
| Download a dataset from HuggingFace Hub as a pandas DataFrame. | |
| This function downloads a dataset from the HuggingFace Hub and returns it as a | |
| pandas DataFrame for immediate use in Python. | |
| Args: | |
| dataset_spec: Dataset specification in the format "dataset_name" or | |
| "dataset_name@config_name". Examples: | |
| - "username/my-dataset" (uses "default" config) | |
| - "username/my-dataset@rubrics" (uses "rubrics" config) | |
| - "username/my-dataset@evaluations" (uses "evaluations" config) | |
| split: The dataset split to download. Defaults to "train". Common values: | |
| - "train": Training or main data | |
| - "validation": Validation data | |
| - "test": Test data | |
| Returns: | |
| pd.DataFrame: Downloaded data as pandas DataFrame, or None if failed | |
| Raises: | |
| ValueError: If the dataset/config/split doesn't exist | |
| Exception: For HuggingFace Hub download errors | |
| Example: | |
| >>> # Download rubrics from specific config | |
| >>> df = hub_to_df("username/hf-agent-benchmark@rubrics") | |
| >>> print(df.head()) | |
| >>> print(f"Shape: {df.shape}") | |
| >>> # Download evaluation results | |
| >>> results_df = download_hf_to_dataframe( | |
| ... "username/hf-agent-benchmark@evaluations", | |
| ... split="test" | |
| ... ) | |
| Notes: | |
| - Requires authentication for private datasets via `huggingface-cli login` | |
| - Downloaded data will be in the same format as uploaded (preserves structure) | |
| - Large datasets may take time to download and consume significant memory | |
| - For very large datasets, consider using streaming or download_hf_to_jsonl | |
| """ | |
| # Parse dataset specification | |
| if "@" in dataset_spec: | |
| dataset_name, config_name = dataset_spec.split("@", 1) | |
| else: | |
| dataset_name = dataset_spec | |
| config_name = "default" | |
| try: | |
| print("\nDownloading from HuggingFace Hub...") | |
| print(f" Dataset: {dataset_name}") | |
| print(f" Config: {config_name}") | |
| print(f" Split: {split}") | |
| # Download dataset from HuggingFace Hub | |
| dataset = load_dataset( | |
| dataset_name, | |
| name=config_name, | |
| split=split, | |
| ) | |
| print(f" Downloaded {len(dataset)} records") | |
| # Convert to pandas DataFrame | |
| df = dataset.to_pandas() | |
| print("✓ Successfully loaded as DataFrame") | |
| print(f" Shape: {df.shape}") | |
| print(f" Columns: {list(df.columns)}") | |
| return df | |
| except Exception as e: | |
| print(f"✗ Failed to download from HuggingFace: {type(e).__name__}: {str(e)}") | |
| return None | |