Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
File size: 7,197 Bytes
035d186 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | """
HuggingFace Dataset I/O Utilities
Reusable functions for uploading and downloading JSONL data to/from HuggingFace Hub.
Supports the dataset_name@config_name notation for managing multiple configurations.
"""
from typing import List, Optional
import pandas as pd
from datasets import Dataset, load_dataset
def list_dataset_configs(dataset_name: str) -> Optional[List[str]]:
"""
List all available configs for a dataset on HuggingFace Hub.
Args:
dataset_name: Name of the dataset (e.g., "username/my-dataset")
Returns:
List of config names, or None if unable to retrieve
Example:
>>> configs = list_dataset_configs("username/hf-agent-benchmark")
>>> print(configs)
['default', 'rubrics', 'evaluations']
"""
try:
from datasets import get_dataset_config_names
configs = get_dataset_config_names(dataset_name)
return configs
except Exception as e:
print(f"✗ Failed to list configs: {type(e).__name__}: {str(e)}")
return None
def df_to_hub(
df: pd.DataFrame,
dataset_spec: str,
split: str = "train",
private: bool = False,
) -> bool:
"""
Upload a pandas DataFrame directly to HuggingFace Hub as a dataset.
This function converts a pandas DataFrame to a HuggingFace Dataset and uploads
it to the Hub. This is useful for uploading data directly without creating an
intermediate JSONL file.
Args:
df: pandas DataFrame to upload. All column types should be serializable.
Example DataFrame:
```
| question | solution | rubric |
|----------|----------|--------|
| "How..." | "You..." | {...} |
```
dataset_spec: Dataset specification in the format "dataset_name" or
"dataset_name@config_name". Examples:
- "username/my-dataset" (uses "default" config)
- "username/my-dataset@rubrics" (uses "rubrics" config)
- "username/my-dataset@evaluations" (uses "evaluations" config)
split: The dataset split name. Defaults to "train". Common values:
- "train": Training or main data
- "validation": Validation data
- "test": Test data
private: Whether to create a private dataset. Defaults to False (public).
Returns:
bool: True if upload succeeded, False otherwise
Raises:
ValueError: If DataFrame is empty
Exception: For HuggingFace Hub upload errors
Example:
>>> import pandas as pd
>>> df = pd.DataFrame({
... "question": ["How to train?", "What is fine-tuning?"],
... "solution": ["Use trainer...", "Fine-tuning is..."],
... "rubric": ['[{"title": "...", ...}]', '[{"title": "...", ...}]']
... })
>>> upload_dataframe_to_hf(df, "username/dataset@rubrics")
Notes:
- Requires authentication via `huggingface-cli login` or HF_TOKEN env var
- DataFrame columns with complex objects should be serialized first (e.g., to JSON strings)
- If the dataset doesn't exist, it will be created automatically
- Empty DataFrames will raise ValueError to prevent uploading invalid data
"""
# Validate DataFrame
if df.empty:
raise ValueError("DataFrame is empty")
# Parse dataset specification
if "@" in dataset_spec:
dataset_name, config_name = dataset_spec.split("@", 1)
else:
dataset_name = dataset_spec
config_name = "default"
try:
print("\nUploading DataFrame to HuggingFace Hub...")
print(f" Dataset: {dataset_name}")
print(f" Config: {config_name}")
print(f" Split: {split}")
print(f" Rows: {len(df)}")
print(f" Columns: {list(df.columns)}")
# Convert DataFrame to HuggingFace Dataset
dataset = Dataset.from_pandas(df)
# Upload to HuggingFace Hub
dataset.push_to_hub(
dataset_name,
config_name=config_name,
split=split,
private=private,
)
print(
f"✓ Successfully uploaded to {dataset_name}@{config_name} (split: {split})"
)
return True
except Exception as e:
print(f"✗ Failed to upload to HuggingFace: {type(e).__name__}: {str(e)}")
return False
def hub_to_df(
dataset_spec: str,
split: str = "train",
) -> Optional[pd.DataFrame]:
"""
Download a dataset from HuggingFace Hub as a pandas DataFrame.
This function downloads a dataset from the HuggingFace Hub and returns it as a
pandas DataFrame for immediate use in Python.
Args:
dataset_spec: Dataset specification in the format "dataset_name" or
"dataset_name@config_name". Examples:
- "username/my-dataset" (uses "default" config)
- "username/my-dataset@rubrics" (uses "rubrics" config)
- "username/my-dataset@evaluations" (uses "evaluations" config)
split: The dataset split to download. Defaults to "train". Common values:
- "train": Training or main data
- "validation": Validation data
- "test": Test data
Returns:
pd.DataFrame: Downloaded data as pandas DataFrame, or None if failed
Raises:
ValueError: If the dataset/config/split doesn't exist
Exception: For HuggingFace Hub download errors
Example:
>>> # Download rubrics from specific config
>>> df = hub_to_df("username/hf-agent-benchmark@rubrics")
>>> print(df.head())
>>> print(f"Shape: {df.shape}")
>>> # Download evaluation results
>>> results_df = download_hf_to_dataframe(
... "username/hf-agent-benchmark@evaluations",
... split="test"
... )
Notes:
- Requires authentication for private datasets via `huggingface-cli login`
- Downloaded data will be in the same format as uploaded (preserves structure)
- Large datasets may take time to download and consume significant memory
- For very large datasets, consider using streaming or download_hf_to_jsonl
"""
# Parse dataset specification
if "@" in dataset_spec:
dataset_name, config_name = dataset_spec.split("@", 1)
else:
dataset_name = dataset_spec
config_name = "default"
try:
print("\nDownloading from HuggingFace Hub...")
print(f" Dataset: {dataset_name}")
print(f" Config: {config_name}")
print(f" Split: {split}")
# Download dataset from HuggingFace Hub
dataset = load_dataset(
dataset_name,
name=config_name,
split=split,
)
print(f" Downloaded {len(dataset)} records")
# Convert to pandas DataFrame
df = dataset.to_pandas()
print("✓ Successfully loaded as DataFrame")
print(f" Shape: {df.shape}")
print(f" Columns: {list(df.columns)}")
return df
except Exception as e:
print(f"✗ Failed to download from HuggingFace: {type(e).__name__}: {str(e)}")
return None
|