File size: 7,197 Bytes
035d186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
"""
HuggingFace Dataset I/O Utilities

Reusable functions for uploading and downloading JSONL data to/from HuggingFace Hub.
Supports the dataset_name@config_name notation for managing multiple configurations.
"""

from typing import List, Optional

import pandas as pd
from datasets import Dataset, load_dataset


def list_dataset_configs(dataset_name: str) -> Optional[List[str]]:
    """
    List all available configs for a dataset on HuggingFace Hub.

    Args:
        dataset_name: Name of the dataset (e.g., "username/my-dataset")

    Returns:
        List of config names, or None if unable to retrieve

    Example:
        >>> configs = list_dataset_configs("username/hf-agent-benchmark")
        >>> print(configs)
        ['default', 'rubrics', 'evaluations']
    """
    try:
        from datasets import get_dataset_config_names

        configs = get_dataset_config_names(dataset_name)
        return configs
    except Exception as e:
        print(f"✗ Failed to list configs: {type(e).__name__}: {str(e)}")
        return None


def df_to_hub(
    df: pd.DataFrame,
    dataset_spec: str,
    split: str = "train",
    private: bool = False,
) -> bool:
    """
    Upload a pandas DataFrame directly to HuggingFace Hub as a dataset.

    This function converts a pandas DataFrame to a HuggingFace Dataset and uploads
    it to the Hub. This is useful for uploading data directly without creating an
    intermediate JSONL file.

    Args:
        df: pandas DataFrame to upload. All column types should be serializable.
            Example DataFrame:
            ```
            | question | solution | rubric |
            |----------|----------|--------|
            | "How..." | "You..." | {...}  |
            ```

        dataset_spec: Dataset specification in the format "dataset_name" or
            "dataset_name@config_name". Examples:
            - "username/my-dataset" (uses "default" config)
            - "username/my-dataset@rubrics" (uses "rubrics" config)
            - "username/my-dataset@evaluations" (uses "evaluations" config)

        split: The dataset split name. Defaults to "train". Common values:
            - "train": Training or main data
            - "validation": Validation data
            - "test": Test data

        private: Whether to create a private dataset. Defaults to False (public).

    Returns:
        bool: True if upload succeeded, False otherwise

    Raises:
        ValueError: If DataFrame is empty
        Exception: For HuggingFace Hub upload errors

    Example:
        >>> import pandas as pd
        >>> df = pd.DataFrame({
        ...     "question": ["How to train?", "What is fine-tuning?"],
        ...     "solution": ["Use trainer...", "Fine-tuning is..."],
        ...     "rubric": ['[{"title": "...", ...}]', '[{"title": "...", ...}]']
        ... })
        >>> upload_dataframe_to_hf(df, "username/dataset@rubrics")

    Notes:
        - Requires authentication via `huggingface-cli login` or HF_TOKEN env var
        - DataFrame columns with complex objects should be serialized first (e.g., to JSON strings)
        - If the dataset doesn't exist, it will be created automatically
        - Empty DataFrames will raise ValueError to prevent uploading invalid data
    """
    # Validate DataFrame
    if df.empty:
        raise ValueError("DataFrame is empty")

    # Parse dataset specification
    if "@" in dataset_spec:
        dataset_name, config_name = dataset_spec.split("@", 1)
    else:
        dataset_name = dataset_spec
        config_name = "default"

    try:
        print("\nUploading DataFrame to HuggingFace Hub...")
        print(f"  Dataset: {dataset_name}")
        print(f"  Config: {config_name}")
        print(f"  Split: {split}")
        print(f"  Rows: {len(df)}")
        print(f"  Columns: {list(df.columns)}")

        # Convert DataFrame to HuggingFace Dataset
        dataset = Dataset.from_pandas(df)

        # Upload to HuggingFace Hub
        dataset.push_to_hub(
            dataset_name,
            config_name=config_name,
            split=split,
            private=private,
        )

        print(
            f"✓ Successfully uploaded to {dataset_name}@{config_name} (split: {split})"
        )
        return True

    except Exception as e:
        print(f"✗ Failed to upload to HuggingFace: {type(e).__name__}: {str(e)}")
        return False


def hub_to_df(
    dataset_spec: str,
    split: str = "train",
) -> Optional[pd.DataFrame]:
    """
    Download a dataset from HuggingFace Hub as a pandas DataFrame.

    This function downloads a dataset from the HuggingFace Hub and returns it as a
    pandas DataFrame for immediate use in Python.

    Args:
        dataset_spec: Dataset specification in the format "dataset_name" or
            "dataset_name@config_name". Examples:
            - "username/my-dataset" (uses "default" config)
            - "username/my-dataset@rubrics" (uses "rubrics" config)
            - "username/my-dataset@evaluations" (uses "evaluations" config)

        split: The dataset split to download. Defaults to "train". Common values:
            - "train": Training or main data
            - "validation": Validation data
            - "test": Test data

    Returns:
        pd.DataFrame: Downloaded data as pandas DataFrame, or None if failed

    Raises:
        ValueError: If the dataset/config/split doesn't exist
        Exception: For HuggingFace Hub download errors

    Example:
        >>> # Download rubrics from specific config
        >>> df = hub_to_df("username/hf-agent-benchmark@rubrics")
        >>> print(df.head())
        >>> print(f"Shape: {df.shape}")

        >>> # Download evaluation results
        >>> results_df = download_hf_to_dataframe(
        ...     "username/hf-agent-benchmark@evaluations",
        ...     split="test"
        ... )

    Notes:
        - Requires authentication for private datasets via `huggingface-cli login`
        - Downloaded data will be in the same format as uploaded (preserves structure)
        - Large datasets may take time to download and consume significant memory
        - For very large datasets, consider using streaming or download_hf_to_jsonl
    """
    # Parse dataset specification
    if "@" in dataset_spec:
        dataset_name, config_name = dataset_spec.split("@", 1)
    else:
        dataset_name = dataset_spec
        config_name = "default"

    try:
        print("\nDownloading from HuggingFace Hub...")
        print(f"  Dataset: {dataset_name}")
        print(f"  Config: {config_name}")
        print(f"  Split: {split}")

        # Download dataset from HuggingFace Hub
        dataset = load_dataset(
            dataset_name,
            name=config_name,
            split=split,
        )

        print(f"  Downloaded {len(dataset)} records")

        # Convert to pandas DataFrame
        df = dataset.to_pandas()

        print("✓ Successfully loaded as DataFrame")
        print(f"  Shape: {df.shape}")
        print(f"  Columns: {list(df.columns)}")
        return df

    except Exception as e:
        print(f"✗ Failed to download from HuggingFace: {type(e).__name__}: {str(e)}")
        return None