Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Yoan Di Cosmo commited on
Commit ·
56a3625
1
Parent(s): ba44575
modified timeout and rows parameters
Browse files- agent/tools/dataset_tools.py +21 -14
agent/tools/dataset_tools.py
CHANGED
|
@@ -27,7 +27,7 @@ def list_splits(dataset: str) -> ToolResult:
|
|
| 27 |
params = {"dataset": dataset}
|
| 28 |
|
| 29 |
try:
|
| 30 |
-
response = httpx.get(url, params=params, timeout=
|
| 31 |
response.raise_for_status()
|
| 32 |
data = response.json()
|
| 33 |
|
|
@@ -77,7 +77,7 @@ def download_rows(
|
|
| 77 |
split: str,
|
| 78 |
config: str | None = None,
|
| 79 |
offset: int = 0,
|
| 80 |
-
length: int =
|
| 81 |
) -> ToolResult:
|
| 82 |
"""
|
| 83 |
Download rows from a dataset split.
|
|
@@ -87,7 +87,7 @@ def download_rows(
|
|
| 87 |
split: Split name (e.g., "train", "test", "validation")
|
| 88 |
config: Optional config name (for datasets with multiple configs)
|
| 89 |
offset: Starting row index (default: 0)
|
| 90 |
-
length: Number of rows to fetch (default:
|
| 91 |
|
| 92 |
Returns:
|
| 93 |
ToolResult with row data
|
|
@@ -106,7 +106,7 @@ def download_rows(
|
|
| 106 |
params["config"] = config
|
| 107 |
|
| 108 |
try:
|
| 109 |
-
response = httpx.get(url, params=params, timeout=
|
| 110 |
response.raise_for_status()
|
| 111 |
data = response.json()
|
| 112 |
|
|
@@ -138,7 +138,7 @@ def download_rows(
|
|
| 138 |
if rows:
|
| 139 |
first_row = rows[0].get("row", {})
|
| 140 |
formatted_parts.append(f"\nExample row (first row):")
|
| 141 |
-
for key, value in list(first_row.items())[:
|
| 142 |
value_str = str(value)
|
| 143 |
if len(value_str) > 200:
|
| 144 |
value_str = value_str[:200] + "..."
|
|
@@ -179,7 +179,9 @@ DATASETS_SERVER_LIST_SPLITS_TOOL_SPEC = {
|
|
| 179 |
"## When to use\n"
|
| 180 |
"- When you need to know what splits are available for a dataset\n"
|
| 181 |
"- Before downloading rows to identify the correct split name\n"
|
| 182 |
-
"- To check dataset structure and organization\n
|
|
|
|
|
|
|
| 183 |
"## Example\n"
|
| 184 |
"{\n"
|
| 185 |
' "dataset": "facebook/research-plan-gen"\n'
|
|
@@ -204,27 +206,32 @@ DATASETS_SERVER_DOWNLOAD_ROWS_TOOL_SPEC = {
|
|
| 204 |
"Fetches a specified number of rows starting from a given offset. Useful for "
|
| 205 |
"sampling data, inspecting dataset contents, or processing datasets in batches.\n\n"
|
| 206 |
"## When to use\n"
|
|
|
|
|
|
|
|
|
|
| 207 |
"- When you need to inspect or sample data from a dataset\n"
|
|
|
|
|
|
|
| 208 |
"- To download specific rows for analysis or processing\n"
|
| 209 |
"- To fetch data in batches (use offset and length parameters)\n\n"
|
| 210 |
"## When NOT to use\n"
|
| 211 |
"- For downloading entire large datasets (use huggingface_hub or datasets library instead)\n"
|
| 212 |
"- When you need to process all data (use streaming or local download)\n\n"
|
| 213 |
"## Examples\n"
|
| 214 |
-
"//
|
| 215 |
"{\n"
|
| 216 |
' "dataset": "facebook/research-plan-gen",\n'
|
| 217 |
' "split": "train",\n'
|
| 218 |
' "config": "arxiv",\n'
|
| 219 |
' "offset": 0,\n'
|
| 220 |
-
' "length":
|
| 221 |
"}\n\n"
|
| 222 |
-
"// Get next batch (rows
|
| 223 |
"{\n"
|
| 224 |
' "dataset": "facebook/research-plan-gen",\n'
|
| 225 |
' "split": "train",\n'
|
| 226 |
-
' "offset":
|
| 227 |
-
' "length":
|
| 228 |
"}"
|
| 229 |
),
|
| 230 |
"parameters": {
|
|
@@ -249,8 +256,8 @@ DATASETS_SERVER_DOWNLOAD_ROWS_TOOL_SPEC = {
|
|
| 249 |
},
|
| 250 |
"length": {
|
| 251 |
"type": "integer",
|
| 252 |
-
"description": "Number of rows to fetch (default:
|
| 253 |
-
"default":
|
| 254 |
},
|
| 255 |
},
|
| 256 |
"required": ["dataset", "split"],
|
|
@@ -275,7 +282,7 @@ async def hf_datasets_download_rows_handler(arguments: Dict[str, Any]) -> tuple[
|
|
| 275 |
split=arguments["split"],
|
| 276 |
config=arguments.get("config"),
|
| 277 |
offset=arguments.get("offset", 0),
|
| 278 |
-
length=arguments.get("length",
|
| 279 |
)
|
| 280 |
return result["formatted"], not result.get("isError", False)
|
| 281 |
except Exception as e:
|
|
|
|
| 27 |
params = {"dataset": dataset}
|
| 28 |
|
| 29 |
try:
|
| 30 |
+
response = httpx.get(url, params=params, timeout=15.0)
|
| 31 |
response.raise_for_status()
|
| 32 |
data = response.json()
|
| 33 |
|
|
|
|
| 77 |
split: str,
|
| 78 |
config: str | None = None,
|
| 79 |
offset: int = 0,
|
| 80 |
+
length: int = 5,
|
| 81 |
) -> ToolResult:
|
| 82 |
"""
|
| 83 |
Download rows from a dataset split.
|
|
|
|
| 87 |
split: Split name (e.g., "train", "test", "validation")
|
| 88 |
config: Optional config name (for datasets with multiple configs)
|
| 89 |
offset: Starting row index (default: 0)
|
| 90 |
+
length: Number of rows to fetch (default: 5, max recommended: 1000)
|
| 91 |
|
| 92 |
Returns:
|
| 93 |
ToolResult with row data
|
|
|
|
| 106 |
params["config"] = config
|
| 107 |
|
| 108 |
try:
|
| 109 |
+
response = httpx.get(url, params=params, timeout=30.0)
|
| 110 |
response.raise_for_status()
|
| 111 |
data = response.json()
|
| 112 |
|
|
|
|
| 138 |
if rows:
|
| 139 |
first_row = rows[0].get("row", {})
|
| 140 |
formatted_parts.append(f"\nExample row (first row):")
|
| 141 |
+
for key, value in list(first_row.items())[:20]: # Show up to 20 fields
|
| 142 |
value_str = str(value)
|
| 143 |
if len(value_str) > 200:
|
| 144 |
value_str = value_str[:200] + "..."
|
|
|
|
| 179 |
"## When to use\n"
|
| 180 |
"- When you need to know what splits are available for a dataset\n"
|
| 181 |
"- Before downloading rows to identify the correct split name\n"
|
| 182 |
+
"- To check dataset structure and organization\n"
|
| 183 |
+
"- **CRITICAL: Always use this tool BEFORE training/fine-tuning models via hf_jobs** "
|
| 184 |
+
"to understand the dataset structure and ensure you're using the correct splits\n\n"
|
| 185 |
"## Example\n"
|
| 186 |
"{\n"
|
| 187 |
' "dataset": "facebook/research-plan-gen"\n'
|
|
|
|
| 206 |
"Fetches a specified number of rows starting from a given offset. Useful for "
|
| 207 |
"sampling data, inspecting dataset contents, or processing datasets in batches.\n\n"
|
| 208 |
"## When to use\n"
|
| 209 |
+
"- **CRITICAL: Always use this tool BEFORE training/fine-tuning models via hf_jobs** "
|
| 210 |
+
"to inspect and understand the dataset structure, data format, column names, and data types. "
|
| 211 |
+
"This helps avoid costly mistakes and ensures proper data preprocessing.\n"
|
| 212 |
"- When you need to inspect or sample data from a dataset\n"
|
| 213 |
+
"- To understand the data format and structure before writing training scripts\n"
|
| 214 |
+
"- To verify column names and data types match your expectations\n"
|
| 215 |
"- To download specific rows for analysis or processing\n"
|
| 216 |
"- To fetch data in batches (use offset and length parameters)\n\n"
|
| 217 |
"## When NOT to use\n"
|
| 218 |
"- For downloading entire large datasets (use huggingface_hub or datasets library instead)\n"
|
| 219 |
"- When you need to process all data (use streaming or local download)\n\n"
|
| 220 |
"## Examples\n"
|
| 221 |
+
"// Inspect first 5 rows to understand dataset structure (recommended before training)\n"
|
| 222 |
"{\n"
|
| 223 |
' "dataset": "facebook/research-plan-gen",\n'
|
| 224 |
' "split": "train",\n'
|
| 225 |
' "config": "arxiv",\n'
|
| 226 |
' "offset": 0,\n'
|
| 227 |
+
' "length": 5\n'
|
| 228 |
"}\n\n"
|
| 229 |
+
"// Get next batch (rows 5-10)\n"
|
| 230 |
"{\n"
|
| 231 |
' "dataset": "facebook/research-plan-gen",\n'
|
| 232 |
' "split": "train",\n'
|
| 233 |
+
' "offset": 5,\n'
|
| 234 |
+
' "length": 5\n'
|
| 235 |
"}"
|
| 236 |
),
|
| 237 |
"parameters": {
|
|
|
|
| 256 |
},
|
| 257 |
"length": {
|
| 258 |
"type": "integer",
|
| 259 |
+
"description": "Number of rows to fetch (default: 5, max recommended: 1000). Use small values (1-5) for quick inspection before training.",
|
| 260 |
+
"default": 5,
|
| 261 |
},
|
| 262 |
},
|
| 263 |
"required": ["dataset", "split"],
|
|
|
|
| 282 |
split=arguments["split"],
|
| 283 |
config=arguments.get("config"),
|
| 284 |
offset=arguments.get("offset", 0),
|
| 285 |
+
length=arguments.get("length", 5),
|
| 286 |
)
|
| 287 |
return result["formatted"], not result.get("isError", False)
|
| 288 |
except Exception as e:
|