Yoan Di Cosmo commited on
Commit
73be271
·
1 Parent(s): 5fd5de2

modified timeout and rows parameters

Browse files
Files changed (1) hide show
  1. agent/tools/dataset_tools.py +21 -14
agent/tools/dataset_tools.py CHANGED
@@ -27,7 +27,7 @@ def list_splits(dataset: str) -> ToolResult:
27
  params = {"dataset": dataset}
28
 
29
  try:
30
- response = httpx.get(url, params=params, timeout=30.0)
31
  response.raise_for_status()
32
  data = response.json()
33
 
@@ -77,7 +77,7 @@ def download_rows(
77
  split: str,
78
  config: str | None = None,
79
  offset: int = 0,
80
- length: int = 100,
81
  ) -> ToolResult:
82
  """
83
  Download rows from a dataset split.
@@ -87,7 +87,7 @@ def download_rows(
87
  split: Split name (e.g., "train", "test", "validation")
88
  config: Optional config name (for datasets with multiple configs)
89
  offset: Starting row index (default: 0)
90
- length: Number of rows to fetch (default: 100, max recommended: 1000)
91
 
92
  Returns:
93
  ToolResult with row data
@@ -106,7 +106,7 @@ def download_rows(
106
  params["config"] = config
107
 
108
  try:
109
- response = httpx.get(url, params=params, timeout=60.0)
110
  response.raise_for_status()
111
  data = response.json()
112
 
@@ -138,7 +138,7 @@ def download_rows(
138
  if rows:
139
  first_row = rows[0].get("row", {})
140
  formatted_parts.append(f"\nExample row (first row):")
141
- for key, value in list(first_row.items())[:5]: # Show first 5 fields
142
  value_str = str(value)
143
  if len(value_str) > 200:
144
  value_str = value_str[:200] + "..."
@@ -179,7 +179,9 @@ DATASETS_SERVER_LIST_SPLITS_TOOL_SPEC = {
179
  "## When to use\n"
180
  "- When you need to know what splits are available for a dataset\n"
181
  "- Before downloading rows to identify the correct split name\n"
182
- "- To check dataset structure and organization\n\n"
 
 
183
  "## Example\n"
184
  "{\n"
185
  ' "dataset": "facebook/research-plan-gen"\n'
@@ -204,27 +206,32 @@ DATASETS_SERVER_DOWNLOAD_ROWS_TOOL_SPEC = {
204
  "Fetches a specified number of rows starting from a given offset. Useful for "
205
  "sampling data, inspecting dataset contents, or processing datasets in batches.\n\n"
206
  "## When to use\n"
 
 
 
207
  "- When you need to inspect or sample data from a dataset\n"
 
 
208
  "- To download specific rows for analysis or processing\n"
209
  "- To fetch data in batches (use offset and length parameters)\n\n"
210
  "## When NOT to use\n"
211
  "- For downloading entire large datasets (use huggingface_hub or datasets library instead)\n"
212
  "- When you need to process all data (use streaming or local download)\n\n"
213
  "## Examples\n"
214
- "// Get first 100 rows from training split\n"
215
  "{\n"
216
  ' "dataset": "facebook/research-plan-gen",\n'
217
  ' "split": "train",\n'
218
  ' "config": "arxiv",\n'
219
  ' "offset": 0,\n'
220
- ' "length": 100\n'
221
  "}\n\n"
222
- "// Get next batch (rows 100-200)\n"
223
  "{\n"
224
  ' "dataset": "facebook/research-plan-gen",\n'
225
  ' "split": "train",\n'
226
- ' "offset": 100,\n'
227
- ' "length": 100\n'
228
  "}"
229
  ),
230
  "parameters": {
@@ -249,8 +256,8 @@ DATASETS_SERVER_DOWNLOAD_ROWS_TOOL_SPEC = {
249
  },
250
  "length": {
251
  "type": "integer",
252
- "description": "Number of rows to fetch (default: 100, max recommended: 1000).",
253
- "default": 100,
254
  },
255
  },
256
  "required": ["dataset", "split"],
@@ -275,7 +282,7 @@ async def hf_datasets_download_rows_handler(arguments: Dict[str, Any]) -> tuple[
275
  split=arguments["split"],
276
  config=arguments.get("config"),
277
  offset=arguments.get("offset", 0),
278
- length=arguments.get("length", 100),
279
  )
280
  return result["formatted"], not result.get("isError", False)
281
  except Exception as e:
 
27
  params = {"dataset": dataset}
28
 
29
  try:
30
+ response = httpx.get(url, params=params, timeout=15.0)
31
  response.raise_for_status()
32
  data = response.json()
33
 
 
77
  split: str,
78
  config: str | None = None,
79
  offset: int = 0,
80
+ length: int = 5,
81
  ) -> ToolResult:
82
  """
83
  Download rows from a dataset split.
 
87
  split: Split name (e.g., "train", "test", "validation")
88
  config: Optional config name (for datasets with multiple configs)
89
  offset: Starting row index (default: 0)
90
+ length: Number of rows to fetch (default: 5, max recommended: 1000)
91
 
92
  Returns:
93
  ToolResult with row data
 
106
  params["config"] = config
107
 
108
  try:
109
+ response = httpx.get(url, params=params, timeout=30.0)
110
  response.raise_for_status()
111
  data = response.json()
112
 
 
138
  if rows:
139
  first_row = rows[0].get("row", {})
140
  formatted_parts.append(f"\nExample row (first row):")
141
+ for key, value in list(first_row.items())[:20]: # Show up to 20 fields
142
  value_str = str(value)
143
  if len(value_str) > 200:
144
  value_str = value_str[:200] + "..."
 
179
  "## When to use\n"
180
  "- When you need to know what splits are available for a dataset\n"
181
  "- Before downloading rows to identify the correct split name\n"
182
+ "- To check dataset structure and organization\n"
183
+ "- **CRITICAL: Always use this tool BEFORE training/fine-tuning models via hf_jobs** "
184
+ "to understand the dataset structure and ensure you're using the correct splits\n\n"
185
  "## Example\n"
186
  "{\n"
187
  ' "dataset": "facebook/research-plan-gen"\n'
 
206
  "Fetches a specified number of rows starting from a given offset. Useful for "
207
  "sampling data, inspecting dataset contents, or processing datasets in batches.\n\n"
208
  "## When to use\n"
209
+ "- **CRITICAL: Always use this tool BEFORE training/fine-tuning models via hf_jobs** "
210
+ "to inspect and understand the dataset structure, data format, column names, and data types. "
211
+ "This helps avoid costly mistakes and ensures proper data preprocessing.\n"
212
  "- When you need to inspect or sample data from a dataset\n"
213
+ "- To understand the data format and structure before writing training scripts\n"
214
+ "- To verify column names and data types match your expectations\n"
215
  "- To download specific rows for analysis or processing\n"
216
  "- To fetch data in batches (use offset and length parameters)\n\n"
217
  "## When NOT to use\n"
218
  "- For downloading entire large datasets (use huggingface_hub or datasets library instead)\n"
219
  "- When you need to process all data (use streaming or local download)\n\n"
220
  "## Examples\n"
221
+ "// Inspect first 5 rows to understand dataset structure (recommended before training)\n"
222
  "{\n"
223
  ' "dataset": "facebook/research-plan-gen",\n'
224
  ' "split": "train",\n'
225
  ' "config": "arxiv",\n'
226
  ' "offset": 0,\n'
227
+ ' "length": 5\n'
228
  "}\n\n"
229
+ "// Get next batch (rows 5-10)\n"
230
  "{\n"
231
  ' "dataset": "facebook/research-plan-gen",\n'
232
  ' "split": "train",\n'
233
+ ' "offset": 5,\n'
234
+ ' "length": 5\n'
235
  "}"
236
  ),
237
  "parameters": {
 
256
  },
257
  "length": {
258
  "type": "integer",
259
+ "description": "Number of rows to fetch (default: 5, max recommended: 1000). Use small values (1-5) for quick inspection before training.",
260
+ "default": 5,
261
  },
262
  },
263
  "required": ["dataset", "split"],
 
282
  split=arguments["split"],
283
  config=arguments.get("config"),
284
  offset=arguments.get("offset", 0),
285
+ length=arguments.get("length", 5),
286
  )
287
  return result["formatted"], not result.get("isError", False)
288
  except Exception as e: