akseljoonas HF Staff commited on
Commit
e8a3722
·
1 Parent(s): 73be271

Unify dataset tools into single comprehensive hf_inspect_dataset tool

Browse files

Replaces two separate tools (hf_datasets_list_splits, hf_datasets_download_rows)
with a single tool that provides everything needed for ML tasks in one call:

- Status check via /is-valid endpoint
- All configs and splits via /splits
- Schema with column types via /info
- Sample rows via /first-rows
- Parquet file structure via /parquet

Key improvements:
- All API calls made in parallel for speed
- Auto-detects config/split if not specified
- Supports private/gated datasets via HF_TOKEN
- Special handling for messages column (chat datasets):
- Shows roles present (user, assistant, system, tool)
- Shows message keys with presence indicators
- Detects tool_calls and function_call presence

agent/core/tools.py CHANGED
@@ -14,10 +14,8 @@ from mcp.types import EmbeddedResource, ImageContent, TextContent
14
 
15
  from agent.config import MCPServerConfig
16
  from agent.tools.dataset_tools import (
17
- DATASETS_SERVER_DOWNLOAD_ROWS_TOOL_SPEC,
18
- DATASETS_SERVER_LIST_SPLITS_TOOL_SPEC,
19
- hf_datasets_download_rows_handler,
20
- hf_datasets_list_splits_handler,
21
  )
22
  from agent.tools.docs_tools import (
23
  EXPLORE_HF_DOCS_TOOL_SPEC,
@@ -263,18 +261,12 @@ def create_builtin_tools() -> list[ToolSpec]:
263
  parameters=HF_DOCS_FETCH_TOOL_SPEC["parameters"],
264
  handler=hf_docs_fetch_handler,
265
  ),
266
- # Datasets server tools
267
  ToolSpec(
268
- name=DATASETS_SERVER_LIST_SPLITS_TOOL_SPEC["name"],
269
- description=DATASETS_SERVER_LIST_SPLITS_TOOL_SPEC["description"],
270
- parameters=DATASETS_SERVER_LIST_SPLITS_TOOL_SPEC["parameters"],
271
- handler=hf_datasets_list_splits_handler,
272
- ),
273
- ToolSpec(
274
- name=DATASETS_SERVER_DOWNLOAD_ROWS_TOOL_SPEC["name"],
275
- description=DATASETS_SERVER_DOWNLOAD_ROWS_TOOL_SPEC["description"],
276
- parameters=DATASETS_SERVER_DOWNLOAD_ROWS_TOOL_SPEC["parameters"],
277
- handler=hf_datasets_download_rows_handler,
278
  ),
279
  # Planning and job management tools
280
  ToolSpec(
 
14
 
15
  from agent.config import MCPServerConfig
16
  from agent.tools.dataset_tools import (
17
+ HF_INSPECT_DATASET_TOOL_SPEC,
18
+ hf_inspect_dataset_handler,
 
 
19
  )
20
  from agent.tools.docs_tools import (
21
  EXPLORE_HF_DOCS_TOOL_SPEC,
 
261
  parameters=HF_DOCS_FETCH_TOOL_SPEC["parameters"],
262
  handler=hf_docs_fetch_handler,
263
  ),
264
+ # Dataset inspection tool (unified)
265
  ToolSpec(
266
+ name=HF_INSPECT_DATASET_TOOL_SPEC["name"],
267
+ description=HF_INSPECT_DATASET_TOOL_SPEC["description"],
268
+ parameters=HF_INSPECT_DATASET_TOOL_SPEC["parameters"],
269
+ handler=hf_inspect_dataset_handler,
 
 
 
 
 
 
270
  ),
271
  # Planning and job management tools
272
  ToolSpec(
agent/tools/__init__.py CHANGED
@@ -19,10 +19,8 @@ from agent.tools.github_search_code import (
19
  github_search_code_handler,
20
  )
21
  from agent.tools.dataset_tools import (
22
- DATASETS_SERVER_DOWNLOAD_ROWS_TOOL_SPEC,
23
- DATASETS_SERVER_LIST_SPLITS_TOOL_SPEC,
24
- hf_datasets_download_rows_handler,
25
- hf_datasets_list_splits_handler,
26
  )
27
  from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, HfJobsTool, hf_jobs_handler
28
  from agent.tools.types import ToolResult
@@ -40,8 +38,6 @@ __all__ = [
40
  "github_read_file_handler",
41
  "GITHUB_SEARCH_CODE_TOOL_SPEC",
42
  "github_search_code_handler",
43
- "DATASETS_SERVER_LIST_SPLITS_TOOL_SPEC",
44
- "hf_datasets_list_splits_handler",
45
- "DATASETS_SERVER_DOWNLOAD_ROWS_TOOL_SPEC",
46
- "hf_datasets_download_rows_handler",
47
  ]
 
19
  github_search_code_handler,
20
  )
21
  from agent.tools.dataset_tools import (
22
+ HF_INSPECT_DATASET_TOOL_SPEC,
23
+ hf_inspect_dataset_handler,
 
 
24
  )
25
  from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, HfJobsTool, hf_jobs_handler
26
  from agent.tools.types import ToolResult
 
38
  "github_read_file_handler",
39
  "GITHUB_SEARCH_CODE_TOOL_SPEC",
40
  "github_search_code_handler",
41
+ "HF_INSPECT_DATASET_TOOL_SPEC",
42
+ "hf_inspect_dataset_handler",
 
 
43
  ]
agent/tools/dataset_tools.py CHANGED
@@ -1,290 +1,397 @@
1
  """
2
- Hugging Face Dataset Tool - Query datasets via the Datasets Server API
3
 
4
- Allows downloading rows and listing splits from Hugging Face datasets.
 
5
  """
6
 
7
- from typing import Any, Dict
 
 
8
 
9
  import httpx
10
 
11
  from agent.tools.types import ToolResult
12
 
 
13
 
14
- def list_splits(dataset: str) -> ToolResult:
15
- """
16
- List all available splits for a dataset.
17
-
18
- Args:
19
- dataset: Dataset identifier (e.g., "facebook/research-plan-gen")
20
-
21
- Returns:
22
- ToolResult with split information
23
- """
24
- base_url = "https://datasets-server.huggingface.co"
25
- url = f"{base_url}/splits"
26
-
27
- params = {"dataset": dataset}
28
-
29
- try:
30
- response = httpx.get(url, params=params, timeout=15.0)
31
- response.raise_for_status()
32
- data = response.json()
33
-
34
- splits = data.get("splits", [])
35
- if not splits:
36
- return {
37
- "formatted": f"No splits found for dataset '{dataset}'",
38
- "totalResults": 0,
39
- "resultsShared": 0,
40
- "isError": False,
41
- }
42
-
43
- # Format splits information
44
- split_info = []
45
- for split in splits:
46
- split_name = split.get("split", "unknown")
47
- num_rows = split.get("num_examples", "unknown")
48
- split_info.append(f"- **{split_name}**: {num_rows} rows")
49
-
50
- formatted = f"Available splits for dataset '{dataset}':\n\n" + "\n".join(split_info)
51
-
52
- return {
53
- "formatted": formatted,
54
- "totalResults": len(splits),
55
- "resultsShared": len(splits),
56
- "isError": False,
57
- }
58
-
59
- except httpx.HTTPStatusError as e:
60
- return {
61
- "formatted": f"HTTP error {e.response.status_code}: {str(e)}",
62
- "totalResults": 0,
63
- "resultsShared": 0,
64
- "isError": True,
65
- }
66
- except Exception as e:
67
- return {
68
- "formatted": f"Failed to list splits: {str(e)}",
69
- "totalResults": 0,
70
- "resultsShared": 0,
71
- "isError": True,
72
- }
73
 
74
 
75
- def download_rows(
76
  dataset: str,
77
- split: str,
78
  config: str | None = None,
79
- offset: int = 0,
80
- length: int = 5,
81
  ) -> ToolResult:
82
  """
83
- Download rows from a dataset split.
84
-
85
- Args:
86
- dataset: Dataset identifier (e.g., "facebook/research-plan-gen")
87
- split: Split name (e.g., "train", "test", "validation")
88
- config: Optional config name (for datasets with multiple configs)
89
- offset: Starting row index (default: 0)
90
- length: Number of rows to fetch (default: 5, max recommended: 1000)
91
-
92
- Returns:
93
- ToolResult with row data
94
  """
95
- base_url = "https://datasets-server.huggingface.co"
96
- url = f"{base_url}/rows"
97
-
98
- params = {
99
- "dataset": dataset,
100
- "split": split,
101
- "offset": offset,
102
- "length": length,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  }
104
-
105
- if config:
106
- params["config"] = config
107
-
108
- try:
109
- response = httpx.get(url, params=params, timeout=30.0)
110
- response.raise_for_status()
111
- data = response.json()
112
-
113
- rows = data.get("rows", [])
114
- features = data.get("features", [])
115
-
116
- if not rows:
117
- return {
118
- "formatted": f"No rows found for dataset '{dataset}', split '{split}' at offset {offset}",
119
- "totalResults": 0,
120
- "resultsShared": 0,
121
- "isError": False,
122
- }
123
-
124
- # Format a summary of the rows
125
- formatted_parts = [
126
- f"Downloaded {len(rows)} rows from dataset '{dataset}'",
127
- f"Split: {split}",
128
- f"Offset: {offset}",
129
- ]
130
-
131
- if config:
132
- formatted_parts.append(f"Config: {config}")
133
-
134
- formatted_parts.append(f"\nFeatures: {', '.join([f.get('name', 'unknown') for f in features])}")
135
- formatted_parts.append(f"\nTotal rows in response: {len(rows)}")
136
-
137
- # Show first row as example
138
- if rows:
139
- first_row = rows[0].get("row", {})
140
- formatted_parts.append(f"\nExample row (first row):")
141
- for key, value in list(first_row.items())[:20]: # Show up to 20 fields
142
- value_str = str(value)
143
- if len(value_str) > 200:
144
- value_str = value_str[:200] + "..."
145
- formatted_parts.append(f" - {key}: {value_str}")
146
-
147
- formatted = "\n".join(formatted_parts)
148
-
149
- return {
150
- "formatted": formatted,
151
- "totalResults": len(rows),
152
- "resultsShared": len(rows),
153
- "isError": False,
154
- }
155
-
156
- except httpx.HTTPStatusError as e:
157
- return {
158
- "formatted": f"HTTP error {e.response.status_code}: {str(e)}",
159
- "totalResults": 0,
160
- "resultsShared": 0,
161
- "isError": True,
162
- }
163
- except Exception as e:
164
- return {
165
- "formatted": f"Failed to download rows: {str(e)}",
166
- "totalResults": 0,
167
- "resultsShared": 0,
168
- "isError": True,
169
- }
170
 
171
 
172
- # Tool specifications
173
- DATASETS_SERVER_LIST_SPLITS_TOOL_SPEC = {
174
- "name": "hf_datasets_list_splits",
175
- "description": (
176
- "List all available splits for a Hugging Face dataset.\n\n"
177
- "Use this to discover what splits (train, test, validation, etc.) are available "
178
- "for a dataset before downloading rows.\n\n"
179
- "## When to use\n"
180
- "- When you need to know what splits are available for a dataset\n"
181
- "- Before downloading rows to identify the correct split name\n"
182
- "- To check dataset structure and organization\n"
183
- "- **CRITICAL: Always use this tool BEFORE training/fine-tuning models via hf_jobs** "
184
- "to understand the dataset structure and ensure you're using the correct splits\n\n"
185
- "## Example\n"
186
- "{\n"
187
- ' "dataset": "facebook/research-plan-gen"\n'
188
- "}"
189
- ),
190
- "parameters": {
191
- "type": "object",
192
- "properties": {
193
- "dataset": {
194
- "type": "string",
195
- "description": "Dataset identifier in format 'org/dataset-name' (e.g., 'facebook/research-plan-gen'). Required.",
196
- },
197
- },
198
- "required": ["dataset"],
199
- },
200
- }
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
- DATASETS_SERVER_DOWNLOAD_ROWS_TOOL_SPEC = {
203
- "name": "hf_datasets_download_rows",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  "description": (
205
- "Download rows from a Hugging Face dataset split via the Datasets Server API.\n\n"
206
- "Fetches a specified number of rows starting from a given offset. Useful for "
207
- "sampling data, inspecting dataset contents, or processing datasets in batches.\n\n"
208
- "## When to use\n"
209
- "- **CRITICAL: Always use this tool BEFORE training/fine-tuning models via hf_jobs** "
210
- "to inspect and understand the dataset structure, data format, column names, and data types. "
211
- "This helps avoid costly mistakes and ensures proper data preprocessing.\n"
212
- "- When you need to inspect or sample data from a dataset\n"
213
- "- To understand the data format and structure before writing training scripts\n"
214
- "- To verify column names and data types match your expectations\n"
215
- "- To download specific rows for analysis or processing\n"
216
- "- To fetch data in batches (use offset and length parameters)\n\n"
217
- "## When NOT to use\n"
218
- "- For downloading entire large datasets (use huggingface_hub or datasets library instead)\n"
219
- "- When you need to process all data (use streaming or local download)\n\n"
220
  "## Examples\n"
221
- "// Inspect first 5 rows to understand dataset structure (recommended before training)\n"
222
- "{\n"
223
- ' "dataset": "facebook/research-plan-gen",\n'
224
- ' "split": "train",\n'
225
- ' "config": "arxiv",\n'
226
- ' "offset": 0,\n'
227
- ' "length": 5\n'
228
- "}\n\n"
229
- "// Get next batch (rows 5-10)\n"
230
- "{\n"
231
- ' "dataset": "facebook/research-plan-gen",\n'
232
- ' "split": "train",\n'
233
- ' "offset": 5,\n'
234
- ' "length": 5\n'
235
- "}"
236
  ),
237
  "parameters": {
238
  "type": "object",
239
  "properties": {
240
  "dataset": {
241
  "type": "string",
242
- "description": "Dataset identifier in format 'org/dataset-name' (e.g., 'facebook/research-plan-gen'). Required.",
243
- },
244
- "split": {
245
- "type": "string",
246
- "description": "Split name (e.g., 'train', 'test', 'validation'). Required.",
247
  },
248
  "config": {
249
  "type": "string",
250
- "description": "Config name (only needed for datasets with multiple configs). Optional.",
251
  },
252
- "offset": {
253
- "type": "integer",
254
- "description": "Starting row index (default: 0).",
255
- "default": 0,
256
  },
257
- "length": {
258
  "type": "integer",
259
- "description": "Number of rows to fetch (default: 5, max recommended: 1000). Use small values (1-5) for quick inspection before training.",
260
- "default": 5,
261
  },
262
  },
263
- "required": ["dataset", "split"],
264
  },
265
  }
266
 
267
 
268
- async def hf_datasets_list_splits_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
269
- """Handler for listing dataset splits"""
270
- try:
271
- result = list_splits(dataset=arguments["dataset"])
272
- return result["formatted"], not result.get("isError", False)
273
- except Exception as e:
274
- return f"Error: {str(e)}", False
275
-
276
-
277
- async def hf_datasets_download_rows_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
278
- """Handler for downloading dataset rows"""
279
  try:
280
- result = download_rows(
281
  dataset=arguments["dataset"],
282
- split=arguments["split"],
283
  config=arguments.get("config"),
284
- offset=arguments.get("offset", 0),
285
- length=arguments.get("length", 5),
286
  )
287
  return result["formatted"], not result.get("isError", False)
288
  except Exception as e:
289
- return f"Error: {str(e)}", False
290
-
 
1
  """
2
+ Dataset Inspection Tool - Comprehensive dataset analysis in one call
3
 
4
+ Combines /is-valid, /splits, /info, /first-rows, and /parquet endpoints
5
+ to provide everything needed for ML tasks in a single tool call.
6
  """
7
 
8
+ import asyncio
9
+ import os
10
+ from typing import Any
11
 
12
  import httpx
13
 
14
  from agent.tools.types import ToolResult
15
 
16
+ BASE_URL = "https://datasets-server.huggingface.co"
17
 
18
+
19
+ def _get_headers() -> dict:
20
+ """Get auth headers for private/gated datasets"""
21
+ token = os.environ.get("HF_TOKEN")
22
+ if token:
23
+ return {"Authorization": f"Bearer {token}"}
24
+ return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
 
27
+ async def inspect_dataset(
28
  dataset: str,
 
29
  config: str | None = None,
30
+ split: str | None = None,
31
+ sample_rows: int = 3,
32
  ) -> ToolResult:
33
  """
34
+ Get comprehensive dataset info in one call.
35
+ All API calls made in parallel for speed.
 
 
 
 
 
 
 
 
 
36
  """
37
+ headers = _get_headers()
38
+ output_parts = []
39
+ errors = []
40
+
41
+ async with httpx.AsyncClient(timeout=15, headers=headers) as client:
42
+ # Phase 1: Parallel calls for structure info (no dependencies)
43
+ is_valid_task = client.get(f"{BASE_URL}/is-valid", params={"dataset": dataset})
44
+ splits_task = client.get(f"{BASE_URL}/splits", params={"dataset": dataset})
45
+ parquet_task = client.get(f"{BASE_URL}/parquet", params={"dataset": dataset})
46
+
47
+ results = await asyncio.gather(
48
+ is_valid_task,
49
+ splits_task,
50
+ parquet_task,
51
+ return_exceptions=True,
52
+ )
53
+
54
+ # Process is-valid
55
+ if not isinstance(results[0], Exception):
56
+ try:
57
+ output_parts.append(_format_status(results[0].json()))
58
+ except Exception as e:
59
+ errors.append(f"is-valid: {e}")
60
+
61
+ # Process splits and auto-detect config/split
62
+ configs = []
63
+ if not isinstance(results[1], Exception):
64
+ try:
65
+ splits_data = results[1].json()
66
+ configs = _extract_configs(splits_data)
67
+ if not config:
68
+ config = configs[0]["name"] if configs else "default"
69
+ if not split:
70
+ split = configs[0]["splits"][0] if configs else "train"
71
+ output_parts.append(_format_structure(configs))
72
+ except Exception as e:
73
+ errors.append(f"splits: {e}")
74
+
75
+ if not config:
76
+ config = "default"
77
+ if not split:
78
+ split = "train"
79
+
80
+ # Process parquet (will be added at the end)
81
+ parquet_section = None
82
+ if not isinstance(results[2], Exception):
83
+ try:
84
+ parquet_section = _format_parquet_files(results[2].json())
85
+ except Exception:
86
+ pass # Silently skip if no parquet
87
+
88
+ # Phase 2: Parallel calls for content (depend on config/split)
89
+ info_task = client.get(
90
+ f"{BASE_URL}/info", params={"dataset": dataset, "config": config}
91
+ )
92
+ rows_task = client.get(
93
+ f"{BASE_URL}/first-rows",
94
+ params={"dataset": dataset, "config": config, "split": split},
95
+ timeout=30,
96
+ )
97
+
98
+ content_results = await asyncio.gather(
99
+ info_task,
100
+ rows_task,
101
+ return_exceptions=True,
102
+ )
103
+
104
+ # Process info (schema)
105
+ if not isinstance(content_results[0], Exception):
106
+ try:
107
+ output_parts.append(_format_schema(content_results[0].json(), config))
108
+ except Exception as e:
109
+ errors.append(f"info: {e}")
110
+
111
+ # Process sample rows
112
+ if not isinstance(content_results[1], Exception):
113
+ try:
114
+ output_parts.append(
115
+ _format_samples(
116
+ content_results[1].json(), config, split, sample_rows
117
+ )
118
+ )
119
+ except Exception as e:
120
+ errors.append(f"rows: {e}")
121
+
122
+ # Add parquet section at the end if available
123
+ if parquet_section:
124
+ output_parts.append(parquet_section)
125
+
126
+ # Combine output
127
+ formatted = f"# {dataset}\n\n" + "\n\n".join(output_parts)
128
+ if errors:
129
+ formatted += f"\n\n**Warnings:** {'; '.join(errors)}"
130
+
131
+ return {
132
+ "formatted": formatted,
133
+ "totalResults": 1,
134
+ "resultsShared": 1,
135
+ "isError": len(output_parts) == 0,
136
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
 
139
+ def _format_status(data: dict) -> str:
140
+ """Format /is-valid response as status line"""
141
+ available = [
142
+ k
143
+ for k in ["viewer", "preview", "search", "filter", "statistics"]
144
+ if data.get(k)
145
+ ]
146
+ if available:
147
+ return f"## Status\n✓ Valid ({', '.join(available)})"
148
+ return "## Status\n✗ Dataset may have issues"
149
+
150
+
151
+ def _extract_configs(splits_data: dict) -> list[dict]:
152
+ """Group splits by config"""
153
+ configs: dict[str, dict] = {}
154
+ for s in splits_data.get("splits", []):
155
+ cfg = s.get("config", "default")
156
+ if cfg not in configs:
157
+ configs[cfg] = {"name": cfg, "splits": []}
158
+ configs[cfg]["splits"].append(s.get("split"))
159
+ return list(configs.values())
160
+
161
+
162
+ def _format_structure(configs: list) -> str:
163
+ """Format splits as markdown table"""
164
+ lines = ["## Structure", "| Config | Split |", "|--------|-------|"]
165
+ for cfg in configs:
166
+ for split_name in cfg["splits"]:
167
+ lines.append(f"| {cfg['name']} | {split_name} |")
168
+ return "\n".join(lines)
169
+
170
+
171
+ def _format_schema(info: dict, config: str) -> str:
172
+ """Extract features and format as table"""
173
+ features = info.get("dataset_info", {}).get("features", {})
174
+ lines = [f"## Schema ({config})", "| Column | Type |", "|--------|------|"]
175
+ for col_name, col_info in features.items():
176
+ col_type = _get_type_str(col_info)
177
+ lines.append(f"| {col_name} | {col_type} |")
178
+ return "\n".join(lines)
179
+
180
 
181
+ def _get_type_str(col_info: dict) -> str:
182
+ """Convert feature info to readable type string"""
183
+ dtype = col_info.get("dtype") or col_info.get("_type", "unknown")
184
+ if col_info.get("_type") == "ClassLabel":
185
+ names = col_info.get("names", [])
186
+ if names and len(names) <= 5:
187
+ return f"ClassLabel ({', '.join(f'{n}={i}' for i, n in enumerate(names))})"
188
+ return f"ClassLabel ({len(names)} classes)"
189
+ return str(dtype)
190
+
191
+
192
+ def _format_samples(rows_data: dict, config: str, split: str, limit: int) -> str:
193
+ """Format sample rows, truncate long values"""
194
+ rows = rows_data.get("rows", [])[:limit]
195
+ lines = [f"## Sample Rows ({config}/{split})"]
196
+
197
+ messages_col_data = None
198
+
199
+ for i, row_wrapper in enumerate(rows, 1):
200
+ row = row_wrapper.get("row", {})
201
+ lines.append(f"**Row {i}:**")
202
+ for key, val in row.items():
203
+ # Check for messages column and capture first one for format analysis
204
+ if key.lower() == "messages" and messages_col_data is None:
205
+ messages_col_data = val
206
+
207
+ val_str = str(val)
208
+ if len(val_str) > 150:
209
+ val_str = val_str[:150] + "..."
210
+ lines.append(f"- {key}: {val_str}")
211
+
212
+ # If we found a messages column, add format analysis
213
+ if messages_col_data is not None:
214
+ messages_format = _format_messages_structure(messages_col_data)
215
+ if messages_format:
216
+ lines.append("")
217
+ lines.append(messages_format)
218
+
219
+ return "\n".join(lines)
220
+
221
+
222
+ def _format_messages_structure(messages_data: Any) -> str | None:
223
+ """
224
+ Analyze and format the structure of a messages column.
225
+ Common in chat/instruction datasets.
226
+ """
227
+ import json
228
+
229
+ # Parse if string
230
+ if isinstance(messages_data, str):
231
+ try:
232
+ messages_data = json.loads(messages_data)
233
+ except json.JSONDecodeError:
234
+ return None
235
+
236
+ if not isinstance(messages_data, list) or not messages_data:
237
+ return None
238
+
239
+ lines = ["## Messages Column Format"]
240
+
241
+ # Analyze message structure
242
+ roles_seen = set()
243
+ has_tool_calls = False
244
+ has_tool_results = False
245
+ message_keys = set()
246
+
247
+ for msg in messages_data:
248
+ if not isinstance(msg, dict):
249
+ continue
250
+
251
+ message_keys.update(msg.keys())
252
+
253
+ role = msg.get("role", "")
254
+ if role:
255
+ roles_seen.add(role)
256
+
257
+ if "tool_calls" in msg or "function_call" in msg:
258
+ has_tool_calls = True
259
+ if role in ("tool", "function") or msg.get("tool_call_id"):
260
+ has_tool_results = True
261
+
262
+ # Format the analysis
263
+ lines.append(
264
+ f"**Roles:** {', '.join(sorted(roles_seen)) if roles_seen else 'unknown'}"
265
+ )
266
+
267
+ # Show common message keys with presence indicators
268
+ common_keys = [
269
+ "role",
270
+ "content",
271
+ "tool_calls",
272
+ "tool_call_id",
273
+ "name",
274
+ "function_call",
275
+ ]
276
+ key_status = []
277
+ for key in common_keys:
278
+ if key in message_keys:
279
+ key_status.append(f"{key} ✓")
280
+ else:
281
+ key_status.append(f"{key} ✗")
282
+ lines.append(f"**Message keys:** {', '.join(key_status)}")
283
+
284
+ if has_tool_calls:
285
+ lines.append("**Tool calls:** ✓ Present")
286
+ if has_tool_results:
287
+ lines.append("**Tool results:** ✓ Present")
288
+
289
+ # Show example message structure
290
+ if messages_data and isinstance(messages_data[0], dict):
291
+ lines.append("")
292
+ lines.append("**Example message structure:**")
293
+ example = messages_data[0]
294
+ for key, val in example.items():
295
+ if key == "content":
296
+ val_preview = (
297
+ str(val)[:100] + "..." if len(str(val)) > 100 else str(val)
298
+ )
299
+ lines.append(f" - {key}: {val_preview}")
300
+ elif key == "tool_calls" and isinstance(val, list) and val:
301
+ lines.append(f" - {key}: [{len(val)} tool call(s)]")
302
+ # Show first tool call structure
303
+ if isinstance(val[0], dict):
304
+ tc = val[0]
305
+ lines.append(f" - type: {tc.get('type', 'function')}")
306
+ if "function" in tc:
307
+ lines.append(
308
+ f" - function.name: {tc['function'].get('name', '?')}"
309
+ )
310
+ lines.append(" - function.arguments: <json string>")
311
+ else:
312
+ lines.append(f" - {key}: {val}")
313
+
314
+ return "\n".join(lines)
315
+
316
+
317
+ def _format_parquet_files(data: dict) -> str | None:
318
+ """Format parquet file info, return None if no files"""
319
+ files = data.get("parquet_files", [])
320
+ if not files:
321
+ return None
322
+
323
+ # Group by config/split
324
+ groups: dict[str, dict] = {}
325
+ for f in files:
326
+ key = f"{f.get('config', 'default')}/{f.get('split', 'train')}"
327
+ if key not in groups:
328
+ groups[key] = {"count": 0, "size": 0}
329
+ groups[key]["count"] += 1
330
+ groups[key]["size"] += f.get("size", 0)
331
+
332
+ lines = ["## Files (Parquet)"]
333
+ for key, info in groups.items():
334
+ size_mb = info["size"] / (1024 * 1024)
335
+ lines.append(f"- {key}: {info['count']} file(s) ({size_mb:.1f} MB)")
336
+ return "\n".join(lines)
337
+
338
+
339
+ # Tool specification
340
+ HF_INSPECT_DATASET_TOOL_SPEC = {
341
+ "name": "hf_inspect_dataset",
342
  "description": (
343
+ "Inspect a Hugging Face dataset comprehensively in one call.\n\n"
344
+ "## What you get\n"
345
+ "- Status check (validates dataset works without errors)\n"
346
+ "- All configs and splits\n"
347
+ "- Column names and types (schema)\n"
348
+ "- Sample rows to understand data format\n"
349
+ "- Parquet file structure and sizes\n\n"
350
+ "## CRITICAL\n"
351
+ "**Always inspect datasets before writing training code** to understand:\n"
352
+ "- Column names for your dataloader\n"
353
+ "- Data types and format\n"
354
+ "- Available splits (train/test/validation)\n\n"
355
+ "Supports private/gated datasets when HF_TOKEN is set.\n\n"
 
 
356
  "## Examples\n"
357
+ '{"dataset": "stanfordnlp/imdb"}\n'
358
+ '{"dataset": "nyu-mll/glue", "config": "mrpc", "sample_rows": 5}\n'
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  ),
360
  "parameters": {
361
  "type": "object",
362
  "properties": {
363
  "dataset": {
364
  "type": "string",
365
+ "description": "Dataset ID in 'org/name' format (e.g., 'stanfordnlp/imdb')",
 
 
 
 
366
  },
367
  "config": {
368
  "type": "string",
369
+ "description": "Config/subset name. Auto-detected if not specified.",
370
  },
371
+ "split": {
372
+ "type": "string",
373
+ "description": "Split for sample rows. Auto-detected if not specified.",
 
374
  },
375
+ "sample_rows": {
376
  "type": "integer",
377
+ "description": "Number of sample rows to show (default: 3, max: 10)",
378
+ "default": 3,
379
  },
380
  },
381
+ "required": ["dataset"],
382
  },
383
  }
384
 
385
 
386
+ async def hf_inspect_dataset_handler(arguments: dict[str, Any]) -> tuple[str, bool]:
387
+ """Handler for agent tool router"""
 
 
 
 
 
 
 
 
 
388
  try:
389
+ result = await inspect_dataset(
390
  dataset=arguments["dataset"],
 
391
  config=arguments.get("config"),
392
+ split=arguments.get("split"),
393
+ sample_rows=min(arguments.get("sample_rows", 3), 10),
394
  )
395
  return result["formatted"], not result.get("isError", False)
396
  except Exception as e:
397
+ return f"Error inspecting dataset: {str(e)}", False