Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Commit ·
e8a3722
1
Parent(s): 73be271
Unify dataset tools into single comprehensive hf_inspect_dataset tool
Browse filesReplaces two separate tools (hf_datasets_list_splits, hf_datasets_download_rows)
with a single tool that provides everything needed for ML tasks in one call:
- Status check via /is-valid endpoint
- All configs and splits via /splits
- Schema with column types via /info
- Sample rows via /first-rows
- Parquet file structure via /parquet
Key improvements:
- All API calls made in parallel for speed
- Auto-detects config/split if not specified
- Supports private/gated datasets via HF_TOKEN
- Special handling for messages column (chat datasets):
- Shows roles present (user, assistant, system, tool)
- Shows message keys with presence indicators
- Detects tool_calls and function_call presence
- agent/core/tools.py +7 -15
- agent/tools/__init__.py +4 -8
- agent/tools/dataset_tools.py +350 -243
agent/core/tools.py
CHANGED
|
@@ -14,10 +14,8 @@ from mcp.types import EmbeddedResource, ImageContent, TextContent
|
|
| 14 |
|
| 15 |
from agent.config import MCPServerConfig
|
| 16 |
from agent.tools.dataset_tools import (
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
hf_datasets_download_rows_handler,
|
| 20 |
-
hf_datasets_list_splits_handler,
|
| 21 |
)
|
| 22 |
from agent.tools.docs_tools import (
|
| 23 |
EXPLORE_HF_DOCS_TOOL_SPEC,
|
|
@@ -263,18 +261,12 @@ def create_builtin_tools() -> list[ToolSpec]:
|
|
| 263 |
parameters=HF_DOCS_FETCH_TOOL_SPEC["parameters"],
|
| 264 |
handler=hf_docs_fetch_handler,
|
| 265 |
),
|
| 266 |
-
#
|
| 267 |
ToolSpec(
|
| 268 |
-
name=
|
| 269 |
-
description=
|
| 270 |
-
parameters=
|
| 271 |
-
handler=
|
| 272 |
-
),
|
| 273 |
-
ToolSpec(
|
| 274 |
-
name=DATASETS_SERVER_DOWNLOAD_ROWS_TOOL_SPEC["name"],
|
| 275 |
-
description=DATASETS_SERVER_DOWNLOAD_ROWS_TOOL_SPEC["description"],
|
| 276 |
-
parameters=DATASETS_SERVER_DOWNLOAD_ROWS_TOOL_SPEC["parameters"],
|
| 277 |
-
handler=hf_datasets_download_rows_handler,
|
| 278 |
),
|
| 279 |
# Planning and job management tools
|
| 280 |
ToolSpec(
|
|
|
|
| 14 |
|
| 15 |
from agent.config import MCPServerConfig
|
| 16 |
from agent.tools.dataset_tools import (
|
| 17 |
+
HF_INSPECT_DATASET_TOOL_SPEC,
|
| 18 |
+
hf_inspect_dataset_handler,
|
|
|
|
|
|
|
| 19 |
)
|
| 20 |
from agent.tools.docs_tools import (
|
| 21 |
EXPLORE_HF_DOCS_TOOL_SPEC,
|
|
|
|
| 261 |
parameters=HF_DOCS_FETCH_TOOL_SPEC["parameters"],
|
| 262 |
handler=hf_docs_fetch_handler,
|
| 263 |
),
|
| 264 |
+
# Dataset inspection tool (unified)
|
| 265 |
ToolSpec(
|
| 266 |
+
name=HF_INSPECT_DATASET_TOOL_SPEC["name"],
|
| 267 |
+
description=HF_INSPECT_DATASET_TOOL_SPEC["description"],
|
| 268 |
+
parameters=HF_INSPECT_DATASET_TOOL_SPEC["parameters"],
|
| 269 |
+
handler=hf_inspect_dataset_handler,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
),
|
| 271 |
# Planning and job management tools
|
| 272 |
ToolSpec(
|
agent/tools/__init__.py
CHANGED
|
@@ -19,10 +19,8 @@ from agent.tools.github_search_code import (
|
|
| 19 |
github_search_code_handler,
|
| 20 |
)
|
| 21 |
from agent.tools.dataset_tools import (
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
hf_datasets_download_rows_handler,
|
| 25 |
-
hf_datasets_list_splits_handler,
|
| 26 |
)
|
| 27 |
from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, HfJobsTool, hf_jobs_handler
|
| 28 |
from agent.tools.types import ToolResult
|
|
@@ -40,8 +38,6 @@ __all__ = [
|
|
| 40 |
"github_read_file_handler",
|
| 41 |
"GITHUB_SEARCH_CODE_TOOL_SPEC",
|
| 42 |
"github_search_code_handler",
|
| 43 |
-
"
|
| 44 |
-
"
|
| 45 |
-
"DATASETS_SERVER_DOWNLOAD_ROWS_TOOL_SPEC",
|
| 46 |
-
"hf_datasets_download_rows_handler",
|
| 47 |
]
|
|
|
|
| 19 |
github_search_code_handler,
|
| 20 |
)
|
| 21 |
from agent.tools.dataset_tools import (
|
| 22 |
+
HF_INSPECT_DATASET_TOOL_SPEC,
|
| 23 |
+
hf_inspect_dataset_handler,
|
|
|
|
|
|
|
| 24 |
)
|
| 25 |
from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, HfJobsTool, hf_jobs_handler
|
| 26 |
from agent.tools.types import ToolResult
|
|
|
|
| 38 |
"github_read_file_handler",
|
| 39 |
"GITHUB_SEARCH_CODE_TOOL_SPEC",
|
| 40 |
"github_search_code_handler",
|
| 41 |
+
"HF_INSPECT_DATASET_TOOL_SPEC",
|
| 42 |
+
"hf_inspect_dataset_handler",
|
|
|
|
|
|
|
| 43 |
]
|
agent/tools/dataset_tools.py
CHANGED
|
@@ -1,290 +1,397 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
|
| 4 |
-
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
-
|
|
|
|
|
|
|
| 8 |
|
| 9 |
import httpx
|
| 10 |
|
| 11 |
from agent.tools.types import ToolResult
|
| 12 |
|
|
|
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
Returns:
|
| 22 |
-
ToolResult with split information
|
| 23 |
-
"""
|
| 24 |
-
base_url = "https://datasets-server.huggingface.co"
|
| 25 |
-
url = f"{base_url}/splits"
|
| 26 |
-
|
| 27 |
-
params = {"dataset": dataset}
|
| 28 |
-
|
| 29 |
-
try:
|
| 30 |
-
response = httpx.get(url, params=params, timeout=15.0)
|
| 31 |
-
response.raise_for_status()
|
| 32 |
-
data = response.json()
|
| 33 |
-
|
| 34 |
-
splits = data.get("splits", [])
|
| 35 |
-
if not splits:
|
| 36 |
-
return {
|
| 37 |
-
"formatted": f"No splits found for dataset '{dataset}'",
|
| 38 |
-
"totalResults": 0,
|
| 39 |
-
"resultsShared": 0,
|
| 40 |
-
"isError": False,
|
| 41 |
-
}
|
| 42 |
-
|
| 43 |
-
# Format splits information
|
| 44 |
-
split_info = []
|
| 45 |
-
for split in splits:
|
| 46 |
-
split_name = split.get("split", "unknown")
|
| 47 |
-
num_rows = split.get("num_examples", "unknown")
|
| 48 |
-
split_info.append(f"- **{split_name}**: {num_rows} rows")
|
| 49 |
-
|
| 50 |
-
formatted = f"Available splits for dataset '{dataset}':\n\n" + "\n".join(split_info)
|
| 51 |
-
|
| 52 |
-
return {
|
| 53 |
-
"formatted": formatted,
|
| 54 |
-
"totalResults": len(splits),
|
| 55 |
-
"resultsShared": len(splits),
|
| 56 |
-
"isError": False,
|
| 57 |
-
}
|
| 58 |
-
|
| 59 |
-
except httpx.HTTPStatusError as e:
|
| 60 |
-
return {
|
| 61 |
-
"formatted": f"HTTP error {e.response.status_code}: {str(e)}",
|
| 62 |
-
"totalResults": 0,
|
| 63 |
-
"resultsShared": 0,
|
| 64 |
-
"isError": True,
|
| 65 |
-
}
|
| 66 |
-
except Exception as e:
|
| 67 |
-
return {
|
| 68 |
-
"formatted": f"Failed to list splits: {str(e)}",
|
| 69 |
-
"totalResults": 0,
|
| 70 |
-
"resultsShared": 0,
|
| 71 |
-
"isError": True,
|
| 72 |
-
}
|
| 73 |
|
| 74 |
|
| 75 |
-
def
|
| 76 |
dataset: str,
|
| 77 |
-
split: str,
|
| 78 |
config: str | None = None,
|
| 79 |
-
|
| 80 |
-
|
| 81 |
) -> ToolResult:
|
| 82 |
"""
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
Args:
|
| 86 |
-
dataset: Dataset identifier (e.g., "facebook/research-plan-gen")
|
| 87 |
-
split: Split name (e.g., "train", "test", "validation")
|
| 88 |
-
config: Optional config name (for datasets with multiple configs)
|
| 89 |
-
offset: Starting row index (default: 0)
|
| 90 |
-
length: Number of rows to fetch (default: 5, max recommended: 1000)
|
| 91 |
-
|
| 92 |
-
Returns:
|
| 93 |
-
ToolResult with row data
|
| 94 |
"""
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
"
|
| 102 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
}
|
| 104 |
-
|
| 105 |
-
if config:
|
| 106 |
-
params["config"] = config
|
| 107 |
-
|
| 108 |
-
try:
|
| 109 |
-
response = httpx.get(url, params=params, timeout=30.0)
|
| 110 |
-
response.raise_for_status()
|
| 111 |
-
data = response.json()
|
| 112 |
-
|
| 113 |
-
rows = data.get("rows", [])
|
| 114 |
-
features = data.get("features", [])
|
| 115 |
-
|
| 116 |
-
if not rows:
|
| 117 |
-
return {
|
| 118 |
-
"formatted": f"No rows found for dataset '{dataset}', split '{split}' at offset {offset}",
|
| 119 |
-
"totalResults": 0,
|
| 120 |
-
"resultsShared": 0,
|
| 121 |
-
"isError": False,
|
| 122 |
-
}
|
| 123 |
-
|
| 124 |
-
# Format a summary of the rows
|
| 125 |
-
formatted_parts = [
|
| 126 |
-
f"Downloaded {len(rows)} rows from dataset '{dataset}'",
|
| 127 |
-
f"Split: {split}",
|
| 128 |
-
f"Offset: {offset}",
|
| 129 |
-
]
|
| 130 |
-
|
| 131 |
-
if config:
|
| 132 |
-
formatted_parts.append(f"Config: {config}")
|
| 133 |
-
|
| 134 |
-
formatted_parts.append(f"\nFeatures: {', '.join([f.get('name', 'unknown') for f in features])}")
|
| 135 |
-
formatted_parts.append(f"\nTotal rows in response: {len(rows)}")
|
| 136 |
-
|
| 137 |
-
# Show first row as example
|
| 138 |
-
if rows:
|
| 139 |
-
first_row = rows[0].get("row", {})
|
| 140 |
-
formatted_parts.append(f"\nExample row (first row):")
|
| 141 |
-
for key, value in list(first_row.items())[:20]: # Show up to 20 fields
|
| 142 |
-
value_str = str(value)
|
| 143 |
-
if len(value_str) > 200:
|
| 144 |
-
value_str = value_str[:200] + "..."
|
| 145 |
-
formatted_parts.append(f" - {key}: {value_str}")
|
| 146 |
-
|
| 147 |
-
formatted = "\n".join(formatted_parts)
|
| 148 |
-
|
| 149 |
-
return {
|
| 150 |
-
"formatted": formatted,
|
| 151 |
-
"totalResults": len(rows),
|
| 152 |
-
"resultsShared": len(rows),
|
| 153 |
-
"isError": False,
|
| 154 |
-
}
|
| 155 |
-
|
| 156 |
-
except httpx.HTTPStatusError as e:
|
| 157 |
-
return {
|
| 158 |
-
"formatted": f"HTTP error {e.response.status_code}: {str(e)}",
|
| 159 |
-
"totalResults": 0,
|
| 160 |
-
"resultsShared": 0,
|
| 161 |
-
"isError": True,
|
| 162 |
-
}
|
| 163 |
-
except Exception as e:
|
| 164 |
-
return {
|
| 165 |
-
"formatted": f"Failed to download rows: {str(e)}",
|
| 166 |
-
"totalResults": 0,
|
| 167 |
-
"resultsShared": 0,
|
| 168 |
-
"isError": True,
|
| 169 |
-
}
|
| 170 |
|
| 171 |
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
"
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
"
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
"
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
-
|
| 203 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
"description": (
|
| 205 |
-
"
|
| 206 |
-
"
|
| 207 |
-
"
|
| 208 |
-
"
|
| 209 |
-
"-
|
| 210 |
-
"
|
| 211 |
-
"
|
| 212 |
-
"
|
| 213 |
-
"
|
| 214 |
-
"-
|
| 215 |
-
"-
|
| 216 |
-
"-
|
| 217 |
-
"
|
| 218 |
-
"- For downloading entire large datasets (use huggingface_hub or datasets library instead)\n"
|
| 219 |
-
"- When you need to process all data (use streaming or local download)\n\n"
|
| 220 |
"## Examples\n"
|
| 221 |
-
"
|
| 222 |
-
|
| 223 |
-
' "dataset": "facebook/research-plan-gen",\n'
|
| 224 |
-
' "split": "train",\n'
|
| 225 |
-
' "config": "arxiv",\n'
|
| 226 |
-
' "offset": 0,\n'
|
| 227 |
-
' "length": 5\n'
|
| 228 |
-
"}\n\n"
|
| 229 |
-
"// Get next batch (rows 5-10)\n"
|
| 230 |
-
"{\n"
|
| 231 |
-
' "dataset": "facebook/research-plan-gen",\n'
|
| 232 |
-
' "split": "train",\n'
|
| 233 |
-
' "offset": 5,\n'
|
| 234 |
-
' "length": 5\n'
|
| 235 |
-
"}"
|
| 236 |
),
|
| 237 |
"parameters": {
|
| 238 |
"type": "object",
|
| 239 |
"properties": {
|
| 240 |
"dataset": {
|
| 241 |
"type": "string",
|
| 242 |
-
"description": "Dataset
|
| 243 |
-
},
|
| 244 |
-
"split": {
|
| 245 |
-
"type": "string",
|
| 246 |
-
"description": "Split name (e.g., 'train', 'test', 'validation'). Required.",
|
| 247 |
},
|
| 248 |
"config": {
|
| 249 |
"type": "string",
|
| 250 |
-
"description": "Config name
|
| 251 |
},
|
| 252 |
-
"
|
| 253 |
-
"type": "
|
| 254 |
-
"description": "
|
| 255 |
-
"default": 0,
|
| 256 |
},
|
| 257 |
-
"
|
| 258 |
"type": "integer",
|
| 259 |
-
"description": "Number of rows to
|
| 260 |
-
"default":
|
| 261 |
},
|
| 262 |
},
|
| 263 |
-
"required": ["dataset"
|
| 264 |
},
|
| 265 |
}
|
| 266 |
|
| 267 |
|
| 268 |
-
async def
|
| 269 |
-
"""Handler for
|
| 270 |
-
try:
|
| 271 |
-
result = list_splits(dataset=arguments["dataset"])
|
| 272 |
-
return result["formatted"], not result.get("isError", False)
|
| 273 |
-
except Exception as e:
|
| 274 |
-
return f"Error: {str(e)}", False
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
async def hf_datasets_download_rows_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
|
| 278 |
-
"""Handler for downloading dataset rows"""
|
| 279 |
try:
|
| 280 |
-
result =
|
| 281 |
dataset=arguments["dataset"],
|
| 282 |
-
split=arguments["split"],
|
| 283 |
config=arguments.get("config"),
|
| 284 |
-
|
| 285 |
-
|
| 286 |
)
|
| 287 |
return result["formatted"], not result.get("isError", False)
|
| 288 |
except Exception as e:
|
| 289 |
-
return f"Error: {str(e)}", False
|
| 290 |
-
|
|
|
|
| 1 |
"""
|
| 2 |
+
Dataset Inspection Tool - Comprehensive dataset analysis in one call
|
| 3 |
|
| 4 |
+
Combines /is-valid, /splits, /info, /first-rows, and /parquet endpoints
|
| 5 |
+
to provide everything needed for ML tasks in a single tool call.
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
import asyncio
|
| 9 |
+
import os
|
| 10 |
+
from typing import Any
|
| 11 |
|
| 12 |
import httpx
|
| 13 |
|
| 14 |
from agent.tools.types import ToolResult
|
| 15 |
|
| 16 |
+
BASE_URL = "https://datasets-server.huggingface.co"
|
| 17 |
|
| 18 |
+
|
| 19 |
+
def _get_headers() -> dict:
|
| 20 |
+
"""Get auth headers for private/gated datasets"""
|
| 21 |
+
token = os.environ.get("HF_TOKEN")
|
| 22 |
+
if token:
|
| 23 |
+
return {"Authorization": f"Bearer {token}"}
|
| 24 |
+
return {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
+
async def inspect_dataset(
|
| 28 |
dataset: str,
|
|
|
|
| 29 |
config: str | None = None,
|
| 30 |
+
split: str | None = None,
|
| 31 |
+
sample_rows: int = 3,
|
| 32 |
) -> ToolResult:
|
| 33 |
"""
|
| 34 |
+
Get comprehensive dataset info in one call.
|
| 35 |
+
All API calls made in parallel for speed.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
"""
|
| 37 |
+
headers = _get_headers()
|
| 38 |
+
output_parts = []
|
| 39 |
+
errors = []
|
| 40 |
+
|
| 41 |
+
async with httpx.AsyncClient(timeout=15, headers=headers) as client:
|
| 42 |
+
# Phase 1: Parallel calls for structure info (no dependencies)
|
| 43 |
+
is_valid_task = client.get(f"{BASE_URL}/is-valid", params={"dataset": dataset})
|
| 44 |
+
splits_task = client.get(f"{BASE_URL}/splits", params={"dataset": dataset})
|
| 45 |
+
parquet_task = client.get(f"{BASE_URL}/parquet", params={"dataset": dataset})
|
| 46 |
+
|
| 47 |
+
results = await asyncio.gather(
|
| 48 |
+
is_valid_task,
|
| 49 |
+
splits_task,
|
| 50 |
+
parquet_task,
|
| 51 |
+
return_exceptions=True,
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
# Process is-valid
|
| 55 |
+
if not isinstance(results[0], Exception):
|
| 56 |
+
try:
|
| 57 |
+
output_parts.append(_format_status(results[0].json()))
|
| 58 |
+
except Exception as e:
|
| 59 |
+
errors.append(f"is-valid: {e}")
|
| 60 |
+
|
| 61 |
+
# Process splits and auto-detect config/split
|
| 62 |
+
configs = []
|
| 63 |
+
if not isinstance(results[1], Exception):
|
| 64 |
+
try:
|
| 65 |
+
splits_data = results[1].json()
|
| 66 |
+
configs = _extract_configs(splits_data)
|
| 67 |
+
if not config:
|
| 68 |
+
config = configs[0]["name"] if configs else "default"
|
| 69 |
+
if not split:
|
| 70 |
+
split = configs[0]["splits"][0] if configs else "train"
|
| 71 |
+
output_parts.append(_format_structure(configs))
|
| 72 |
+
except Exception as e:
|
| 73 |
+
errors.append(f"splits: {e}")
|
| 74 |
+
|
| 75 |
+
if not config:
|
| 76 |
+
config = "default"
|
| 77 |
+
if not split:
|
| 78 |
+
split = "train"
|
| 79 |
+
|
| 80 |
+
# Process parquet (will be added at the end)
|
| 81 |
+
parquet_section = None
|
| 82 |
+
if not isinstance(results[2], Exception):
|
| 83 |
+
try:
|
| 84 |
+
parquet_section = _format_parquet_files(results[2].json())
|
| 85 |
+
except Exception:
|
| 86 |
+
pass # Silently skip if no parquet
|
| 87 |
+
|
| 88 |
+
# Phase 2: Parallel calls for content (depend on config/split)
|
| 89 |
+
info_task = client.get(
|
| 90 |
+
f"{BASE_URL}/info", params={"dataset": dataset, "config": config}
|
| 91 |
+
)
|
| 92 |
+
rows_task = client.get(
|
| 93 |
+
f"{BASE_URL}/first-rows",
|
| 94 |
+
params={"dataset": dataset, "config": config, "split": split},
|
| 95 |
+
timeout=30,
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
content_results = await asyncio.gather(
|
| 99 |
+
info_task,
|
| 100 |
+
rows_task,
|
| 101 |
+
return_exceptions=True,
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
# Process info (schema)
|
| 105 |
+
if not isinstance(content_results[0], Exception):
|
| 106 |
+
try:
|
| 107 |
+
output_parts.append(_format_schema(content_results[0].json(), config))
|
| 108 |
+
except Exception as e:
|
| 109 |
+
errors.append(f"info: {e}")
|
| 110 |
+
|
| 111 |
+
# Process sample rows
|
| 112 |
+
if not isinstance(content_results[1], Exception):
|
| 113 |
+
try:
|
| 114 |
+
output_parts.append(
|
| 115 |
+
_format_samples(
|
| 116 |
+
content_results[1].json(), config, split, sample_rows
|
| 117 |
+
)
|
| 118 |
+
)
|
| 119 |
+
except Exception as e:
|
| 120 |
+
errors.append(f"rows: {e}")
|
| 121 |
+
|
| 122 |
+
# Add parquet section at the end if available
|
| 123 |
+
if parquet_section:
|
| 124 |
+
output_parts.append(parquet_section)
|
| 125 |
+
|
| 126 |
+
# Combine output
|
| 127 |
+
formatted = f"# {dataset}\n\n" + "\n\n".join(output_parts)
|
| 128 |
+
if errors:
|
| 129 |
+
formatted += f"\n\n**Warnings:** {'; '.join(errors)}"
|
| 130 |
+
|
| 131 |
+
return {
|
| 132 |
+
"formatted": formatted,
|
| 133 |
+
"totalResults": 1,
|
| 134 |
+
"resultsShared": 1,
|
| 135 |
+
"isError": len(output_parts) == 0,
|
| 136 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
|
| 139 |
+
def _format_status(data: dict) -> str:
|
| 140 |
+
"""Format /is-valid response as status line"""
|
| 141 |
+
available = [
|
| 142 |
+
k
|
| 143 |
+
for k in ["viewer", "preview", "search", "filter", "statistics"]
|
| 144 |
+
if data.get(k)
|
| 145 |
+
]
|
| 146 |
+
if available:
|
| 147 |
+
return f"## Status\n✓ Valid ({', '.join(available)})"
|
| 148 |
+
return "## Status\n✗ Dataset may have issues"
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def _extract_configs(splits_data: dict) -> list[dict]:
|
| 152 |
+
"""Group splits by config"""
|
| 153 |
+
configs: dict[str, dict] = {}
|
| 154 |
+
for s in splits_data.get("splits", []):
|
| 155 |
+
cfg = s.get("config", "default")
|
| 156 |
+
if cfg not in configs:
|
| 157 |
+
configs[cfg] = {"name": cfg, "splits": []}
|
| 158 |
+
configs[cfg]["splits"].append(s.get("split"))
|
| 159 |
+
return list(configs.values())
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def _format_structure(configs: list) -> str:
|
| 163 |
+
"""Format splits as markdown table"""
|
| 164 |
+
lines = ["## Structure", "| Config | Split |", "|--------|-------|"]
|
| 165 |
+
for cfg in configs:
|
| 166 |
+
for split_name in cfg["splits"]:
|
| 167 |
+
lines.append(f"| {cfg['name']} | {split_name} |")
|
| 168 |
+
return "\n".join(lines)
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def _format_schema(info: dict, config: str) -> str:
|
| 172 |
+
"""Extract features and format as table"""
|
| 173 |
+
features = info.get("dataset_info", {}).get("features", {})
|
| 174 |
+
lines = [f"## Schema ({config})", "| Column | Type |", "|--------|------|"]
|
| 175 |
+
for col_name, col_info in features.items():
|
| 176 |
+
col_type = _get_type_str(col_info)
|
| 177 |
+
lines.append(f"| {col_name} | {col_type} |")
|
| 178 |
+
return "\n".join(lines)
|
| 179 |
+
|
| 180 |
|
| 181 |
+
def _get_type_str(col_info: dict) -> str:
|
| 182 |
+
"""Convert feature info to readable type string"""
|
| 183 |
+
dtype = col_info.get("dtype") or col_info.get("_type", "unknown")
|
| 184 |
+
if col_info.get("_type") == "ClassLabel":
|
| 185 |
+
names = col_info.get("names", [])
|
| 186 |
+
if names and len(names) <= 5:
|
| 187 |
+
return f"ClassLabel ({', '.join(f'{n}={i}' for i, n in enumerate(names))})"
|
| 188 |
+
return f"ClassLabel ({len(names)} classes)"
|
| 189 |
+
return str(dtype)
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def _format_samples(rows_data: dict, config: str, split: str, limit: int) -> str:
|
| 193 |
+
"""Format sample rows, truncate long values"""
|
| 194 |
+
rows = rows_data.get("rows", [])[:limit]
|
| 195 |
+
lines = [f"## Sample Rows ({config}/{split})"]
|
| 196 |
+
|
| 197 |
+
messages_col_data = None
|
| 198 |
+
|
| 199 |
+
for i, row_wrapper in enumerate(rows, 1):
|
| 200 |
+
row = row_wrapper.get("row", {})
|
| 201 |
+
lines.append(f"**Row {i}:**")
|
| 202 |
+
for key, val in row.items():
|
| 203 |
+
# Check for messages column and capture first one for format analysis
|
| 204 |
+
if key.lower() == "messages" and messages_col_data is None:
|
| 205 |
+
messages_col_data = val
|
| 206 |
+
|
| 207 |
+
val_str = str(val)
|
| 208 |
+
if len(val_str) > 150:
|
| 209 |
+
val_str = val_str[:150] + "..."
|
| 210 |
+
lines.append(f"- {key}: {val_str}")
|
| 211 |
+
|
| 212 |
+
# If we found a messages column, add format analysis
|
| 213 |
+
if messages_col_data is not None:
|
| 214 |
+
messages_format = _format_messages_structure(messages_col_data)
|
| 215 |
+
if messages_format:
|
| 216 |
+
lines.append("")
|
| 217 |
+
lines.append(messages_format)
|
| 218 |
+
|
| 219 |
+
return "\n".join(lines)
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def _format_messages_structure(messages_data: Any) -> str | None:
|
| 223 |
+
"""
|
| 224 |
+
Analyze and format the structure of a messages column.
|
| 225 |
+
Common in chat/instruction datasets.
|
| 226 |
+
"""
|
| 227 |
+
import json
|
| 228 |
+
|
| 229 |
+
# Parse if string
|
| 230 |
+
if isinstance(messages_data, str):
|
| 231 |
+
try:
|
| 232 |
+
messages_data = json.loads(messages_data)
|
| 233 |
+
except json.JSONDecodeError:
|
| 234 |
+
return None
|
| 235 |
+
|
| 236 |
+
if not isinstance(messages_data, list) or not messages_data:
|
| 237 |
+
return None
|
| 238 |
+
|
| 239 |
+
lines = ["## Messages Column Format"]
|
| 240 |
+
|
| 241 |
+
# Analyze message structure
|
| 242 |
+
roles_seen = set()
|
| 243 |
+
has_tool_calls = False
|
| 244 |
+
has_tool_results = False
|
| 245 |
+
message_keys = set()
|
| 246 |
+
|
| 247 |
+
for msg in messages_data:
|
| 248 |
+
if not isinstance(msg, dict):
|
| 249 |
+
continue
|
| 250 |
+
|
| 251 |
+
message_keys.update(msg.keys())
|
| 252 |
+
|
| 253 |
+
role = msg.get("role", "")
|
| 254 |
+
if role:
|
| 255 |
+
roles_seen.add(role)
|
| 256 |
+
|
| 257 |
+
if "tool_calls" in msg or "function_call" in msg:
|
| 258 |
+
has_tool_calls = True
|
| 259 |
+
if role in ("tool", "function") or msg.get("tool_call_id"):
|
| 260 |
+
has_tool_results = True
|
| 261 |
+
|
| 262 |
+
# Format the analysis
|
| 263 |
+
lines.append(
|
| 264 |
+
f"**Roles:** {', '.join(sorted(roles_seen)) if roles_seen else 'unknown'}"
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
# Show common message keys with presence indicators
|
| 268 |
+
common_keys = [
|
| 269 |
+
"role",
|
| 270 |
+
"content",
|
| 271 |
+
"tool_calls",
|
| 272 |
+
"tool_call_id",
|
| 273 |
+
"name",
|
| 274 |
+
"function_call",
|
| 275 |
+
]
|
| 276 |
+
key_status = []
|
| 277 |
+
for key in common_keys:
|
| 278 |
+
if key in message_keys:
|
| 279 |
+
key_status.append(f"{key} ✓")
|
| 280 |
+
else:
|
| 281 |
+
key_status.append(f"{key} ✗")
|
| 282 |
+
lines.append(f"**Message keys:** {', '.join(key_status)}")
|
| 283 |
+
|
| 284 |
+
if has_tool_calls:
|
| 285 |
+
lines.append("**Tool calls:** ✓ Present")
|
| 286 |
+
if has_tool_results:
|
| 287 |
+
lines.append("**Tool results:** ✓ Present")
|
| 288 |
+
|
| 289 |
+
# Show example message structure
|
| 290 |
+
if messages_data and isinstance(messages_data[0], dict):
|
| 291 |
+
lines.append("")
|
| 292 |
+
lines.append("**Example message structure:**")
|
| 293 |
+
example = messages_data[0]
|
| 294 |
+
for key, val in example.items():
|
| 295 |
+
if key == "content":
|
| 296 |
+
val_preview = (
|
| 297 |
+
str(val)[:100] + "..." if len(str(val)) > 100 else str(val)
|
| 298 |
+
)
|
| 299 |
+
lines.append(f" - {key}: {val_preview}")
|
| 300 |
+
elif key == "tool_calls" and isinstance(val, list) and val:
|
| 301 |
+
lines.append(f" - {key}: [{len(val)} tool call(s)]")
|
| 302 |
+
# Show first tool call structure
|
| 303 |
+
if isinstance(val[0], dict):
|
| 304 |
+
tc = val[0]
|
| 305 |
+
lines.append(f" - type: {tc.get('type', 'function')}")
|
| 306 |
+
if "function" in tc:
|
| 307 |
+
lines.append(
|
| 308 |
+
f" - function.name: {tc['function'].get('name', '?')}"
|
| 309 |
+
)
|
| 310 |
+
lines.append(" - function.arguments: <json string>")
|
| 311 |
+
else:
|
| 312 |
+
lines.append(f" - {key}: {val}")
|
| 313 |
+
|
| 314 |
+
return "\n".join(lines)
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
def _format_parquet_files(data: dict) -> str | None:
|
| 318 |
+
"""Format parquet file info, return None if no files"""
|
| 319 |
+
files = data.get("parquet_files", [])
|
| 320 |
+
if not files:
|
| 321 |
+
return None
|
| 322 |
+
|
| 323 |
+
# Group by config/split
|
| 324 |
+
groups: dict[str, dict] = {}
|
| 325 |
+
for f in files:
|
| 326 |
+
key = f"{f.get('config', 'default')}/{f.get('split', 'train')}"
|
| 327 |
+
if key not in groups:
|
| 328 |
+
groups[key] = {"count": 0, "size": 0}
|
| 329 |
+
groups[key]["count"] += 1
|
| 330 |
+
groups[key]["size"] += f.get("size", 0)
|
| 331 |
+
|
| 332 |
+
lines = ["## Files (Parquet)"]
|
| 333 |
+
for key, info in groups.items():
|
| 334 |
+
size_mb = info["size"] / (1024 * 1024)
|
| 335 |
+
lines.append(f"- {key}: {info['count']} file(s) ({size_mb:.1f} MB)")
|
| 336 |
+
return "\n".join(lines)
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
# Tool specification
|
| 340 |
+
HF_INSPECT_DATASET_TOOL_SPEC = {
|
| 341 |
+
"name": "hf_inspect_dataset",
|
| 342 |
"description": (
|
| 343 |
+
"Inspect a Hugging Face dataset comprehensively in one call.\n\n"
|
| 344 |
+
"## What you get\n"
|
| 345 |
+
"- Status check (validates dataset works without errors)\n"
|
| 346 |
+
"- All configs and splits\n"
|
| 347 |
+
"- Column names and types (schema)\n"
|
| 348 |
+
"- Sample rows to understand data format\n"
|
| 349 |
+
"- Parquet file structure and sizes\n\n"
|
| 350 |
+
"## CRITICAL\n"
|
| 351 |
+
"**Always inspect datasets before writing training code** to understand:\n"
|
| 352 |
+
"- Column names for your dataloader\n"
|
| 353 |
+
"- Data types and format\n"
|
| 354 |
+
"- Available splits (train/test/validation)\n\n"
|
| 355 |
+
"Supports private/gated datasets when HF_TOKEN is set.\n\n"
|
|
|
|
|
|
|
| 356 |
"## Examples\n"
|
| 357 |
+
'{"dataset": "stanfordnlp/imdb"}\n'
|
| 358 |
+
'{"dataset": "nyu-mll/glue", "config": "mrpc", "sample_rows": 5}\n'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
),
|
| 360 |
"parameters": {
|
| 361 |
"type": "object",
|
| 362 |
"properties": {
|
| 363 |
"dataset": {
|
| 364 |
"type": "string",
|
| 365 |
+
"description": "Dataset ID in 'org/name' format (e.g., 'stanfordnlp/imdb')",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
},
|
| 367 |
"config": {
|
| 368 |
"type": "string",
|
| 369 |
+
"description": "Config/subset name. Auto-detected if not specified.",
|
| 370 |
},
|
| 371 |
+
"split": {
|
| 372 |
+
"type": "string",
|
| 373 |
+
"description": "Split for sample rows. Auto-detected if not specified.",
|
|
|
|
| 374 |
},
|
| 375 |
+
"sample_rows": {
|
| 376 |
"type": "integer",
|
| 377 |
+
"description": "Number of sample rows to show (default: 3, max: 10)",
|
| 378 |
+
"default": 3,
|
| 379 |
},
|
| 380 |
},
|
| 381 |
+
"required": ["dataset"],
|
| 382 |
},
|
| 383 |
}
|
| 384 |
|
| 385 |
|
| 386 |
+
async def hf_inspect_dataset_handler(arguments: dict[str, Any]) -> tuple[str, bool]:
|
| 387 |
+
"""Handler for agent tool router"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 388 |
try:
|
| 389 |
+
result = await inspect_dataset(
|
| 390 |
dataset=arguments["dataset"],
|
|
|
|
| 391 |
config=arguments.get("config"),
|
| 392 |
+
split=arguments.get("split"),
|
| 393 |
+
sample_rows=min(arguments.get("sample_rows", 3), 10),
|
| 394 |
)
|
| 395 |
return result["formatted"], not result.get("isError", False)
|
| 396 |
except Exception as e:
|
| 397 |
+
return f"Error inspecting dataset: {str(e)}", False
|
|
|