Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Yoan Di Cosmo commited on
Commit ·
ce69c24
1
Parent(s): 693d9c0
limit in rows displaying
Browse files- .gitignore +1 -1
- agent/tools/dataset_tools.py +35 -4
.gitignore
CHANGED
|
@@ -16,4 +16,4 @@ wheels/
|
|
| 16 |
/logs
|
| 17 |
hf-agent-leaderboard/
|
| 18 |
.cursor/
|
| 19 |
-
session_logs/
|
|
|
|
| 16 |
/logs
|
| 17 |
hf-agent-leaderboard/
|
| 18 |
.cursor/
|
| 19 |
+
session_logs/
|
agent/tools/dataset_tools.py
CHANGED
|
@@ -169,12 +169,29 @@ def _extract_configs(splits_data: dict) -> list[SplitConfig]:
|
|
| 169 |
return list(configs.values())
|
| 170 |
|
| 171 |
|
| 172 |
-
def _format_structure(
|
|
|
|
|
|
|
| 173 |
"""Format configs and splits as a markdown table."""
|
| 174 |
lines = ["## Structure (configs & splits)", "| Config | Split |", "|--------|-------|"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
for cfg in configs:
|
| 176 |
for split_name in cfg["splits"]:
|
|
|
|
|
|
|
| 177 |
lines.append(f"| {cfg['name']} | {split_name} |")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
return "\n".join(lines)
|
| 179 |
|
| 180 |
|
|
@@ -332,8 +349,12 @@ def _format_messages_structure(messages_data: Any) -> str | None:
|
|
| 332 |
return "\n".join(lines)
|
| 333 |
|
| 334 |
|
| 335 |
-
def _format_parquet_files(data: dict) -> str | None:
|
| 336 |
-
"""Format parquet file info, return None if no files
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
files = data.get("parquet_files", [])
|
| 338 |
if not files:
|
| 339 |
return None
|
|
@@ -351,9 +372,19 @@ def _format_parquet_files(data: dict) -> str | None:
|
|
| 351 |
groups[key]["size"] += int(size)
|
| 352 |
|
| 353 |
lines = ["## Files (Parquet)"]
|
| 354 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
size_mb = info["size"] / (1024 * 1024)
|
| 356 |
lines.append(f"- {key}: {info['count']} file(s) ({size_mb:.1f} MB)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
return "\n".join(lines)
|
| 358 |
|
| 359 |
|
|
|
|
| 169 |
return list(configs.values())
|
| 170 |
|
| 171 |
|
| 172 |
+
def _format_structure(
|
| 173 |
+
configs: list[SplitConfig], max_rows: int = 10
|
| 174 |
+
) -> str:
|
| 175 |
"""Format configs and splits as a markdown table."""
|
| 176 |
lines = ["## Structure (configs & splits)", "| Config | Split |", "|--------|-------|"]
|
| 177 |
+
|
| 178 |
+
total_splits = sum(len(cfg["splits"]) for cfg in configs)
|
| 179 |
+
added_rows = 0
|
| 180 |
+
|
| 181 |
for cfg in configs:
|
| 182 |
for split_name in cfg["splits"]:
|
| 183 |
+
if added_rows >= max_rows:
|
| 184 |
+
break
|
| 185 |
lines.append(f"| {cfg['name']} | {split_name} |")
|
| 186 |
+
added_rows += 1
|
| 187 |
+
if added_rows >= max_rows:
|
| 188 |
+
break
|
| 189 |
+
|
| 190 |
+
if total_splits > added_rows:
|
| 191 |
+
lines.append(
|
| 192 |
+
f"| ... | ... | (_showing {added_rows} of {total_splits} config/split rows_) |"
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
return "\n".join(lines)
|
| 196 |
|
| 197 |
|
|
|
|
| 349 |
return "\n".join(lines)
|
| 350 |
|
| 351 |
|
| 352 |
+
def _format_parquet_files(data: dict, max_rows: int = 10) -> str | None:
|
| 353 |
+
"""Format parquet file info, return None if no files.
|
| 354 |
+
|
| 355 |
+
We cap the number of rendered lines to keep output manageable for
|
| 356 |
+
datasets with many parquet groups.
|
| 357 |
+
"""
|
| 358 |
files = data.get("parquet_files", [])
|
| 359 |
if not files:
|
| 360 |
return None
|
|
|
|
| 372 |
groups[key]["size"] += int(size)
|
| 373 |
|
| 374 |
lines = ["## Files (Parquet)"]
|
| 375 |
+
items = list(groups.items())
|
| 376 |
+
total_groups = len(items)
|
| 377 |
+
|
| 378 |
+
shown = 0
|
| 379 |
+
for key, info in items[:max_rows]:
|
| 380 |
size_mb = info["size"] / (1024 * 1024)
|
| 381 |
lines.append(f"- {key}: {info['count']} file(s) ({size_mb:.1f} MB)")
|
| 382 |
+
shown += 1
|
| 383 |
+
|
| 384 |
+
if total_groups > shown:
|
| 385 |
+
lines.append(
|
| 386 |
+
f"- ... (_showing {shown} of {total_groups} parquet groups_)"
|
| 387 |
+
)
|
| 388 |
return "\n".join(lines)
|
| 389 |
|
| 390 |
|