Yoan Di Cosmo commited on
Commit
414e120
·
1 Parent(s): 6d8949d

limit in rows displaying

Browse files
Files changed (2) hide show
  1. .gitignore +1 -1
  2. agent/tools/dataset_tools.py +35 -4
.gitignore CHANGED
@@ -16,4 +16,4 @@ wheels/
16
  /logs
17
  hf-agent-leaderboard/
18
  .cursor/
19
- session_logs/skills/
 
16
  /logs
17
  hf-agent-leaderboard/
18
  .cursor/
19
+ session_logs/
agent/tools/dataset_tools.py CHANGED
@@ -169,12 +169,29 @@ def _extract_configs(splits_data: dict) -> list[SplitConfig]:
169
  return list(configs.values())
170
 
171
 
172
- def _format_structure(configs: list[SplitConfig]) -> str:
 
 
173
  """Format configs and splits as a markdown table."""
174
  lines = ["## Structure (configs & splits)", "| Config | Split |", "|--------|-------|"]
 
 
 
 
175
  for cfg in configs:
176
  for split_name in cfg["splits"]:
 
 
177
  lines.append(f"| {cfg['name']} | {split_name} |")
 
 
 
 
 
 
 
 
 
178
  return "\n".join(lines)
179
 
180
 
@@ -332,8 +349,12 @@ def _format_messages_structure(messages_data: Any) -> str | None:
332
  return "\n".join(lines)
333
 
334
 
335
- def _format_parquet_files(data: dict) -> str | None:
336
- """Format parquet file info, return None if no files"""
 
 
 
 
337
  files = data.get("parquet_files", [])
338
  if not files:
339
  return None
@@ -351,9 +372,19 @@ def _format_parquet_files(data: dict) -> str | None:
351
  groups[key]["size"] += int(size)
352
 
353
  lines = ["## Files (Parquet)"]
354
- for key, info in groups.items():
 
 
 
 
355
  size_mb = info["size"] / (1024 * 1024)
356
  lines.append(f"- {key}: {info['count']} file(s) ({size_mb:.1f} MB)")
 
 
 
 
 
 
357
  return "\n".join(lines)
358
 
359
 
 
169
  return list(configs.values())
170
 
171
 
172
+ def _format_structure(
173
+ configs: list[SplitConfig], max_rows: int = 10
174
+ ) -> str:
175
  """Format configs and splits as a markdown table."""
176
  lines = ["## Structure (configs & splits)", "| Config | Split |", "|--------|-------|"]
177
+
178
+ total_splits = sum(len(cfg["splits"]) for cfg in configs)
179
+ added_rows = 0
180
+
181
  for cfg in configs:
182
  for split_name in cfg["splits"]:
183
+ if added_rows >= max_rows:
184
+ break
185
  lines.append(f"| {cfg['name']} | {split_name} |")
186
+ added_rows += 1
187
+ if added_rows >= max_rows:
188
+ break
189
+
190
+ if total_splits > added_rows:
191
+ lines.append(
192
+ f"| ... | ... | (_showing {added_rows} of {total_splits} config/split rows_) |"
193
+ )
194
+
195
  return "\n".join(lines)
196
 
197
 
 
349
  return "\n".join(lines)
350
 
351
 
352
+ def _format_parquet_files(data: dict, max_rows: int = 10) -> str | None:
353
+ """Format parquet file info, return None if no files.
354
+
355
+ We cap the number of rendered lines to keep output manageable for
356
+ datasets with many parquet groups.
357
+ """
358
  files = data.get("parquet_files", [])
359
  if not files:
360
  return None
 
372
  groups[key]["size"] += int(size)
373
 
374
  lines = ["## Files (Parquet)"]
375
+ items = list(groups.items())
376
+ total_groups = len(items)
377
+
378
+ shown = 0
379
+ for key, info in items[:max_rows]:
380
  size_mb = info["size"] / (1024 * 1024)
381
  lines.append(f"- {key}: {info['count']} file(s) ({size_mb:.1f} MB)")
382
+ shown += 1
383
+
384
+ if total_groups > shown:
385
+ lines.append(
386
+ f"- ... (_showing {shown} of {total_groups} parquet groups_)"
387
+ )
388
  return "\n".join(lines)
389
 
390