akseljoonas HF Staff commited on
Commit
c45ebae
Β·
1 Parent(s): a33baef

feat: use hf_agent tool descriptions, hardware flavors, and default env

Browse files
agent/tools/dataset_tools.py CHANGED
@@ -388,22 +388,15 @@ def _format_parquet_files(data: dict, max_rows: int = 10) -> str | None:
388
  HF_INSPECT_DATASET_TOOL_SPEC = {
389
  "name": "hf_inspect_dataset",
390
  "description": (
391
- "Inspect a Hugging Face dataset comprehensively in one call.\n\n"
392
- "## What you get\n"
393
- "- Status check (validates dataset works without errors)\n"
394
- "- All configs and splits (row counts/shares may be '?' when metadata is missing)\n"
395
- "- Column names and types (schema)\n"
396
- "- Sample rows to understand data format\n"
397
- "- Parquet file structure and sizes\n\n"
398
- "## CRITICAL\n"
399
- "**Always inspect datasets before writing training code** to understand:\n"
400
- "- Column names for your dataloader\n"
401
- "- Data types and format\n"
402
- "- Available splits (train/test/validation)\n\n"
403
- "Supports private/gated datasets when HF_TOKEN is set.\n\n"
404
- "## Examples\n"
405
- '{"dataset": "stanfordnlp/imdb"}\n'
406
- '{"dataset": "nyu-mll/glue", "config": "mrpc", "sample_rows": 5}\n'
407
  ),
408
  "parameters": {
409
  "type": "object",
 
388
  HF_INSPECT_DATASET_TOOL_SPEC = {
389
  "name": "hf_inspect_dataset",
390
  "description": (
391
+ "Inspect a HF dataset in one call: status, configs/splits, schema, sample rows, parquet info.\n\n"
392
+ "REQUIRED before any training job to verify dataset format matches training method:\n"
393
+ " SFT: needs 'messages', 'text', or 'prompt'/'completion'\n"
394
+ " DPO: needs 'prompt', 'chosen', 'rejected'\n"
395
+ " GRPO: needs 'prompt'\n"
396
+ "All datasets used for training have to be in conversational ChatML format to be compatible with HF libraries.'\n"
397
+ "Training will fail with KeyError if columns don't match.\n\n"
398
+ "Also use to get example datapoints, understand column names, data types, and available splits before writing any data loading code. "
399
+ "Supports private/gated datasets when HF_TOKEN is set."
 
 
 
 
 
 
 
400
  ),
401
  "parameters": {
402
  "type": "object",
agent/tools/docs_tools.py CHANGED
@@ -845,17 +845,12 @@ DOC_ENDPOINTS = [
845
  EXPLORE_HF_DOCS_TOOL_SPEC = {
846
  "name": "explore_hf_docs",
847
  "description": (
848
- "Explore Hugging Face documentation structure and discover available pages with 200-character previews. "
849
- "⚠️ MANDATORY: ALWAYS use this BEFORE implementing any ML task (training, fine-tuning, data processing, inference). "
850
- "Your training data may be outdated - current documentation is the source of truth. "
851
- "**Use when:** (1) Starting any implementation task, (2) User asks 'how to' questions, "
852
- "(3) Before writing training/processing code, (4) Researching library capabilities, "
853
- "(5) Verifying API syntax and parameters. "
854
- "**Pattern:** explore (discover structure) β†’ fetch_hf_docs (get details) β†’ implement with researched approach. "
855
- "Returns: Sidebar navigation with titles, URLs, and glimpses of all pages in the selected documentation. "
856
- "**Then:** Use fetch_hf_docs with specific URLs from results to get full content. "
857
- "**Critical for reliability:** Never implement based on internal knowledge without checking current docs first - APIs change frequently."
858
- " By default returns the top 20 results; set max_results (max 50) to adjust."
859
  ),
860
  "parameters": {
861
  "type": "object",
@@ -928,16 +923,10 @@ EXPLORE_HF_DOCS_TOOL_SPEC = {
928
  HF_DOCS_FETCH_TOOL_SPEC = {
929
  "name": "fetch_hf_docs",
930
  "description": (
931
- "Fetch full markdown content of a specific HF documentation page. "
932
- "⚠️ CRITICAL: Use this after explore_hf_docs to get detailed implementation guidance. "
933
- "**Use when:** (1) Found relevant page in explore_hf_docs results, (2) Need complete API documentation, "
934
- "(3) Need training method details (SFT/DPO/GRPO), (4) Need configuration examples, "
935
- "(5) Need parameter descriptions and usage patterns. "
936
- "**Pattern:** explore_hf_docs (find relevant page) β†’ fetch_hf_docs (get full content) β†’ implement using documented approach. "
937
- "Provide full URL from explore_hf_docs results (e.g., 'https://huggingface.co/docs/trl/sft_trainer'). "
938
- "Returns: Complete markdown documentation with examples, parameters, and usage patterns. "
939
- "**For training tasks:** ALWAYS fetch trainer docs (SFTConfig, DPOConfig, etc.) before creating training scripts. "
940
- "**Critical for reliability:** This ensures you use current APIs and best practices."
941
  ),
942
  "parameters": {
943
  "type": "object",
 
845
  EXPLORE_HF_DOCS_TOOL_SPEC = {
846
  "name": "explore_hf_docs",
847
  "description": (
848
+ "Browse HF documentation structure β€” discover all available documentation with 200-char previews.\n\n"
849
+ "Use this to find relevant documentation and/or examples with detailed parameter docs and API reference. "
850
+ "To be used together with github_find_examples and github_read_file to find working examples and documentation.\n\n"
851
+ "Pattern: explore_hf_docs (find relevant pages) β†’ fetch_hf_docs (get full content).\n\n"
852
+ "For training tasks: fetch the trainer config docs (SFTConfig, DPOConfig, GRPOConfig) to verify parameter names. "
853
+ "Returns top 20 results by default; set max_results (max 50) to adjust."
 
 
 
 
 
854
  ),
855
  "parameters": {
856
  "type": "object",
 
923
  HF_DOCS_FETCH_TOOL_SPEC = {
924
  "name": "fetch_hf_docs",
925
  "description": (
926
+ "Fetch full markdown content of an HF documentation page. Use after explore_hf_docs.\n\n"
927
+ "Critical for finding documentation e.g. current trainer configuration parameters (SFTConfig, DPOConfig, etc.) "
928
+ "Use for researching solutions and before writing training scripts. Your internal knowledge is outdated.\n\n"
929
+ "Provide the full URL from explore_hf_docs results. The .md extension is added automatically."
 
 
 
 
 
 
930
  ),
931
  "parameters": {
932
  "type": "object",
agent/tools/github_find_examples.py CHANGED
@@ -405,55 +405,16 @@ def find_examples(
405
  GITHUB_FIND_EXAMPLES_TOOL_SPEC = {
406
  "name": "github_find_examples",
407
  "description": (
408
- "Discover working code examples, tutorials, scripts, and demos in GitHub repositories. "
409
- "⚠️ CRITICAL: ALWAYS use this BEFORE implementing ML tasks - find working reference code first. "
410
- "Your training data may be outdated; real repository examples show current best practices. "
411
- "**Use when:** (1) Starting any ML implementation (training, inference, evaluation), "
412
- "(2) User asks 'how to' questions about libraries, (3) Need reference implementations, "
413
- "(4) Exploring library capabilities, (5) Before writing training/processing scripts. "
414
- "**Pattern:** github_find_examples (discover) β†’ github_read_file (study code) β†’ implement with researched approach. "
415
- "Returns: List of example files (scripts/notebooks/tutorials) with paths and URLs, sorted by relevance. "
416
- "**Then:** Use github_read_file to read the actual implementation code. "
417
- "**Critical for reliability:** Real examples prevent outdated API usage and show proven patterns. "
418
- "## How it works\n\n"
419
- "1. Fetches all example files (examples/, scripts/, tutorials/, demos/, notebooks/, etc.) from repository\n"
420
- "2. If keyword provided, scores files against keyword using fuzzy matching\n"
421
- "3. Returns best matches sorted by relevance and pattern priority\n"
422
- "4. Provides copyable parameters for github_read_file tool\n\n"
423
- "## Examples\n\n"
424
- "<example>\n"
425
- "// ML Workflow Step: Find GRPO training examples before implementation\n"
426
- "// Task: Starting GRPO fine-tuning project, need reference implementation\n"
427
- "{\n"
428
- " keyword: 'grpo',\n"
429
- " repo: 'trl',\n"
430
- " org: 'huggingface'\n"
431
- "}\n"
432
- "// Returns: examples/scripts/grpo_agent.py, examples/scripts/grpo_vlm.py\n"
433
- "// Next step: github_read_file to study working implementation\n"
434
- "</example>\n\n"
435
- "<example>\n"
436
- "// ML Workflow Step: Discover all available training methods\n"
437
- "// Task: Exploring TRL training options before choosing approach\n"
438
- "{\n"
439
- " repo: 'trl',\n"
440
- " org: 'huggingface',\n"
441
- " max_results: 20\n"
442
- "}\n"
443
- "// Lists: SFT, DPO, GRPO, PPO, reward modeling examples\n"
444
- "// Helps user choose appropriate method\n"
445
- "</example>\n\n"
446
- "<example>\n"
447
- "// ML Workflow Step: Find LoRA fine-tuning examples\n"
448
- "// Task: Learning parameter-efficient fine-tuning patterns\n"
449
- "{\n"
450
- " keyword: 'lora',\n"
451
- " repo: 'peft',\n"
452
- " org: 'huggingface'\n"
453
- "}\n"
454
- "// Discovers LoRA configuration and training examples\n"
455
- "// Shows current PEFT API usage patterns\n"
456
- "</example>"
457
  ),
458
  "parameters": {
459
  "type": "object",
 
405
  GITHUB_FIND_EXAMPLES_TOOL_SPEC = {
406
  "name": "github_find_examples",
407
  "description": (
408
+ "Find working example scripts in GitHub repositories (from a list of predetermined directories e.g. examples/, scripts/, tutorials/, etc.). "
409
+ "Uses fuzzy keyword matching.\n\n"
410
+ "MANDATORY before writing any ML training, fine-tuning, or inference code. "
411
+ "Your internal knowledge of library APIs is outdated β€” working examples show current API patterns.\n\n"
412
+ "Sequence: github_find_examples β†’ github_read_file (study the example) β†’ implement based on what you found.\n\n"
413
+ "Skip this only for: simple data queries, status checks, non-code tasks.\n\n"
414
+ "Examples:\n"
415
+ " {keyword: 'sft', repo: 'trl'} β†’ finds examples/scripts/sft.py\n"
416
+ " {keyword: 'grpo', repo: 'trl'} β†’ finds GRPO training examples\n"
417
+ " {repo: 'trl', max_results: 20} β†’ lists all available training method examples"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  ),
419
  "parameters": {
420
  "type": "object",
agent/tools/github_read_file.py CHANGED
@@ -250,59 +250,13 @@ def read_file(
250
  GITHUB_READ_FILE_TOOL_SPEC = {
251
  "name": "github_read_file",
252
  "description": (
253
- "Read file contents from GitHub repositories with line range support (default 300 lines). "
254
- "⚠️ CRITICAL: Use AFTER github_find_examples to study working implementation code. "
255
- "**Use when:** (1) Found example file via github_find_examples and need full code, "
256
- "(2) Need to read trainer class implementation, (3) Study configuration patterns, "
257
- "(4) Read specific code sections with line ranges, (5) Review code from specific branches/commits. "
258
- "**Pattern:** github_find_examples (discover files) β†’ github_read_file (read code) β†’ implement using researched patterns. "
259
- "Returns: File contents with line numbers, formatted for LLM reading. Auto-converts Jupyter notebooks to markdown. "
260
- "**Then:** Implement using patterns and APIs from the example code. "
261
- "**Critical for reliability:** Reading working examples prevents API errors and shows current best practices. "
262
  "Use line_start/line_end for large files (>300 lines) to read specific sections.\n\n"
263
- "## When to use this tool\n\n"
264
- "- When reading example code, trainer implementations, or configuration files\n"
265
- "- After github_find_examples returns file paths you want to study\n"
266
- "- When investigating specific code sections with line ranges\n"
267
- "- When reading from specific branches, tags, or commits (use ref parameter)\n\n"
268
- "## When NOT to use this tool\n\n"
269
- "- When you don't know exact file path (use github_find_examples or github_search_code first)\n"
270
- "- When searching for code patterns across repos (use github_search_code instead)\n\n"
271
- "## Examples\n\n"
272
- "<example>\n"
273
- "// ML Workflow Step: Read GRPO trainer class after finding via github_find_examples\n"
274
- "// Use case: Understand GRPOTrainer API, parameters, and methods\n"
275
- "{\n"
276
- " repo: 'huggingface/trl',\n"
277
- " path: 'trl/trainer/grpo_trainer.py',\n"
278
- " line_start: 1,\n"
279
- " line_end: 200\n"
280
- "}\n"
281
- "// Read class definition and constructor to understand current API\n"
282
- "// Shows: __init__ parameters, configuration, required arguments\n"
283
- "</example>\n\n"
284
- "<example>\n"
285
- "// ML Workflow Step: Study complete training script from examples\n"
286
- "// Use case: Learn end-to-end VLM fine-tuning workflow\n"
287
- "{\n"
288
- " repo: 'huggingface/trl',\n"
289
- " path: 'examples/scripts/grpo_vlm.py'\n"
290
- "}\n"
291
- "// Returns first 300 lines - shows full training setup\n"
292
- "// Use line_start/line_end if need to read more\n"
293
- "</example>\n\n"
294
- "<example>\n"
295
- "// ML Workflow Step: Check TrainingArguments configuration patterns\n"
296
- "// Use case: Learn how to structure training configs correctly\n"
297
- "{\n"
298
- " repo: 'huggingface/transformers',\n"
299
- " path: 'examples/pytorch/language-modeling/run_clm.py',\n"
300
- " line_start: 50,\n"
301
- " line_end: 150\n"
302
- "}\n"
303
- "// Read argument parsing and config setup section\n"
304
- "// Shows: current parameter names, default values, best practices\n"
305
- "</example>"
306
  ),
307
  "parameters": {
308
  "type": "object",
 
250
  GITHUB_READ_FILE_TOOL_SPEC = {
251
  "name": "github_read_file",
252
  "description": (
253
+ "Read file contents from GitHub repositories. Returns first 300 lines by default. "
254
+ "Auto-converts Jupyter notebooks to markdown.\n\n"
255
+ "Use AFTER github_find_examples to study the working implementation. "
256
+ "The purpose is to learn current API patterns β€” imports, trainer configs, dataset handling β€” "
257
+ "so your implementation uses correct, up-to-date code.\n\n"
 
 
 
 
258
  "Use line_start/line_end for large files (>300 lines) to read specific sections.\n\n"
259
+ "When NOT to use: when you don't know the file path (use github_find_examples first)."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  ),
261
  "parameters": {
262
  "type": "object",
agent/tools/jobs_tool.py CHANGED
@@ -29,38 +29,33 @@ from agent.tools.utilities import (
29
  )
30
 
31
  # Hardware flavors
32
- CPU_FLAVORS = ["cpu-basic", "cpu-upgrade", "cpu-performance", "cpu-xl"]
33
  GPU_FLAVORS = [
34
- "sprx8",
35
- "zero-a10g",
36
  "t4-small",
37
  "t4-medium",
38
- "l4x1",
39
- "l4x4",
40
- "l40sx1",
41
- "l40sx4",
42
- "l40sx8",
43
  "a10g-small",
44
  "a10g-large",
45
  "a10g-largex2",
46
  "a10g-largex4",
47
  "a100-large",
48
- "h100",
49
- "h100x8",
 
 
 
 
 
50
  ]
51
 
52
  # Detailed specs for display (vCPU/RAM/GPU VRAM)
53
- CPU_FLAVORS_DESC = (
54
- "cpu-basic(2vCPU/16GB), cpu-upgrade(8vCPU/32GB), cpu-performance, cpu-xl"
55
- )
56
  GPU_FLAVORS_DESC = (
57
  "t4-small(4vCPU/15GB/GPU 16GB), t4-medium(8vCPU/30GB/GPU 16GB), "
58
- "l4x1(8vCPU/30GB/GPU 24GB), l4x4(48vCPU/186GB/GPU 96GB), "
59
- "l40sx1(8vCPU/62GB/GPU 48GB), l40sx4(48vCPU/382GB/GPU 192GB), l40sx8(192vCPU/1534GB/GPU 384GB), "
60
- "a10g-small(4vCPU/14GB/GPU 24GB), a10g-large(12vCPU/46GB/GPU 24GB), "
61
  "a10g-largex2(24vCPU/92GB/GPU 48GB), a10g-largex4(48vCPU/184GB/GPU 96GB), "
62
- "a100-large(12vCPU/142GB/GPU 80GB), h100(23vCPU/240GB/GPU 80GB), h100x8(184vCPU/1920GB/GPU 640GB), "
63
- "zero-a10g(dynamic alloc)"
 
64
  )
65
  SPECIALIZED_FLAVORS = ["inf2x6"]
66
  ALL_FLAVORS = CPU_FLAVORS + GPU_FLAVORS + SPECIALIZED_FLAVORS
@@ -122,6 +117,21 @@ def _filter_uv_install_output(logs: list[str]) -> list[str]:
122
  return logs
123
 
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  def _add_environment_variables(
126
  params: Dict[str, Any] | None, user_token: str | None = None
127
  ) -> Dict[str, Any]:
@@ -509,7 +519,7 @@ class HfJobsTool:
509
  self.api.run_job,
510
  image=image,
511
  command=command,
512
- env=args.get("env"),
513
  secrets=_add_environment_variables(args.get("secrets"), self.hf_token),
514
  flavor=args.get("hardware_flavor", "cpu-basic"),
515
  timeout=args.get("timeout", "30m"),
@@ -741,7 +751,7 @@ To verify, call this tool with `{{"operation": "inspect", "job_id": "{job_id}"}}
741
  image=image,
742
  command=command,
743
  schedule=schedule,
744
- env=args.get("env"),
745
  secrets=_add_environment_variables(args.get("secrets"), self.hf_token),
746
  flavor=args.get("hardware_flavor", "cpu-basic"),
747
  timeout=args.get("timeout", "30m"),
@@ -901,56 +911,31 @@ To inspect, call this tool with `{{"operation": "scheduled inspect", "scheduled_
901
  HF_JOBS_TOOL_SPEC = {
902
  "name": "hf_jobs",
903
  "description": (
904
- "Execute Python scripts or Docker containers on HF cloud infrastructure (CPUs/GPUs) in one of two modes. "
905
- "\n\n"
906
- "**Two Modes (mutually exclusive):**\n"
907
- "1. Python mode: using 'script' arg (REQUIRED) + 'dependencies'\n"
908
- "2. Docker mode: using 'command' arg (REQUIRED) + 'image'\n\n"
909
- "🚨 **REQUIRED:** You MUST provide exactly ONE of: 'script' (Python code as string) OR 'command' (Docker command as array). "
910
- "They are mutually exclusive - provide one or the other, never both, never neither. "
911
- "Do NOT call with just {'operation': 'run'} - always include your code. Example: {'operation': 'run', 'script': 'import torch; print(torch.cuda.is_available())', 'dependencies': ['torch']} or {'operation': 'run', 'command': ['duckdb', '-c', 'select 1 + 2']', 'image': 'duckdb/duckdb'}\n\n"
912
- "⚠️ CRITICAL for reliability: (1) Jobs run ASYNC - provide monitoring URL immediately, don't poll; "
913
- "(2) Set timeout >30min (default too short - training needs 2-8h); "
914
- "(3) HF_TOKEN auto-loaded to secrets for Hub ops (push_to_hub, private repos); "
915
- "(4) Job storage EPHEMERAL - MUST push_to_hub() or ALL work is LOST. "
916
- "**Use when:** User wants cloud compute, training models, data processing, batch inference, GPU workloads, scheduled tasks. "
917
- "ALWAYS use this tool (βœ“), never bash 'hf jobs' commands (βœ—). Pass script content inline (βœ“), don't save to files unless requested (βœ—). "
918
- "\n\n"
919
- "**Operations:** run, ps, logs, inspect, cancel, scheduled run, scheduled ps, scheduled inspect, scheduled delete, scheduled suspend, scheduled resume. "
920
- "**Available Hardware (vCPU/RAM/GPU):**\n"
921
- f"β€’ CPU: {CPU_FLAVORS_DESC}\n"
922
- f"β€’ GPU: {GPU_FLAVORS_DESC}\n"
923
- " β—¦ Common: t4-small ($0.60/hr, demos/1-3B models), a10g-small ($1/hr), a10g-large ($2/hr, production 7-13B), a100-large ($4/hr, 30B+), h100 ($6/hr, 70B+)\n\n"
924
- "**After Submission Ground Rules:**\n"
925
- "βœ“ Return immediately with job ID and monitoring URL\n"
926
- "βœ“ Provide expected completion time and cost estimate\n"
927
- "βœ“ For training: Include Trackio dashboard URL\n"
928
- "βœ“ Note user can check status later\n"
929
- "βœ— DON'T poll logs automatically\n"
930
- "βœ— DON'T wait for completion\n"
931
- "βœ— DON'T check status unless user asks\n\n"
932
- "**For Training Tasks:**\n"
933
- "β€’ ALWAYS research TRL docs first: explore_hf_docs('trl') β†’ fetch_hf_docs(<trainer_url>)\n"
934
- "β€’ ALWAYS validate dataset format with hub_repo_details (SFT needs messages/text, DPO needs chosen/rejected)\n"
935
- "β€’ ALWAYS include Trackio monitoring in script (explore_hf_docs('trackio'))\n"
936
- "β€’ ALWAYS enable push_to_hub=True in training config\n"
937
- "β€’ Set timeout 2-8h for training (NOT default 30m)\n"
938
- "β€’ Confirm model/dataset choices with user before submitting\n\n"
939
- "**Examples:**\n\n"
940
- "**Training - Fine-tune LLM:**\n"
941
- "{'operation': 'run', 'script': '# Training script with TRL\\nfrom trl import SFTConfig, SFTTrainer\\nfrom transformers import AutoModelForCausalLM\\nmodel = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen3-4B\")\\n# ... researched implementation from docs ...\\ntrainer.train()\\ntrainer.push_to_hub(\"user-name/my-model\")', 'dependencies': ['transformers', 'trl', 'torch', 'datasets', 'trackio'], 'hardware_flavor': 'a10g-large', 'timeout': '4h'}\n\n"
942
- "**Data Processing:**\n"
943
- "{'operation': 'run', 'script': 'from datasets import load_dataset\\nds = load_dataset(\"data\")\\n# process...\\nds.push_to_hub(\"user/processed\")', 'dependencies': ['datasets', 'pandas'], 'hardware_flavor': 'cpu-upgrade', 'timeout': '2h'}\n\n"
944
- "**Scheduled Daily Job:**\n"
945
- "{'operation': 'scheduled run', 'schedule': '@daily', 'script': 'from datasets import Dataset\\nimport pandas as pd\\n# scrape/generate data\\ndf = pd.DataFrame(data)\\nds = Dataset.from_pandas(df)\\nds.push_to_hub(\"user-name/daily-dataset\")', 'dependencies': ['datasets', 'pandas'], 'hardware_flavor': 'cpu-basic'}\n\n"
946
- "**Docker Mode:**\n"
947
- "{'operation': 'run', 'image': 'pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime', 'command': ['python', 'train.py', '--epochs', '10'], 'hardware_flavor': 'a100-large'}\n\n"
948
- "**Monitor Operations:**\n"
949
- "{'operation': 'ps'} - List all jobs\n"
950
- "{'operation': 'logs', 'job_id': 'xxx'} - Stream logs (only when user requests)\n"
951
- "{'operation': 'inspect', 'job_id': 'xxx'} - Get job details\n"
952
- "{'operation': 'cancel', 'job_id': 'xxx'} - Stop job\n\n"
953
- "⚠️ CRITICAL: Files created during execution are DELETED when job finishes. MUST push_to_hub() all outputs (models, datasets, artifacts) in script. For logs/scripts, use hf_private_repos after completion."
954
  ),
955
  "parameters": {
956
  "type": "object",
@@ -970,13 +955,8 @@ HF_JOBS_TOOL_SPEC = {
970
  "scheduled suspend",
971
  "scheduled resume",
972
  ],
973
- "description": (
974
- "Operation to execute. Valid values: [run, ps, logs, inspect, cancel, "
975
- "scheduled run, scheduled ps, scheduled inspect, scheduled delete, "
976
- "scheduled suspend, scheduled resume]"
977
- ),
978
  },
979
- # Python/UV specific parameters
980
  "script": {
981
  "type": "string",
982
  "description": (
@@ -988,44 +968,52 @@ HF_JOBS_TOOL_SPEC = {
988
  "dependencies": {
989
  "type": "array",
990
  "items": {"type": "string"},
991
- "description": "Pip packages to install. Example: ['trl', 'torch', 'datasets', 'transformers']. Only used with 'script'.",
 
 
 
 
992
  },
993
- # Docker specific parameters
994
  "image": {
995
  "type": "string",
996
- "description": "Docker image. Example: 'pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime'. Use with 'run'/'scheduled run'. Optional (auto-selected if not provided).",
997
  },
998
  "command": {
999
  "type": "array",
1000
  "items": {"type": "string"},
1001
- "description": "Command to execute as list. Example: ['python', 'train.py', '--epochs', '10']. Triggers Docker mode. Use with 'run'/'scheduled run'. Mutually exclusive with 'script'.",
1002
  },
1003
- # Hardware and environment
1004
  "hardware_flavor": {
1005
  "type": "string",
1006
- "description": f"Hardware type. Available CPU flavors: {CPU_FLAVORS}. Available GPU flavors: {GPU_FLAVORS}. Use with 'run'/'scheduled run'.",
 
 
 
 
1007
  },
1008
  "timeout": {
1009
  "type": "string",
1010
- "description": "Max runtime. Examples: '30m', '2h', '4h'. Default: '30m'. Important for long training jobs. Use with 'run'/'scheduled run'.",
 
 
 
 
1011
  },
1012
  "env": {
1013
  "type": "object",
1014
- "description": "Environment variables. Format: {'KEY': 'VALUE'}. HF_TOKEN is automatically included from your auth. Use with 'run'/'scheduled run'.",
1015
  },
1016
- # Job management parameters
1017
  "job_id": {
1018
  "type": "string",
1019
- "description": "Job ID to operate on. Required for: 'logs', 'inspect', 'cancel'.",
1020
  },
1021
- # Scheduled job parameters
1022
  "scheduled_job_id": {
1023
  "type": "string",
1024
- "description": "Scheduled job ID. Required for: 'scheduled inspect', 'scheduled delete', 'scheduled suspend', 'scheduled resume'.",
1025
  },
1026
  "schedule": {
1027
  "type": "string",
1028
- "description": "Schedule for recurring job. Presets: '@hourly', '@daily', '@weekly', '@monthly'. Cron: '0 9 * * 1' (Mon 9am). Required for: 'scheduled run'.",
1029
  },
1030
  },
1031
  "required": ["operation"],
 
29
  )
30
 
31
  # Hardware flavors
32
+ CPU_FLAVORS = ["cpu-basic", "cpu-upgrade"]
33
  GPU_FLAVORS = [
 
 
34
  "t4-small",
35
  "t4-medium",
 
 
 
 
 
36
  "a10g-small",
37
  "a10g-large",
38
  "a10g-largex2",
39
  "a10g-largex4",
40
  "a100-large",
41
+ "a100x4",
42
+ "a100x8",
43
+ "l4x1",
44
+ "l4x4",
45
+ "l40sx1",
46
+ "l40sx4",
47
+ "l40sx8",
48
  ]
49
 
50
  # Detailed specs for display (vCPU/RAM/GPU VRAM)
51
+ CPU_FLAVORS_DESC = "cpu-basic(2vCPU/16GB), cpu-upgrade(8vCPU/32GB)"
 
 
52
  GPU_FLAVORS_DESC = (
53
  "t4-small(4vCPU/15GB/GPU 16GB), t4-medium(8vCPU/30GB/GPU 16GB), "
54
+ "a10g-small(4vCPU/15GB/GPU 24GB), a10g-large(12vCPU/46GB/GPU 24GB), "
 
 
55
  "a10g-largex2(24vCPU/92GB/GPU 48GB), a10g-largex4(48vCPU/184GB/GPU 96GB), "
56
+ "a100-large(12vCPU/142GB/GPU 80GB), a100x4(48vCPU/568GB/GPU 320GB), a100x8(96vCPU/1136GB/GPU 640GB), "
57
+ "l4x1(8vCPU/30GB/GPU 24GB), l4x4(48vCPU/186GB/GPU 96GB), "
58
+ "l40sx1(8vCPU/62GB/GPU 48GB), l40sx4(48vCPU/382GB/GPU 192GB), l40sx8(192vCPU/1534GB/GPU 384GB)"
59
  )
60
  SPECIALIZED_FLAVORS = ["inf2x6"]
61
  ALL_FLAVORS = CPU_FLAVORS + GPU_FLAVORS + SPECIALIZED_FLAVORS
 
117
  return logs
118
 
119
 
120
+ _DEFAULT_ENV = {
121
+ "HF_HUB_DISABLE_PROGRESS_BARS": "1",
122
+ "TQDM_DISABLE": "1",
123
+ "TRANSFORMERS_VERBOSITY": "warning",
124
+ "HF_HUB_ENABLE_HF_TRANSFER": "1",
125
+ }
126
+
127
+
128
+ def _add_default_env(params: Dict[str, Any] | None) -> Dict[str, Any]:
129
+ """Inject default env vars for clean, agent-friendly output."""
130
+ result = dict(_DEFAULT_ENV)
131
+ result.update(params or {}) # user-provided values override defaults
132
+ return result
133
+
134
+
135
  def _add_environment_variables(
136
  params: Dict[str, Any] | None, user_token: str | None = None
137
  ) -> Dict[str, Any]:
 
519
  self.api.run_job,
520
  image=image,
521
  command=command,
522
+ env=_add_default_env(args.get("env")),
523
  secrets=_add_environment_variables(args.get("secrets"), self.hf_token),
524
  flavor=args.get("hardware_flavor", "cpu-basic"),
525
  timeout=args.get("timeout", "30m"),
 
751
  image=image,
752
  command=command,
753
  schedule=schedule,
754
+ env=_add_default_env(args.get("env")),
755
  secrets=_add_environment_variables(args.get("secrets"), self.hf_token),
756
  flavor=args.get("hardware_flavor", "cpu-basic"),
757
  timeout=args.get("timeout", "30m"),
 
911
  HF_JOBS_TOOL_SPEC = {
912
  "name": "hf_jobs",
913
  "description": (
914
+ "Execute Python scripts or Docker containers on HF cloud infrastructure.\n\n"
915
+ "Two modes (mutually exclusive): Python mode (script + dependencies) or Docker mode (command + image). "
916
+ "Provide exactly ONE of 'script' or 'command'.\n\n"
917
+ "BEFORE submitting training/fine-tuning jobs:\n"
918
+ "- You MUST have called github_find_examples + github_read_file to find a working reference implementation. "
919
+ "Scripts based on your internal knowledge WILL use outdated APIs and fail.\n"
920
+ "- You MUST have validated dataset format via hf_inspect_dataset or hub_repo_details.\n"
921
+ "- Training config MUST include push_to_hub=True and hub_model_id. "
922
+ "Job storage is EPHEMERAL β€” all files are deleted when the job ends. Without push_to_hub, trained models are lost permanently.\n"
923
+ "- Include trackio monitoring and provide the dashboard URL to the user.\n\n"
924
+ "BATCH/ABLATION JOBS: Submit ONE job first. Check logs to confirm it starts training successfully. "
925
+ "Only then submit the remaining jobs. Never submit all at once β€” if there's a bug, all jobs fail.\n\n"
926
+ "Operations: run, ps, logs, inspect, cancel, scheduled run/ps/inspect/delete/suspend/resume.\n\n"
927
+ f"Hardware: CPU: {CPU_FLAVORS_DESC}. GPU: {GPU_FLAVORS_DESC}.\n"
928
+ "Common picks: t4-small ($0.60/hr, 1-3B), a10g-large ($2/hr, 7-13B), a100-large ($4/hr, 30B+), h100 ($6/hr, 70B+). "
929
+ "Note: a10g-small and a10g-large have the SAME 24GB GPU β€” the difference is CPU/RAM only.\n\n"
930
+ "OOM RECOVERY: When a training job fails with CUDA OOM:\n"
931
+ "1. Reduce per_device_train_batch_size and increase gradient_accumulation_steps proportionally (keep effective batch size identical)\n"
932
+ "2. Enable gradient_checkpointing=True\n"
933
+ "3. Upgrade to larger GPU (a10g→a100→h100)\n"
934
+ "Do NOT switch training methods (e.g. full SFT to LoRA) or reduce max_length β€” those change what the user gets and require explicit approval.\n\n"
935
+ "Examples:\n"
936
+ "Training: {'operation': 'run', 'script': '/app/train.py', 'dependencies': ['transformers', 'trl', 'torch', 'datasets', 'trackio'], 'hardware_flavor': 'a100-large', 'timeout': '8h'}\n"
937
+ "Monitor: {'operation': 'ps'}, {'operation': 'logs', 'job_id': 'xxx'}, {'operation': 'cancel', 'job_id': 'xxx'}"
938
+ "Docker: {'operation': 'run', 'command': ['duckdb', '-c', 'select 1 + 2'], 'image': 'duckdb/duckdb', 'hardware_flavor': 'cpu-basic', 'timeout': '1h'}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
939
  ),
940
  "parameters": {
941
  "type": "object",
 
955
  "scheduled suspend",
956
  "scheduled resume",
957
  ],
958
+ "description": "Operation to execute.",
 
 
 
 
959
  },
 
960
  "script": {
961
  "type": "string",
962
  "description": (
 
968
  "dependencies": {
969
  "type": "array",
970
  "items": {"type": "string"},
971
+ "description": (
972
+ "Pip packages to install. Include ALL required packages. "
973
+ "Common training set: ['transformers', 'trl', 'torch', 'datasets', 'trackio', 'accelerate']. "
974
+ "Only used with 'script'."
975
+ ),
976
  },
 
977
  "image": {
978
  "type": "string",
979
+ "description": "Docker image. Optional β€” auto-selected if not provided. Use with 'command'.",
980
  },
981
  "command": {
982
  "type": "array",
983
  "items": {"type": "string"},
984
+ "description": "Command to execute as list. Triggers Docker mode. Mutually exclusive with 'script'.",
985
  },
 
986
  "hardware_flavor": {
987
  "type": "string",
988
+ "description": (
989
+ "Hardware type. Sizing guide: 1-3B params β†’ t4-small/a10g-small, "
990
+ "7-13B β†’ a10g-large, 30B+ β†’ a100-large, 70B+ β†’ h100/h100x8. "
991
+ f"All options: CPU: {CPU_FLAVORS}. GPU: {GPU_FLAVORS}."
992
+ ),
993
  },
994
  "timeout": {
995
  "type": "string",
996
+ "description": (
997
+ "Maximum job runtime. MUST be >2h for any training job β€” default 30m kills training mid-run. "
998
+ "Guidelines: 1-3B models: 3-4h, 7-13B: 6-8h, 30B+: 12-24h. "
999
+ "Use 30m-1h only for quick data processing or inference tasks. Default: '30m'."
1000
+ ),
1001
  },
1002
  "env": {
1003
  "type": "object",
1004
+ "description": "Environment variables {'KEY': 'VALUE'}. HF_TOKEN is auto-included.",
1005
  },
 
1006
  "job_id": {
1007
  "type": "string",
1008
+ "description": "Job ID. Required for: logs, inspect, cancel.",
1009
  },
 
1010
  "scheduled_job_id": {
1011
  "type": "string",
1012
+ "description": "Scheduled job ID. Required for: scheduled inspect/delete/suspend/resume.",
1013
  },
1014
  "schedule": {
1015
  "type": "string",
1016
+ "description": "Cron schedule or preset (@hourly, @daily, @weekly, @monthly). Required for: scheduled run.",
1017
  },
1018
  },
1019
  "required": ["operation"],
agent/tools/plan_tool.py CHANGED
@@ -85,18 +85,11 @@ def get_current_plan() -> List[Dict[str, str]]:
85
  PLAN_TOOL_SPEC = {
86
  "name": "plan_tool",
87
  "description": (
88
- "Manage task planning and progress tracking with todo list (pending/in_progress/completed statuses). "
89
- "⚠️ CRITICAL: ALWAYS use for multi-step tasks (3+ steps) and MUST update frequently to show progress. "
90
- "**Use when:** (1) User provides multiple tasks, (2) Complex workflows (training, evaluation, data processing), "
91
- "(3) Tasks requiring multiple tool calls, (4) Need to communicate progress clearly to user, "
92
- "(5) Breaking down ambiguous requests into concrete steps. "
93
- "**Pattern:** Create plan at start β†’ Mark in_progress when starting task β†’ Mark completed immediately after finishing β†’ User sees clear progress. "
94
- "Each call replaces entire plan (full list required). "
95
- "**Critical for reliability:** Exactly ONE task in_progress at a time (not zero, not multiple). "
96
- "Mark tasks completed IMMEDIATELY after finishing - don't batch completions. "
97
- "**For long-running tasks:** Update plan after each major step to keep user informed. "
98
- "**Only mark completed when:** Task fully accomplished, no errors, all requirements met. "
99
- "Keep tasks pending if blocked/errors occur - create new task to resolve blockers."
100
  ),
101
  "parameters": {
102
  "type": "object",
 
85
  PLAN_TOOL_SPEC = {
86
  "name": "plan_tool",
87
  "description": (
88
+ "Track progress on multi-step tasks with a todo list (pending/in_progress/completed).\n\n"
89
+ "Use for tasks with 3+ steps. Each call replaces the entire plan (send full list).\n\n"
90
+ "Rules: exactly ONE task in_progress at a time. Mark completed immediately after finishing. "
91
+ "Only mark completed when the task fully succeeded β€” keep in_progress if there are errors. "
92
+ "Update frequently so the user sees progress."
 
 
 
 
 
 
 
93
  ),
94
  "parameters": {
95
  "type": "object",