Spaces:

smolagents
/

ml-intern

Running on CPU Upgrade

App Files Files Community

akseljoonas HF Staff commited on Feb 26

Commit

103298c

1 Parent(s): a644598

feat: use hf_agent tool descriptions, hardware flavors, and default env

Browse files

Files changed (6) hide show

agent/tools/dataset_tools.py +9 -16
agent/tools/docs_tools.py +10 -21
agent/tools/github_find_examples.py +10 -49
agent/tools/github_read_file.py +6 -52
agent/tools/jobs_tool.py +77 -89
agent/tools/plan_tool.py +5 -12

agent/tools/dataset_tools.py CHANGED Viewed

@@ -388,22 +388,15 @@ def _format_parquet_files(data: dict, max_rows: int = 10) -> str | None:
 HF_INSPECT_DATASET_TOOL_SPEC = {
     "name": "hf_inspect_dataset",
     "description": (
-        "Inspect a Hugging Face dataset comprehensively in one call.\n\n"
-        "## What you get\n"
-        "- Status check (validates dataset works without errors)\n"
-        "- All configs and splits (row counts/shares may be '?' when metadata is missing)\n"
-        "- Column names and types (schema)\n"
-        "- Sample rows to understand data format\n"
-        "- Parquet file structure and sizes\n\n"
-        "## CRITICAL\n"
-        "**Always inspect datasets before writing training code** to understand:\n"
-        "- Column names for your dataloader\n"
-        "- Data types and format\n"
-        "- Available splits (train/test/validation)\n\n"
-        "Supports private/gated datasets when HF_TOKEN is set.\n\n"
-        "## Examples\n"
-        '{"dataset": "stanfordnlp/imdb"}\n'
-        '{"dataset": "nyu-mll/glue", "config": "mrpc", "sample_rows": 5}\n'
     ),
     "parameters": {
         "type": "object",

 HF_INSPECT_DATASET_TOOL_SPEC = {
     "name": "hf_inspect_dataset",
     "description": (
+        "Inspect a HF dataset in one call: status, configs/splits, schema, sample rows, parquet info.\n\n"
+        "REQUIRED before any training job to verify dataset format matches training method:\n"
+        "  SFT: needs 'messages', 'text', or 'prompt'/'completion'\n"
+        "  DPO: needs 'prompt', 'chosen', 'rejected'\n"
+        "  GRPO: needs 'prompt'\n"
+        "All datasets used for training have to be in conversational ChatML format to be compatible with HF libraries.'\n"
+        "Training will fail with KeyError if columns don't match.\n\n"
+        "Also use to get example datapoints, understand column names, data types, and available splits before writing any data loading code. "
+        "Supports private/gated datasets when HF_TOKEN is set."
     ),
     "parameters": {
         "type": "object",

agent/tools/docs_tools.py CHANGED Viewed

@@ -845,17 +845,12 @@ DOC_ENDPOINTS = [
 EXPLORE_HF_DOCS_TOOL_SPEC = {
     "name": "explore_hf_docs",
     "description": (
-        "Explore Hugging Face documentation structure and discover available pages with 200-character previews. "
-        "⚠️ MANDATORY: ALWAYS use this BEFORE implementing any ML task (training, fine-tuning, data processing, inference). "
-        "Your training data may be outdated - current documentation is the source of truth. "
-        "**Use when:** (1) Starting any implementation task, (2) User asks 'how to' questions, "
-        "(3) Before writing training/processing code, (4) Researching library capabilities, "
-        "(5) Verifying API syntax and parameters. "
-        "**Pattern:** explore (discover structure) → fetch_hf_docs (get details) → implement with researched approach. "
-        "Returns: Sidebar navigation with titles, URLs, and glimpses of all pages in the selected documentation. "
-        "**Then:** Use fetch_hf_docs with specific URLs from results to get full content. "
-        "**Critical for reliability:** Never implement based on internal knowledge without checking current docs first - APIs change frequently."
-        " By default returns the top 20 results; set max_results (max 50) to adjust."
     ),
     "parameters": {
         "type": "object",
@@ -928,16 +923,10 @@ EXPLORE_HF_DOCS_TOOL_SPEC = {
 HF_DOCS_FETCH_TOOL_SPEC = {
     "name": "fetch_hf_docs",
     "description": (
-        "Fetch full markdown content of a specific HF documentation page. "
-        "⚠️ CRITICAL: Use this after explore_hf_docs to get detailed implementation guidance. "
-        "**Use when:** (1) Found relevant page in explore_hf_docs results, (2) Need complete API documentation, "
-        "(3) Need training method details (SFT/DPO/GRPO), (4) Need configuration examples, "
-        "(5) Need parameter descriptions and usage patterns. "
-        "**Pattern:** explore_hf_docs (find relevant page) → fetch_hf_docs (get full content) → implement using documented approach. "
-        "Provide full URL from explore_hf_docs results (e.g., 'https://huggingface.co/docs/trl/sft_trainer'). "
-        "Returns: Complete markdown documentation with examples, parameters, and usage patterns. "
-        "**For training tasks:** ALWAYS fetch trainer docs (SFTConfig, DPOConfig, etc.) before creating training scripts. "
-        "**Critical for reliability:** This ensures you use current APIs and best practices."
     ),
     "parameters": {
         "type": "object",

 EXPLORE_HF_DOCS_TOOL_SPEC = {
     "name": "explore_hf_docs",
     "description": (
+        "Browse HF documentation structure — discover all available documentation with 200-char previews.\n\n"
+        "Use this to find relevant documentation and/or examples with detailed parameter docs and API reference. "
+        "To be used together with github_find_examples and github_read_file to find working examples and documentation.\n\n"
+        "Pattern: explore_hf_docs (find relevant pages) → fetch_hf_docs (get full content).\n\n"
+        "For training tasks: fetch the trainer config docs (SFTConfig, DPOConfig, GRPOConfig) to verify parameter names. "
+        "Returns top 20 results by default; set max_results (max 50) to adjust."
     ),
     "parameters": {
         "type": "object",
 HF_DOCS_FETCH_TOOL_SPEC = {
     "name": "fetch_hf_docs",
     "description": (
+        "Fetch full markdown content of an HF documentation page. Use after explore_hf_docs.\n\n"
+        "Critical for finding documentation e.g. current trainer configuration parameters (SFTConfig, DPOConfig, etc.) "
+        "Use for researching solutions and before writing training scripts. Your internal knowledge is outdated.\n\n"
+        "Provide the full URL from explore_hf_docs results. The .md extension is added automatically."
     ),
     "parameters": {
         "type": "object",

agent/tools/github_find_examples.py CHANGED Viewed

@@ -405,55 +405,16 @@ def find_examples(
 GITHUB_FIND_EXAMPLES_TOOL_SPEC = {
     "name": "github_find_examples",
     "description": (
-        "Discover working code examples, tutorials, scripts, and demos in GitHub repositories. "
-        "⚠️ CRITICAL: ALWAYS use this BEFORE implementing ML tasks - find working reference code first. "
-        "Your training data may be outdated; real repository examples show current best practices. "
-        "**Use when:** (1) Starting any ML implementation (training, inference, evaluation), "
-        "(2) User asks 'how to' questions about libraries, (3) Need reference implementations, "
-        "(4) Exploring library capabilities, (5) Before writing training/processing scripts. "
-        "**Pattern:** github_find_examples (discover) → github_read_file (study code) → implement with researched approach. "
-        "Returns: List of example files (scripts/notebooks/tutorials) with paths and URLs, sorted by relevance. "
-        "**Then:** Use github_read_file to read the actual implementation code. "
-        "**Critical for reliability:** Real examples prevent outdated API usage and show proven patterns. "
-        "## How it works\n\n"
-        "1. Fetches all example files (examples/, scripts/, tutorials/, demos/, notebooks/, etc.) from repository\n"
-        "2. If keyword provided, scores files against keyword using fuzzy matching\n"
-        "3. Returns best matches sorted by relevance and pattern priority\n"
-        "4. Provides copyable parameters for github_read_file tool\n\n"
-        "## Examples\n\n"
-        "<example>\n"
-        "// ML Workflow Step: Find GRPO training examples before implementation\n"
-        "// Task: Starting GRPO fine-tuning project, need reference implementation\n"
-        "{\n"
-        "  keyword: 'grpo',\n"
-        "  repo: 'trl',\n"
-        "  org: 'huggingface'\n"
-        "}\n"
-        "// Returns: examples/scripts/grpo_agent.py, examples/scripts/grpo_vlm.py\n"
-        "// Next step: github_read_file to study working implementation\n"
-        "</example>\n\n"
-        "<example>\n"
-        "// ML Workflow Step: Discover all available training methods\n"
-        "// Task: Exploring TRL training options before choosing approach\n"
-        "{\n"
-        "  repo: 'trl',\n"
-        "  org: 'huggingface',\n"
-        "  max_results: 20\n"
-        "}\n"
-        "// Lists: SFT, DPO, GRPO, PPO, reward modeling examples\n"
-        "// Helps user choose appropriate method\n"
-        "</example>\n\n"
-        "<example>\n"
-        "// ML Workflow Step: Find LoRA fine-tuning examples\n"
-        "// Task: Learning parameter-efficient fine-tuning patterns\n"
-        "{\n"
-        "  keyword: 'lora',\n"
-        "  repo: 'peft',\n"
-        "  org: 'huggingface'\n"
-        "}\n"
-        "// Discovers LoRA configuration and training examples\n"
-        "// Shows current PEFT API usage patterns\n"
-        "</example>"
     ),
     "parameters": {
         "type": "object",

 GITHUB_FIND_EXAMPLES_TOOL_SPEC = {
     "name": "github_find_examples",
     "description": (
+        "Find working example scripts in GitHub repositories (from a list of predetermined directories e.g. examples/, scripts/, tutorials/, etc.). "
+        "Uses fuzzy keyword matching.\n\n"
+        "MANDATORY before writing any ML training, fine-tuning, or inference code. "
+        "Your internal knowledge of library APIs is outdated — working examples show current API patterns.\n\n"
+        "Sequence: github_find_examples → github_read_file (study the example) → implement based on what you found.\n\n"
+        "Skip this only for: simple data queries, status checks, non-code tasks.\n\n"
+        "Examples:\n"
+        "  {keyword: 'sft', repo: 'trl'} → finds examples/scripts/sft.py\n"
+        "  {keyword: 'grpo', repo: 'trl'} → finds GRPO training examples\n"
+        "  {repo: 'trl', max_results: 20} → lists all available training method examples"
     ),
     "parameters": {
         "type": "object",

agent/tools/github_read_file.py CHANGED Viewed

@@ -250,59 +250,13 @@ def read_file(
 GITHUB_READ_FILE_TOOL_SPEC = {
     "name": "github_read_file",
     "description": (
-        "Read file contents from GitHub repositories with line range support (default 300 lines). "
-        "⚠️ CRITICAL: Use AFTER github_find_examples to study working implementation code. "
-        "**Use when:** (1) Found example file via github_find_examples and need full code, "
-        "(2) Need to read trainer class implementation, (3) Study configuration patterns, "
-        "(4) Read specific code sections with line ranges, (5) Review code from specific branches/commits. "
-        "**Pattern:** github_find_examples (discover files) → github_read_file (read code) → implement using researched patterns. "
-        "Returns: File contents with line numbers, formatted for LLM reading. Auto-converts Jupyter notebooks to markdown. "
-        "**Then:** Implement using patterns and APIs from the example code. "
-        "**Critical for reliability:** Reading working examples prevents API errors and shows current best practices. "
         "Use line_start/line_end for large files (>300 lines) to read specific sections.\n\n"
-        "## When to use this tool\n\n"
-        "- When reading example code, trainer implementations, or configuration files\n"
-        "- After github_find_examples returns file paths you want to study\n"
-        "- When investigating specific code sections with line ranges\n"
-        "- When reading from specific branches, tags, or commits (use ref parameter)\n\n"
-        "## When NOT to use this tool\n\n"
-        "- When you don't know exact file path (use github_find_examples or github_search_code first)\n"
-        "- When searching for code patterns across repos (use github_search_code instead)\n\n"
-        "## Examples\n\n"
-        "<example>\n"
-        "// ML Workflow Step: Read GRPO trainer class after finding via github_find_examples\n"
-        "// Use case: Understand GRPOTrainer API, parameters, and methods\n"
-        "{\n"
-        "  repo: 'huggingface/trl',\n"
-        "  path: 'trl/trainer/grpo_trainer.py',\n"
-        "  line_start: 1,\n"
-        "  line_end: 200\n"
-        "}\n"
-        "// Read class definition and constructor to understand current API\n"
-        "// Shows: __init__ parameters, configuration, required arguments\n"
-        "</example>\n\n"
-        "<example>\n"
-        "// ML Workflow Step: Study complete training script from examples\n"
-        "// Use case: Learn end-to-end VLM fine-tuning workflow\n"
-        "{\n"
-        "  repo: 'huggingface/trl',\n"
-        "  path: 'examples/scripts/grpo_vlm.py'\n"
-        "}\n"
-        "// Returns first 300 lines - shows full training setup\n"
-        "// Use line_start/line_end if need to read more\n"
-        "</example>\n\n"
-        "<example>\n"
-        "// ML Workflow Step: Check TrainingArguments configuration patterns\n"
-        "// Use case: Learn how to structure training configs correctly\n"
-        "{\n"
-        "  repo: 'huggingface/transformers',\n"
-        "  path: 'examples/pytorch/language-modeling/run_clm.py',\n"
-        "  line_start: 50,\n"
-        "  line_end: 150\n"
-        "}\n"
-        "// Read argument parsing and config setup section\n"
-        "// Shows: current parameter names, default values, best practices\n"
-        "</example>"
     ),
     "parameters": {
         "type": "object",

 GITHUB_READ_FILE_TOOL_SPEC = {
     "name": "github_read_file",
     "description": (
+        "Read file contents from GitHub repositories. Returns first 300 lines by default. "
+        "Auto-converts Jupyter notebooks to markdown.\n\n"
+        "Use AFTER github_find_examples to study the working implementation. "
+        "The purpose is to learn current API patterns — imports, trainer configs, dataset handling — "
+        "so your implementation uses correct, up-to-date code.\n\n"
         "Use line_start/line_end for large files (>300 lines) to read specific sections.\n\n"
+        "When NOT to use: when you don't know the file path (use github_find_examples first)."
     ),
     "parameters": {
         "type": "object",

agent/tools/jobs_tool.py CHANGED Viewed

@@ -29,38 +29,33 @@ from agent.tools.utilities import (
 )
 # Hardware flavors
-CPU_FLAVORS = ["cpu-basic", "cpu-upgrade", "cpu-performance", "cpu-xl"]
 GPU_FLAVORS = [
-    "sprx8",
-    "zero-a10g",
     "t4-small",
     "t4-medium",
-    "l4x1",
-    "l4x4",
-    "l40sx1",
-    "l40sx4",
-    "l40sx8",
     "a10g-small",
     "a10g-large",
     "a10g-largex2",
     "a10g-largex4",
     "a100-large",
-    "h100",
-    "h100x8",
 ]
 # Detailed specs for display (vCPU/RAM/GPU VRAM)
-CPU_FLAVORS_DESC = (
-    "cpu-basic(2vCPU/16GB), cpu-upgrade(8vCPU/32GB), cpu-performance, cpu-xl"
-)
 GPU_FLAVORS_DESC = (
     "t4-small(4vCPU/15GB/GPU 16GB), t4-medium(8vCPU/30GB/GPU 16GB), "
-    "l4x1(8vCPU/30GB/GPU 24GB), l4x4(48vCPU/186GB/GPU 96GB), "
-    "l40sx1(8vCPU/62GB/GPU 48GB), l40sx4(48vCPU/382GB/GPU 192GB), l40sx8(192vCPU/1534GB/GPU 384GB), "
-    "a10g-small(4vCPU/14GB/GPU 24GB), a10g-large(12vCPU/46GB/GPU 24GB), "
     "a10g-largex2(24vCPU/92GB/GPU 48GB), a10g-largex4(48vCPU/184GB/GPU 96GB), "
-    "a100-large(12vCPU/142GB/GPU 80GB), h100(23vCPU/240GB/GPU 80GB), h100x8(184vCPU/1920GB/GPU 640GB), "
-    "zero-a10g(dynamic alloc)"
 )
 SPECIALIZED_FLAVORS = ["inf2x6"]
 ALL_FLAVORS = CPU_FLAVORS + GPU_FLAVORS + SPECIALIZED_FLAVORS
@@ -122,6 +117,21 @@ def _filter_uv_install_output(logs: list[str]) -> list[str]:
     return logs
 def _add_environment_variables(
     params: Dict[str, Any] | None, user_token: str | None = None
 ) -> Dict[str, Any]:
@@ -509,7 +519,7 @@ class HfJobsTool:
                 self.api.run_job,
                 image=image,
                 command=command,
-                env=args.get("env"),
                 secrets=_add_environment_variables(args.get("secrets"), self.hf_token),
                 flavor=args.get("hardware_flavor", "cpu-basic"),
                 timeout=args.get("timeout", "30m"),
@@ -741,7 +751,7 @@ To verify, call this tool with `{{"operation": "inspect", "job_id": "{job_id}"}}
                 image=image,
                 command=command,
                 schedule=schedule,
-                env=args.get("env"),
                 secrets=_add_environment_variables(args.get("secrets"), self.hf_token),
                 flavor=args.get("hardware_flavor", "cpu-basic"),
                 timeout=args.get("timeout", "30m"),
@@ -901,56 +911,31 @@ To inspect, call this tool with `{{"operation": "scheduled inspect", "scheduled_
 HF_JOBS_TOOL_SPEC = {
     "name": "hf_jobs",
     "description": (
-        "Execute Python scripts or Docker containers on HF cloud infrastructure (CPUs/GPUs) in one of two modes. "
-        "\n\n"
-        "**Two Modes (mutually exclusive):**\n"
-        "1. Python mode: using 'script' arg (REQUIRED) + 'dependencies'\n"
-        "2. Docker mode: using 'command' arg (REQUIRED) + 'image'\n\n"
-        "🚨 **REQUIRED:** You MUST provide exactly ONE of: 'script' (Python code as string) OR 'command' (Docker command as array). "
-        "They are mutually exclusive - provide one or the other, never both, never neither. "
-        "Do NOT call with just {'operation': 'run'} - always include your code. Example: {'operation': 'run', 'script': 'import torch; print(torch.cuda.is_available())', 'dependencies': ['torch']} or {'operation': 'run', 'command': ['duckdb', '-c', 'select 1 + 2']', 'image': 'duckdb/duckdb'}\n\n"
-        "⚠️ CRITICAL for reliability: (1) Jobs run ASYNC - provide monitoring URL immediately, don't poll; "
-        "(2) Set timeout >30min (default too short - training needs 2-8h); "
-        "(3) HF_TOKEN auto-loaded to secrets for Hub ops (push_to_hub, private repos); "
-        "(4) Job storage EPHEMERAL - MUST push_to_hub() or ALL work is LOST. "
-        "**Use when:** User wants cloud compute, training models, data processing, batch inference, GPU workloads, scheduled tasks. "
-        "ALWAYS use this tool (✓), never bash 'hf jobs' commands (✗). Pass script content inline (✓), don't save to files unless requested (✗). "
-        "\n\n"
-        "**Operations:** run, ps, logs, inspect, cancel, scheduled run, scheduled ps, scheduled inspect, scheduled delete, scheduled suspend, scheduled resume. "
-        "**Available Hardware (vCPU/RAM/GPU):**\n"
-        f"• CPU: {CPU_FLAVORS_DESC}\n"
-        f"• GPU: {GPU_FLAVORS_DESC}\n"
-        "  ◦ Common: t4-small ($0.60/hr, demos/1-3B models), a10g-small ($1/hr), a10g-large ($2/hr, production 7-13B), a100-large ($4/hr, 30B+), h100 ($6/hr, 70B+)\n\n"
-        "**After Submission Ground Rules:**\n"
-        "✓ Return immediately with job ID and monitoring URL\n"
-        "✓ Provide expected completion time and cost estimate\n"
-        "✓ For training: Include Trackio dashboard URL\n"
-        "✓ Note user can check status later\n"
-        "✗ DON'T poll logs automatically\n"
-        "✗ DON'T wait for completion\n"
-        "✗ DON'T check status unless user asks\n\n"
-        "**For Training Tasks:**\n"
-        "• ALWAYS research TRL docs first: explore_hf_docs('trl') → fetch_hf_docs(<trainer_url>)\n"
-        "• ALWAYS validate dataset format with hub_repo_details (SFT needs messages/text, DPO needs chosen/rejected)\n"
-        "• ALWAYS include Trackio monitoring in script (explore_hf_docs('trackio'))\n"
-        "• ALWAYS enable push_to_hub=True in training config\n"
-        "• Set timeout 2-8h for training (NOT default 30m)\n"
-        "• Confirm model/dataset choices with user before submitting\n\n"
-        "**Examples:**\n\n"
-        "**Training - Fine-tune LLM:**\n"
-        "{'operation': 'run', 'script': '# Training script with TRL\\nfrom trl import SFTConfig, SFTTrainer\\nfrom transformers import AutoModelForCausalLM\\nmodel = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen3-4B\")\\n# ... researched implementation from docs ...\\ntrainer.train()\\ntrainer.push_to_hub(\"user-name/my-model\")', 'dependencies': ['transformers', 'trl', 'torch', 'datasets', 'trackio'], 'hardware_flavor': 'a10g-large', 'timeout': '4h'}\n\n"
-        "**Data Processing:**\n"
-        "{'operation': 'run', 'script': 'from datasets import load_dataset\\nds = load_dataset(\"data\")\\n# process...\\nds.push_to_hub(\"user/processed\")', 'dependencies': ['datasets', 'pandas'], 'hardware_flavor': 'cpu-upgrade', 'timeout': '2h'}\n\n"
-        "**Scheduled Daily Job:**\n"
-        "{'operation': 'scheduled run', 'schedule': '@daily', 'script': 'from datasets import Dataset\\nimport pandas as pd\\n# scrape/generate data\\ndf = pd.DataFrame(data)\\nds = Dataset.from_pandas(df)\\nds.push_to_hub(\"user-name/daily-dataset\")', 'dependencies': ['datasets', 'pandas'], 'hardware_flavor': 'cpu-basic'}\n\n"
-        "**Docker Mode:**\n"
-        "{'operation': 'run', 'image': 'pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime', 'command': ['python', 'train.py', '--epochs', '10'], 'hardware_flavor': 'a100-large'}\n\n"
-        "**Monitor Operations:**\n"
-        "{'operation': 'ps'} - List all jobs\n"
-        "{'operation': 'logs', 'job_id': 'xxx'} - Stream logs (only when user requests)\n"
-        "{'operation': 'inspect', 'job_id': 'xxx'} - Get job details\n"
-        "{'operation': 'cancel', 'job_id': 'xxx'} - Stop job\n\n"
-        "⚠️ CRITICAL: Files created during execution are DELETED when job finishes. MUST push_to_hub() all outputs (models, datasets, artifacts) in script. For logs/scripts, use hf_private_repos after completion."
     ),
     "parameters": {
         "type": "object",
@@ -970,13 +955,8 @@ HF_JOBS_TOOL_SPEC = {
                     "scheduled suspend",
                     "scheduled resume",
                 ],
-                "description": (
-                    "Operation to execute. Valid values: [run, ps, logs, inspect, cancel, "
-                    "scheduled run, scheduled ps, scheduled inspect, scheduled delete, "
-                    "scheduled suspend, scheduled resume]"
-                ),
             },
-            # Python/UV specific parameters
             "script": {
                 "type": "string",
                 "description": (
@@ -988,44 +968,52 @@ HF_JOBS_TOOL_SPEC = {
             "dependencies": {
                 "type": "array",
                 "items": {"type": "string"},
-                "description": "Pip packages to install. Example: ['trl', 'torch', 'datasets', 'transformers']. Only used with 'script'.",
             },
-            # Docker specific parameters
             "image": {
                 "type": "string",
-                "description": "Docker image. Example: 'pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime'. Use with 'run'/'scheduled run'. Optional (auto-selected if not provided).",
             },
             "command": {
                 "type": "array",
                 "items": {"type": "string"},
-                "description": "Command to execute as list. Example: ['python', 'train.py', '--epochs', '10']. Triggers Docker mode. Use with 'run'/'scheduled run'. Mutually exclusive with 'script'.",
             },
-            # Hardware and environment
             "hardware_flavor": {
                 "type": "string",
-                "description": f"Hardware type. Available CPU flavors: {CPU_FLAVORS}. Available GPU flavors: {GPU_FLAVORS}. Use with 'run'/'scheduled run'.",
             },
             "timeout": {
                 "type": "string",
-                "description": "Max runtime. Examples: '30m', '2h', '4h'. Default: '30m'. Important for long training jobs. Use with 'run'/'scheduled run'.",
             },
             "env": {
                 "type": "object",
-                "description": "Environment variables. Format: {'KEY': 'VALUE'}. HF_TOKEN is automatically included from your auth. Use with 'run'/'scheduled run'.",
             },
-            # Job management parameters
             "job_id": {
                 "type": "string",
-                "description": "Job ID to operate on. Required for: 'logs', 'inspect', 'cancel'.",
             },
-            # Scheduled job parameters
             "scheduled_job_id": {
                 "type": "string",
-                "description": "Scheduled job ID. Required for: 'scheduled inspect', 'scheduled delete', 'scheduled suspend', 'scheduled resume'.",
             },
             "schedule": {
                 "type": "string",
-                "description": "Schedule for recurring job. Presets: '@hourly', '@daily', '@weekly', '@monthly'. Cron: '0 9 * * 1' (Mon 9am). Required for: 'scheduled run'.",
             },
         },
         "required": ["operation"],

 )
 # Hardware flavors
+CPU_FLAVORS = ["cpu-basic", "cpu-upgrade"]
 GPU_FLAVORS = [
     "t4-small",
     "t4-medium",
     "a10g-small",
     "a10g-large",
     "a10g-largex2",
     "a10g-largex4",
     "a100-large",
+    "a100x4",
+    "a100x8",
+    "l4x1",
+    "l4x4",
+    "l40sx1",
+    "l40sx4",
+    "l40sx8",
 ]
 # Detailed specs for display (vCPU/RAM/GPU VRAM)
+CPU_FLAVORS_DESC = "cpu-basic(2vCPU/16GB), cpu-upgrade(8vCPU/32GB)"
 GPU_FLAVORS_DESC = (
     "t4-small(4vCPU/15GB/GPU 16GB), t4-medium(8vCPU/30GB/GPU 16GB), "
+    "a10g-small(4vCPU/15GB/GPU 24GB), a10g-large(12vCPU/46GB/GPU 24GB), "
     "a10g-largex2(24vCPU/92GB/GPU 48GB), a10g-largex4(48vCPU/184GB/GPU 96GB), "
+    "a100-large(12vCPU/142GB/GPU 80GB), a100x4(48vCPU/568GB/GPU 320GB), a100x8(96vCPU/1136GB/GPU 640GB), "
+    "l4x1(8vCPU/30GB/GPU 24GB), l4x4(48vCPU/186GB/GPU 96GB), "
+    "l40sx1(8vCPU/62GB/GPU 48GB), l40sx4(48vCPU/382GB/GPU 192GB), l40sx8(192vCPU/1534GB/GPU 384GB)"
 )
 SPECIALIZED_FLAVORS = ["inf2x6"]
 ALL_FLAVORS = CPU_FLAVORS + GPU_FLAVORS + SPECIALIZED_FLAVORS
     return logs
+_DEFAULT_ENV = {
+    "HF_HUB_DISABLE_PROGRESS_BARS": "1",
+    "TQDM_DISABLE": "1",
+    "TRANSFORMERS_VERBOSITY": "warning",
+    "HF_HUB_ENABLE_HF_TRANSFER": "1",
+}
+def _add_default_env(params: Dict[str, Any] | None) -> Dict[str, Any]:
+    """Inject default env vars for clean, agent-friendly output."""
+    result = dict(_DEFAULT_ENV)
+    result.update(params or {})  # user-provided values override defaults
+    return result
 def _add_environment_variables(
     params: Dict[str, Any] | None, user_token: str | None = None
 ) -> Dict[str, Any]:
                 self.api.run_job,
                 image=image,
                 command=command,
+                env=_add_default_env(args.get("env")),
                 secrets=_add_environment_variables(args.get("secrets"), self.hf_token),
                 flavor=args.get("hardware_flavor", "cpu-basic"),
                 timeout=args.get("timeout", "30m"),
                 image=image,
                 command=command,
                 schedule=schedule,
+                env=_add_default_env(args.get("env")),
                 secrets=_add_environment_variables(args.get("secrets"), self.hf_token),
                 flavor=args.get("hardware_flavor", "cpu-basic"),
                 timeout=args.get("timeout", "30m"),
 HF_JOBS_TOOL_SPEC = {
     "name": "hf_jobs",
     "description": (
+        "Execute Python scripts or Docker containers on HF cloud infrastructure.\n\n"
+        "Two modes (mutually exclusive): Python mode (script + dependencies) or Docker mode (command + image). "
+        "Provide exactly ONE of 'script' or 'command'.\n\n"
+        "BEFORE submitting training/fine-tuning jobs:\n"
+        "- You MUST have called github_find_examples + github_read_file to find a working reference implementation. "
+        "Scripts based on your internal knowledge WILL use outdated APIs and fail.\n"
+        "- You MUST have validated dataset format via hf_inspect_dataset or hub_repo_details.\n"
+        "- Training config MUST include push_to_hub=True and hub_model_id. "
+        "Job storage is EPHEMERAL — all files are deleted when the job ends. Without push_to_hub, trained models are lost permanently.\n"
+        "- Include trackio monitoring and provide the dashboard URL to the user.\n\n"
+        "BATCH/ABLATION JOBS: Submit ONE job first. Check logs to confirm it starts training successfully. "
+        "Only then submit the remaining jobs. Never submit all at once — if there's a bug, all jobs fail.\n\n"
+        "Operations: run, ps, logs, inspect, cancel, scheduled run/ps/inspect/delete/suspend/resume.\n\n"
+        f"Hardware: CPU: {CPU_FLAVORS_DESC}. GPU: {GPU_FLAVORS_DESC}.\n"
+        "Common picks: t4-small ($0.60/hr, 1-3B), a10g-large ($2/hr, 7-13B), a100-large ($4/hr, 30B+), h100 ($6/hr, 70B+). "
+        "Note: a10g-small and a10g-large have the SAME 24GB GPU — the difference is CPU/RAM only.\n\n"
+        "OOM RECOVERY: When a training job fails with CUDA OOM:\n"
+        "1. Reduce per_device_train_batch_size and increase gradient_accumulation_steps proportionally (keep effective batch size identical)\n"
+        "2. Enable gradient_checkpointing=True\n"
+        "3. Upgrade to larger GPU (a10g→a100→h100)\n"
+        "Do NOT switch training methods (e.g. full SFT to LoRA) or reduce max_length — those change what the user gets and require explicit approval.\n\n"
+        "Examples:\n"
+        "Training: {'operation': 'run', 'script': '/app/train.py', 'dependencies': ['transformers', 'trl', 'torch', 'datasets', 'trackio'], 'hardware_flavor': 'a100-large', 'timeout': '8h'}\n"
+        "Monitor: {'operation': 'ps'}, {'operation': 'logs', 'job_id': 'xxx'}, {'operation': 'cancel', 'job_id': 'xxx'}"
+        "Docker: {'operation': 'run', 'command': ['duckdb', '-c', 'select 1 + 2'], 'image': 'duckdb/duckdb', 'hardware_flavor': 'cpu-basic', 'timeout': '1h'}\n"
     ),
     "parameters": {
         "type": "object",
                     "scheduled suspend",
                     "scheduled resume",
                 ],
+                "description": "Operation to execute.",
             },
             "script": {
                 "type": "string",
                 "description": (
             "dependencies": {
                 "type": "array",
                 "items": {"type": "string"},
+                "description": (
+                    "Pip packages to install. Include ALL required packages. "
+                    "Common training set: ['transformers', 'trl', 'torch', 'datasets', 'trackio', 'accelerate']. "
+                    "Only used with 'script'."
+                ),
             },
             "image": {
                 "type": "string",
+                "description": "Docker image. Optional — auto-selected if not provided. Use with 'command'.",
             },
             "command": {
                 "type": "array",
                 "items": {"type": "string"},
+                "description": "Command to execute as list. Triggers Docker mode. Mutually exclusive with 'script'.",
             },
             "hardware_flavor": {
                 "type": "string",
+                "description": (
+                    "Hardware type. Sizing guide: 1-3B params → t4-small/a10g-small, "
+                    "7-13B → a10g-large, 30B+ → a100-large, 70B+ → h100/h100x8. "
+                    f"All options: CPU: {CPU_FLAVORS}. GPU: {GPU_FLAVORS}."
+                ),
             },
             "timeout": {
                 "type": "string",
+                "description": (
+                    "Maximum job runtime. MUST be >2h for any training job — default 30m kills training mid-run. "
+                    "Guidelines: 1-3B models: 3-4h, 7-13B: 6-8h, 30B+: 12-24h. "
+                    "Use 30m-1h only for quick data processing or inference tasks. Default: '30m'."
+                ),
             },
             "env": {
                 "type": "object",
+                "description": "Environment variables {'KEY': 'VALUE'}. HF_TOKEN is auto-included.",
             },
             "job_id": {
                 "type": "string",
+                "description": "Job ID. Required for: logs, inspect, cancel.",
             },
             "scheduled_job_id": {
                 "type": "string",
+                "description": "Scheduled job ID. Required for: scheduled inspect/delete/suspend/resume.",
             },
             "schedule": {
                 "type": "string",
+                "description": "Cron schedule or preset (@hourly, @daily, @weekly, @monthly). Required for: scheduled run.",
             },
         },
         "required": ["operation"],

agent/tools/plan_tool.py CHANGED Viewed

@@ -85,18 +85,11 @@ def get_current_plan() -> List[Dict[str, str]]:
 PLAN_TOOL_SPEC = {
     "name": "plan_tool",
     "description": (
-        "Manage task planning and progress tracking with todo list (pending/in_progress/completed statuses). "
-        "⚠️ CRITICAL: ALWAYS use for multi-step tasks (3+ steps) and MUST update frequently to show progress. "
-        "**Use when:** (1) User provides multiple tasks, (2) Complex workflows (training, evaluation, data processing), "
-        "(3) Tasks requiring multiple tool calls, (4) Need to communicate progress clearly to user, "
-        "(5) Breaking down ambiguous requests into concrete steps. "
-        "**Pattern:** Create plan at start → Mark in_progress when starting task → Mark completed immediately after finishing → User sees clear progress. "
-        "Each call replaces entire plan (full list required). "
-        "**Critical for reliability:** Exactly ONE task in_progress at a time (not zero, not multiple). "
-        "Mark tasks completed IMMEDIATELY after finishing - don't batch completions. "
-        "**For long-running tasks:** Update plan after each major step to keep user informed. "
-        "**Only mark completed when:** Task fully accomplished, no errors, all requirements met. "
-        "Keep tasks pending if blocked/errors occur - create new task to resolve blockers."
     ),
     "parameters": {
         "type": "object",

 PLAN_TOOL_SPEC = {
     "name": "plan_tool",
     "description": (
+        "Track progress on multi-step tasks with a todo list (pending/in_progress/completed).\n\n"
+        "Use for tasks with 3+ steps. Each call replaces the entire plan (send full list).\n\n"
+        "Rules: exactly ONE task in_progress at a time. Mark completed immediately after finishing. "
+        "Only mark completed when the task fully succeeded — keep in_progress if there are errors. "
+        "Update frequently so the user sees progress."
     ),
     "parameters": {
         "type": "object",