Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Commit ·
1158f2c
1
Parent(s): 6f67ddc
feat: add research sub-agent tool, slim down main agent system prompt
Browse filesAdds a `research` tool that spawns a cheaper LLM in its own context window
with read-only tools (github_find_examples, explore_hf_docs, etc.) and
returns a concise summary. This keeps expensive research output out of
the main agent's context.
The system prompt's Phase 1 research section is replaced with a single
`research({task, context})` call pattern — all the detailed research
methodology (tool chains, correct patterns, examples) moves into the
sub-agent's system prompt where it belongs.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
- agent/core/tools.py +8 -0
- agent/prompts/system_prompt_v2.yaml +42 -179
- agent/tools/research_tool.py +292 -0
agent/core/tools.py
CHANGED
|
@@ -48,6 +48,7 @@ from agent.tools.hf_repo_git_tool import (
|
|
| 48 |
from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler
|
| 49 |
from agent.tools.papers_tool import HF_PAPERS_TOOL_SPEC, hf_papers_handler
|
| 50 |
from agent.tools.plan_tool import PLAN_TOOL_SPEC, plan_tool_handler
|
|
|
|
| 51 |
from agent.tools.sandbox_tool import get_sandbox_tools
|
| 52 |
|
| 53 |
# NOTE: Private HF repo tool disabled - replaced by hf_repo_files and hf_repo_git
|
|
@@ -282,6 +283,13 @@ def create_builtin_tools(local_mode: bool = False) -> list[ToolSpec]:
|
|
| 282 |
"""Create built-in tool specifications"""
|
| 283 |
# in order of importance
|
| 284 |
tools = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
# Documentation search tools
|
| 286 |
ToolSpec(
|
| 287 |
name=EXPLORE_HF_DOCS_TOOL_SPEC["name"],
|
|
|
|
| 48 |
from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler
|
| 49 |
from agent.tools.papers_tool import HF_PAPERS_TOOL_SPEC, hf_papers_handler
|
| 50 |
from agent.tools.plan_tool import PLAN_TOOL_SPEC, plan_tool_handler
|
| 51 |
+
from agent.tools.research_tool import RESEARCH_TOOL_SPEC, research_handler
|
| 52 |
from agent.tools.sandbox_tool import get_sandbox_tools
|
| 53 |
|
| 54 |
# NOTE: Private HF repo tool disabled - replaced by hf_repo_files and hf_repo_git
|
|
|
|
| 283 |
"""Create built-in tool specifications"""
|
| 284 |
# in order of importance
|
| 285 |
tools = [
|
| 286 |
+
# Research sub-agent (delegates to read-only tools in independent context)
|
| 287 |
+
ToolSpec(
|
| 288 |
+
name=RESEARCH_TOOL_SPEC["name"],
|
| 289 |
+
description=RESEARCH_TOOL_SPEC["description"],
|
| 290 |
+
parameters=RESEARCH_TOOL_SPEC["parameters"],
|
| 291 |
+
handler=research_handler,
|
| 292 |
+
),
|
| 293 |
# Documentation search tools
|
| 294 |
ToolSpec(
|
| 295 |
name=EXPLORE_HF_DOCS_TOOL_SPEC["name"],
|
agent/prompts/system_prompt_v2.yaml
CHANGED
|
@@ -23,93 +23,29 @@ system_prompt: |
|
|
| 23 |
|
| 24 |
## PHASE 1: RESEARCH (Mandatory - Never Skip)
|
| 25 |
|
| 26 |
-
⚠️ **CRITICAL:** Your training data is outdated. NEVER implement ML tasks without
|
| 27 |
-
|
| 28 |
-
**Research Checklist:**
|
| 29 |
-
1. ✅ **Identify relevant libraries** (TRL for training, datasets for data, PEFT for LoRA, trackio for monitoring)
|
| 30 |
-
2. ✅ **Find working example code FIRST**: `github_find_examples({"repo": "trl", "keyword": "grpo"})`
|
| 31 |
-
- ⚠️ MANDATORY: Find reference implementations before coding
|
| 32 |
-
- Returns: Working scripts/notebooks from examples/ and scripts/ directories
|
| 33 |
-
- Shows: Current API usage, proven patterns, best practices
|
| 34 |
-
3. ✅ **Read example implementations**: `github_read_file({"repo": "huggingface/trl", "path": "examples/scripts/..."})`
|
| 35 |
-
- Study working code to understand current APIs
|
| 36 |
-
- See actual trainer configurations, parameters, imports
|
| 37 |
-
- Learn from production-ready implementations
|
| 38 |
-
4. ✅ **Explore documentation structure**: `explore_hf_docs(<endpoint>)`
|
| 39 |
-
- For training: "trl", "peft", "accelerate"
|
| 40 |
-
- For data: "datasets", "dataset-viewer"
|
| 41 |
-
- For monitoring: "trackio"
|
| 42 |
-
- For inference: "vllm", "inference-endpoints"
|
| 43 |
-
5. ✅ **Fetch specific documentation**: `fetch_hf_docs(<url>)` from explore results
|
| 44 |
-
6. ✅ **Find API endpoints if needed**: `find_hf_api(query="space logs")` or `find_hf_api(tag="spaces")` for REST API operations
|
| 45 |
-
|
| 46 |
-
**✓ CORRECT Research Pattern:**
|
| 47 |
-
```python
|
| 48 |
-
# User requests: "Fine-tune a model for instruction following using SFT"
|
| 49 |
-
|
| 50 |
-
# Step 1: Find working example code FIRST
|
| 51 |
-
github_find_examples({"repo": "trl", "keyword": "sft", "org": "huggingface"})
|
| 52 |
-
# Returns: examples/scripts/sft.py, examples/scripts/sft_vlm.py
|
| 53 |
-
|
| 54 |
-
# Step 2: Read the example implementation
|
| 55 |
-
github_read_file({"repo": "huggingface/trl", "path": "examples/scripts/sft.py"})
|
| 56 |
-
# Study: imports, SFTTrainer usage, SFTConfig parameters, dataset handling
|
| 57 |
-
|
| 58 |
-
# Step 3: Explore TRL documentation for details
|
| 59 |
-
explore_hf_docs("trl") # Discover available pages
|
| 60 |
-
|
| 61 |
-
# Step 4: Fetch specific trainer documentation
|
| 62 |
-
fetch_hf_docs("https://huggingface.co/docs/trl/sft_trainer") # Get SFTTrainer details
|
| 63 |
-
fetch_hf_docs("https://huggingface.co/docs/trl/sft_config") # Get SFTConfig parameters
|
| 64 |
-
|
| 65 |
-
# Step 5: Research related libraries if needed
|
| 66 |
-
explore_hf_docs("peft") # For LoRA if memory constrained
|
| 67 |
-
fetch_hf_docs("https://huggingface.co/docs/peft/quickstart")
|
| 68 |
-
|
| 69 |
-
# Step 6: Research monitoring
|
| 70 |
-
explore_hf_docs("trackio")
|
| 71 |
-
fetch_hf_docs("https://huggingface.co/docs/trackio/quickstart")
|
| 72 |
|
| 73 |
-
|
| 74 |
-
# Proceed to Phase 2 with accurate, proven implementation patterns
|
| 75 |
-
```
|
| 76 |
|
| 77 |
-
**✗ WRONG - Skipping Research:**
|
| 78 |
```python
|
| 79 |
-
# User requests
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
| 82 |
```
|
| 83 |
|
| 84 |
-
**
|
| 85 |
-
```python
|
| 86 |
-
# User requests: "Fine-tune a model"
|
| 87 |
-
# Only reading docs, not looking at working examples
|
| 88 |
-
explore_hf_docs("trl")
|
| 89 |
-
fetch_hf_docs("https://...")
|
| 90 |
-
# This misses proven patterns and actual working code!
|
| 91 |
-
```
|
| 92 |
|
| 93 |
-
**
|
| 94 |
-
```python
|
| 95 |
-
# User requests: "Fine-tune a model"
|
| 96 |
-
# Using PEFT without being asked for it explicitly
|
| 97 |
-
explore_hf_docs("peft")
|
| 98 |
-
fetch_hf_docs("https://...")
|
| 99 |
-
# This is not what the user asked for!
|
| 100 |
-
```
|
| 101 |
|
| 102 |
-
**Skip
|
| 103 |
- Simple factual questions ("What is LoRA?", "What is DPO?")
|
| 104 |
- Status checks (`hf_jobs("ps")`, `hf_jobs("logs", job_id="xxx")`)
|
| 105 |
- Resource discovery (`model_search`, `dataset_search`, `paper_search`)
|
| 106 |
- Trivial operations that don't require implementation
|
| 107 |
|
| 108 |
-
**Why This Matters:**
|
| 109 |
-
- Working code shows current APIs (prevents outdated internal knowledge)
|
| 110 |
-
- Examples demonstrate proven patterns (prevents trial-and-error)
|
| 111 |
-
- Real implementations reveal best practices (prevents anti-patterns)
|
| 112 |
-
|
| 113 |
## PHASE 2: PLAN & VALIDATE (Required for Multi-Step Tasks)
|
| 114 |
|
| 115 |
⚠️ **CRITICAL:** Break down complex tasks and validate resources BEFORE executing.
|
|
@@ -264,74 +200,22 @@ system_prompt: |
|
|
| 264 |
|
| 265 |
# Tool Usage Patterns for Reliability
|
| 266 |
|
| 267 |
-
##
|
| 268 |
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
- Use to discover current implementations BEFORE writing code
|
| 273 |
-
- Pattern: find_examples → read_file → implement using proven patterns
|
| 274 |
-
- Shows: Current API usage, best practices, working configurations
|
| 275 |
-
- Example: `github_find_examples({"repo": "trl", "keyword": "grpo"})`
|
| 276 |
|
| 277 |
-
|
| 278 |
-
- Use AFTER github_find_examples to study implementation code
|
| 279 |
-
- Read trainer classes, example scripts, configuration files
|
| 280 |
-
- Returns: File contents with line numbers (default 300 lines)
|
| 281 |
-
- Use line_start/line_end for large files
|
| 282 |
-
- Example: `github_read_file({"repo": "huggingface/trl", "path": "examples/scripts/sft.py"})`
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
**github_list_repos:**
|
| 286 |
-
- Discover libraries and repositories for a task
|
| 287 |
-
- List repos by stars, forks, update date
|
| 288 |
-
- Use when exploring what libraries exist
|
| 289 |
-
- Example: `github_list_repos({"owner": "huggingface", "sort": "stars", "limit": 10})`
|
| 290 |
-
|
| 291 |
-
## Documentation Tools
|
| 292 |
-
|
| 293 |
-
**explore_hf_docs:**
|
| 294 |
-
- Use AFTER github_find_examples to complement example code with docs
|
| 295 |
-
- Use to discover current documentation structure
|
| 296 |
-
- Returns list of pages with 300-char glimpses
|
| 297 |
-
- Then use fetch_hf_docs for detailed content
|
| 298 |
|
| 299 |
-
**
|
| 300 |
-
-
|
| 301 |
-
-
|
| 302 |
-
-
|
| 303 |
|
| 304 |
**find_hf_api:**
|
| 305 |
-
- Find REST API endpoints by keyword
|
| 306 |
-
-
|
| 307 |
-
- Use `tag` to browse all endpoints in a category
|
| 308 |
-
- Returns curl examples with authentication patterns
|
| 309 |
-
- Use for API-only operations: streaming logs/metrics, org management, security scans, etc.
|
| 310 |
-
|
| 311 |
-
## Hub Discovery Tools (MCP)
|
| 312 |
-
|
| 313 |
-
**model_search:**
|
| 314 |
-
- Find models by query, task, author, library
|
| 315 |
-
- Sort by downloads, likes, trending, created date
|
| 316 |
-
- ALWAYS verify with hub_repo_details before using
|
| 317 |
-
- Select most appropriate option based on requirements
|
| 318 |
-
|
| 319 |
-
**dataset_search:**
|
| 320 |
-
- Find datasets by query, tags, author
|
| 321 |
-
- Sort by downloads, likes, trending
|
| 322 |
-
- ALWAYS verify format with hub_repo_details before training
|
| 323 |
-
- Select most suitable dataset based on format and task
|
| 324 |
-
|
| 325 |
-
**paper_search:**
|
| 326 |
-
- Find research papers semantically
|
| 327 |
-
- Get paper abstracts and links
|
| 328 |
-
- Useful for understanding methods before implementing
|
| 329 |
-
|
| 330 |
-
**hub_repo_details:**
|
| 331 |
-
- Get detailed information about repos
|
| 332 |
-
- ⚠️ CRITICAL: Use this to verify dataset format before training
|
| 333 |
-
- Check model size, architecture, requirements
|
| 334 |
-
- Verify dataset columns, splits, size
|
| 335 |
|
| 336 |
## Execution & Storage Tools
|
| 337 |
|
|
@@ -401,16 +285,13 @@ system_prompt: |
|
|
| 401 |
## Documentation Usage
|
| 402 |
|
| 403 |
**✓ DO:**
|
| 404 |
-
-
|
| 405 |
-
-
|
| 406 |
-
- Check current APIs and parameters
|
| 407 |
-
- Base implementation on researched approaches
|
| 408 |
|
| 409 |
**✗ DON'T:**
|
| 410 |
-
- Implement based on internal knowledge without
|
| 411 |
- Assume you know current API syntax
|
| 412 |
-
- Skip research for "simple" tasks
|
| 413 |
-
- Use outdated patterns or methods
|
| 414 |
|
| 415 |
## Error Handling & Recovery
|
| 416 |
|
|
@@ -519,42 +400,24 @@ system_prompt: |
|
|
| 519 |
User: Fine-tune Llama for instruction following on ultrachat dataset
|
| 520 |
|
| 521 |
Assistant:
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
[Creates plan with plan_tool: Find examples, Study code, Research docs, Find model, Validate dataset, Create script, Submit job]
|
| 525 |
-
|
| 526 |
-
[STEP 1: Find working example code FIRST]
|
| 527 |
-
github_find_examples({"repo": "trl", "keyword": "sft", "org": "huggingface"})
|
| 528 |
-
# Found: examples/scripts/sft.py, examples/scripts/sft_vlm.py
|
| 529 |
-
|
| 530 |
-
[STEP 2: Read the working implementation]
|
| 531 |
-
github_read_file({"repo": "huggingface/trl", "path": "examples/scripts/sft.py"})
|
| 532 |
-
# Studied: SFTTrainer usage, SFTConfig parameters, dataset handling, imports
|
| 533 |
|
| 534 |
-
[
|
| 535 |
-
[Researches: explore_hf_docs("trl"), fetch_hf_docs(SFT pages), explore_hf_docs("trackio")]
|
| 536 |
|
| 537 |
-
[STEP
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
- Selected: meta-llama/Llama-3.2-1B (1.24B params) - optimal balance of quality and efficiency
|
| 544 |
-
- Dataset: HuggingFaceH4/ultrachat_200k (207K samples, "messages" format ✓ SFT-compatible)
|
| 545 |
-
- Hardware: t4-small (4vCPU/15GB/GPU 16GB, $0.60/hr) - cost-efficient for this model size
|
| 546 |
-
- Estimated: 3 hours, ~$1.80 total cost
|
| 547 |
-
|
| 548 |
-
[STEP 6: Create and submit training job]
|
| 549 |
-
[Updates plan: mark resource selection complete, mark script creation in_progress]
|
| 550 |
|
| 551 |
-
[
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
- Trackio monitoring as shown in docs
|
| 556 |
-
- push_to_hub configuration with HF_TOKEN]
|
| 557 |
|
|
|
|
|
|
|
| 558 |
[Submits training job with hf_jobs: hardware=t4-small, timeout=4h, env=HF_TOKEN]
|
| 559 |
|
| 560 |
</example>
|
|
@@ -601,8 +464,8 @@ system_prompt: |
|
|
| 601 |
|
| 602 |
# Additional Instructions
|
| 603 |
|
| 604 |
-
- **Always use current information:**
|
| 605 |
-
- **Example code first:**
|
| 606 |
- **Search before building:** Use Hub search tools, GitHub code search, and documentation before creating custom solutions
|
| 607 |
- **Verify explicitly:** Never assume dataset schemas, column names, or API details; always check with hub_repo_details
|
| 608 |
- **Base on documented practices:** Implement using researched approaches from documentation, not general knowledge
|
|
|
|
| 23 |
|
| 24 |
## PHASE 1: RESEARCH (Mandatory - Never Skip)
|
| 25 |
|
| 26 |
+
⚠️ **CRITICAL:** Your training data is outdated. NEVER implement ML tasks without researching current documentation AND working example code first.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
+
**Use the `research` tool.** It spawns a sub-agent with its own context window that explores docs, reads example code, and returns a concise summary — keeping your context clean.
|
|
|
|
|
|
|
| 29 |
|
|
|
|
| 30 |
```python
|
| 31 |
+
# Example: User requests "Fine-tune a model for instruction following using SFT"
|
| 32 |
+
research({
|
| 33 |
+
"task": "Research current TRL SFTTrainer: find working example scripts in the trl repo, read the SFT example implementation, check SFTConfig parameters in docs, and check trackio monitoring setup.",
|
| 34 |
+
"context": "User wants to fine-tune a model for instruction following using SFT."
|
| 35 |
+
})
|
| 36 |
+
# Returns: key findings, code patterns, imports, config parameters, file references
|
| 37 |
```
|
| 38 |
|
| 39 |
+
**Be specific in your research task** — include library names, trainer types, dataset names, specific questions. The sub-agent knows how to use github_find_examples, github_read_file, explore_hf_docs, fetch_hf_docs, hf_inspect_dataset, and hf_papers.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
+
**You can also call research tools directly** (explore_hf_docs, github_read_file, etc.) for quick lookups that don't need a full research cycle.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
+
**Skip research ONLY for:**
|
| 44 |
- Simple factual questions ("What is LoRA?", "What is DPO?")
|
| 45 |
- Status checks (`hf_jobs("ps")`, `hf_jobs("logs", job_id="xxx")`)
|
| 46 |
- Resource discovery (`model_search`, `dataset_search`, `paper_search`)
|
| 47 |
- Trivial operations that don't require implementation
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
## PHASE 2: PLAN & VALIDATE (Required for Multi-Step Tasks)
|
| 50 |
|
| 51 |
⚠️ **CRITICAL:** Break down complex tasks and validate resources BEFORE executing.
|
|
|
|
| 200 |
|
| 201 |
# Tool Usage Patterns for Reliability
|
| 202 |
|
| 203 |
+
## Research
|
| 204 |
|
| 205 |
+
Use the `research` tool for any ML implementation research. It handles the full
|
| 206 |
+
github_find_examples → github_read_file → explore_hf_docs → fetch_hf_docs chain
|
| 207 |
+
in its own context and returns a summary. You can also call these tools directly for quick lookups.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
|
| 209 |
+
## Hub Discovery Tools (MCP)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
+
**model_search / dataset_search / paper_search / hub_repo_details:**
|
| 212 |
+
- Find models, datasets, papers by query
|
| 213 |
+
- ⚠️ ALWAYS verify dataset format with hub_repo_details before training
|
| 214 |
+
- hub_repo_details: check model size, architecture, dataset columns/splits
|
| 215 |
|
| 216 |
**find_hf_api:**
|
| 217 |
+
- Find REST API endpoints by keyword or tag
|
| 218 |
+
- For API-only operations: streaming logs, org management, etc.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
## Execution & Storage Tools
|
| 221 |
|
|
|
|
| 285 |
## Documentation Usage
|
| 286 |
|
| 287 |
**✓ DO:**
|
| 288 |
+
- Use `research` tool before implementing any ML task
|
| 289 |
+
- Base implementation on the research findings (code patterns, imports, config)
|
|
|
|
|
|
|
| 290 |
|
| 291 |
**✗ DON'T:**
|
| 292 |
+
- Implement based on internal knowledge without researching first
|
| 293 |
- Assume you know current API syntax
|
| 294 |
+
- Skip research for "simple" ML tasks
|
|
|
|
| 295 |
|
| 296 |
## Error Handling & Recovery
|
| 297 |
|
|
|
|
| 400 |
User: Fine-tune Llama for instruction following on ultrachat dataset
|
| 401 |
|
| 402 |
Assistant:
|
| 403 |
+
I'll fine-tune Llama for instruction following. Let me research current TRL SFT patterns and validate the dataset.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
|
| 405 |
+
[Creates plan with plan_tool: Research, Find model, Validate dataset, Create script, Submit job]
|
|
|
|
| 406 |
|
| 407 |
+
[STEP 1: Research via sub-agent — keeps main context clean]
|
| 408 |
+
research({
|
| 409 |
+
"task": "Research current TRL SFTTrainer: find working SFT example scripts in the trl repo, read the implementation, check SFTConfig parameters and imports. Also check trackio monitoring setup.",
|
| 410 |
+
"context": "User wants to SFT fine-tune Llama on ultrachat dataset."
|
| 411 |
+
})
|
| 412 |
+
# Returns: key imports, SFTConfig params, working code patterns, trackio setup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
|
| 414 |
+
[STEP 2: Discover and validate resources]
|
| 415 |
+
model_search({"query": "llama instruct", "sort": "downloads"})
|
| 416 |
+
hub_repo_details({"repo_ids": ["meta-llama/Llama-3.2-1B", "HuggingFaceH4/ultrachat_200k"]})
|
| 417 |
+
# Validates: model exists, dataset has "messages" column ✓ SFT-compatible
|
|
|
|
|
|
|
| 418 |
|
| 419 |
+
[STEP 3: Create and submit training job]
|
| 420 |
+
[Creates script based on research findings — correct imports, SFTConfig, dataset handling, trackio, push_to_hub]
|
| 421 |
[Submits training job with hf_jobs: hardware=t4-small, timeout=4h, env=HF_TOKEN]
|
| 422 |
|
| 423 |
</example>
|
|
|
|
| 464 |
|
| 465 |
# Additional Instructions
|
| 466 |
|
| 467 |
+
- **Always use current information:** Use the `research` tool before implementing ML tasks; internal knowledge may be outdated
|
| 468 |
+
- **Example code first:** The research sub-agent finds and reads working examples — real code shows current APIs and patterns
|
| 469 |
- **Search before building:** Use Hub search tools, GitHub code search, and documentation before creating custom solutions
|
| 470 |
- **Verify explicitly:** Never assume dataset schemas, column names, or API details; always check with hub_repo_details
|
| 471 |
- **Base on documented practices:** Implement using researched approaches from documentation, not general knowledge
|
agent/tools/research_tool.py
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Research subagent tool — spawns a cheap LLM call with a focused
|
| 3 |
+
research task and returns a summary. The subagent gets its own
|
| 4 |
+
independent context (not the main conversation), so research
|
| 5 |
+
work doesn't pollute the main agent's context window.
|
| 6 |
+
|
| 7 |
+
Inspired by claude-code's code-explorer agent pattern.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import json
|
| 11 |
+
import logging
|
| 12 |
+
import os
|
| 13 |
+
from typing import Any
|
| 14 |
+
|
| 15 |
+
from litellm import Message, acompletion
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
# Tools the research agent can use (read-only subset)
|
| 20 |
+
RESEARCH_TOOL_NAMES = {
|
| 21 |
+
"read",
|
| 22 |
+
"bash",
|
| 23 |
+
"explore_hf_docs",
|
| 24 |
+
"fetch_hf_docs",
|
| 25 |
+
"find_hf_api",
|
| 26 |
+
"hf_papers",
|
| 27 |
+
"github_find_examples",
|
| 28 |
+
"github_list_repos",
|
| 29 |
+
"github_read_file",
|
| 30 |
+
"hf_inspect_dataset",
|
| 31 |
+
"hf_repo_files",
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
RESEARCH_SYSTEM_PROMPT = """\
|
| 35 |
+
You are a research sub-agent for an ML engineering assistant.
|
| 36 |
+
Your job: explore documentation, code examples, APIs, and repos,
|
| 37 |
+
then return a concise, actionable summary. The main agent will use
|
| 38 |
+
your findings to implement the actual solution.
|
| 39 |
+
|
| 40 |
+
# Research methodology
|
| 41 |
+
|
| 42 |
+
1. **Discovery**: Find relevant entry points — example scripts, doc pages, API endpoints
|
| 43 |
+
2. **Tracing**: Follow the chain from entry point to implementation detail
|
| 44 |
+
3. **Analysis**: Identify patterns, current API usage, key dependencies
|
| 45 |
+
4. **Synthesis**: Summarize findings in a structured format
|
| 46 |
+
|
| 47 |
+
# How to use your tools
|
| 48 |
+
|
| 49 |
+
## GitHub code research (USE FIRST for any ML implementation task)
|
| 50 |
+
- `github_find_examples`: Find working example scripts in HF repos (trl, transformers, etc.)
|
| 51 |
+
Example: `github_find_examples({"repo": "trl", "keyword": "sft"})`
|
| 52 |
+
Returns: file paths in examples/, scripts/, notebooks/ directories
|
| 53 |
+
- `github_read_file`: Read the actual implementation code
|
| 54 |
+
Example: `github_read_file({"repo": "huggingface/trl", "path": "examples/scripts/sft.py"})`
|
| 55 |
+
Use line_start/line_end for large files
|
| 56 |
+
|
| 57 |
+
## Documentation
|
| 58 |
+
- `explore_hf_docs(endpoint)`: Search docs for a library. Endpoints: trl, transformers, datasets, peft, accelerate, trackio, vllm, inference-endpoints, etc.
|
| 59 |
+
- `fetch_hf_docs(url)`: Fetch full page content from explore results
|
| 60 |
+
- `find_hf_api(query=..., tag=...)`: Find REST API endpoints
|
| 61 |
+
|
| 62 |
+
## Dataset inspection
|
| 63 |
+
- `hf_inspect_dataset`: Check dataset schema, splits, sample rows
|
| 64 |
+
CRITICAL for training: verify column format matches training method:
|
| 65 |
+
- SFT: needs "messages", "text", or "prompt"/"completion"
|
| 66 |
+
- DPO: needs "prompt", "chosen", "rejected"
|
| 67 |
+
- GRPO: needs "prompt" only
|
| 68 |
+
|
| 69 |
+
## Papers
|
| 70 |
+
- `hf_papers`: Search papers, get details, find linked datasets/models
|
| 71 |
+
|
| 72 |
+
## Hub repo inspection
|
| 73 |
+
- `hf_repo_files`: List/read files in any HF repo (model, dataset, space)
|
| 74 |
+
|
| 75 |
+
# Correct research pattern for ML tasks
|
| 76 |
+
|
| 77 |
+
```
|
| 78 |
+
# 1. Find working example code FIRST
|
| 79 |
+
github_find_examples({"repo": "trl", "keyword": "sft"})
|
| 80 |
+
|
| 81 |
+
# 2. Read the implementation
|
| 82 |
+
github_read_file({"repo": "huggingface/trl", "path": "examples/scripts/sft.py"})
|
| 83 |
+
|
| 84 |
+
# 3. Check docs for parameters/config details
|
| 85 |
+
explore_hf_docs("trl")
|
| 86 |
+
fetch_hf_docs("https://huggingface.co/docs/trl/sft_trainer")
|
| 87 |
+
|
| 88 |
+
# 4. Validate dataset format if relevant
|
| 89 |
+
hf_inspect_dataset({"dataset": "org/name", "split": "train", "sample_rows": 3})
|
| 90 |
+
```
|
| 91 |
+
|
| 92 |
+
# Output format
|
| 93 |
+
|
| 94 |
+
Your output MUST include:
|
| 95 |
+
- **Key findings**: The most important things you discovered (current API usage, working patterns)
|
| 96 |
+
- **Essential references**: Specific file paths, URLs, function names, doc sections, code snippets
|
| 97 |
+
that the main agent should use directly
|
| 98 |
+
- **Code patterns**: Key imports, configurations, and usage patterns from working examples
|
| 99 |
+
- **Recommendations**: What to do next based on your findings
|
| 100 |
+
|
| 101 |
+
Be concise. Your output goes into another agent's context — every token counts.
|
| 102 |
+
Aim for 500-1500 words max. Include actual code snippets from examples you read,
|
| 103 |
+
not paraphrased descriptions.
|
| 104 |
+
"""
|
| 105 |
+
|
| 106 |
+
RESEARCH_TOOL_SPEC = {
|
| 107 |
+
"name": "research",
|
| 108 |
+
"description": (
|
| 109 |
+
"Spawn a research sub-agent to explore documentation, codebases, "
|
| 110 |
+
"or repos WITHOUT polluting the main conversation context. "
|
| 111 |
+
"The sub-agent gets its own independent context window with read-only "
|
| 112 |
+
"research tools and returns a concise summary of findings.\n\n"
|
| 113 |
+
"Use this for:\n"
|
| 114 |
+
"- Researching current API usage before implementing ML tasks "
|
| 115 |
+
"(find examples + read docs)\n"
|
| 116 |
+
"- Exploring HF docs, reading papers, analyzing GitHub repos\n"
|
| 117 |
+
"- Any research where raw tool outputs would be too verbose\n\n"
|
| 118 |
+
"The sub-agent knows how to use github_find_examples, github_read_file, "
|
| 119 |
+
"explore_hf_docs, fetch_hf_docs, hf_inspect_dataset, hf_papers, etc. "
|
| 120 |
+
"Just describe what you need researched."
|
| 121 |
+
),
|
| 122 |
+
"parameters": {
|
| 123 |
+
"type": "object",
|
| 124 |
+
"properties": {
|
| 125 |
+
"task": {
|
| 126 |
+
"type": "string",
|
| 127 |
+
"description": (
|
| 128 |
+
"Detailed description of what to research. Be specific: "
|
| 129 |
+
"include library names, trainer types, dataset names, "
|
| 130 |
+
"repo names, or doc pages to explore. Example: "
|
| 131 |
+
"'Research current TRL SFTTrainer usage: find working "
|
| 132 |
+
"example scripts, read the SFT documentation, and check "
|
| 133 |
+
"SFTConfig parameters. Also validate that dataset "
|
| 134 |
+
"HuggingFaceH4/ultrachat_200k has the right format for SFT.'"
|
| 135 |
+
),
|
| 136 |
+
},
|
| 137 |
+
"context": {
|
| 138 |
+
"type": "string",
|
| 139 |
+
"description": (
|
| 140 |
+
"Optional context from the current conversation that the "
|
| 141 |
+
"research agent needs (e.g., what the user wants to build, "
|
| 142 |
+
"constraints, what's been tried)."
|
| 143 |
+
),
|
| 144 |
+
},
|
| 145 |
+
},
|
| 146 |
+
"required": ["task"],
|
| 147 |
+
},
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def _resolve_llm_params(model_name: str) -> dict:
|
| 152 |
+
"""Build LiteLLM kwargs, reusing the HF router logic from agent_loop."""
|
| 153 |
+
if not model_name.startswith("huggingface/"):
|
| 154 |
+
return {"model": model_name}
|
| 155 |
+
|
| 156 |
+
parts = model_name.split("/", 2) # ["huggingface", "<provider>", "<org>/<model>"]
|
| 157 |
+
if len(parts) < 3:
|
| 158 |
+
return {"model": model_name}
|
| 159 |
+
|
| 160 |
+
provider = parts[1]
|
| 161 |
+
model_id = parts[2]
|
| 162 |
+
return {
|
| 163 |
+
"model": f"openai/{model_id}",
|
| 164 |
+
"api_base": f"https://router.huggingface.co/{provider}/v3/openai",
|
| 165 |
+
"api_key": os.environ.get("INFERENCE_TOKEN", ""),
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def _get_research_model(main_model: str) -> str:
|
| 170 |
+
"""Pick a cheaper model for research based on the main model."""
|
| 171 |
+
if "opus" in main_model:
|
| 172 |
+
return "anthropic/claude-sonnet-4-5-20250929"
|
| 173 |
+
if "sonnet" in main_model:
|
| 174 |
+
return "anthropic/claude-haiku-3-5-20241022"
|
| 175 |
+
# For HF router models, use the same model
|
| 176 |
+
return main_model
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
async def research_handler(
|
| 180 |
+
arguments: dict[str, Any], session=None, **_kw
|
| 181 |
+
) -> tuple[str, bool]:
|
| 182 |
+
"""Execute a research sub-agent with its own context."""
|
| 183 |
+
task = arguments.get("task", "")
|
| 184 |
+
context = arguments.get("context", "")
|
| 185 |
+
if not task:
|
| 186 |
+
return "No research task provided.", False
|
| 187 |
+
|
| 188 |
+
if not session:
|
| 189 |
+
return "No session available for research agent.", False
|
| 190 |
+
|
| 191 |
+
# Build the sub-agent's messages (independent context)
|
| 192 |
+
messages: list[Message] = [
|
| 193 |
+
Message(role="system", content=RESEARCH_SYSTEM_PROMPT),
|
| 194 |
+
]
|
| 195 |
+
|
| 196 |
+
user_content = f"Research task: {task}"
|
| 197 |
+
if context:
|
| 198 |
+
user_content = f"Context: {context}\n\n{user_content}"
|
| 199 |
+
messages.append(Message(role="user", content=user_content))
|
| 200 |
+
|
| 201 |
+
# Use a cheaper/faster model for research
|
| 202 |
+
main_model = session.config.model_name
|
| 203 |
+
research_model = _get_research_model(main_model)
|
| 204 |
+
llm_params = _resolve_llm_params(research_model)
|
| 205 |
+
|
| 206 |
+
# Get read-only tool specs from the session's tool router
|
| 207 |
+
tool_specs = [
|
| 208 |
+
spec
|
| 209 |
+
for spec in session.tool_router.get_tool_specs_for_llm()
|
| 210 |
+
if spec["function"]["name"] in RESEARCH_TOOL_NAMES
|
| 211 |
+
]
|
| 212 |
+
|
| 213 |
+
# Run the research loop (max 20 iterations — research should be focused)
|
| 214 |
+
max_iterations = 20
|
| 215 |
+
for _iteration in range(max_iterations):
|
| 216 |
+
try:
|
| 217 |
+
response = await acompletion(
|
| 218 |
+
messages=messages,
|
| 219 |
+
tools=tool_specs if tool_specs else None,
|
| 220 |
+
tool_choice="auto",
|
| 221 |
+
stream=False,
|
| 222 |
+
timeout=120,
|
| 223 |
+
**llm_params,
|
| 224 |
+
)
|
| 225 |
+
except Exception as e:
|
| 226 |
+
logger.error("Research sub-agent LLM error: %s", e)
|
| 227 |
+
return f"Research agent LLM error: {e}", False
|
| 228 |
+
|
| 229 |
+
choice = response.choices[0]
|
| 230 |
+
msg = choice.message
|
| 231 |
+
|
| 232 |
+
# If no tool calls, we have our final answer
|
| 233 |
+
if not msg.tool_calls:
|
| 234 |
+
content = msg.content or "Research completed but no summary generated."
|
| 235 |
+
return content, True
|
| 236 |
+
|
| 237 |
+
# Execute tool calls and add results
|
| 238 |
+
messages.append(msg)
|
| 239 |
+
for tc in msg.tool_calls:
|
| 240 |
+
try:
|
| 241 |
+
tool_args = json.loads(tc.function.arguments)
|
| 242 |
+
except (json.JSONDecodeError, TypeError):
|
| 243 |
+
messages.append(
|
| 244 |
+
Message(
|
| 245 |
+
role="tool",
|
| 246 |
+
content="Invalid tool arguments.",
|
| 247 |
+
tool_call_id=tc.id,
|
| 248 |
+
name=tc.function.name,
|
| 249 |
+
)
|
| 250 |
+
)
|
| 251 |
+
continue
|
| 252 |
+
|
| 253 |
+
tool_name = tc.function.name
|
| 254 |
+
if tool_name not in RESEARCH_TOOL_NAMES:
|
| 255 |
+
messages.append(
|
| 256 |
+
Message(
|
| 257 |
+
role="tool",
|
| 258 |
+
content=f"Tool '{tool_name}' not available for research.",
|
| 259 |
+
tool_call_id=tc.id,
|
| 260 |
+
name=tool_name,
|
| 261 |
+
)
|
| 262 |
+
)
|
| 263 |
+
continue
|
| 264 |
+
|
| 265 |
+
try:
|
| 266 |
+
output, _success = await session.tool_router.call_tool(
|
| 267 |
+
tool_name, tool_args, session=session
|
| 268 |
+
)
|
| 269 |
+
# Truncate tool output for the research context
|
| 270 |
+
if len(output) > 8000:
|
| 271 |
+
output = (
|
| 272 |
+
output[:4800]
|
| 273 |
+
+ "\n...(truncated)...\n"
|
| 274 |
+
+ output[-3200:]
|
| 275 |
+
)
|
| 276 |
+
except Exception as e:
|
| 277 |
+
output = f"Tool error: {e}"
|
| 278 |
+
|
| 279 |
+
messages.append(
|
| 280 |
+
Message(
|
| 281 |
+
role="tool",
|
| 282 |
+
content=output,
|
| 283 |
+
tool_call_id=tc.id,
|
| 284 |
+
name=tool_name,
|
| 285 |
+
)
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
return (
|
| 289 |
+
"Research agent hit iteration limit (20). "
|
| 290 |
+
"Partial findings may be incomplete — try a more focused task.",
|
| 291 |
+
False,
|
| 292 |
+
)
|