Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
improved search capabilities with github tools and system prompt updates
Browse files- agent/context_manager/manager.py +12 -0
- agent/core/agent_loop.py +8 -4
- agent/core/tools.py +47 -6
- agent/main.py +12 -6
- agent/prompts/system_prompt.yaml +18 -24
- agent/tools/__init__.py +24 -0
- agent/tools/github_find_examples.py +491 -0
- agent/tools/github_list_repos.py +281 -0
- agent/tools/github_read_file.py +336 -0
- agent/tools/github_search_code.py +453 -0
- agent/tools/jobs_tool.py +19 -5
- agent/tools/utilities.py +2 -2
- agent/tools/utils_tools.py +5 -8
- pyproject.toml +4 -0
- uv.lock +0 -0
agent/context_manager/manager.py
CHANGED
|
@@ -2,6 +2,8 @@
|
|
| 2 |
Context management for conversation history
|
| 3 |
"""
|
| 4 |
|
|
|
|
|
|
|
| 5 |
from pathlib import Path
|
| 6 |
from typing import Any
|
| 7 |
|
|
@@ -42,10 +44,20 @@ class ContextManager:
|
|
| 42 |
prompt_data = yaml.safe_load(f)
|
| 43 |
template_str = prompt_data.get("system_prompt", "")
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
template = Template(template_str)
|
| 46 |
return template.render(
|
| 47 |
tools=tool_specs,
|
| 48 |
num_tools=len(tool_specs),
|
|
|
|
|
|
|
|
|
|
| 49 |
)
|
| 50 |
|
| 51 |
def add_message(self, message: Message, token_count: int = None) -> None:
|
|
|
|
| 2 |
Context management for conversation history
|
| 3 |
"""
|
| 4 |
|
| 5 |
+
import zoneinfo
|
| 6 |
+
from datetime import datetime
|
| 7 |
from pathlib import Path
|
| 8 |
from typing import Any
|
| 9 |
|
|
|
|
| 44 |
prompt_data = yaml.safe_load(f)
|
| 45 |
template_str = prompt_data.get("system_prompt", "")
|
| 46 |
|
| 47 |
+
# Get current date and time
|
| 48 |
+
tz = zoneinfo.ZoneInfo("Europe/Paris")
|
| 49 |
+
now = datetime.now(tz)
|
| 50 |
+
current_date = now.strftime("%d-%m-%Y")
|
| 51 |
+
current_time = now.strftime("%H:%M:%S.%f")[:-3]
|
| 52 |
+
current_timezone = f"{now.strftime('%Z')} (UTC{now.strftime('%z')[:3]}:{now.strftime('%z')[3:]})"
|
| 53 |
+
|
| 54 |
template = Template(template_str)
|
| 55 |
return template.render(
|
| 56 |
tools=tool_specs,
|
| 57 |
num_tools=len(tool_specs),
|
| 58 |
+
current_date=current_date,
|
| 59 |
+
current_time=current_time,
|
| 60 |
+
current_timezone=current_timezone,
|
| 61 |
)
|
| 62 |
|
| 63 |
def add_message(self, message: Message, token_count: int = None) -> None:
|
agent/core/agent_loop.py
CHANGED
|
@@ -25,9 +25,15 @@ def _validate_tool_args(tool_args: dict) -> tuple[bool, str | None]:
|
|
| 25 |
args = tool_args.get("args", {})
|
| 26 |
# Sometimes LLM passes args as string instead of dict
|
| 27 |
if isinstance(args, str):
|
| 28 |
-
return
|
|
|
|
|
|
|
|
|
|
| 29 |
if not isinstance(args, dict) and args is not None:
|
| 30 |
-
return
|
|
|
|
|
|
|
|
|
|
| 31 |
return True, None
|
| 32 |
|
| 33 |
|
|
@@ -38,8 +44,6 @@ def _needs_approval(tool_name: str, tool_args: dict) -> bool:
|
|
| 38 |
if not args_valid:
|
| 39 |
return False
|
| 40 |
|
| 41 |
-
args = tool_args.get("args", {})
|
| 42 |
-
|
| 43 |
if tool_name == "hf_jobs":
|
| 44 |
# Check if it's a run or uv operation
|
| 45 |
operation = tool_args.get("operation", "")
|
|
|
|
| 25 |
args = tool_args.get("args", {})
|
| 26 |
# Sometimes LLM passes args as string instead of dict
|
| 27 |
if isinstance(args, str):
|
| 28 |
+
return (
|
| 29 |
+
False,
|
| 30 |
+
f"Tool call error: 'args' must be a JSON object, not a string. You passed: {repr(args)}",
|
| 31 |
+
)
|
| 32 |
if not isinstance(args, dict) and args is not None:
|
| 33 |
+
return (
|
| 34 |
+
False,
|
| 35 |
+
f"Tool call error: 'args' must be a JSON object. You passed type: {type(args).__name__}",
|
| 36 |
+
)
|
| 37 |
return True, None
|
| 38 |
|
| 39 |
|
|
|
|
| 44 |
if not args_valid:
|
| 45 |
return False
|
| 46 |
|
|
|
|
|
|
|
| 47 |
if tool_name == "hf_jobs":
|
| 48 |
# Check if it's a run or uv operation
|
| 49 |
operation = tool_args.get("operation", "")
|
agent/core/tools.py
CHANGED
|
@@ -19,13 +19,27 @@ from agent.tools.docs_tools import (
|
|
| 19 |
explore_hf_docs_handler,
|
| 20 |
hf_docs_fetch_handler,
|
| 21 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler
|
| 23 |
from agent.tools.plan_tool import PLAN_TOOL_SPEC, plan_tool_handler
|
| 24 |
from agent.tools.private_hf_repo_tools import (
|
| 25 |
PRIVATE_HF_REPO_TOOL_SPEC,
|
| 26 |
private_hf_repo_handler,
|
| 27 |
)
|
| 28 |
-
|
|
|
|
|
|
|
| 29 |
|
| 30 |
# Suppress aiohttp deprecation warning
|
| 31 |
warnings.filterwarnings(
|
|
@@ -224,7 +238,7 @@ class ToolRouter:
|
|
| 224 |
def create_builtin_tools() -> list[ToolSpec]:
|
| 225 |
"""Create built-in tool specifications"""
|
| 226 |
print(
|
| 227 |
-
f"Creating built-in tools: {EXPLORE_HF_DOCS_TOOL_SPEC['name']}, {HF_DOCS_FETCH_TOOL_SPEC['name']}, {PLAN_TOOL_SPEC['name']}, {HF_JOBS_TOOL_SPEC['name']}, {PRIVATE_HF_REPO_TOOL_SPEC['name']}, {
|
| 228 |
)
|
| 229 |
# in order of importance
|
| 230 |
return [
|
|
@@ -260,10 +274,37 @@ def create_builtin_tools() -> list[ToolSpec]:
|
|
| 260 |
parameters=PRIVATE_HF_REPO_TOOL_SPEC["parameters"],
|
| 261 |
handler=private_hf_repo_handler,
|
| 262 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
ToolSpec(
|
| 264 |
-
name=
|
| 265 |
-
description=
|
| 266 |
-
parameters=
|
| 267 |
-
handler=
|
| 268 |
),
|
| 269 |
]
|
|
|
|
| 19 |
explore_hf_docs_handler,
|
| 20 |
hf_docs_fetch_handler,
|
| 21 |
)
|
| 22 |
+
from agent.tools.github_find_examples import (
|
| 23 |
+
GITHUB_FIND_EXAMPLES_TOOL_SPEC,
|
| 24 |
+
github_find_examples_handler,
|
| 25 |
+
)
|
| 26 |
+
from agent.tools.github_list_repos import (
|
| 27 |
+
GITHUB_LIST_REPOS_TOOL_SPEC,
|
| 28 |
+
github_list_repos_handler,
|
| 29 |
+
)
|
| 30 |
+
from agent.tools.github_read_file import (
|
| 31 |
+
GITHUB_READ_FILE_TOOL_SPEC,
|
| 32 |
+
github_read_file_handler,
|
| 33 |
+
)
|
| 34 |
from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler
|
| 35 |
from agent.tools.plan_tool import PLAN_TOOL_SPEC, plan_tool_handler
|
| 36 |
from agent.tools.private_hf_repo_tools import (
|
| 37 |
PRIVATE_HF_REPO_TOOL_SPEC,
|
| 38 |
private_hf_repo_handler,
|
| 39 |
)
|
| 40 |
+
|
| 41 |
+
# NOTE: Utils tool disabled - date/time now loaded into system prompt at initialization
|
| 42 |
+
# from agent.tools.utils_tools import UTILS_TOOL_SPEC, utils_handler
|
| 43 |
|
| 44 |
# Suppress aiohttp deprecation warning
|
| 45 |
warnings.filterwarnings(
|
|
|
|
| 238 |
def create_builtin_tools() -> list[ToolSpec]:
|
| 239 |
"""Create built-in tool specifications"""
|
| 240 |
print(
|
| 241 |
+
f"Creating built-in tools: {EXPLORE_HF_DOCS_TOOL_SPEC['name']}, {HF_DOCS_FETCH_TOOL_SPEC['name']}, {PLAN_TOOL_SPEC['name']}, {HF_JOBS_TOOL_SPEC['name']}, {PRIVATE_HF_REPO_TOOL_SPEC['name']}, {GITHUB_FIND_EXAMPLES_TOOL_SPEC['name']}, {GITHUB_LIST_REPOS_TOOL_SPEC['name']}, {GITHUB_READ_FILE_TOOL_SPEC['name']}"
|
| 242 |
)
|
| 243 |
# in order of importance
|
| 244 |
return [
|
|
|
|
| 274 |
parameters=PRIVATE_HF_REPO_TOOL_SPEC["parameters"],
|
| 275 |
handler=private_hf_repo_handler,
|
| 276 |
),
|
| 277 |
+
# NOTE: Utils tool disabled - date/time now loaded into system prompt at initialization (less tool calls=more reliablity)
|
| 278 |
+
# ToolSpec(
|
| 279 |
+
# name=UTILS_TOOL_SPEC["name"],
|
| 280 |
+
# description=UTILS_TOOL_SPEC["description"],
|
| 281 |
+
# parameters=UTILS_TOOL_SPEC["parameters"],
|
| 282 |
+
# handler=utils_handler,
|
| 283 |
+
# ),
|
| 284 |
+
# GitHub tools
|
| 285 |
+
# NOTE: Github search code tool disabled - a bit buggy
|
| 286 |
+
# ToolSpec(
|
| 287 |
+
# name=GITHUB_SEARCH_CODE_TOOL_SPEC["name"],
|
| 288 |
+
# description=GITHUB_SEARCH_CODE_TOOL_SPEC["description"],
|
| 289 |
+
# parameters=GITHUB_SEARCH_CODE_TOOL_SPEC["parameters"],
|
| 290 |
+
# handler=github_search_code_handler,
|
| 291 |
+
# ),
|
| 292 |
+
ToolSpec(
|
| 293 |
+
name=GITHUB_FIND_EXAMPLES_TOOL_SPEC["name"],
|
| 294 |
+
description=GITHUB_FIND_EXAMPLES_TOOL_SPEC["description"],
|
| 295 |
+
parameters=GITHUB_FIND_EXAMPLES_TOOL_SPEC["parameters"],
|
| 296 |
+
handler=github_find_examples_handler,
|
| 297 |
+
),
|
| 298 |
+
ToolSpec(
|
| 299 |
+
name=GITHUB_LIST_REPOS_TOOL_SPEC["name"],
|
| 300 |
+
description=GITHUB_LIST_REPOS_TOOL_SPEC["description"],
|
| 301 |
+
parameters=GITHUB_LIST_REPOS_TOOL_SPEC["parameters"],
|
| 302 |
+
handler=github_list_repos_handler,
|
| 303 |
+
),
|
| 304 |
ToolSpec(
|
| 305 |
+
name=GITHUB_READ_FILE_TOOL_SPEC["name"],
|
| 306 |
+
description=GITHUB_READ_FILE_TOOL_SPEC["description"],
|
| 307 |
+
parameters=GITHUB_READ_FILE_TOOL_SPEC["parameters"],
|
| 308 |
+
handler=github_read_file_handler,
|
| 309 |
),
|
| 310 |
]
|
agent/main.py
CHANGED
|
@@ -222,11 +222,15 @@ async def event_listener(
|
|
| 222 |
|
| 223 |
# Build repo URL
|
| 224 |
type_path = "" if repo_type == "model" else f"{repo_type}s"
|
| 225 |
-
repo_url =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
print(f"Repository: {repo_id}")
|
| 228 |
print(f"Type: {repo_type}")
|
| 229 |
-
print(
|
| 230 |
print(f"URL: {repo_url}")
|
| 231 |
|
| 232 |
# Show file preview for upload_file operation
|
|
@@ -237,9 +241,9 @@ async def event_listener(
|
|
| 237 |
|
| 238 |
if isinstance(file_content, str):
|
| 239 |
# Calculate metrics
|
| 240 |
-
all_lines = file_content.split(
|
| 241 |
line_count = len(all_lines)
|
| 242 |
-
size_bytes = len(file_content.encode(
|
| 243 |
size_kb = size_bytes / 1024
|
| 244 |
size_mb = size_kb / 1024
|
| 245 |
|
|
@@ -251,8 +255,10 @@ async def event_listener(
|
|
| 251 |
|
| 252 |
# Show preview
|
| 253 |
preview_lines = all_lines[:5]
|
| 254 |
-
preview =
|
| 255 |
-
print(
|
|
|
|
|
|
|
| 256 |
if len(all_lines) > 5:
|
| 257 |
print("...")
|
| 258 |
|
|
|
|
| 222 |
|
| 223 |
# Build repo URL
|
| 224 |
type_path = "" if repo_type == "model" else f"{repo_type}s"
|
| 225 |
+
repo_url = (
|
| 226 |
+
f"https://huggingface.co/{type_path}/{repo_id}".replace(
|
| 227 |
+
"//", "/"
|
| 228 |
+
)
|
| 229 |
+
)
|
| 230 |
|
| 231 |
print(f"Repository: {repo_id}")
|
| 232 |
print(f"Type: {repo_type}")
|
| 233 |
+
print("Private: Yes")
|
| 234 |
print(f"URL: {repo_url}")
|
| 235 |
|
| 236 |
# Show file preview for upload_file operation
|
|
|
|
| 241 |
|
| 242 |
if isinstance(file_content, str):
|
| 243 |
# Calculate metrics
|
| 244 |
+
all_lines = file_content.split("\n")
|
| 245 |
line_count = len(all_lines)
|
| 246 |
+
size_bytes = len(file_content.encode("utf-8"))
|
| 247 |
size_kb = size_bytes / 1024
|
| 248 |
size_mb = size_kb / 1024
|
| 249 |
|
|
|
|
| 255 |
|
| 256 |
# Show preview
|
| 257 |
preview_lines = all_lines[:5]
|
| 258 |
+
preview = "\n".join(preview_lines)
|
| 259 |
+
print(
|
| 260 |
+
f"Content preview (first 5 lines):\n{preview}"
|
| 261 |
+
)
|
| 262 |
if len(all_lines) > 5:
|
| 263 |
print("...")
|
| 264 |
|
agent/prompts/system_prompt.yaml
CHANGED
|
@@ -1,63 +1,57 @@
|
|
| 1 |
system_prompt: |
|
| 2 |
You are HF Agent, a powerful AI assistant for Machine Learning Engineering, particularly training Large Language Models. You have access to {{ num_tools }} tools for interacting with Hugging Face Hub and performing ML tasks.
|
| 3 |
-
|
|
|
|
|
|
|
| 4 |
# Task Approach
|
| 5 |
|
| 6 |
-
**CRITICAL:
|
| 7 |
|
| 8 |
For ANY implementation task (training, fine-tuning, inference, data processing, etc.):
|
| 9 |
1. **FIRST**: Search HF documentation to find the recommended approach
|
| 10 |
- This is MANDATORY before writing any code or making implementation decisions
|
| 11 |
- Use `explore_hf_docs` to discover documentation structure for relevant libraries (e.g., "trl", "transformers", "diffusers")
|
|
|
|
| 12 |
- Use `fetch_hf_docs` to retrieve full content from specific documentation pages
|
| 13 |
-
- Use `search_hf_api_endpoints` to find API endpoints with usage examples
|
| 14 |
- Research what libraries to use, find code examples, understand best practices
|
| 15 |
-
- Skip ONLY for simple factual questions (e.g., "What is LoRA?")
|
| 16 |
|
| 17 |
-
2. **THEN**: Formulate a plan based on research findings. Pass todos to the
|
| 18 |
|
| 19 |
3. **FINALLY**: Implement using researched approaches
|
| 20 |
- Search for relevant models/datasets on HF Hub
|
|
|
|
| 21 |
- Use all available tools to complete the task
|
| 22 |
-
-
|
| 23 |
-
-
|
| 24 |
|
| 25 |
# Autonomy / Subordinate trade-off.
|
| 26 |
|
| 27 |
Your main goal is to achieve what the user asked. For this:
|
| 28 |
-
1.
|
| 29 |
|
| 30 |
However !! :
|
| 31 |
1. Don't surprise the user with costly, irreversible, or strange actions without asking.
|
| 32 |
-
2. Don't be shy to ask questions if needed.
|
| 33 |
3. Don't be overly talkative, explaining everything after a task ended.
|
| 34 |
|
| 35 |
-
# Available Tools
|
| 36 |
-
|
| 37 |
-
You have access to the following categories of tools:
|
| 38 |
-
|
| 39 |
-
- Hugging Face Hub: Search and interact with models, datasets, papers, and documentation
|
| 40 |
-
- Spaces: Use and discover ML applications
|
| 41 |
-
- Jobs: Manage compute jobs for training and inference
|
| 42 |
-
- Image Generation: Generate and transform images
|
| 43 |
-
- Planning : a planning/to-do tool.
|
| 44 |
|
| 45 |
# Conventions
|
| 46 |
|
| 47 |
- **ALWAYS search documentation BEFORE implementing** any ML workflow (training, inference, data processing, etc.) - This is non-negotiable
|
| 48 |
-
- Use `explore_hf_docs`, `fetch_hf_docs`, and `search_hf_api_endpoints` to research the correct approach
|
| 49 |
-
- Never assume you know the correct library, method, or approach - you must verify with documentation first
|
| 50 |
- Base your implementation on researched best practices, not general knowledge or assumptions
|
| 51 |
- Always search Hugging Face Hub for existing resources before suggesting custom implementations
|
| 52 |
- Keep in mind that a space is a repo, so you can create a space directly by uploading files that way. Repos should also be used to store files permanently : post-execution, files from jobs are not available.
|
| 53 |
- To run jobs, you must always pass the whole content of the file to execute. No files are available on server. Your local files and distant files are entirely seperate scopes.
|
| 54 |
- The HF_TOKEN is automatically loaded from the environment variables.
|
| 55 |
-
-
|
| 56 |
- When referencing models, datasets, or papers, include direct links from search results
|
| 57 |
-
- Before processing any dataset: inspect its actual structure first using the
|
| 58 |
-
- Follow ML best practices: proper train/val/test splits, reproducibility, evaluation metrics
|
| 59 |
- Unless absolutely necessary, don't ask user for action. This does not apply to follow-up questions you have.
|
| 60 |
-
- For training tasks, consider compute requirements and choose appropriate hardware.
|
| 61 |
- Never expose or log API keys, tokens, or secrets. Do not assume keys or secrets are available. Only Hugging Face private resources are available.
|
| 62 |
|
| 63 |
# Communication Style
|
|
|
|
| 1 |
system_prompt: |
|
| 2 |
You are HF Agent, a powerful AI assistant for Machine Learning Engineering, particularly training Large Language Models. You have access to {{ num_tools }} tools for interacting with Hugging Face Hub and performing ML tasks.
|
| 3 |
+
|
| 4 |
+
_Current Time: **{{ current_date }} {{ current_time }} ({{ current_timezone }})**_
|
| 5 |
+
|
| 6 |
# Task Approach
|
| 7 |
|
| 8 |
+
**CRITICAL: You always research first, then implement. You only make implementations that are guided by examples, best practices, or documentation.**
|
| 9 |
|
| 10 |
For ANY implementation task (training, fine-tuning, inference, data processing, etc.):
|
| 11 |
1. **FIRST**: Search HF documentation to find the recommended approach
|
| 12 |
- This is MANDATORY before writing any code or making implementation decisions
|
| 13 |
- Use `explore_hf_docs` to discover documentation structure for relevant libraries (e.g., "trl", "transformers", "diffusers")
|
| 14 |
+
- Use `github_find_examples` and `github_read_file` to discover best-practices on these libraries to reuse.
|
| 15 |
- Use `fetch_hf_docs` to retrieve full content from specific documentation pages
|
| 16 |
+
- Use `search_hf_api_endpoints` to find API endpoints (e.g. spaces, models, datasets, discussions, users, orgs, papers etc.) with usage examples and curl examples.
|
| 17 |
- Research what libraries to use, find code examples, understand best practices
|
| 18 |
+
- Skip ONLY for simple factual questions (e.g., "What is LoRA?").
|
| 19 |
|
| 20 |
+
2. **THEN**: Formulate a plan based on research findings. Pass todos to the `plan_tool`. Update as progress is made.
|
| 21 |
|
| 22 |
3. **FINALLY**: Implement using researched approaches
|
| 23 |
- Search for relevant models/datasets on HF Hub
|
| 24 |
+
- Always validate data structure and format before using it (libraries need specific formats, see documentation).
|
| 25 |
- Use all available tools to complete the task
|
| 26 |
+
- Always leverage existing implementations and resources before creating new ones
|
| 27 |
+
- Use multiple independent tools concurrently for efficiency
|
| 28 |
|
| 29 |
# Autonomy / Subordinate trade-off.
|
| 30 |
|
| 31 |
Your main goal is to achieve what the user asked. For this:
|
| 32 |
+
1. Research, then take action, follow-up, launch jobs. Ask for as little action from the user as possible. Do not ask them to do things you could do via a script or tool.
|
| 33 |
|
| 34 |
However !! :
|
| 35 |
1. Don't surprise the user with costly, irreversible, or strange actions without asking.
|
| 36 |
+
2. Don't be shy to ask clarifying questions if needed.
|
| 37 |
3. Don't be overly talkative, explaining everything after a task ended.
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
# Conventions
|
| 41 |
|
| 42 |
- **ALWAYS search documentation BEFORE implementing** any ML workflow (training, inference, data processing, etc.) - This is non-negotiable
|
| 43 |
+
- Use `explore_hf_docs`, `github_find_examples`, `fetch_hf_docs`, and `search_hf_api_endpoints` to research the correct approach
|
| 44 |
+
- Never assume you know the correct library, method, or approach - you must verify with documentation first. Documentation is the ultimate source of truth.
|
| 45 |
- Base your implementation on researched best practices, not general knowledge or assumptions
|
| 46 |
- Always search Hugging Face Hub for existing resources before suggesting custom implementations
|
| 47 |
- Keep in mind that a space is a repo, so you can create a space directly by uploading files that way. Repos should also be used to store files permanently : post-execution, files from jobs are not available.
|
| 48 |
- To run jobs, you must always pass the whole content of the file to execute. No files are available on server. Your local files and distant files are entirely seperate scopes.
|
| 49 |
- The HF_TOKEN is automatically loaded from the environment variables.
|
|
|
|
| 50 |
- When referencing models, datasets, or papers, include direct links from search results
|
| 51 |
+
- Before processing any dataset: inspect its actual structure first using the `hub_repo_details` tool. Never assume column names, datarow structure, or format: verify them beforehand.
|
| 52 |
+
- Follow ML best practices: proper train/val/test splits, reproducibility, evaluation metrics, pushing to hub.
|
| 53 |
- Unless absolutely necessary, don't ask user for action. This does not apply to follow-up questions you have.
|
| 54 |
+
- For training tasks, consider compute requirements and choose appropriate hardware based on this formula: approx_VRAM_needed = N_params × bytes_per_param × 1.5.
|
| 55 |
- Never expose or log API keys, tokens, or secrets. Do not assume keys or secrets are available. Only Hugging Face private resources are available.
|
| 56 |
|
| 57 |
# Communication Style
|
agent/tools/__init__.py
CHANGED
|
@@ -2,6 +2,22 @@
|
|
| 2 |
Hugging Face tools for the agent
|
| 3 |
"""
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, HfJobsTool, hf_jobs_handler
|
| 6 |
from agent.tools.types import ToolResult
|
| 7 |
|
|
@@ -10,4 +26,12 @@ __all__ = [
|
|
| 10 |
"HF_JOBS_TOOL_SPEC",
|
| 11 |
"hf_jobs_handler",
|
| 12 |
"HfJobsTool",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
]
|
|
|
|
| 2 |
Hugging Face tools for the agent
|
| 3 |
"""
|
| 4 |
|
| 5 |
+
from agent.tools.github_find_examples import (
|
| 6 |
+
GITHUB_FIND_EXAMPLES_TOOL_SPEC,
|
| 7 |
+
github_find_examples_handler,
|
| 8 |
+
)
|
| 9 |
+
from agent.tools.github_list_repos import (
|
| 10 |
+
GITHUB_LIST_REPOS_TOOL_SPEC,
|
| 11 |
+
github_list_repos_handler,
|
| 12 |
+
)
|
| 13 |
+
from agent.tools.github_read_file import (
|
| 14 |
+
GITHUB_READ_FILE_TOOL_SPEC,
|
| 15 |
+
github_read_file_handler,
|
| 16 |
+
)
|
| 17 |
+
from agent.tools.github_search_code import (
|
| 18 |
+
GITHUB_SEARCH_CODE_TOOL_SPEC,
|
| 19 |
+
github_search_code_handler,
|
| 20 |
+
)
|
| 21 |
from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, HfJobsTool, hf_jobs_handler
|
| 22 |
from agent.tools.types import ToolResult
|
| 23 |
|
|
|
|
| 26 |
"HF_JOBS_TOOL_SPEC",
|
| 27 |
"hf_jobs_handler",
|
| 28 |
"HfJobsTool",
|
| 29 |
+
"GITHUB_FIND_EXAMPLES_TOOL_SPEC",
|
| 30 |
+
"github_find_examples_handler",
|
| 31 |
+
"GITHUB_LIST_REPOS_TOOL_SPEC",
|
| 32 |
+
"github_list_repos_handler",
|
| 33 |
+
"GITHUB_READ_FILE_TOOL_SPEC",
|
| 34 |
+
"github_read_file_handler",
|
| 35 |
+
"GITHUB_SEARCH_CODE_TOOL_SPEC",
|
| 36 |
+
"github_search_code_handler",
|
| 37 |
]
|
agent/tools/github_find_examples.py
ADDED
|
@@ -0,0 +1,491 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GitHub Find Examples Tool - Discover examples, tutorials, and guides for any library
|
| 3 |
+
|
| 4 |
+
Lists all files in a repository and performs deterministic keyword search.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
from typing import Any, Dict, List
|
| 9 |
+
|
| 10 |
+
import requests
|
| 11 |
+
from thefuzz import fuzz
|
| 12 |
+
|
| 13 |
+
from agent.tools.types import ToolResult
|
| 14 |
+
|
| 15 |
+
# In order of priority (lower index = higher priority for sorting)
|
| 16 |
+
EXAMPLE_PATTERNS = [
|
| 17 |
+
"scripts",
|
| 18 |
+
# General example patterns (catch-all, lower priority)
|
| 19 |
+
"examples",
|
| 20 |
+
"example",
|
| 21 |
+
# Notebook patterns
|
| 22 |
+
"notebooks",
|
| 23 |
+
"notebook",
|
| 24 |
+
# Tutorial/learning patterns
|
| 25 |
+
"tutorials",
|
| 26 |
+
"tutorial",
|
| 27 |
+
"quickstart",
|
| 28 |
+
"walkthroughs",
|
| 29 |
+
"walkthrough",
|
| 30 |
+
# Cookbook/recipe patterns
|
| 31 |
+
"cookbook",
|
| 32 |
+
"cookbooks",
|
| 33 |
+
"recipes",
|
| 34 |
+
"recipe",
|
| 35 |
+
# Demo/sample patterns
|
| 36 |
+
"demos",
|
| 37 |
+
"demo",
|
| 38 |
+
"samples",
|
| 39 |
+
"sample",
|
| 40 |
+
# Other patterns
|
| 41 |
+
"guides",
|
| 42 |
+
"guide",
|
| 43 |
+
"getting-started",
|
| 44 |
+
"getting_started",
|
| 45 |
+
"playground",
|
| 46 |
+
"howto",
|
| 47 |
+
"how-to",
|
| 48 |
+
"use-cases",
|
| 49 |
+
"usecases",
|
| 50 |
+
"use_cases",
|
| 51 |
+
"sandbox",
|
| 52 |
+
"showcase",
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _get_repo_tree(org: str, repo: str, token: str) -> tuple[List[Dict[str, Any]], str]:
|
| 57 |
+
"""Get all files in a repository recursively. Returns (files, error_message)"""
|
| 58 |
+
headers = {
|
| 59 |
+
"Accept": "application/vnd.github+json",
|
| 60 |
+
"X-GitHub-Api-Version": "2022-11-28",
|
| 61 |
+
"Authorization": f"Bearer {token}",
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
full_repo = f"{org}/{repo}"
|
| 65 |
+
|
| 66 |
+
# Get default branch
|
| 67 |
+
try:
|
| 68 |
+
response = requests.get(
|
| 69 |
+
f"https://api.github.com/repos/{full_repo}", headers=headers, timeout=10
|
| 70 |
+
)
|
| 71 |
+
if response.status_code == 404:
|
| 72 |
+
return [], "not_found"
|
| 73 |
+
if response.status_code != 200:
|
| 74 |
+
return [], f"API error: {response.status_code}"
|
| 75 |
+
|
| 76 |
+
repo_data = response.json()
|
| 77 |
+
default_branch = repo_data.get("default_branch", "main")
|
| 78 |
+
except Exception as e:
|
| 79 |
+
return [], f"Error fetching repo: {str(e)}"
|
| 80 |
+
|
| 81 |
+
# Get repository tree recursively
|
| 82 |
+
try:
|
| 83 |
+
response = requests.get(
|
| 84 |
+
f"https://api.github.com/repos/{full_repo}/git/trees/{default_branch}",
|
| 85 |
+
headers=headers,
|
| 86 |
+
params={"recursive": "1"},
|
| 87 |
+
timeout=30,
|
| 88 |
+
)
|
| 89 |
+
if response.status_code != 200:
|
| 90 |
+
return [], f"Error fetching tree: {response.status_code}"
|
| 91 |
+
|
| 92 |
+
data = response.json()
|
| 93 |
+
tree = data.get("tree", [])
|
| 94 |
+
|
| 95 |
+
# Filter to only include files (not directories)
|
| 96 |
+
files = [
|
| 97 |
+
{
|
| 98 |
+
"path": item["path"],
|
| 99 |
+
"ref": item["sha"],
|
| 100 |
+
"size": item.get("size", 0),
|
| 101 |
+
"url": f"https://github.com/{full_repo}/blob/{default_branch}/{item['path']}",
|
| 102 |
+
}
|
| 103 |
+
for item in tree
|
| 104 |
+
if item["type"] == "blob"
|
| 105 |
+
]
|
| 106 |
+
|
| 107 |
+
return files, ""
|
| 108 |
+
except Exception as e:
|
| 109 |
+
return [], f"Error processing tree: {str(e)}"
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def _search_similar_repos(org: str, repo: str, token: str) -> List[Dict[str, Any]]:
|
| 113 |
+
"""Search for similar repository names in the organization"""
|
| 114 |
+
headers = {
|
| 115 |
+
"Accept": "application/vnd.github+json",
|
| 116 |
+
"X-GitHub-Api-Version": "2022-11-28",
|
| 117 |
+
"Authorization": f"Bearer {token}",
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
# Search for repos in the org with similar name
|
| 121 |
+
query = f"org:{org} {repo}"
|
| 122 |
+
|
| 123 |
+
try:
|
| 124 |
+
response = requests.get(
|
| 125 |
+
"https://api.github.com/search/repositories",
|
| 126 |
+
headers=headers,
|
| 127 |
+
params={"q": query, "sort": "stars", "order": "desc", "per_page": 10},
|
| 128 |
+
timeout=30,
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
if response.status_code != 200:
|
| 132 |
+
return []
|
| 133 |
+
|
| 134 |
+
data = response.json()
|
| 135 |
+
items = data.get("items", [])
|
| 136 |
+
|
| 137 |
+
return [
|
| 138 |
+
{
|
| 139 |
+
"name": item.get("name"),
|
| 140 |
+
"full_name": item.get("full_name"),
|
| 141 |
+
"description": item.get("description"),
|
| 142 |
+
"stars": item.get("stargazers_count", 0),
|
| 143 |
+
"url": item.get("html_url"),
|
| 144 |
+
}
|
| 145 |
+
for item in items
|
| 146 |
+
]
|
| 147 |
+
except Exception:
|
| 148 |
+
return []
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def _score_against_example_patterns(file_path: str) -> int:
|
| 152 |
+
"""Score file against example patterns using token_set_ratio"""
|
| 153 |
+
scores = []
|
| 154 |
+
for pattern in EXAMPLE_PATTERNS:
|
| 155 |
+
score = fuzz.token_set_ratio(pattern.lower(), file_path.lower())
|
| 156 |
+
scores.append(score)
|
| 157 |
+
return max(scores) if scores else 0
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def _score_against_keyword(file_path: str, keyword: str) -> int:
|
| 161 |
+
"""Calculate fuzzy match score for a file path against a keyword"""
|
| 162 |
+
# Use partial_ratio for substring matching (good for paths)
|
| 163 |
+
# Also check token_set_ratio for word-level matching
|
| 164 |
+
partial_score = fuzz.partial_ratio(keyword.lower(), file_path.lower())
|
| 165 |
+
token_score = fuzz.token_set_ratio(keyword.lower(), file_path.lower())
|
| 166 |
+
|
| 167 |
+
# Return the higher of the two
|
| 168 |
+
return max(partial_score, token_score)
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def _get_pattern_priority(file_path: str) -> tuple[int, int, int]:
|
| 172 |
+
"""
|
| 173 |
+
Get priority of a file path based on which example pattern directory it's in.
|
| 174 |
+
|
| 175 |
+
Returns: (in_examples_dir, pattern_priority, path_depth)
|
| 176 |
+
- in_examples_dir: 0 if in examples/ directory, 1 otherwise (lower is better)
|
| 177 |
+
- pattern_priority: Index in EXAMPLE_PATTERNS (lower is better), or 999 if no match
|
| 178 |
+
- path_depth: Number of path segments (lower is better)
|
| 179 |
+
|
| 180 |
+
Note: Prioritizes files in "examples/" directory first, then by most specific pattern match.
|
| 181 |
+
E.g., "examples/scripts/train.py" is better than "scripts/util.py"
|
| 182 |
+
"""
|
| 183 |
+
path_lower = file_path.lower()
|
| 184 |
+
path_parts = path_lower.split("/")
|
| 185 |
+
|
| 186 |
+
# Check if file is in examples/ directory (highest priority)
|
| 187 |
+
in_examples_dir = 0 if (path_parts[0] in ["examples", "example"]) else 1
|
| 188 |
+
|
| 189 |
+
# Find ALL matching patterns and use the best (lowest index) one
|
| 190 |
+
# But prefer deeper matches (more specific) over shallow ones
|
| 191 |
+
best_priority = 999
|
| 192 |
+
best_depth_at_match = -1
|
| 193 |
+
|
| 194 |
+
for i, pattern in enumerate(EXAMPLE_PATTERNS):
|
| 195 |
+
# Check if pattern appears as a directory component in the path
|
| 196 |
+
if pattern in path_parts:
|
| 197 |
+
# Find the depth where this pattern appears (rightmost occurrence)
|
| 198 |
+
depth = len(path_parts) - 1 - path_parts[::-1].index(pattern)
|
| 199 |
+
|
| 200 |
+
# Prefer deeper matches, or better priority if at same depth
|
| 201 |
+
if depth > best_depth_at_match or (
|
| 202 |
+
depth == best_depth_at_match and i < best_priority
|
| 203 |
+
):
|
| 204 |
+
best_priority = i
|
| 205 |
+
best_depth_at_match = depth
|
| 206 |
+
|
| 207 |
+
return (in_examples_dir, best_priority, len(path_parts))
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def _handle_repo_tree_errors(
|
| 211 |
+
all_files: List[Dict[str, Any]],
|
| 212 |
+
error: str,
|
| 213 |
+
org: str,
|
| 214 |
+
repo: str,
|
| 215 |
+
token: str,
|
| 216 |
+
) -> ToolResult | None:
|
| 217 |
+
"""Handle errors from repo tree fetch. Returns ToolResult if error, None if OK."""
|
| 218 |
+
if error == "not_found":
|
| 219 |
+
similar_repos = _search_similar_repos(org, repo, token)
|
| 220 |
+
|
| 221 |
+
if not similar_repos:
|
| 222 |
+
return {
|
| 223 |
+
"formatted": f"Repository '{org}/{repo}' not found and no similar repositories found.",
|
| 224 |
+
"totalResults": 0,
|
| 225 |
+
"resultsShared": 0,
|
| 226 |
+
"isError": True,
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
# Format similar repos
|
| 230 |
+
lines = [f"**Repository '{org}/{repo}' not found. Similar repositories:**\n"]
|
| 231 |
+
for i, r in enumerate(similar_repos, 1):
|
| 232 |
+
lines.append(f"{i}. **{r['full_name']}** (⭐ {r['stars']:,} stars)")
|
| 233 |
+
if r["description"]:
|
| 234 |
+
desc = (
|
| 235 |
+
r["description"][:100] + "..."
|
| 236 |
+
if len(r["description"]) > 100
|
| 237 |
+
else r["description"]
|
| 238 |
+
)
|
| 239 |
+
lines.append(f" {desc}")
|
| 240 |
+
lines.append(f" {r['url']}\n")
|
| 241 |
+
|
| 242 |
+
return {
|
| 243 |
+
"formatted": "\n".join(lines),
|
| 244 |
+
"totalResults": len(similar_repos),
|
| 245 |
+
"resultsShared": len(similar_repos),
|
| 246 |
+
"isError": True,
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
if error:
|
| 250 |
+
return {
|
| 251 |
+
"formatted": f"Error accessing repository '{org}/{repo}': {error}",
|
| 252 |
+
"totalResults": 0,
|
| 253 |
+
"resultsShared": 0,
|
| 254 |
+
"isError": True,
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
if not all_files:
|
| 258 |
+
return {
|
| 259 |
+
"formatted": f"No files found in repository '{org}/{repo}'",
|
| 260 |
+
"totalResults": 0,
|
| 261 |
+
"resultsShared": 0,
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
return None
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def find_examples(
|
| 268 |
+
keyword: str = "",
|
| 269 |
+
repo: str = "",
|
| 270 |
+
org: str = "huggingface",
|
| 271 |
+
max_results: int = 10,
|
| 272 |
+
min_score: int = 80,
|
| 273 |
+
) -> ToolResult:
|
| 274 |
+
"""
|
| 275 |
+
Find example files in a repository using fuzzy matching.
|
| 276 |
+
|
| 277 |
+
Args:
|
| 278 |
+
keyword: Keyword to fuzzy match against file paths (e.g., "grpo")
|
| 279 |
+
repo: Repository name (e.g., "trl")
|
| 280 |
+
org: GitHub organization (default: "huggingface")
|
| 281 |
+
max_results: Maximum number of results (default 50)
|
| 282 |
+
min_score: Minimum fuzzy match score (0-100, default 60)
|
| 283 |
+
|
| 284 |
+
Returns:
|
| 285 |
+
ToolResult with matching files, or similar repos if repo not found
|
| 286 |
+
"""
|
| 287 |
+
token = os.environ.get("GITHUB_TOKEN")
|
| 288 |
+
if not token:
|
| 289 |
+
return {
|
| 290 |
+
"formatted": "Error: GITHUB_TOKEN environment variable is required",
|
| 291 |
+
"totalResults": 0,
|
| 292 |
+
"resultsShared": 0,
|
| 293 |
+
"isError": True,
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
if not repo:
|
| 297 |
+
return {
|
| 298 |
+
"formatted": "Error: repo parameter is required",
|
| 299 |
+
"totalResults": 0,
|
| 300 |
+
"resultsShared": 0,
|
| 301 |
+
"isError": True,
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
# Get all files in the repository
|
| 305 |
+
all_files, error = _get_repo_tree(org, repo, token)
|
| 306 |
+
|
| 307 |
+
# Handle errors (not found, API errors, empty repo)
|
| 308 |
+
if error_result := _handle_repo_tree_errors(all_files, error, org, repo, token):
|
| 309 |
+
return error_result
|
| 310 |
+
|
| 311 |
+
# Step 1: Filter files by example patterns (score >= 60)
|
| 312 |
+
example_threshold = 60
|
| 313 |
+
example_files = []
|
| 314 |
+
for file in all_files:
|
| 315 |
+
example_score = _score_against_example_patterns(file["path"])
|
| 316 |
+
if example_score >= example_threshold:
|
| 317 |
+
example_files.append({**file, "example_score": example_score})
|
| 318 |
+
|
| 319 |
+
if not example_files:
|
| 320 |
+
return {
|
| 321 |
+
"formatted": f"No example files found in {org}/{repo} (no files match example patterns with score >= {example_threshold}).",
|
| 322 |
+
"totalResults": 0,
|
| 323 |
+
"resultsShared": 0,
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
# Step 2: If keyword provided, score and filter by keyword
|
| 327 |
+
if keyword:
|
| 328 |
+
scored_files = []
|
| 329 |
+
for file in example_files:
|
| 330 |
+
keyword_score = _score_against_keyword(file["path"], keyword)
|
| 331 |
+
if keyword_score >= min_score:
|
| 332 |
+
scored_files.append({**file, "score": keyword_score})
|
| 333 |
+
|
| 334 |
+
if not scored_files:
|
| 335 |
+
return {
|
| 336 |
+
"formatted": f"No files found in {org}/{repo} matching keyword '{keyword}' (min score: {min_score}) among {len(example_files)} example files.",
|
| 337 |
+
"totalResults": 0,
|
| 338 |
+
"resultsShared": 0,
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
# Sort by keyword score (descending) for best matches first
|
| 342 |
+
scored_files.sort(key=lambda x: x["score"], reverse=True)
|
| 343 |
+
else:
|
| 344 |
+
# No keyword: prioritize by pattern directory, then path depth
|
| 345 |
+
scored_files = []
|
| 346 |
+
for file in example_files:
|
| 347 |
+
in_examples_dir, pattern_priority, path_depth = _get_pattern_priority(
|
| 348 |
+
file["path"]
|
| 349 |
+
)
|
| 350 |
+
scored_files.append(
|
| 351 |
+
{
|
| 352 |
+
**file,
|
| 353 |
+
"score": file["example_score"],
|
| 354 |
+
"in_examples_dir": in_examples_dir,
|
| 355 |
+
"pattern_priority": pattern_priority,
|
| 356 |
+
"path_depth": path_depth,
|
| 357 |
+
}
|
| 358 |
+
)
|
| 359 |
+
|
| 360 |
+
if not scored_files:
|
| 361 |
+
return {
|
| 362 |
+
"formatted": f"No example files found in {org}/{repo}.",
|
| 363 |
+
"totalResults": 0,
|
| 364 |
+
"resultsShared": 0,
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
# Sort by: 1) files in examples/ dir first, 2) pattern priority (scripts > datasets > etc), 3) path depth, 4) path name
|
| 368 |
+
scored_files.sort(
|
| 369 |
+
key=lambda x: (
|
| 370 |
+
x["in_examples_dir"],
|
| 371 |
+
x["pattern_priority"],
|
| 372 |
+
x["path_depth"],
|
| 373 |
+
x["path"],
|
| 374 |
+
)
|
| 375 |
+
)
|
| 376 |
+
|
| 377 |
+
# Limit results
|
| 378 |
+
results = scored_files[:max_results]
|
| 379 |
+
|
| 380 |
+
# Format output
|
| 381 |
+
keyword_desc = f" matching '{keyword}'" if keyword else ""
|
| 382 |
+
lines = [f"**Found {len(results)} example files in {org}/{repo}{keyword_desc}:**"]
|
| 383 |
+
if len(scored_files) > max_results:
|
| 384 |
+
lines[0] += f" (showing {max_results} of {len(scored_files)})"
|
| 385 |
+
lines.append("")
|
| 386 |
+
|
| 387 |
+
for i, file in enumerate(results, 1):
|
| 388 |
+
lines.append(f"{i}. **{file['path']}**")
|
| 389 |
+
lines.append(f" Size: {file['size']:,} bytes | Ref: {file['ref'][:7]}")
|
| 390 |
+
lines.append(f" URL: {file['url']}")
|
| 391 |
+
|
| 392 |
+
# Copyable parameters for read_file tool
|
| 393 |
+
read_params = f"{{'repo': '{org}/{repo}', 'path': '{file['path']}'}}"
|
| 394 |
+
lines.append(f" To read, use: {read_params}")
|
| 395 |
+
lines.append("")
|
| 396 |
+
|
| 397 |
+
return {
|
| 398 |
+
"formatted": "\n".join(lines),
|
| 399 |
+
"totalResults": len(results),
|
| 400 |
+
"resultsShared": len(results),
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
# Tool specification
|
| 405 |
+
GITHUB_FIND_EXAMPLES_TOOL_SPEC = {
|
| 406 |
+
"name": "github_find_examples",
|
| 407 |
+
"description": (
|
| 408 |
+
"Discover best practices, reusable scripts, tutorials, and demos for usinga specific library or framework. This is an important step before implementing anything ML related.",
|
| 409 |
+
"Use together with github_read_file tool.\n\n"
|
| 410 |
+
"## When to use this tool\n\n"
|
| 411 |
+
"- ALWAYS before implementing any training/inference/benchmarking or other ML related code or answering how-to question.\n"
|
| 412 |
+
"- When exploring a new repository and need to understand how to use it\n"
|
| 413 |
+
"## How it works\n\n"
|
| 414 |
+
"1. Fetches all (examples, tutorials, demos, notebooks, scripts, etc.) from the repository\n"
|
| 415 |
+
"2. If keyword provided, scores found files against the keyword using fuzzy matching\n"
|
| 416 |
+
"3. Returns best matches sorted by relevance score\n"
|
| 417 |
+
"## Examples\n\n"
|
| 418 |
+
"<example>\n"
|
| 419 |
+
"// ML Workflow Step: Find GRPO/SFT/DPO/RLOO etc training examples\n"
|
| 420 |
+
"// Task: Starting GRPO fine-tuning project, need reference implementations\n"
|
| 421 |
+
"{\n"
|
| 422 |
+
" keyword: 'grpo',\n"
|
| 423 |
+
" repo: 'trl',\n"
|
| 424 |
+
" org: 'huggingface'\n"
|
| 425 |
+
"}\n"
|
| 426 |
+
"// Returns: examples/scripts/grpo_agent.py, examples/scripts/grpo_vlm.py\n"
|
| 427 |
+
"// Next step: Use github_read_file to study the implementation\n"
|
| 428 |
+
"</example>\n\n"
|
| 429 |
+
"<example>\n"
|
| 430 |
+
"// ML Workflow Step: Discover all training examples in TRL\n"
|
| 431 |
+
"// Task: Exploring available training methods before choosing approach\n"
|
| 432 |
+
"{\n"
|
| 433 |
+
" repo: 'trl',\n"
|
| 434 |
+
" org: 'huggingface',\n"
|
| 435 |
+
" max_results: 20\n"
|
| 436 |
+
"}\n"
|
| 437 |
+
"// Lists all example scripts: PPO, DPO, GRPO, reward modeling, etc.\n"
|
| 438 |
+
"</example>\n\n"
|
| 439 |
+
"<example>\n"
|
| 440 |
+
"// ML Workflow Step: Find LoRA fine-tuning examples\n"
|
| 441 |
+
"// Task: Learning parameter-efficient fine-tuning with PEFT\n"
|
| 442 |
+
"{\n"
|
| 443 |
+
" keyword: 'lora',\n"
|
| 444 |
+
" repo: 'peft',\n"
|
| 445 |
+
" org: 'huggingface'\n"
|
| 446 |
+
"}\n"
|
| 447 |
+
"// Discovers LoRA configuration and training examples\n"
|
| 448 |
+
"</example>",
|
| 449 |
+
),
|
| 450 |
+
"parameters": {
|
| 451 |
+
"type": "object",
|
| 452 |
+
"properties": {
|
| 453 |
+
"keyword": {
|
| 454 |
+
"type": "string",
|
| 455 |
+
"description": "Keyword to fuzzy match against file paths (e.g., 'grpo', 'sft').",
|
| 456 |
+
},
|
| 457 |
+
"repo": {
|
| 458 |
+
"type": "string",
|
| 459 |
+
"description": "Repository name (e.g., 'trl', 'transformers'). Required.",
|
| 460 |
+
},
|
| 461 |
+
"org": {
|
| 462 |
+
"type": "string",
|
| 463 |
+
"description": "GitHub organization or username. Default: 'huggingface'.",
|
| 464 |
+
},
|
| 465 |
+
"max_results": {
|
| 466 |
+
"type": "integer",
|
| 467 |
+
"description": "Maximum number of results to return. Default: 50.",
|
| 468 |
+
},
|
| 469 |
+
"min_score": {
|
| 470 |
+
"type": "integer",
|
| 471 |
+
"description": "Minimum fuzzy match score (0-100). Default: 60.",
|
| 472 |
+
},
|
| 473 |
+
},
|
| 474 |
+
"required": ["repo"],
|
| 475 |
+
},
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
|
| 479 |
+
async def github_find_examples_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
|
| 480 |
+
"""Handler for agent tool router"""
|
| 481 |
+
try:
|
| 482 |
+
result = find_examples(
|
| 483 |
+
keyword=arguments.get("keyword", ""),
|
| 484 |
+
repo=arguments["repo"],
|
| 485 |
+
org=arguments.get("org", "huggingface"),
|
| 486 |
+
max_results=arguments.get("max_results", 50),
|
| 487 |
+
min_score=arguments.get("min_score", 60),
|
| 488 |
+
)
|
| 489 |
+
return result["formatted"], not result.get("isError", False)
|
| 490 |
+
except Exception as e:
|
| 491 |
+
return f"Error finding examples: {str(e)}", False
|
agent/tools/github_list_repos.py
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GitHub List Repositories Tool - List and sort repositories for any user or organization
|
| 3 |
+
|
| 4 |
+
Efficiently discover repositories with flexible sorting options.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
from typing import Any, Dict, Literal, Optional
|
| 9 |
+
|
| 10 |
+
import requests
|
| 11 |
+
|
| 12 |
+
from agent.tools.types import ToolResult
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def list_repos(
|
| 16 |
+
owner: str,
|
| 17 |
+
owner_type: Literal["user", "org"] = "org",
|
| 18 |
+
sort: Literal["stars", "forks", "updated", "created"] = "stars",
|
| 19 |
+
order: Literal["asc", "desc"] = "desc",
|
| 20 |
+
limit: Optional[int] = 30,
|
| 21 |
+
) -> ToolResult:
|
| 22 |
+
"""
|
| 23 |
+
List repositories for a user or organization using GitHub REST API.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
owner: GitHub username or organization name
|
| 27 |
+
owner_type: Whether the owner is a "user" or "org" (default: "org")
|
| 28 |
+
sort: Sort field - "stars", "forks", "updated", or "created"
|
| 29 |
+
order: Sort order - "asc" or "desc" (default: "desc")
|
| 30 |
+
limit: Maximum number of repositories to return
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
ToolResult with repository information
|
| 34 |
+
"""
|
| 35 |
+
token = os.environ.get("GITHUB_TOKEN")
|
| 36 |
+
if not token:
|
| 37 |
+
return {
|
| 38 |
+
"formatted": "Error: GITHUB_TOKEN environment variable is required",
|
| 39 |
+
"totalResults": 0,
|
| 40 |
+
"resultsShared": 0,
|
| 41 |
+
"isError": True,
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
if owner_type == "org":
|
| 45 |
+
url = f"https://api.github.com/orgs/{owner}/repos"
|
| 46 |
+
else:
|
| 47 |
+
url = f"https://api.github.com/users/{owner}/repos"
|
| 48 |
+
|
| 49 |
+
headers = {
|
| 50 |
+
"Accept": "application/vnd.github+json",
|
| 51 |
+
"X-GitHub-Api-Version": "2022-11-28",
|
| 52 |
+
"Authorization": f"Bearer {token}",
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
all_repos = []
|
| 56 |
+
page = 1
|
| 57 |
+
per_page = 100 # Maximum allowed by GitHub
|
| 58 |
+
|
| 59 |
+
# Map our sort values to GitHub API sort values
|
| 60 |
+
# Note: GitHub list repos API doesn't support sorting by stars/forks
|
| 61 |
+
# We'll fetch all repos and sort in memory for those cases
|
| 62 |
+
api_sort_map = {
|
| 63 |
+
"created": "created",
|
| 64 |
+
"updated": "updated",
|
| 65 |
+
"stars": None, # Not supported by list API
|
| 66 |
+
"forks": None, # Not supported by list API
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
api_sort = api_sort_map.get(sort)
|
| 70 |
+
need_manual_sort = api_sort is None
|
| 71 |
+
|
| 72 |
+
try:
|
| 73 |
+
while True:
|
| 74 |
+
params = {
|
| 75 |
+
"page": page,
|
| 76 |
+
"per_page": per_page,
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
# Only add sort/direction if API supports it
|
| 80 |
+
if api_sort:
|
| 81 |
+
params["sort"] = api_sort
|
| 82 |
+
params["direction"] = order
|
| 83 |
+
|
| 84 |
+
response = requests.get(
|
| 85 |
+
url,
|
| 86 |
+
headers=headers,
|
| 87 |
+
params=params,
|
| 88 |
+
timeout=30,
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
if response.status_code == 403:
|
| 92 |
+
error_data = response.json()
|
| 93 |
+
return {
|
| 94 |
+
"formatted": f"GitHub API rate limit or permission error: {error_data.get('message', 'Unknown error')}",
|
| 95 |
+
"totalResults": 0,
|
| 96 |
+
"resultsShared": 0,
|
| 97 |
+
"isError": True,
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
if response.status_code != 200:
|
| 101 |
+
error_msg = f"GitHub API error (status {response.status_code})"
|
| 102 |
+
try:
|
| 103 |
+
error_data = response.json()
|
| 104 |
+
if "message" in error_data:
|
| 105 |
+
error_msg += f": {error_data['message']}"
|
| 106 |
+
except Exception:
|
| 107 |
+
pass
|
| 108 |
+
return {
|
| 109 |
+
"formatted": error_msg,
|
| 110 |
+
"totalResults": 0,
|
| 111 |
+
"resultsShared": 0,
|
| 112 |
+
"isError": True,
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
items = response.json()
|
| 116 |
+
|
| 117 |
+
if not items:
|
| 118 |
+
break
|
| 119 |
+
|
| 120 |
+
for item in items:
|
| 121 |
+
all_repos.append(
|
| 122 |
+
{
|
| 123 |
+
"name": item.get("name"),
|
| 124 |
+
"full_name": item.get("full_name"),
|
| 125 |
+
"description": item.get("description"),
|
| 126 |
+
"html_url": item.get("html_url"),
|
| 127 |
+
"language": item.get("language"),
|
| 128 |
+
"stars": item.get("stargazers_count", 0),
|
| 129 |
+
"forks": item.get("forks_count", 0),
|
| 130 |
+
"open_issues": item.get("open_issues_count", 0),
|
| 131 |
+
"topics": item.get("topics", []),
|
| 132 |
+
"updated_at": item.get("updated_at"),
|
| 133 |
+
"created_at": item.get("created_at"),
|
| 134 |
+
}
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
# Check if we got fewer results than requested (last page)
|
| 138 |
+
if len(items) < per_page:
|
| 139 |
+
break
|
| 140 |
+
|
| 141 |
+
# Stop if we have enough repos
|
| 142 |
+
if limit and len(all_repos) >= limit:
|
| 143 |
+
break
|
| 144 |
+
|
| 145 |
+
page += 1
|
| 146 |
+
|
| 147 |
+
except requests.exceptions.RequestException as e:
|
| 148 |
+
return {
|
| 149 |
+
"formatted": f"Failed to connect to GitHub API: {str(e)}",
|
| 150 |
+
"totalResults": 0,
|
| 151 |
+
"resultsShared": 0,
|
| 152 |
+
"isError": True,
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
# Manual sorting if needed (for stars/forks)
|
| 156 |
+
if need_manual_sort and all_repos:
|
| 157 |
+
reverse = order == "desc"
|
| 158 |
+
all_repos.sort(key=lambda x: x[sort], reverse=reverse)
|
| 159 |
+
|
| 160 |
+
# Apply limit after sorting
|
| 161 |
+
if limit:
|
| 162 |
+
all_repos = all_repos[:limit]
|
| 163 |
+
|
| 164 |
+
if not all_repos:
|
| 165 |
+
return {
|
| 166 |
+
"formatted": f"No repositories found for {owner_type} '{owner}'",
|
| 167 |
+
"totalResults": 0,
|
| 168 |
+
"resultsShared": 0,
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
# Format output
|
| 172 |
+
lines = [f"**Found {len(all_repos)} repositories for {owner}:**\n"]
|
| 173 |
+
|
| 174 |
+
for i, repo in enumerate(all_repos, 1):
|
| 175 |
+
lines.append(f"{i}. **{repo['full_name']}**")
|
| 176 |
+
lines.append(
|
| 177 |
+
f" ⭐ {repo['stars']:,} stars | 🍴 {repo['forks']:,} forks | Language: {repo['language'] or 'N/A'}"
|
| 178 |
+
)
|
| 179 |
+
if repo["description"]:
|
| 180 |
+
desc = (
|
| 181 |
+
repo["description"][:100] + "..."
|
| 182 |
+
if len(repo["description"]) > 100
|
| 183 |
+
else repo["description"]
|
| 184 |
+
)
|
| 185 |
+
lines.append(f" {desc}")
|
| 186 |
+
lines.append(f" URL: {repo['html_url']}")
|
| 187 |
+
if repo["topics"]:
|
| 188 |
+
lines.append(f" Topics: {', '.join(repo['topics'][:5])}")
|
| 189 |
+
|
| 190 |
+
# Copyable parameters for other tools
|
| 191 |
+
lines.append(f" Use in tools: {{'repo': '{repo['full_name']}'}}")
|
| 192 |
+
lines.append("")
|
| 193 |
+
|
| 194 |
+
return {
|
| 195 |
+
"formatted": "\n".join(lines),
|
| 196 |
+
"totalResults": len(all_repos),
|
| 197 |
+
"resultsShared": len(all_repos),
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
# Tool specification
|
| 202 |
+
GITHUB_LIST_REPOS_TOOL_SPEC = {
|
| 203 |
+
"name": "github_list_repos",
|
| 204 |
+
"description": (
|
| 205 |
+
"List and discover repositories for any GitHub user or organization with flexible sorting.\n\n"
|
| 206 |
+
"Returns comprehensive repository information including stars, forks, language, topics, and direct URLs. "
|
| 207 |
+
"Sorts by stars, forks, update date, or creation date.\n\n"
|
| 208 |
+
"## When to use this tool\n\n"
|
| 209 |
+
"- When you need to find libraries to use in your implementation, or to explore what repositories exist for a task.\n"
|
| 210 |
+
"- When debugging an error to looking up if others are having the same issues in repositories."
|
| 211 |
+
"- When finding the most popular or active projects for a user or org\n"
|
| 212 |
+
"## Examples\n\n"
|
| 213 |
+
"<example>\n"
|
| 214 |
+
"// ML Workflow Step: Discover HF libraries for RLHF/alignment\n"
|
| 215 |
+
"// Use case: Find the right library for training with human feedback\n"
|
| 216 |
+
"{\n"
|
| 217 |
+
" owner: 'huggingface',\n"
|
| 218 |
+
" owner_type: 'org',\n"
|
| 219 |
+
" sort: 'stars',\n"
|
| 220 |
+
" limit: 10\n"
|
| 221 |
+
"}\n"
|
| 222 |
+
"// Returns: transformers, trl, peft, accelerate, diffusers...\n"
|
| 223 |
+
"</example>\n\n"
|
| 224 |
+
"<example>\n"
|
| 225 |
+
"// ML Workflow Step: Check for recently updated HF repos\n"
|
| 226 |
+
"// Use case: Find actively maintained libraries with latest features\n"
|
| 227 |
+
"{\n"
|
| 228 |
+
" owner: 'huggingface',\n"
|
| 229 |
+
" owner_type: 'org',\n"
|
| 230 |
+
" sort: 'updated',\n"
|
| 231 |
+
" order: 'desc',\n"
|
| 232 |
+
" limit: 15\n"
|
| 233 |
+
"}\n"
|
| 234 |
+
"// Helps identify which repos have recent improvements/fixes\n"
|
| 235 |
+
"</example>"
|
| 236 |
+
),
|
| 237 |
+
"parameters": {
|
| 238 |
+
"type": "object",
|
| 239 |
+
"properties": {
|
| 240 |
+
"owner": {
|
| 241 |
+
"type": "string",
|
| 242 |
+
"description": "GitHub username or organization name. Required.",
|
| 243 |
+
},
|
| 244 |
+
"owner_type": {
|
| 245 |
+
"type": "string",
|
| 246 |
+
"enum": ["user", "org"],
|
| 247 |
+
"description": "Whether the owner is a 'user' or 'org'. Default: 'org'.",
|
| 248 |
+
},
|
| 249 |
+
"sort": {
|
| 250 |
+
"type": "string",
|
| 251 |
+
"enum": ["stars", "forks", "updated", "created"],
|
| 252 |
+
"description": "Sort field. Options: 'stars', 'forks', 'updated', 'created'. Default: 'stars'.",
|
| 253 |
+
},
|
| 254 |
+
"order": {
|
| 255 |
+
"type": "string",
|
| 256 |
+
"enum": ["asc", "desc"],
|
| 257 |
+
"description": "Sort order. Options: 'asc', 'desc'. Default: 'desc'.",
|
| 258 |
+
},
|
| 259 |
+
"limit": {
|
| 260 |
+
"type": "integer",
|
| 261 |
+
"description": "Maximum number of repositories to return. No limit if not specified. Default: 30.",
|
| 262 |
+
},
|
| 263 |
+
},
|
| 264 |
+
"required": ["owner"],
|
| 265 |
+
},
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
async def github_list_repos_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
|
| 270 |
+
"""Handler for agent tool router"""
|
| 271 |
+
try:
|
| 272 |
+
result = list_repos(
|
| 273 |
+
owner=arguments["owner"],
|
| 274 |
+
owner_type=arguments.get("owner_type", "org"),
|
| 275 |
+
sort=arguments.get("sort", "stars"),
|
| 276 |
+
order=arguments.get("order", "desc"),
|
| 277 |
+
limit=arguments.get("limit"),
|
| 278 |
+
)
|
| 279 |
+
return result["formatted"], not result.get("isError", False)
|
| 280 |
+
except Exception as e:
|
| 281 |
+
return f"Error listing repositories: {str(e)}", False
|
agent/tools/github_read_file.py
ADDED
|
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GitHub Read File Tool - Read file contents from any GitHub repository with line range support
|
| 3 |
+
|
| 4 |
+
Fetch exact file contents with metadata, supporting line ranges for efficient reading.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import base64
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
from typing import Any, Dict, Optional
|
| 11 |
+
|
| 12 |
+
import nbformat
|
| 13 |
+
import requests
|
| 14 |
+
from nbconvert import MarkdownExporter
|
| 15 |
+
from nbconvert.preprocessors import ClearOutputPreprocessor, TagRemovePreprocessor
|
| 16 |
+
|
| 17 |
+
from agent.tools.types import ToolResult
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _convert_ipynb_to_markdown(content: str) -> str:
|
| 21 |
+
"""
|
| 22 |
+
Convert Jupyter notebook JSON to LLM-friendly Markdown.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
content: Raw notebook JSON string
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
Converted Markdown string
|
| 29 |
+
"""
|
| 30 |
+
try:
|
| 31 |
+
# Parse notebook JSON
|
| 32 |
+
nb_dict = json.loads(content)
|
| 33 |
+
|
| 34 |
+
# Normalize cell sources (can be string or list of strings)
|
| 35 |
+
if "cells" in nb_dict:
|
| 36 |
+
for cell in nb_dict["cells"]:
|
| 37 |
+
if "source" in cell and isinstance(cell["source"], list):
|
| 38 |
+
cell["source"] = "".join(cell["source"])
|
| 39 |
+
|
| 40 |
+
# Read notebook with explicit version
|
| 41 |
+
nb = nbformat.reads(json.dumps(nb_dict), as_version=4)
|
| 42 |
+
|
| 43 |
+
# Strip outputs for LLM readability (outputs can be noisy/large)
|
| 44 |
+
clear = ClearOutputPreprocessor()
|
| 45 |
+
nb, _ = clear.preprocess(nb, {})
|
| 46 |
+
|
| 47 |
+
# Optionally remove cells tagged with "hide" or similar
|
| 48 |
+
remove = TagRemovePreprocessor(
|
| 49 |
+
remove_cell_tags={"hide", "hidden", "remove"},
|
| 50 |
+
remove_input_tags=set(),
|
| 51 |
+
remove_all_outputs_tags=set(),
|
| 52 |
+
)
|
| 53 |
+
nb, _ = remove.preprocess(nb, {})
|
| 54 |
+
|
| 55 |
+
# Convert to markdown
|
| 56 |
+
exporter = MarkdownExporter()
|
| 57 |
+
markdown, _ = exporter.from_notebook_node(nb)
|
| 58 |
+
|
| 59 |
+
return markdown
|
| 60 |
+
|
| 61 |
+
except json.JSONDecodeError:
|
| 62 |
+
return content
|
| 63 |
+
except Exception:
|
| 64 |
+
return content
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def read_file(
|
| 68 |
+
repo: str,
|
| 69 |
+
path: str,
|
| 70 |
+
ref: str = "HEAD",
|
| 71 |
+
line_start: Optional[int] = None,
|
| 72 |
+
line_end: Optional[int] = None,
|
| 73 |
+
) -> ToolResult:
|
| 74 |
+
"""
|
| 75 |
+
Read file contents from a GitHub repository with line range support.
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
repo: Repository in format "owner/repo" (e.g., "github/github-mcp-server")
|
| 79 |
+
path: Path to file in repository (e.g., "pkg/github/search.go")
|
| 80 |
+
ref: Git reference - branch name, tag, or commit SHA (default: "HEAD")
|
| 81 |
+
line_start: Starting line number (1-indexed, inclusive)
|
| 82 |
+
line_end: Ending line number (1-indexed, inclusive)
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
ToolResult with file contents and metadata
|
| 86 |
+
"""
|
| 87 |
+
token = os.environ.get("GITHUB_TOKEN")
|
| 88 |
+
if not token:
|
| 89 |
+
return {
|
| 90 |
+
"formatted": "Error: GITHUB_TOKEN environment variable is required",
|
| 91 |
+
"totalResults": 0,
|
| 92 |
+
"resultsShared": 0,
|
| 93 |
+
"isError": True,
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
# Parse repo
|
| 97 |
+
if "/" not in repo:
|
| 98 |
+
return {
|
| 99 |
+
"formatted": "Error: repo must be in format 'owner/repo'",
|
| 100 |
+
"totalResults": 0,
|
| 101 |
+
"resultsShared": 0,
|
| 102 |
+
"isError": True,
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
owner, repo_name = repo.split("/", 1)
|
| 106 |
+
|
| 107 |
+
headers = {
|
| 108 |
+
"Accept": "application/vnd.github+json",
|
| 109 |
+
"X-GitHub-Api-Version": "2022-11-28",
|
| 110 |
+
"Authorization": f"Bearer {token}",
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
# Fetch file contents
|
| 114 |
+
url = f"https://api.github.com/repos/{owner}/{repo_name}/contents/{path}"
|
| 115 |
+
params = {}
|
| 116 |
+
if ref and ref != "HEAD":
|
| 117 |
+
params["ref"] = ref
|
| 118 |
+
|
| 119 |
+
try:
|
| 120 |
+
response = requests.get(url, headers=headers, params=params, timeout=30)
|
| 121 |
+
|
| 122 |
+
if response.status_code == 404:
|
| 123 |
+
return {
|
| 124 |
+
"formatted": f"File not found: {path} in {repo} (ref: {ref})",
|
| 125 |
+
"totalResults": 0,
|
| 126 |
+
"resultsShared": 0,
|
| 127 |
+
"isError": True,
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
if response.status_code != 200:
|
| 131 |
+
error_msg = f"GitHub API error (status {response.status_code})"
|
| 132 |
+
try:
|
| 133 |
+
error_data = response.json()
|
| 134 |
+
if "message" in error_data:
|
| 135 |
+
error_msg += f": {error_data['message']}"
|
| 136 |
+
except Exception:
|
| 137 |
+
pass
|
| 138 |
+
return {
|
| 139 |
+
"formatted": error_msg,
|
| 140 |
+
"totalResults": 0,
|
| 141 |
+
"resultsShared": 0,
|
| 142 |
+
"isError": True,
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
data = response.json()
|
| 146 |
+
|
| 147 |
+
# Check if it's a file
|
| 148 |
+
if data.get("type") != "file":
|
| 149 |
+
return {
|
| 150 |
+
"formatted": f"Path {path} is not a file (type: {data.get('type')})",
|
| 151 |
+
"totalResults": 0,
|
| 152 |
+
"resultsShared": 0,
|
| 153 |
+
"isError": True,
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
# Decode content
|
| 157 |
+
content_b64 = data.get("content", "")
|
| 158 |
+
if content_b64:
|
| 159 |
+
content_b64 = content_b64.replace("\n", "").replace(" ", "")
|
| 160 |
+
content = base64.b64decode(content_b64).decode("utf-8", errors="replace")
|
| 161 |
+
else:
|
| 162 |
+
# For large files, fetch raw content
|
| 163 |
+
raw_headers = {
|
| 164 |
+
"Accept": "application/vnd.github.raw",
|
| 165 |
+
"X-GitHub-Api-Version": "2022-11-28",
|
| 166 |
+
"Authorization": f"Bearer {token}",
|
| 167 |
+
}
|
| 168 |
+
raw_response = requests.get(
|
| 169 |
+
url, headers=raw_headers, params=params, timeout=30
|
| 170 |
+
)
|
| 171 |
+
if raw_response.status_code != 200:
|
| 172 |
+
return {
|
| 173 |
+
"formatted": "Failed to fetch file content",
|
| 174 |
+
"totalResults": 0,
|
| 175 |
+
"resultsShared": 0,
|
| 176 |
+
"isError": True,
|
| 177 |
+
}
|
| 178 |
+
content = raw_response.text
|
| 179 |
+
|
| 180 |
+
if path.lower().endswith(".ipynb"):
|
| 181 |
+
content = _convert_ipynb_to_markdown(content)
|
| 182 |
+
|
| 183 |
+
# Process line ranges
|
| 184 |
+
lines = content.split("\n")
|
| 185 |
+
total_lines = len(lines)
|
| 186 |
+
|
| 187 |
+
truncated = False
|
| 188 |
+
|
| 189 |
+
if line_start is None and line_end is None:
|
| 190 |
+
# No range specified
|
| 191 |
+
if total_lines > 300:
|
| 192 |
+
line_start = 1
|
| 193 |
+
line_end = 300
|
| 194 |
+
truncated = True
|
| 195 |
+
else:
|
| 196 |
+
line_start = 1
|
| 197 |
+
line_end = total_lines
|
| 198 |
+
else:
|
| 199 |
+
# Range specified
|
| 200 |
+
if line_start is None:
|
| 201 |
+
line_start = 1
|
| 202 |
+
if line_end is None:
|
| 203 |
+
line_end = total_lines
|
| 204 |
+
|
| 205 |
+
# Validate range
|
| 206 |
+
line_start = max(1, line_start)
|
| 207 |
+
line_end = min(total_lines, line_end)
|
| 208 |
+
if line_start > line_end:
|
| 209 |
+
return {
|
| 210 |
+
"formatted": f"Invalid range: line_start ({line_start}) > line_end ({line_end})",
|
| 211 |
+
"totalResults": 0,
|
| 212 |
+
"resultsShared": 0,
|
| 213 |
+
"isError": True,
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
# Extract lines
|
| 217 |
+
selected_lines = lines[line_start - 1 : line_end]
|
| 218 |
+
selected_content = "\n".join(selected_lines)
|
| 219 |
+
|
| 220 |
+
# Format output
|
| 221 |
+
lines_output = [f"**Reading file from repo: {repo}, path: {path}**"]
|
| 222 |
+
|
| 223 |
+
if ref and ref != "HEAD":
|
| 224 |
+
lines_output.append(f"Ref: {ref}")
|
| 225 |
+
|
| 226 |
+
lines_output.append("\n**File content:")
|
| 227 |
+
lines_output.append("```")
|
| 228 |
+
lines_output.append(selected_content)
|
| 229 |
+
lines_output.append("```")
|
| 230 |
+
if truncated:
|
| 231 |
+
lines_output.append(
|
| 232 |
+
f"Currently showing lines {line_start}-{line_end} out of {total_lines} total lines. Use line_start and line_end to view more lines."
|
| 233 |
+
)
|
| 234 |
+
return {
|
| 235 |
+
"formatted": "\n".join(lines_output),
|
| 236 |
+
"totalResults": 1,
|
| 237 |
+
"resultsShared": 1,
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
except requests.exceptions.RequestException as e:
|
| 241 |
+
return {
|
| 242 |
+
"formatted": f"Failed to connect to GitHub API: {str(e)}",
|
| 243 |
+
"totalResults": 0,
|
| 244 |
+
"resultsShared": 0,
|
| 245 |
+
"isError": True,
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
# Tool specification
|
| 250 |
+
GITHUB_READ_FILE_TOOL_SPEC = {
|
| 251 |
+
"name": "github_read_file",
|
| 252 |
+
"description": (
|
| 253 |
+
"Read file contents from any GitHub repository with line range support.\n\n"
|
| 254 |
+
"Fetches exact file contents in the given line range (default 300 lines, use line_start/line_end adjust). \n\n"
|
| 255 |
+
"## When to use this tool\n\n"
|
| 256 |
+
"- When reading example code, implementations, or documentation on a specific github file\n"
|
| 257 |
+
"- When you found a file via github_list_repos, or github_find_examples and need its contents\n"
|
| 258 |
+
"- When investigating specific code sections with line ranges\n"
|
| 259 |
+
"- When reading from specific branches, tags, or commits\n"
|
| 260 |
+
"## When NOT to use this tool\n\n"
|
| 261 |
+
"- When you don't know the exact file path beforehand (use github_search_code or github_find_examples first)\n\n"
|
| 262 |
+
"## Examples\n\n"
|
| 263 |
+
"<example>\n"
|
| 264 |
+
"// ML Workflow Step: Reading example code from for GRPO training with TRL\n"
|
| 265 |
+
"// Use case: Read trainer class to understand API and methods\n"
|
| 266 |
+
"{\n"
|
| 267 |
+
" repo: 'huggingface/trl',\n"
|
| 268 |
+
" path: 'trl/trainer/grpo_trainer.py',\n"
|
| 269 |
+
" line_start: 1,\n"
|
| 270 |
+
" line_end: 200\n"
|
| 271 |
+
"}\n"
|
| 272 |
+
"// Read class definition and constructor to understand parameters\n"
|
| 273 |
+
"</example>\n\n"
|
| 274 |
+
"<example>\n"
|
| 275 |
+
"// ML Workflow Step: Study complete training script\n"
|
| 276 |
+
"// Use case: Learn end-to-end VLM fine-tuning with GRPO\n"
|
| 277 |
+
"{\n"
|
| 278 |
+
" repo: 'huggingface/trl',\n"
|
| 279 |
+
" path: 'examples/scripts/grpo_vlm.py'\n"
|
| 280 |
+
"}\n"
|
| 281 |
+
"// Returns first 300 lines of the file\n"
|
| 282 |
+
"</example>\n\n"
|
| 283 |
+
"<example>\n"
|
| 284 |
+
"// ML Workflow Step: Check configuration patterns\n"
|
| 285 |
+
"// Use case: Learn how to structure training configs\n"
|
| 286 |
+
"{\n"
|
| 287 |
+
" repo: 'huggingface/transformers',\n"
|
| 288 |
+
" path: 'examples/pytorch/language-modeling/run_clm.py',\n"
|
| 289 |
+
" line_start: 50,\n"
|
| 290 |
+
" line_end: 150\n"
|
| 291 |
+
"}\n"
|
| 292 |
+
"// Read argument parsing and config setup section\n"
|
| 293 |
+
"</example>"
|
| 294 |
+
),
|
| 295 |
+
"parameters": {
|
| 296 |
+
"type": "object",
|
| 297 |
+
"properties": {
|
| 298 |
+
"repo": {
|
| 299 |
+
"type": "string",
|
| 300 |
+
"description": "Repository in format 'owner/repo' (e.g., 'github/github-mcp-server'). Required.",
|
| 301 |
+
},
|
| 302 |
+
"path": {
|
| 303 |
+
"type": "string",
|
| 304 |
+
"description": "Path to file in repository (e.g., 'src/index.js'). Required.",
|
| 305 |
+
},
|
| 306 |
+
"ref": {
|
| 307 |
+
"type": "string",
|
| 308 |
+
"description": "Git reference - branch name, tag, or commit SHA. Default: 'HEAD'.",
|
| 309 |
+
},
|
| 310 |
+
"line_start": {
|
| 311 |
+
"type": "integer",
|
| 312 |
+
"description": "Starting line number (1-indexed, inclusive). Optional.",
|
| 313 |
+
},
|
| 314 |
+
"line_end": {
|
| 315 |
+
"type": "integer",
|
| 316 |
+
"description": "Ending line number (1-indexed, inclusive). Optional.",
|
| 317 |
+
},
|
| 318 |
+
},
|
| 319 |
+
"required": ["repo", "path"],
|
| 320 |
+
},
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
async def github_read_file_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
|
| 325 |
+
"""Handler for agent tool router"""
|
| 326 |
+
try:
|
| 327 |
+
result = read_file(
|
| 328 |
+
repo=arguments["repo"],
|
| 329 |
+
path=arguments["path"],
|
| 330 |
+
ref=arguments.get("ref", "HEAD"),
|
| 331 |
+
line_start=arguments.get("line_start"),
|
| 332 |
+
line_end=arguments.get("line_end"),
|
| 333 |
+
)
|
| 334 |
+
return result["formatted"], not result.get("isError", False)
|
| 335 |
+
except Exception as e:
|
| 336 |
+
return f"Error reading file: {str(e)}", False
|
agent/tools/github_search_code.py
ADDED
|
@@ -0,0 +1,453 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GitHub Code Search Tool - Search code across GitHub with intelligent filtering
|
| 3 |
+
|
| 4 |
+
Maps user-friendly patterns to GitHub's Code Search API capabilities.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import fnmatch
|
| 8 |
+
import os
|
| 9 |
+
import re
|
| 10 |
+
from typing import Any, Dict, Optional
|
| 11 |
+
|
| 12 |
+
import requests
|
| 13 |
+
|
| 14 |
+
from agent.tools.types import ToolResult
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _glob_match(text: str, pattern: str) -> bool:
|
| 18 |
+
"""Check if text matches glob pattern, supporting ** for multi-level paths"""
|
| 19 |
+
if "**" in pattern:
|
| 20 |
+
regex_pattern = pattern.replace("**", "<<<DOUBLESTAR>>>")
|
| 21 |
+
regex_pattern = fnmatch.translate(regex_pattern)
|
| 22 |
+
regex_pattern = regex_pattern.replace("<<<DOUBLESTAR>>>", ".*")
|
| 23 |
+
return re.match(regex_pattern, text) is not None
|
| 24 |
+
return fnmatch.fnmatch(text, pattern)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _parse_repo_filter(repo_pattern: str) -> tuple[Optional[str], Optional[str]]:
|
| 28 |
+
"""
|
| 29 |
+
Parse repository pattern into GitHub API filter and client-side glob pattern.
|
| 30 |
+
|
| 31 |
+
Returns: (api_filter, client_glob)
|
| 32 |
+
- api_filter: GitHub API filter string (e.g., "org:huggingface")
|
| 33 |
+
- client_glob: Pattern for client-side filtering (e.g., "huggingface/trl*")
|
| 34 |
+
|
| 35 |
+
Examples:
|
| 36 |
+
"huggingface/trl" → ("repo:huggingface/trl", None)
|
| 37 |
+
"huggingface/*" → ("org:huggingface", "huggingface/*")
|
| 38 |
+
"huggingface/trl*" → ("org:huggingface", "huggingface/trl*")
|
| 39 |
+
"huggingface" → ("org:huggingface", None)
|
| 40 |
+
"*/*" → (None, "*/*")
|
| 41 |
+
"""
|
| 42 |
+
if not repo_pattern:
|
| 43 |
+
return None, None
|
| 44 |
+
|
| 45 |
+
# Pattern: owner/repo (exact match)
|
| 46 |
+
if "/" in repo_pattern and "*" not in repo_pattern and "?" not in repo_pattern:
|
| 47 |
+
return f"repo:{repo_pattern}", None
|
| 48 |
+
|
| 49 |
+
# Pattern: owner/* or owner/prefix* (org + client filter)
|
| 50 |
+
if "/" in repo_pattern and ("*" in repo_pattern or "?" in repo_pattern):
|
| 51 |
+
org_name = repo_pattern.split("/")[0]
|
| 52 |
+
if "*" not in org_name and "?" not in org_name:
|
| 53 |
+
return f"org:{org_name}", repo_pattern
|
| 54 |
+
# Org name has wildcards - can't filter server-side
|
| 55 |
+
return None, repo_pattern
|
| 56 |
+
|
| 57 |
+
# Pattern: owner (just org name, no wildcards)
|
| 58 |
+
if "*" not in repo_pattern and "?" not in repo_pattern:
|
| 59 |
+
return f"org:{repo_pattern}", None
|
| 60 |
+
|
| 61 |
+
# Pattern: */* or other complex patterns (client-side only)
|
| 62 |
+
return None, repo_pattern
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _parse_path_filter(path_pattern: str) -> tuple[Optional[str], Optional[str]]:
|
| 66 |
+
"""
|
| 67 |
+
Parse path pattern into GitHub API filter and client-side glob pattern.
|
| 68 |
+
|
| 69 |
+
Returns: (api_filter, client_glob)
|
| 70 |
+
|
| 71 |
+
Examples:
|
| 72 |
+
"*.py" → ("extension:py", None)
|
| 73 |
+
"**/*.py" → ("extension:py", None)
|
| 74 |
+
"src/**/*.py" → ("extension:py", "src/**/*.py")
|
| 75 |
+
"test_*.py" → ("extension:py", "test_*.py")
|
| 76 |
+
"src/main.py" → ("path:src/main.py", None)
|
| 77 |
+
"""
|
| 78 |
+
if not path_pattern:
|
| 79 |
+
return None, None
|
| 80 |
+
|
| 81 |
+
# Exact path (no wildcards)
|
| 82 |
+
if "*" not in path_pattern and "?" not in path_pattern:
|
| 83 |
+
return f"path:{path_pattern}", None
|
| 84 |
+
|
| 85 |
+
# Extract extension if present
|
| 86 |
+
ext_match = re.search(r"\*\.(\w+)$", path_pattern)
|
| 87 |
+
if ext_match:
|
| 88 |
+
extension = ext_match.group(1)
|
| 89 |
+
api_filter = f"extension:{extension}"
|
| 90 |
+
|
| 91 |
+
# Check if there's a directory prefix that needs client-side filtering
|
| 92 |
+
# e.g., "src/**/*.py" needs client filter, "**/*.py" doesn't
|
| 93 |
+
if path_pattern in [f"*.{extension}", f"**/*.{extension}"]:
|
| 94 |
+
# Simple patterns - API filter is enough
|
| 95 |
+
return api_filter, None
|
| 96 |
+
else:
|
| 97 |
+
# Complex pattern - need client-side filter too
|
| 98 |
+
return api_filter, path_pattern
|
| 99 |
+
|
| 100 |
+
# Pattern like "test_*.py" or "README*" - use filename with client filter
|
| 101 |
+
# GitHub's filename: doesn't support wildcards, so we rely on client-side
|
| 102 |
+
if "/" not in path_pattern:
|
| 103 |
+
# Try to extract extension for API filtering
|
| 104 |
+
if "." in path_pattern:
|
| 105 |
+
parts = path_pattern.rsplit(".", 1)
|
| 106 |
+
if "*" not in parts[-1] and "?" not in parts[-1]:
|
| 107 |
+
# Extension is clean
|
| 108 |
+
return f"extension:{parts[-1]}", path_pattern
|
| 109 |
+
# No extension or complex - client-side only
|
| 110 |
+
return None, path_pattern
|
| 111 |
+
|
| 112 |
+
# Complex path pattern - client-side only
|
| 113 |
+
return None, path_pattern
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def search_code(
|
| 117 |
+
query: str,
|
| 118 |
+
repo_pattern: Optional[str] = None,
|
| 119 |
+
path_pattern: Optional[str] = None,
|
| 120 |
+
regex: bool = False,
|
| 121 |
+
max_results: int = 20,
|
| 122 |
+
) -> ToolResult:
|
| 123 |
+
"""
|
| 124 |
+
Search for code across GitHub with intelligent pattern matching.
|
| 125 |
+
|
| 126 |
+
This tool intelligently maps user patterns to GitHub's Code Search API capabilities:
|
| 127 |
+
|
| 128 |
+
Repository Patterns:
|
| 129 |
+
- "owner/repo" → Searches exact repository
|
| 130 |
+
- "owner/*" or "owner" → Searches all repos in organization
|
| 131 |
+
- "*/*" → Searches all GitHub (no repo filter)
|
| 132 |
+
- Wildcards trigger client-side filtering when needed
|
| 133 |
+
|
| 134 |
+
Path Patterns:
|
| 135 |
+
- "*.py" → Searches all Python files
|
| 136 |
+
- "**/*.js" → Searches all JavaScript files (any directory)
|
| 137 |
+
- "src/**/*.py" → Python files in src/ (uses client-side filtering)
|
| 138 |
+
- "test_*.py" → Files matching pattern (client-side filtering)
|
| 139 |
+
- "path/to/file.py" → Exact file path
|
| 140 |
+
|
| 141 |
+
Args:
|
| 142 |
+
query: Search term or pattern to find in code
|
| 143 |
+
repo_pattern: Repository pattern (e.g., "huggingface/trl", "huggingface/*", "huggingface")
|
| 144 |
+
path_pattern: File path pattern (e.g., "*.py", "src/**/*.js")
|
| 145 |
+
regex: If True, treat query as regular expression
|
| 146 |
+
max_results: Maximum number of results to return (default 20)
|
| 147 |
+
|
| 148 |
+
Returns:
|
| 149 |
+
ToolResult with code matches and snippets
|
| 150 |
+
"""
|
| 151 |
+
token = os.environ.get("GITHUB_TOKEN")
|
| 152 |
+
if not token:
|
| 153 |
+
return {
|
| 154 |
+
"formatted": "Error: GITHUB_TOKEN environment variable is required",
|
| 155 |
+
"totalResults": 0,
|
| 156 |
+
"resultsShared": 0,
|
| 157 |
+
"isError": True,
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
# Build GitHub API query
|
| 161 |
+
query_parts = []
|
| 162 |
+
|
| 163 |
+
# Add search term
|
| 164 |
+
if regex:
|
| 165 |
+
query_parts.append(f"/{query}/")
|
| 166 |
+
else:
|
| 167 |
+
query_parts.append(f'"{query}"' if " " in query else query)
|
| 168 |
+
|
| 169 |
+
# Parse repository filter
|
| 170 |
+
repo_api_filter, repo_client_glob = _parse_repo_filter(repo_pattern)
|
| 171 |
+
if repo_api_filter:
|
| 172 |
+
query_parts.append(repo_api_filter)
|
| 173 |
+
|
| 174 |
+
# Parse path filter
|
| 175 |
+
path_api_filter, path_client_glob = _parse_path_filter(path_pattern)
|
| 176 |
+
if path_api_filter:
|
| 177 |
+
query_parts.append(path_api_filter)
|
| 178 |
+
|
| 179 |
+
github_query = " ".join(query_parts)
|
| 180 |
+
|
| 181 |
+
headers = {
|
| 182 |
+
"Accept": "application/vnd.github.text-match+json",
|
| 183 |
+
"X-GitHub-Api-Version": "2022-11-28",
|
| 184 |
+
"Authorization": f"Bearer {token}",
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
all_matches = []
|
| 188 |
+
page = 1
|
| 189 |
+
per_page = min(100, max_results)
|
| 190 |
+
|
| 191 |
+
try:
|
| 192 |
+
while len(all_matches) < max_results:
|
| 193 |
+
params = {
|
| 194 |
+
"q": github_query,
|
| 195 |
+
"page": page,
|
| 196 |
+
"per_page": per_page,
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
response = requests.get(
|
| 200 |
+
"https://api.github.com/search/code",
|
| 201 |
+
headers=headers,
|
| 202 |
+
params=params,
|
| 203 |
+
timeout=30,
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
if response.status_code == 403:
|
| 207 |
+
error_data = response.json()
|
| 208 |
+
return {
|
| 209 |
+
"formatted": f"GitHub API rate limit or permission error: {error_data.get('message', 'Unknown error')}",
|
| 210 |
+
"totalResults": 0,
|
| 211 |
+
"resultsShared": 0,
|
| 212 |
+
"isError": True,
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
if response.status_code != 200:
|
| 216 |
+
error_msg = f"GitHub API error (status {response.status_code})"
|
| 217 |
+
try:
|
| 218 |
+
error_data = response.json()
|
| 219 |
+
if "message" in error_data:
|
| 220 |
+
error_msg += f": {error_data['message']}"
|
| 221 |
+
except Exception:
|
| 222 |
+
pass
|
| 223 |
+
return {
|
| 224 |
+
"formatted": error_msg,
|
| 225 |
+
"totalResults": 0,
|
| 226 |
+
"resultsShared": 0,
|
| 227 |
+
"isError": True,
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
data = response.json()
|
| 231 |
+
items = data.get("items", [])
|
| 232 |
+
|
| 233 |
+
if not items:
|
| 234 |
+
break
|
| 235 |
+
|
| 236 |
+
for item in items:
|
| 237 |
+
repo_name = item.get("repository", {}).get("full_name", "unknown")
|
| 238 |
+
file_path = item.get("path", "")
|
| 239 |
+
sha = item.get("sha", "")
|
| 240 |
+
|
| 241 |
+
# Apply client-side filtering
|
| 242 |
+
if repo_client_glob and not _glob_match(repo_name, repo_client_glob):
|
| 243 |
+
continue
|
| 244 |
+
if path_client_glob and not _glob_match(file_path, path_client_glob):
|
| 245 |
+
continue
|
| 246 |
+
|
| 247 |
+
# Extract text matches
|
| 248 |
+
text_matches = item.get("text_matches", [])
|
| 249 |
+
if text_matches:
|
| 250 |
+
for text_match in text_matches:
|
| 251 |
+
fragment = text_match.get("fragment", "")
|
| 252 |
+
lines = fragment.split("\n")
|
| 253 |
+
line_count = len([line for line in lines if line.strip()])
|
| 254 |
+
|
| 255 |
+
all_matches.append(
|
| 256 |
+
{
|
| 257 |
+
"repo": repo_name,
|
| 258 |
+
"path": file_path,
|
| 259 |
+
"ref": sha,
|
| 260 |
+
"line_start": 1,
|
| 261 |
+
"line_end": line_count,
|
| 262 |
+
"snippet": fragment.strip(),
|
| 263 |
+
"url": item.get("html_url", ""),
|
| 264 |
+
}
|
| 265 |
+
)
|
| 266 |
+
else:
|
| 267 |
+
all_matches.append(
|
| 268 |
+
{
|
| 269 |
+
"repo": repo_name,
|
| 270 |
+
"path": file_path,
|
| 271 |
+
"ref": sha,
|
| 272 |
+
"line_start": 1,
|
| 273 |
+
"line_end": 1,
|
| 274 |
+
"snippet": "(snippet not available)",
|
| 275 |
+
"url": item.get("html_url", ""),
|
| 276 |
+
}
|
| 277 |
+
)
|
| 278 |
+
|
| 279 |
+
if len(all_matches) >= data.get("total_count", 0):
|
| 280 |
+
break
|
| 281 |
+
|
| 282 |
+
page += 1
|
| 283 |
+
|
| 284 |
+
except requests.exceptions.RequestException as e:
|
| 285 |
+
return {
|
| 286 |
+
"formatted": f"Failed to connect to GitHub API: {str(e)}",
|
| 287 |
+
"totalResults": 0,
|
| 288 |
+
"resultsShared": 0,
|
| 289 |
+
"isError": True,
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
results = all_matches[:max_results]
|
| 293 |
+
|
| 294 |
+
if not results:
|
| 295 |
+
return {
|
| 296 |
+
"formatted": f"No code matches found for query: {query}",
|
| 297 |
+
"totalResults": 0,
|
| 298 |
+
"resultsShared": 0,
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
# Format output
|
| 302 |
+
lines_output = [f"**Found {len(results)} code matches:**\n"]
|
| 303 |
+
|
| 304 |
+
for i, match in enumerate(results, 1):
|
| 305 |
+
lines_output.append(f"{i}. **{match['repo']}:{match['path']}**")
|
| 306 |
+
lines_output.append(
|
| 307 |
+
f" Lines: {match['line_start']}-{match['line_end']} | Ref: {match['ref'][:7]}"
|
| 308 |
+
)
|
| 309 |
+
lines_output.append(f" URL: {match['url']}")
|
| 310 |
+
|
| 311 |
+
# Copyable parameters for read_file tool
|
| 312 |
+
read_params = f"{{'repo': '{match['repo']}', 'path': '{match['path']}', 'ref': '{match['ref'][:7]}'}}"
|
| 313 |
+
lines_output.append(f" To read, use: {read_params}")
|
| 314 |
+
|
| 315 |
+
# Show snippet (first 5 lines)
|
| 316 |
+
snippet_lines = match["snippet"].split("\n")[:5]
|
| 317 |
+
if snippet_lines:
|
| 318 |
+
lines_output.append(" ```")
|
| 319 |
+
for line in snippet_lines:
|
| 320 |
+
lines_output.append(f" {line}")
|
| 321 |
+
if len(match["snippet"].split("\n")) > 5:
|
| 322 |
+
lines_output.append(" ...")
|
| 323 |
+
lines_output.append(" ```")
|
| 324 |
+
lines_output.append("")
|
| 325 |
+
|
| 326 |
+
return {
|
| 327 |
+
"formatted": "\n".join(lines_output),
|
| 328 |
+
"totalResults": len(results),
|
| 329 |
+
"resultsShared": len(results),
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
# Tool specification
|
| 334 |
+
GITHUB_SEARCH_CODE_TOOL_SPEC = {
|
| 335 |
+
"name": "github_search_code",
|
| 336 |
+
"description": (
|
| 337 |
+
"Search for code patterns across GitHub repositories with intelligent pattern matching.\n\n"
|
| 338 |
+
"Searches for specific code patterns, functions, classes, or implementations across GitHub. "
|
| 339 |
+
"Intelligently maps patterns to GitHub's Code Search API for efficient server-side filtering, "
|
| 340 |
+
"with automatic client-side filtering for complex patterns. Returns code snippets with context.\n\n"
|
| 341 |
+
"## When to use this tool\n\n"
|
| 342 |
+
"- When searching for specific code patterns, functions, or classes across repositories\n"
|
| 343 |
+
"- When looking for implementation examples of specific methods or APIs\n"
|
| 344 |
+
"- When you need to find where specific code exists across multiple files or repos\n"
|
| 345 |
+
"- When investigating how a feature is implemented in different repositories\n"
|
| 346 |
+
"- When searching for TODO comments, specific patterns, or code structures\n"
|
| 347 |
+
"- Use this for searching actual implementation code (not examples - use github_find_examples for those)\n\n"
|
| 348 |
+
"## When NOT to use this tool\n\n"
|
| 349 |
+
"- When looking for example files or tutorials (use github_find_examples instead)\n"
|
| 350 |
+
"- When you already know the exact file path (use github_read_file directly)\n"
|
| 351 |
+
"- When you need to list repositories (use github_list_repos instead)\n\n"
|
| 352 |
+
"## Repository Patterns\n\n"
|
| 353 |
+
"- **Exact repo**: `'huggingface/trl'` → Searches only that repository\n"
|
| 354 |
+
"- **Organization**: `'huggingface'` or `'huggingface/*'` → All repos in organization\n"
|
| 355 |
+
"- **All GitHub**: `'*/*'` or omit repo_pattern → Searches across all GitHub\n"
|
| 356 |
+
"- **Wildcards**: `'huggingface/trl*'` → Automatic client-side filtering for complex patterns\n\n"
|
| 357 |
+
"## Path Patterns\n\n"
|
| 358 |
+
"- **Extension**: `'*.py'` or `'**/*.py'` → All Python files\n"
|
| 359 |
+
"- **Directory**: `'src/**/*.js'` → JavaScript files in src/ directory (client-filtered)\n"
|
| 360 |
+
"- **Pattern**: `'test_*.py'` → Files matching pattern (client-filtered)\n"
|
| 361 |
+
"- **Exact path**: `'README.md'` → Specific file\n\n"
|
| 362 |
+
"## How it works\n\n"
|
| 363 |
+
"1. Parses repository and path patterns\n"
|
| 364 |
+
"2. Converts to GitHub API filters when possible (server-side, fast)\n"
|
| 365 |
+
"3. Falls back to client-side filtering for complex patterns\n"
|
| 366 |
+
"4. Returns code snippets with line numbers, URLs, and file refs\n"
|
| 367 |
+
"5. Results can be used directly with github_read_file tool\n\n"
|
| 368 |
+
"## Examples\n\n"
|
| 369 |
+
"<example>\n"
|
| 370 |
+
"// ML Workflow Step: Find how AutoModelForCausalLM is used\n"
|
| 371 |
+
"// Use case: Learning best practices for loading LLMs in TRL\n"
|
| 372 |
+
"{\n"
|
| 373 |
+
" query: 'AutoModelForCausalLM.from_pretrained',\n"
|
| 374 |
+
" repo_pattern: 'huggingface/trl',\n"
|
| 375 |
+
" path_pattern: '*.py'\n"
|
| 376 |
+
"}\n"
|
| 377 |
+
"// Finds all model loading patterns with quantization, device_map, etc.\n"
|
| 378 |
+
"</example>\n\n"
|
| 379 |
+
"<example>\n"
|
| 380 |
+
"// ML Workflow Step: Discover TrainingArguments configurations\n"
|
| 381 |
+
"// Use case: Setting up training hyperparameters correctly\n"
|
| 382 |
+
"{\n"
|
| 383 |
+
" query: 'TrainingArguments',\n"
|
| 384 |
+
" repo_pattern: 'huggingface/transformers',\n"
|
| 385 |
+
" path_pattern: 'examples/**/*.py',\n"
|
| 386 |
+
" max_results: 10\n"
|
| 387 |
+
"}\n"
|
| 388 |
+
"// Shows various TrainingArguments setups across different tasks\n"
|
| 389 |
+
"</example>\n\n"
|
| 390 |
+
"<example>\n"
|
| 391 |
+
"// ML Workflow Step: Find dataset preprocessing patterns\n"
|
| 392 |
+
"// Use case: Learning how to prepare data for instruction tuning\n"
|
| 393 |
+
"{\n"
|
| 394 |
+
" query: 'map(tokenize',\n"
|
| 395 |
+
" repo_pattern: 'huggingface',\n"
|
| 396 |
+
" path_pattern: '*.py'\n"
|
| 397 |
+
"}\n"
|
| 398 |
+
"// Discovers tokenization and dataset mapping patterns\n"
|
| 399 |
+
"</example>\n\n"
|
| 400 |
+
"<example>\n"
|
| 401 |
+
"// ML Workflow Step: Find all Trainer class implementations\n"
|
| 402 |
+
"// Use case: Understanding available trainer variants for different tasks\n"
|
| 403 |
+
"{\n"
|
| 404 |
+
" query: 'class \\\\w+Trainer\\\\(',\n"
|
| 405 |
+
" repo_pattern: 'huggingface/trl',\n"
|
| 406 |
+
" path_pattern: 'trl/trainer/**/*.py',\n"
|
| 407 |
+
" regex: true\n"
|
| 408 |
+
"}\n"
|
| 409 |
+
"// Lists: GRPOTrainer, DPOTrainer, PPOTrainer, RewardTrainer, etc.\n"
|
| 410 |
+
"</example>"
|
| 411 |
+
),
|
| 412 |
+
"parameters": {
|
| 413 |
+
"type": "object",
|
| 414 |
+
"properties": {
|
| 415 |
+
"query": {
|
| 416 |
+
"type": "string",
|
| 417 |
+
"description": "Search term or pattern to find in code. Required.",
|
| 418 |
+
},
|
| 419 |
+
"repo_pattern": {
|
| 420 |
+
"type": "string",
|
| 421 |
+
"description": "Repository pattern: 'owner/repo' (exact), 'owner' (org), 'owner/*' (org with filter), '*/*' (all). Optional.",
|
| 422 |
+
},
|
| 423 |
+
"path_pattern": {
|
| 424 |
+
"type": "string",
|
| 425 |
+
"description": "File path pattern: '*.ext' (extension), 'dir/**/*.ext' (directory), 'pattern*.ext' (name pattern). Optional.",
|
| 426 |
+
},
|
| 427 |
+
"regex": {
|
| 428 |
+
"type": "boolean",
|
| 429 |
+
"description": "If true, treat query as regular expression. Default: false.",
|
| 430 |
+
},
|
| 431 |
+
"max_results": {
|
| 432 |
+
"type": "integer",
|
| 433 |
+
"description": "Maximum number of results to return. Default: 20.",
|
| 434 |
+
},
|
| 435 |
+
},
|
| 436 |
+
"required": ["query"],
|
| 437 |
+
},
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
|
| 441 |
+
async def github_search_code_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
|
| 442 |
+
"""Handler for agent tool router"""
|
| 443 |
+
try:
|
| 444 |
+
result = search_code(
|
| 445 |
+
query=arguments["query"],
|
| 446 |
+
repo_pattern=arguments.get("repo_pattern"),
|
| 447 |
+
path_pattern=arguments.get("path_pattern"),
|
| 448 |
+
regex=arguments.get("regex", False),
|
| 449 |
+
max_results=arguments.get("max_results", 20),
|
| 450 |
+
)
|
| 451 |
+
return result["formatted"], not result.get("isError", False)
|
| 452 |
+
except Exception as e:
|
| 453 |
+
return f"Error searching code: {str(e)}", False
|
agent/tools/jobs_tool.py
CHANGED
|
@@ -40,6 +40,20 @@ GPU_FLAVORS = [
|
|
| 40 |
"h100",
|
| 41 |
"h100x8",
|
| 42 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
SPECIALIZED_FLAVORS = ["inf2x6"]
|
| 44 |
ALL_FLAVORS = CPU_FLAVORS + GPU_FLAVORS + SPECIALIZED_FLAVORS
|
| 45 |
|
|
@@ -741,12 +755,12 @@ HF_JOBS_TOOL_SPEC = {
|
|
| 741 |
"1. **Python mode:** Provide 'script' + 'dependencies' → auto-handles pip install\n"
|
| 742 |
"2. **Docker mode:** Provide 'image' + 'command' → full control\n"
|
| 743 |
"(script and command are mutually exclusive)\n\n"
|
| 744 |
-
"## Hardware:\n"
|
| 745 |
-
"CPU:
|
| 746 |
-
"GPU:
|
| 747 |
"## Examples:\n\n"
|
| 748 |
"**Fine-tune LLM and push to Hub:**\n"
|
| 749 |
-
"{'operation': 'run', 'script': 'from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer\\nmodel = AutoModelForCausalLM.from_pretrained(\"
|
| 750 |
"**Generate dataset daily and upload:**\n"
|
| 751 |
"{'operation': 'scheduled run', 'script': 'from datasets import Dataset\\nimport pandas as pd\\n# scrape/generate data\\ndf = pd.DataFrame(data)\\nds = Dataset.from_pandas(df)\\nds.push_to_hub(\"user-name/daily-dataset\")', 'dependencies': ['datasets', 'pandas'], 'schedule': '@daily'}\n\n"
|
| 752 |
"**Run custom training with Docker:**\n"
|
|
@@ -807,7 +821,7 @@ HF_JOBS_TOOL_SPEC = {
|
|
| 807 |
# Hardware and environment
|
| 808 |
"hardware_flavor": {
|
| 809 |
"type": "string",
|
| 810 |
-
"description": "Hardware type. CPU:
|
| 811 |
},
|
| 812 |
"timeout": {
|
| 813 |
"type": "string",
|
|
|
|
| 40 |
"h100",
|
| 41 |
"h100x8",
|
| 42 |
]
|
| 43 |
+
|
| 44 |
+
# Detailed specs for display (vCPU/RAM/GPU VRAM)
|
| 45 |
+
CPU_FLAVORS_DESC = (
|
| 46 |
+
"cpu-basic(2vCPU/16GB), cpu-upgrade(8vCPU/32GB), cpu-performance, cpu-xl"
|
| 47 |
+
)
|
| 48 |
+
GPU_FLAVORS_DESC = (
|
| 49 |
+
"t4-small(4vCPU/15GB/GPU 16GB), t4-medium(8vCPU/30GB/GPU 16GB), "
|
| 50 |
+
"l4x1(8vCPU/30GB/GPU 24GB), l4x4(48vCPU/186GB/GPU 96GB), "
|
| 51 |
+
"l40sx1(8vCPU/62GB/GPU 48GB), l40sx4(48vCPU/382GB/GPU 192GB), l40sx8(192vCPU/1534GB/GPU 384GB), "
|
| 52 |
+
"a10g-small(4vCPU/14GB/GPU 24GB), a10g-large(12vCPU/46GB/GPU 24GB), "
|
| 53 |
+
"a10g-largex2(24vCPU/92GB/GPU 48GB), a10g-largex4(48vCPU/184GB/GPU 96GB), "
|
| 54 |
+
"a100-large(12vCPU/142GB/GPU 80GB), h100(23vCPU/240GB/GPU 80GB), h100x8(184vCPU/1920GB/GPU 640GB), "
|
| 55 |
+
"zero-a10g(dynamic alloc)"
|
| 56 |
+
)
|
| 57 |
SPECIALIZED_FLAVORS = ["inf2x6"]
|
| 58 |
ALL_FLAVORS = CPU_FLAVORS + GPU_FLAVORS + SPECIALIZED_FLAVORS
|
| 59 |
|
|
|
|
| 755 |
"1. **Python mode:** Provide 'script' + 'dependencies' → auto-handles pip install\n"
|
| 756 |
"2. **Docker mode:** Provide 'image' + 'command' → full control\n"
|
| 757 |
"(script and command are mutually exclusive)\n\n"
|
| 758 |
+
"## Available Hardware (vCPU/RAM/GPU):\n"
|
| 759 |
+
f"CPU: {CPU_FLAVORS_DESC}\n"
|
| 760 |
+
f"GPU: {GPU_FLAVORS_DESC}\n"
|
| 761 |
"## Examples:\n\n"
|
| 762 |
"**Fine-tune LLM and push to Hub:**\n"
|
| 763 |
+
"{'operation': 'run', 'script': 'from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer\\nmodel = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen3-4B-Thinking-2507\")\\n# ... training code ...\\nmodel.push_to_hub(\"user-name/my-finetuned-model\")', 'dependencies': ['transformers', 'torch', 'datasets'], 'hardware_flavor': 'a10g-large', 'timeout': '4h', 'env': {'CUSTOM_VAR': 'value'}}\n\n"
|
| 764 |
"**Generate dataset daily and upload:**\n"
|
| 765 |
"{'operation': 'scheduled run', 'script': 'from datasets import Dataset\\nimport pandas as pd\\n# scrape/generate data\\ndf = pd.DataFrame(data)\\nds = Dataset.from_pandas(df)\\nds.push_to_hub(\"user-name/daily-dataset\")', 'dependencies': ['datasets', 'pandas'], 'schedule': '@daily'}\n\n"
|
| 766 |
"**Run custom training with Docker:**\n"
|
|
|
|
| 821 |
# Hardware and environment
|
| 822 |
"hardware_flavor": {
|
| 823 |
"type": "string",
|
| 824 |
+
"description": f"Hardware type. Available CPU flavors: {CPU_FLAVORS}. Available GPU flavors: {GPU_FLAVORS}. Use with 'run'/'scheduled run'.",
|
| 825 |
},
|
| 826 |
"timeout": {
|
| 827 |
"type": "string",
|
agent/tools/utilities.py
CHANGED
|
@@ -2,8 +2,10 @@
|
|
| 2 |
Utility functions for Hugging Face tools
|
| 3 |
|
| 4 |
Ported from: hf-mcp-server/packages/mcp/src/jobs/formatters.ts
|
|
|
|
| 5 |
"""
|
| 6 |
|
|
|
|
| 7 |
from datetime import datetime
|
| 8 |
from typing import Any, Dict, List, Optional
|
| 9 |
|
|
@@ -126,7 +128,6 @@ def format_scheduled_jobs_table(jobs: List[Dict[str, Any]]) -> str:
|
|
| 126 |
|
| 127 |
def format_job_details(jobs: Any) -> str:
|
| 128 |
"""Format job details as JSON in a markdown code block"""
|
| 129 |
-
import json
|
| 130 |
|
| 131 |
job_array = jobs if isinstance(jobs, list) else [jobs]
|
| 132 |
json_str = json.dumps(job_array, indent=2)
|
|
@@ -135,7 +136,6 @@ def format_job_details(jobs: Any) -> str:
|
|
| 135 |
|
| 136 |
def format_scheduled_job_details(jobs: Any) -> str:
|
| 137 |
"""Format scheduled job details as JSON in a markdown code block"""
|
| 138 |
-
import json
|
| 139 |
|
| 140 |
job_array = jobs if isinstance(jobs, list) else [jobs]
|
| 141 |
json_str = json.dumps(job_array, indent=2)
|
|
|
|
| 2 |
Utility functions for Hugging Face tools
|
| 3 |
|
| 4 |
Ported from: hf-mcp-server/packages/mcp/src/jobs/formatters.ts
|
| 5 |
+
Includes GPU memory validation for job submissions
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
import json
|
| 9 |
from datetime import datetime
|
| 10 |
from typing import Any, Dict, List, Optional
|
| 11 |
|
|
|
|
| 128 |
|
| 129 |
def format_job_details(jobs: Any) -> str:
|
| 130 |
"""Format job details as JSON in a markdown code block"""
|
|
|
|
| 131 |
|
| 132 |
job_array = jobs if isinstance(jobs, list) else [jobs]
|
| 133 |
json_str = json.dumps(job_array, indent=2)
|
|
|
|
| 136 |
|
| 137 |
def format_scheduled_job_details(jobs: Any) -> str:
|
| 138 |
"""Format scheduled job details as JSON in a markdown code block"""
|
|
|
|
| 139 |
|
| 140 |
job_array = jobs if isinstance(jobs, list) else [jobs]
|
| 141 |
json_str = json.dumps(job_array, indent=2)
|
agent/tools/utils_tools.py
CHANGED
|
@@ -4,14 +4,9 @@ Utils Tools - General utility operations
|
|
| 4 |
Provides system information like current date/time with timezone support.
|
| 5 |
"""
|
| 6 |
|
| 7 |
-
import
|
| 8 |
from datetime import datetime
|
| 9 |
-
from typing import Any, Dict, Literal
|
| 10 |
-
|
| 11 |
-
try:
|
| 12 |
-
import zoneinfo
|
| 13 |
-
except ImportError:
|
| 14 |
-
from backports import zoneinfo
|
| 15 |
|
| 16 |
from agent.tools.types import ToolResult
|
| 17 |
|
|
@@ -123,7 +118,9 @@ Common timezones: Europe/Paris, America/New_York, America/Los_Angeles, Asia/Toky
|
|
| 123 |
date_str = now.strftime("%d-%m-%Y")
|
| 124 |
|
| 125 |
# Format time as HH:MM:SS.mmm
|
| 126 |
-
time_str = now.strftime("%H:%M:%S.%f")[
|
|
|
|
|
|
|
| 127 |
|
| 128 |
# Get timezone abbreviation/offset
|
| 129 |
tz_offset = now.strftime("%z")
|
|
|
|
| 4 |
Provides system information like current date/time with timezone support.
|
| 5 |
"""
|
| 6 |
|
| 7 |
+
import zoneinfo
|
| 8 |
from datetime import datetime
|
| 9 |
+
from typing import Any, Dict, Literal
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
from agent.tools.types import ToolResult
|
| 12 |
|
|
|
|
| 118 |
date_str = now.strftime("%d-%m-%Y")
|
| 119 |
|
| 120 |
# Format time as HH:MM:SS.mmm
|
| 121 |
+
time_str = now.strftime("%H:%M:%S.%f")[
|
| 122 |
+
:-3
|
| 123 |
+
] # Remove last 3 digits to keep only milliseconds
|
| 124 |
|
| 125 |
# Get timezone abbreviation/offset
|
| 126 |
tz_offset = now.strftime("%z")
|
pyproject.toml
CHANGED
|
@@ -23,4 +23,8 @@ dependencies = [
|
|
| 23 |
"prompt-toolkit>=3.0.0",
|
| 24 |
"ipykernel>=7.1.0",
|
| 25 |
"ipywidgets>=8.1.8",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
]
|
|
|
|
| 23 |
"prompt-toolkit>=3.0.0",
|
| 24 |
"ipykernel>=7.1.0",
|
| 25 |
"ipywidgets>=8.1.8",
|
| 26 |
+
"thefuzz>=0.22.1",
|
| 27 |
+
"nbconvert>=7.16.6",
|
| 28 |
+
"nbformat>=5.10.4",
|
| 29 |
+
"markitdown[all,docx,outlook,pdf,pptx,xls,xlsx]>=0.1.4",
|
| 30 |
]
|
uv.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|