Aksel Joonas Reedi commited on
Commit
54196ee
·
2 Parent(s): 2e66e57ca6c6c4

improved search capabilities with github tools and system prompt updates

Browse files
agent/context_manager/manager.py CHANGED
@@ -2,6 +2,8 @@
2
  Context management for conversation history
3
  """
4
 
 
 
5
  from pathlib import Path
6
  from typing import Any
7
 
@@ -42,10 +44,20 @@ class ContextManager:
42
  prompt_data = yaml.safe_load(f)
43
  template_str = prompt_data.get("system_prompt", "")
44
 
 
 
 
 
 
 
 
45
  template = Template(template_str)
46
  return template.render(
47
  tools=tool_specs,
48
  num_tools=len(tool_specs),
 
 
 
49
  )
50
 
51
  def add_message(self, message: Message, token_count: int = None) -> None:
 
2
  Context management for conversation history
3
  """
4
 
5
+ import zoneinfo
6
+ from datetime import datetime
7
  from pathlib import Path
8
  from typing import Any
9
 
 
44
  prompt_data = yaml.safe_load(f)
45
  template_str = prompt_data.get("system_prompt", "")
46
 
47
+ # Get current date and time
48
+ tz = zoneinfo.ZoneInfo("Europe/Paris")
49
+ now = datetime.now(tz)
50
+ current_date = now.strftime("%d-%m-%Y")
51
+ current_time = now.strftime("%H:%M:%S.%f")[:-3]
52
+ current_timezone = f"{now.strftime('%Z')} (UTC{now.strftime('%z')[:3]}:{now.strftime('%z')[3:]})"
53
+
54
  template = Template(template_str)
55
  return template.render(
56
  tools=tool_specs,
57
  num_tools=len(tool_specs),
58
+ current_date=current_date,
59
+ current_time=current_time,
60
+ current_timezone=current_timezone,
61
  )
62
 
63
  def add_message(self, message: Message, token_count: int = None) -> None:
agent/core/agent_loop.py CHANGED
@@ -25,9 +25,15 @@ def _validate_tool_args(tool_args: dict) -> tuple[bool, str | None]:
25
  args = tool_args.get("args", {})
26
  # Sometimes LLM passes args as string instead of dict
27
  if isinstance(args, str):
28
- return False, f"Tool call error: 'args' must be a JSON object, not a string. You passed: {repr(args)}"
 
 
 
29
  if not isinstance(args, dict) and args is not None:
30
- return False, f"Tool call error: 'args' must be a JSON object. You passed type: {type(args).__name__}"
 
 
 
31
  return True, None
32
 
33
 
@@ -38,8 +44,6 @@ def _needs_approval(tool_name: str, tool_args: dict) -> bool:
38
  if not args_valid:
39
  return False
40
 
41
- args = tool_args.get("args", {})
42
-
43
  if tool_name == "hf_jobs":
44
  # Check if it's a run or uv operation
45
  operation = tool_args.get("operation", "")
 
25
  args = tool_args.get("args", {})
26
  # Sometimes LLM passes args as string instead of dict
27
  if isinstance(args, str):
28
+ return (
29
+ False,
30
+ f"Tool call error: 'args' must be a JSON object, not a string. You passed: {repr(args)}",
31
+ )
32
  if not isinstance(args, dict) and args is not None:
33
+ return (
34
+ False,
35
+ f"Tool call error: 'args' must be a JSON object. You passed type: {type(args).__name__}",
36
+ )
37
  return True, None
38
 
39
 
 
44
  if not args_valid:
45
  return False
46
 
 
 
47
  if tool_name == "hf_jobs":
48
  # Check if it's a run or uv operation
49
  operation = tool_args.get("operation", "")
agent/core/tools.py CHANGED
@@ -19,13 +19,27 @@ from agent.tools.docs_tools import (
19
  explore_hf_docs_handler,
20
  hf_docs_fetch_handler,
21
  )
 
 
 
 
 
 
 
 
 
 
 
 
22
  from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler
23
  from agent.tools.plan_tool import PLAN_TOOL_SPEC, plan_tool_handler
24
  from agent.tools.private_hf_repo_tools import (
25
  PRIVATE_HF_REPO_TOOL_SPEC,
26
  private_hf_repo_handler,
27
  )
28
- from agent.tools.utils_tools import UTILS_TOOL_SPEC, utils_handler
 
 
29
 
30
  # Suppress aiohttp deprecation warning
31
  warnings.filterwarnings(
@@ -224,7 +238,7 @@ class ToolRouter:
224
  def create_builtin_tools() -> list[ToolSpec]:
225
  """Create built-in tool specifications"""
226
  print(
227
- f"Creating built-in tools: {EXPLORE_HF_DOCS_TOOL_SPEC['name']}, {HF_DOCS_FETCH_TOOL_SPEC['name']}, {PLAN_TOOL_SPEC['name']}, {HF_JOBS_TOOL_SPEC['name']}, {PRIVATE_HF_REPO_TOOL_SPEC['name']}, {UTILS_TOOL_SPEC['name']}"
228
  )
229
  # in order of importance
230
  return [
@@ -260,10 +274,37 @@ def create_builtin_tools() -> list[ToolSpec]:
260
  parameters=PRIVATE_HF_REPO_TOOL_SPEC["parameters"],
261
  handler=private_hf_repo_handler,
262
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  ToolSpec(
264
- name=UTILS_TOOL_SPEC["name"],
265
- description=UTILS_TOOL_SPEC["description"],
266
- parameters=UTILS_TOOL_SPEC["parameters"],
267
- handler=utils_handler,
268
  ),
269
  ]
 
19
  explore_hf_docs_handler,
20
  hf_docs_fetch_handler,
21
  )
22
+ from agent.tools.github_find_examples import (
23
+ GITHUB_FIND_EXAMPLES_TOOL_SPEC,
24
+ github_find_examples_handler,
25
+ )
26
+ from agent.tools.github_list_repos import (
27
+ GITHUB_LIST_REPOS_TOOL_SPEC,
28
+ github_list_repos_handler,
29
+ )
30
+ from agent.tools.github_read_file import (
31
+ GITHUB_READ_FILE_TOOL_SPEC,
32
+ github_read_file_handler,
33
+ )
34
  from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler
35
  from agent.tools.plan_tool import PLAN_TOOL_SPEC, plan_tool_handler
36
  from agent.tools.private_hf_repo_tools import (
37
  PRIVATE_HF_REPO_TOOL_SPEC,
38
  private_hf_repo_handler,
39
  )
40
+
41
+ # NOTE: Utils tool disabled - date/time now loaded into system prompt at initialization
42
+ # from agent.tools.utils_tools import UTILS_TOOL_SPEC, utils_handler
43
 
44
  # Suppress aiohttp deprecation warning
45
  warnings.filterwarnings(
 
238
  def create_builtin_tools() -> list[ToolSpec]:
239
  """Create built-in tool specifications"""
240
  print(
241
+ f"Creating built-in tools: {EXPLORE_HF_DOCS_TOOL_SPEC['name']}, {HF_DOCS_FETCH_TOOL_SPEC['name']}, {PLAN_TOOL_SPEC['name']}, {HF_JOBS_TOOL_SPEC['name']}, {PRIVATE_HF_REPO_TOOL_SPEC['name']}, {GITHUB_FIND_EXAMPLES_TOOL_SPEC['name']}, {GITHUB_LIST_REPOS_TOOL_SPEC['name']}, {GITHUB_READ_FILE_TOOL_SPEC['name']}"
242
  )
243
  # in order of importance
244
  return [
 
274
  parameters=PRIVATE_HF_REPO_TOOL_SPEC["parameters"],
275
  handler=private_hf_repo_handler,
276
  ),
277
+ # NOTE: Utils tool disabled - date/time now loaded into system prompt at initialization (less tool calls=more reliablity)
278
+ # ToolSpec(
279
+ # name=UTILS_TOOL_SPEC["name"],
280
+ # description=UTILS_TOOL_SPEC["description"],
281
+ # parameters=UTILS_TOOL_SPEC["parameters"],
282
+ # handler=utils_handler,
283
+ # ),
284
+ # GitHub tools
285
+ # NOTE: Github search code tool disabled - a bit buggy
286
+ # ToolSpec(
287
+ # name=GITHUB_SEARCH_CODE_TOOL_SPEC["name"],
288
+ # description=GITHUB_SEARCH_CODE_TOOL_SPEC["description"],
289
+ # parameters=GITHUB_SEARCH_CODE_TOOL_SPEC["parameters"],
290
+ # handler=github_search_code_handler,
291
+ # ),
292
+ ToolSpec(
293
+ name=GITHUB_FIND_EXAMPLES_TOOL_SPEC["name"],
294
+ description=GITHUB_FIND_EXAMPLES_TOOL_SPEC["description"],
295
+ parameters=GITHUB_FIND_EXAMPLES_TOOL_SPEC["parameters"],
296
+ handler=github_find_examples_handler,
297
+ ),
298
+ ToolSpec(
299
+ name=GITHUB_LIST_REPOS_TOOL_SPEC["name"],
300
+ description=GITHUB_LIST_REPOS_TOOL_SPEC["description"],
301
+ parameters=GITHUB_LIST_REPOS_TOOL_SPEC["parameters"],
302
+ handler=github_list_repos_handler,
303
+ ),
304
  ToolSpec(
305
+ name=GITHUB_READ_FILE_TOOL_SPEC["name"],
306
+ description=GITHUB_READ_FILE_TOOL_SPEC["description"],
307
+ parameters=GITHUB_READ_FILE_TOOL_SPEC["parameters"],
308
+ handler=github_read_file_handler,
309
  ),
310
  ]
agent/main.py CHANGED
@@ -222,11 +222,15 @@ async def event_listener(
222
 
223
  # Build repo URL
224
  type_path = "" if repo_type == "model" else f"{repo_type}s"
225
- repo_url = f"https://huggingface.co/{type_path}/{repo_id}".replace("//", "/")
 
 
 
 
226
 
227
  print(f"Repository: {repo_id}")
228
  print(f"Type: {repo_type}")
229
- print(f"Private: Yes")
230
  print(f"URL: {repo_url}")
231
 
232
  # Show file preview for upload_file operation
@@ -237,9 +241,9 @@ async def event_listener(
237
 
238
  if isinstance(file_content, str):
239
  # Calculate metrics
240
- all_lines = file_content.split('\n')
241
  line_count = len(all_lines)
242
- size_bytes = len(file_content.encode('utf-8'))
243
  size_kb = size_bytes / 1024
244
  size_mb = size_kb / 1024
245
 
@@ -251,8 +255,10 @@ async def event_listener(
251
 
252
  # Show preview
253
  preview_lines = all_lines[:5]
254
- preview = '\n'.join(preview_lines)
255
- print(f"Content preview (first 5 lines):\n{preview}")
 
 
256
  if len(all_lines) > 5:
257
  print("...")
258
 
 
222
 
223
  # Build repo URL
224
  type_path = "" if repo_type == "model" else f"{repo_type}s"
225
+ repo_url = (
226
+ f"https://huggingface.co/{type_path}/{repo_id}".replace(
227
+ "//", "/"
228
+ )
229
+ )
230
 
231
  print(f"Repository: {repo_id}")
232
  print(f"Type: {repo_type}")
233
+ print("Private: Yes")
234
  print(f"URL: {repo_url}")
235
 
236
  # Show file preview for upload_file operation
 
241
 
242
  if isinstance(file_content, str):
243
  # Calculate metrics
244
+ all_lines = file_content.split("\n")
245
  line_count = len(all_lines)
246
+ size_bytes = len(file_content.encode("utf-8"))
247
  size_kb = size_bytes / 1024
248
  size_mb = size_kb / 1024
249
 
 
255
 
256
  # Show preview
257
  preview_lines = all_lines[:5]
258
+ preview = "\n".join(preview_lines)
259
+ print(
260
+ f"Content preview (first 5 lines):\n{preview}"
261
+ )
262
  if len(all_lines) > 5:
263
  print("...")
264
 
agent/prompts/system_prompt.yaml CHANGED
@@ -1,63 +1,57 @@
1
  system_prompt: |
2
  You are HF Agent, a powerful AI assistant for Machine Learning Engineering, particularly training Large Language Models. You have access to {{ num_tools }} tools for interacting with Hugging Face Hub and performing ML tasks.
3
-
 
 
4
  # Task Approach
5
 
6
- **CRITICAL: Research First, Then Implement**
7
 
8
  For ANY implementation task (training, fine-tuning, inference, data processing, etc.):
9
  1. **FIRST**: Search HF documentation to find the recommended approach
10
  - This is MANDATORY before writing any code or making implementation decisions
11
  - Use `explore_hf_docs` to discover documentation structure for relevant libraries (e.g., "trl", "transformers", "diffusers")
 
12
  - Use `fetch_hf_docs` to retrieve full content from specific documentation pages
13
- - Use `search_hf_api_endpoints` to find API endpoints with usage examples
14
  - Research what libraries to use, find code examples, understand best practices
15
- - Skip ONLY for simple factual questions (e.g., "What is LoRA?")
16
 
17
- 2. **THEN**: Formulate a plan based on research findings. Pass todos to the PlanTool. Update as progress is made.
18
 
19
  3. **FINALLY**: Implement using researched approaches
20
  - Search for relevant models/datasets on HF Hub
 
21
  - Use all available tools to complete the task
22
- - Leverage existing resources before creating new ones
23
- - Invoke multiple independent tools simultaneously for efficiency
24
 
25
  # Autonomy / Subordinate trade-off.
26
 
27
  Your main goal is to achieve what the user asked. For this:
28
- 1. Take action, follow-up, launch jobs. Ask for as little action from the user as possible. Do not ask them to do things you could do via a script.
29
 
30
  However !! :
31
  1. Don't surprise the user with costly, irreversible, or strange actions without asking.
32
- 2. Don't be shy to ask questions if needed.
33
  3. Don't be overly talkative, explaining everything after a task ended.
34
 
35
- # Available Tools
36
-
37
- You have access to the following categories of tools:
38
-
39
- - Hugging Face Hub: Search and interact with models, datasets, papers, and documentation
40
- - Spaces: Use and discover ML applications
41
- - Jobs: Manage compute jobs for training and inference
42
- - Image Generation: Generate and transform images
43
- - Planning : a planning/to-do tool.
44
 
45
  # Conventions
46
 
47
  - **ALWAYS search documentation BEFORE implementing** any ML workflow (training, inference, data processing, etc.) - This is non-negotiable
48
- - Use `explore_hf_docs`, `fetch_hf_docs`, and `search_hf_api_endpoints` to research the correct approach
49
- - Never assume you know the correct library, method, or approach - you must verify with documentation first
50
  - Base your implementation on researched best practices, not general knowledge or assumptions
51
  - Always search Hugging Face Hub for existing resources before suggesting custom implementations
52
  - Keep in mind that a space is a repo, so you can create a space directly by uploading files that way. Repos should also be used to store files permanently : post-execution, files from jobs are not available.
53
  - To run jobs, you must always pass the whole content of the file to execute. No files are available on server. Your local files and distant files are entirely seperate scopes.
54
  - The HF_TOKEN is automatically loaded from the environment variables.
55
- -
56
  - When referencing models, datasets, or papers, include direct links from search results
57
- - Before processing any dataset: inspect its actual structure first using the mcp__hf-mcp-server__hub_repo_details tool. Never assume column names: verify them beforehand.
58
- - Follow ML best practices: proper train/val/test splits, reproducibility, evaluation metrics
59
  - Unless absolutely necessary, don't ask user for action. This does not apply to follow-up questions you have.
60
- - For training tasks, consider compute requirements and choose appropriate hardware.
61
  - Never expose or log API keys, tokens, or secrets. Do not assume keys or secrets are available. Only Hugging Face private resources are available.
62
 
63
  # Communication Style
 
1
  system_prompt: |
2
  You are HF Agent, a powerful AI assistant for Machine Learning Engineering, particularly training Large Language Models. You have access to {{ num_tools }} tools for interacting with Hugging Face Hub and performing ML tasks.
3
+
4
+ _Current Time: **{{ current_date }} {{ current_time }} ({{ current_timezone }})**_
5
+
6
  # Task Approach
7
 
8
+ **CRITICAL: You always research first, then implement. You only make implementations that are guided by examples, best practices, or documentation.**
9
 
10
  For ANY implementation task (training, fine-tuning, inference, data processing, etc.):
11
  1. **FIRST**: Search HF documentation to find the recommended approach
12
  - This is MANDATORY before writing any code or making implementation decisions
13
  - Use `explore_hf_docs` to discover documentation structure for relevant libraries (e.g., "trl", "transformers", "diffusers")
14
+ - Use `github_find_examples` and `github_read_file` to discover best-practices on these libraries to reuse.
15
  - Use `fetch_hf_docs` to retrieve full content from specific documentation pages
16
+ - Use `search_hf_api_endpoints` to find API endpoints (e.g. spaces, models, datasets, discussions, users, orgs, papers etc.) with usage examples and curl examples.
17
  - Research what libraries to use, find code examples, understand best practices
18
+ - Skip ONLY for simple factual questions (e.g., "What is LoRA?").
19
 
20
+ 2. **THEN**: Formulate a plan based on research findings. Pass todos to the `plan_tool`. Update as progress is made.
21
 
22
  3. **FINALLY**: Implement using researched approaches
23
  - Search for relevant models/datasets on HF Hub
24
+ - Always validate data structure and format before using it (libraries need specific formats, see documentation).
25
  - Use all available tools to complete the task
26
+ - Always leverage existing implementations and resources before creating new ones
27
+ - Use multiple independent tools concurrently for efficiency
28
 
29
  # Autonomy / Subordinate trade-off.
30
 
31
  Your main goal is to achieve what the user asked. For this:
32
+ 1. Research, then take action, follow-up, launch jobs. Ask for as little action from the user as possible. Do not ask them to do things you could do via a script or tool.
33
 
34
  However !! :
35
  1. Don't surprise the user with costly, irreversible, or strange actions without asking.
36
+ 2. Don't be shy to ask clarifying questions if needed.
37
  3. Don't be overly talkative, explaining everything after a task ended.
38
 
 
 
 
 
 
 
 
 
 
39
 
40
  # Conventions
41
 
42
  - **ALWAYS search documentation BEFORE implementing** any ML workflow (training, inference, data processing, etc.) - This is non-negotiable
43
+ - Use `explore_hf_docs`, `github_find_examples`, `fetch_hf_docs`, and `search_hf_api_endpoints` to research the correct approach
44
+ - Never assume you know the correct library, method, or approach - you must verify with documentation first. Documentation is the ultimate source of truth.
45
  - Base your implementation on researched best practices, not general knowledge or assumptions
46
  - Always search Hugging Face Hub for existing resources before suggesting custom implementations
47
  - Keep in mind that a space is a repo, so you can create a space directly by uploading files that way. Repos should also be used to store files permanently : post-execution, files from jobs are not available.
48
  - To run jobs, you must always pass the whole content of the file to execute. No files are available on server. Your local files and distant files are entirely seperate scopes.
49
  - The HF_TOKEN is automatically loaded from the environment variables.
 
50
  - When referencing models, datasets, or papers, include direct links from search results
51
+ - Before processing any dataset: inspect its actual structure first using the `hub_repo_details` tool. Never assume column names, datarow structure, or format: verify them beforehand.
52
+ - Follow ML best practices: proper train/val/test splits, reproducibility, evaluation metrics, pushing to hub.
53
  - Unless absolutely necessary, don't ask user for action. This does not apply to follow-up questions you have.
54
+ - For training tasks, consider compute requirements and choose appropriate hardware based on this formula: approx_VRAM_needed = N_params × bytes_per_param × 1.5.
55
  - Never expose or log API keys, tokens, or secrets. Do not assume keys or secrets are available. Only Hugging Face private resources are available.
56
 
57
  # Communication Style
agent/tools/__init__.py CHANGED
@@ -2,6 +2,22 @@
2
  Hugging Face tools for the agent
3
  """
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, HfJobsTool, hf_jobs_handler
6
  from agent.tools.types import ToolResult
7
 
@@ -10,4 +26,12 @@ __all__ = [
10
  "HF_JOBS_TOOL_SPEC",
11
  "hf_jobs_handler",
12
  "HfJobsTool",
 
 
 
 
 
 
 
 
13
  ]
 
2
  Hugging Face tools for the agent
3
  """
4
 
5
+ from agent.tools.github_find_examples import (
6
+ GITHUB_FIND_EXAMPLES_TOOL_SPEC,
7
+ github_find_examples_handler,
8
+ )
9
+ from agent.tools.github_list_repos import (
10
+ GITHUB_LIST_REPOS_TOOL_SPEC,
11
+ github_list_repos_handler,
12
+ )
13
+ from agent.tools.github_read_file import (
14
+ GITHUB_READ_FILE_TOOL_SPEC,
15
+ github_read_file_handler,
16
+ )
17
+ from agent.tools.github_search_code import (
18
+ GITHUB_SEARCH_CODE_TOOL_SPEC,
19
+ github_search_code_handler,
20
+ )
21
  from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, HfJobsTool, hf_jobs_handler
22
  from agent.tools.types import ToolResult
23
 
 
26
  "HF_JOBS_TOOL_SPEC",
27
  "hf_jobs_handler",
28
  "HfJobsTool",
29
+ "GITHUB_FIND_EXAMPLES_TOOL_SPEC",
30
+ "github_find_examples_handler",
31
+ "GITHUB_LIST_REPOS_TOOL_SPEC",
32
+ "github_list_repos_handler",
33
+ "GITHUB_READ_FILE_TOOL_SPEC",
34
+ "github_read_file_handler",
35
+ "GITHUB_SEARCH_CODE_TOOL_SPEC",
36
+ "github_search_code_handler",
37
  ]
agent/tools/github_find_examples.py ADDED
@@ -0,0 +1,491 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GitHub Find Examples Tool - Discover examples, tutorials, and guides for any library
3
+
4
+ Lists all files in a repository and performs deterministic keyword search.
5
+ """
6
+
7
+ import os
8
+ from typing import Any, Dict, List
9
+
10
+ import requests
11
+ from thefuzz import fuzz
12
+
13
+ from agent.tools.types import ToolResult
14
+
15
+ # In order of priority (lower index = higher priority for sorting)
16
+ EXAMPLE_PATTERNS = [
17
+ "scripts",
18
+ # General example patterns (catch-all, lower priority)
19
+ "examples",
20
+ "example",
21
+ # Notebook patterns
22
+ "notebooks",
23
+ "notebook",
24
+ # Tutorial/learning patterns
25
+ "tutorials",
26
+ "tutorial",
27
+ "quickstart",
28
+ "walkthroughs",
29
+ "walkthrough",
30
+ # Cookbook/recipe patterns
31
+ "cookbook",
32
+ "cookbooks",
33
+ "recipes",
34
+ "recipe",
35
+ # Demo/sample patterns
36
+ "demos",
37
+ "demo",
38
+ "samples",
39
+ "sample",
40
+ # Other patterns
41
+ "guides",
42
+ "guide",
43
+ "getting-started",
44
+ "getting_started",
45
+ "playground",
46
+ "howto",
47
+ "how-to",
48
+ "use-cases",
49
+ "usecases",
50
+ "use_cases",
51
+ "sandbox",
52
+ "showcase",
53
+ ]
54
+
55
+
56
+ def _get_repo_tree(org: str, repo: str, token: str) -> tuple[List[Dict[str, Any]], str]:
57
+ """Get all files in a repository recursively. Returns (files, error_message)"""
58
+ headers = {
59
+ "Accept": "application/vnd.github+json",
60
+ "X-GitHub-Api-Version": "2022-11-28",
61
+ "Authorization": f"Bearer {token}",
62
+ }
63
+
64
+ full_repo = f"{org}/{repo}"
65
+
66
+ # Get default branch
67
+ try:
68
+ response = requests.get(
69
+ f"https://api.github.com/repos/{full_repo}", headers=headers, timeout=10
70
+ )
71
+ if response.status_code == 404:
72
+ return [], "not_found"
73
+ if response.status_code != 200:
74
+ return [], f"API error: {response.status_code}"
75
+
76
+ repo_data = response.json()
77
+ default_branch = repo_data.get("default_branch", "main")
78
+ except Exception as e:
79
+ return [], f"Error fetching repo: {str(e)}"
80
+
81
+ # Get repository tree recursively
82
+ try:
83
+ response = requests.get(
84
+ f"https://api.github.com/repos/{full_repo}/git/trees/{default_branch}",
85
+ headers=headers,
86
+ params={"recursive": "1"},
87
+ timeout=30,
88
+ )
89
+ if response.status_code != 200:
90
+ return [], f"Error fetching tree: {response.status_code}"
91
+
92
+ data = response.json()
93
+ tree = data.get("tree", [])
94
+
95
+ # Filter to only include files (not directories)
96
+ files = [
97
+ {
98
+ "path": item["path"],
99
+ "ref": item["sha"],
100
+ "size": item.get("size", 0),
101
+ "url": f"https://github.com/{full_repo}/blob/{default_branch}/{item['path']}",
102
+ }
103
+ for item in tree
104
+ if item["type"] == "blob"
105
+ ]
106
+
107
+ return files, ""
108
+ except Exception as e:
109
+ return [], f"Error processing tree: {str(e)}"
110
+
111
+
112
+ def _search_similar_repos(org: str, repo: str, token: str) -> List[Dict[str, Any]]:
113
+ """Search for similar repository names in the organization"""
114
+ headers = {
115
+ "Accept": "application/vnd.github+json",
116
+ "X-GitHub-Api-Version": "2022-11-28",
117
+ "Authorization": f"Bearer {token}",
118
+ }
119
+
120
+ # Search for repos in the org with similar name
121
+ query = f"org:{org} {repo}"
122
+
123
+ try:
124
+ response = requests.get(
125
+ "https://api.github.com/search/repositories",
126
+ headers=headers,
127
+ params={"q": query, "sort": "stars", "order": "desc", "per_page": 10},
128
+ timeout=30,
129
+ )
130
+
131
+ if response.status_code != 200:
132
+ return []
133
+
134
+ data = response.json()
135
+ items = data.get("items", [])
136
+
137
+ return [
138
+ {
139
+ "name": item.get("name"),
140
+ "full_name": item.get("full_name"),
141
+ "description": item.get("description"),
142
+ "stars": item.get("stargazers_count", 0),
143
+ "url": item.get("html_url"),
144
+ }
145
+ for item in items
146
+ ]
147
+ except Exception:
148
+ return []
149
+
150
+
151
+ def _score_against_example_patterns(file_path: str) -> int:
152
+ """Score file against example patterns using token_set_ratio"""
153
+ scores = []
154
+ for pattern in EXAMPLE_PATTERNS:
155
+ score = fuzz.token_set_ratio(pattern.lower(), file_path.lower())
156
+ scores.append(score)
157
+ return max(scores) if scores else 0
158
+
159
+
160
+ def _score_against_keyword(file_path: str, keyword: str) -> int:
161
+ """Calculate fuzzy match score for a file path against a keyword"""
162
+ # Use partial_ratio for substring matching (good for paths)
163
+ # Also check token_set_ratio for word-level matching
164
+ partial_score = fuzz.partial_ratio(keyword.lower(), file_path.lower())
165
+ token_score = fuzz.token_set_ratio(keyword.lower(), file_path.lower())
166
+
167
+ # Return the higher of the two
168
+ return max(partial_score, token_score)
169
+
170
+
171
+ def _get_pattern_priority(file_path: str) -> tuple[int, int, int]:
172
+ """
173
+ Get priority of a file path based on which example pattern directory it's in.
174
+
175
+ Returns: (in_examples_dir, pattern_priority, path_depth)
176
+ - in_examples_dir: 0 if in examples/ directory, 1 otherwise (lower is better)
177
+ - pattern_priority: Index in EXAMPLE_PATTERNS (lower is better), or 999 if no match
178
+ - path_depth: Number of path segments (lower is better)
179
+
180
+ Note: Prioritizes files in "examples/" directory first, then by most specific pattern match.
181
+ E.g., "examples/scripts/train.py" is better than "scripts/util.py"
182
+ """
183
+ path_lower = file_path.lower()
184
+ path_parts = path_lower.split("/")
185
+
186
+ # Check if file is in examples/ directory (highest priority)
187
+ in_examples_dir = 0 if (path_parts[0] in ["examples", "example"]) else 1
188
+
189
+ # Find ALL matching patterns and use the best (lowest index) one
190
+ # But prefer deeper matches (more specific) over shallow ones
191
+ best_priority = 999
192
+ best_depth_at_match = -1
193
+
194
+ for i, pattern in enumerate(EXAMPLE_PATTERNS):
195
+ # Check if pattern appears as a directory component in the path
196
+ if pattern in path_parts:
197
+ # Find the depth where this pattern appears (rightmost occurrence)
198
+ depth = len(path_parts) - 1 - path_parts[::-1].index(pattern)
199
+
200
+ # Prefer deeper matches, or better priority if at same depth
201
+ if depth > best_depth_at_match or (
202
+ depth == best_depth_at_match and i < best_priority
203
+ ):
204
+ best_priority = i
205
+ best_depth_at_match = depth
206
+
207
+ return (in_examples_dir, best_priority, len(path_parts))
208
+
209
+
210
+ def _handle_repo_tree_errors(
211
+ all_files: List[Dict[str, Any]],
212
+ error: str,
213
+ org: str,
214
+ repo: str,
215
+ token: str,
216
+ ) -> ToolResult | None:
217
+ """Handle errors from repo tree fetch. Returns ToolResult if error, None if OK."""
218
+ if error == "not_found":
219
+ similar_repos = _search_similar_repos(org, repo, token)
220
+
221
+ if not similar_repos:
222
+ return {
223
+ "formatted": f"Repository '{org}/{repo}' not found and no similar repositories found.",
224
+ "totalResults": 0,
225
+ "resultsShared": 0,
226
+ "isError": True,
227
+ }
228
+
229
+ # Format similar repos
230
+ lines = [f"**Repository '{org}/{repo}' not found. Similar repositories:**\n"]
231
+ for i, r in enumerate(similar_repos, 1):
232
+ lines.append(f"{i}. **{r['full_name']}** (⭐ {r['stars']:,} stars)")
233
+ if r["description"]:
234
+ desc = (
235
+ r["description"][:100] + "..."
236
+ if len(r["description"]) > 100
237
+ else r["description"]
238
+ )
239
+ lines.append(f" {desc}")
240
+ lines.append(f" {r['url']}\n")
241
+
242
+ return {
243
+ "formatted": "\n".join(lines),
244
+ "totalResults": len(similar_repos),
245
+ "resultsShared": len(similar_repos),
246
+ "isError": True,
247
+ }
248
+
249
+ if error:
250
+ return {
251
+ "formatted": f"Error accessing repository '{org}/{repo}': {error}",
252
+ "totalResults": 0,
253
+ "resultsShared": 0,
254
+ "isError": True,
255
+ }
256
+
257
+ if not all_files:
258
+ return {
259
+ "formatted": f"No files found in repository '{org}/{repo}'",
260
+ "totalResults": 0,
261
+ "resultsShared": 0,
262
+ }
263
+
264
+ return None
265
+
266
+
267
+ def find_examples(
268
+ keyword: str = "",
269
+ repo: str = "",
270
+ org: str = "huggingface",
271
+ max_results: int = 10,
272
+ min_score: int = 80,
273
+ ) -> ToolResult:
274
+ """
275
+ Find example files in a repository using fuzzy matching.
276
+
277
+ Args:
278
+ keyword: Keyword to fuzzy match against file paths (e.g., "grpo")
279
+ repo: Repository name (e.g., "trl")
280
+ org: GitHub organization (default: "huggingface")
281
+ max_results: Maximum number of results (default 50)
282
+ min_score: Minimum fuzzy match score (0-100, default 60)
283
+
284
+ Returns:
285
+ ToolResult with matching files, or similar repos if repo not found
286
+ """
287
+ token = os.environ.get("GITHUB_TOKEN")
288
+ if not token:
289
+ return {
290
+ "formatted": "Error: GITHUB_TOKEN environment variable is required",
291
+ "totalResults": 0,
292
+ "resultsShared": 0,
293
+ "isError": True,
294
+ }
295
+
296
+ if not repo:
297
+ return {
298
+ "formatted": "Error: repo parameter is required",
299
+ "totalResults": 0,
300
+ "resultsShared": 0,
301
+ "isError": True,
302
+ }
303
+
304
+ # Get all files in the repository
305
+ all_files, error = _get_repo_tree(org, repo, token)
306
+
307
+ # Handle errors (not found, API errors, empty repo)
308
+ if error_result := _handle_repo_tree_errors(all_files, error, org, repo, token):
309
+ return error_result
310
+
311
+ # Step 1: Filter files by example patterns (score >= 60)
312
+ example_threshold = 60
313
+ example_files = []
314
+ for file in all_files:
315
+ example_score = _score_against_example_patterns(file["path"])
316
+ if example_score >= example_threshold:
317
+ example_files.append({**file, "example_score": example_score})
318
+
319
+ if not example_files:
320
+ return {
321
+ "formatted": f"No example files found in {org}/{repo} (no files match example patterns with score >= {example_threshold}).",
322
+ "totalResults": 0,
323
+ "resultsShared": 0,
324
+ }
325
+
326
+ # Step 2: If keyword provided, score and filter by keyword
327
+ if keyword:
328
+ scored_files = []
329
+ for file in example_files:
330
+ keyword_score = _score_against_keyword(file["path"], keyword)
331
+ if keyword_score >= min_score:
332
+ scored_files.append({**file, "score": keyword_score})
333
+
334
+ if not scored_files:
335
+ return {
336
+ "formatted": f"No files found in {org}/{repo} matching keyword '{keyword}' (min score: {min_score}) among {len(example_files)} example files.",
337
+ "totalResults": 0,
338
+ "resultsShared": 0,
339
+ }
340
+
341
+ # Sort by keyword score (descending) for best matches first
342
+ scored_files.sort(key=lambda x: x["score"], reverse=True)
343
+ else:
344
+ # No keyword: prioritize by pattern directory, then path depth
345
+ scored_files = []
346
+ for file in example_files:
347
+ in_examples_dir, pattern_priority, path_depth = _get_pattern_priority(
348
+ file["path"]
349
+ )
350
+ scored_files.append(
351
+ {
352
+ **file,
353
+ "score": file["example_score"],
354
+ "in_examples_dir": in_examples_dir,
355
+ "pattern_priority": pattern_priority,
356
+ "path_depth": path_depth,
357
+ }
358
+ )
359
+
360
+ if not scored_files:
361
+ return {
362
+ "formatted": f"No example files found in {org}/{repo}.",
363
+ "totalResults": 0,
364
+ "resultsShared": 0,
365
+ }
366
+
367
+ # Sort by: 1) files in examples/ dir first, 2) pattern priority (scripts > datasets > etc), 3) path depth, 4) path name
368
+ scored_files.sort(
369
+ key=lambda x: (
370
+ x["in_examples_dir"],
371
+ x["pattern_priority"],
372
+ x["path_depth"],
373
+ x["path"],
374
+ )
375
+ )
376
+
377
+ # Limit results
378
+ results = scored_files[:max_results]
379
+
380
+ # Format output
381
+ keyword_desc = f" matching '{keyword}'" if keyword else ""
382
+ lines = [f"**Found {len(results)} example files in {org}/{repo}{keyword_desc}:**"]
383
+ if len(scored_files) > max_results:
384
+ lines[0] += f" (showing {max_results} of {len(scored_files)})"
385
+ lines.append("")
386
+
387
+ for i, file in enumerate(results, 1):
388
+ lines.append(f"{i}. **{file['path']}**")
389
+ lines.append(f" Size: {file['size']:,} bytes | Ref: {file['ref'][:7]}")
390
+ lines.append(f" URL: {file['url']}")
391
+
392
+ # Copyable parameters for read_file tool
393
+ read_params = f"{{'repo': '{org}/{repo}', 'path': '{file['path']}'}}"
394
+ lines.append(f" To read, use: {read_params}")
395
+ lines.append("")
396
+
397
+ return {
398
+ "formatted": "\n".join(lines),
399
+ "totalResults": len(results),
400
+ "resultsShared": len(results),
401
+ }
402
+
403
+
404
+ # Tool specification
405
+ GITHUB_FIND_EXAMPLES_TOOL_SPEC = {
406
+ "name": "github_find_examples",
407
+ "description": (
408
+ "Discover best practices, reusable scripts, tutorials, and demos for usinga specific library or framework. This is an important step before implementing anything ML related.",
409
+ "Use together with github_read_file tool.\n\n"
410
+ "## When to use this tool\n\n"
411
+ "- ALWAYS before implementing any training/inference/benchmarking or other ML related code or answering how-to question.\n"
412
+ "- When exploring a new repository and need to understand how to use it\n"
413
+ "## How it works\n\n"
414
+ "1. Fetches all (examples, tutorials, demos, notebooks, scripts, etc.) from the repository\n"
415
+ "2. If keyword provided, scores found files against the keyword using fuzzy matching\n"
416
+ "3. Returns best matches sorted by relevance score\n"
417
+ "## Examples\n\n"
418
+ "<example>\n"
419
+ "// ML Workflow Step: Find GRPO/SFT/DPO/RLOO etc training examples\n"
420
+ "// Task: Starting GRPO fine-tuning project, need reference implementations\n"
421
+ "{\n"
422
+ " keyword: 'grpo',\n"
423
+ " repo: 'trl',\n"
424
+ " org: 'huggingface'\n"
425
+ "}\n"
426
+ "// Returns: examples/scripts/grpo_agent.py, examples/scripts/grpo_vlm.py\n"
427
+ "// Next step: Use github_read_file to study the implementation\n"
428
+ "</example>\n\n"
429
+ "<example>\n"
430
+ "// ML Workflow Step: Discover all training examples in TRL\n"
431
+ "// Task: Exploring available training methods before choosing approach\n"
432
+ "{\n"
433
+ " repo: 'trl',\n"
434
+ " org: 'huggingface',\n"
435
+ " max_results: 20\n"
436
+ "}\n"
437
+ "// Lists all example scripts: PPO, DPO, GRPO, reward modeling, etc.\n"
438
+ "</example>\n\n"
439
+ "<example>\n"
440
+ "// ML Workflow Step: Find LoRA fine-tuning examples\n"
441
+ "// Task: Learning parameter-efficient fine-tuning with PEFT\n"
442
+ "{\n"
443
+ " keyword: 'lora',\n"
444
+ " repo: 'peft',\n"
445
+ " org: 'huggingface'\n"
446
+ "}\n"
447
+ "// Discovers LoRA configuration and training examples\n"
448
+ "</example>",
449
+ ),
450
+ "parameters": {
451
+ "type": "object",
452
+ "properties": {
453
+ "keyword": {
454
+ "type": "string",
455
+ "description": "Keyword to fuzzy match against file paths (e.g., 'grpo', 'sft').",
456
+ },
457
+ "repo": {
458
+ "type": "string",
459
+ "description": "Repository name (e.g., 'trl', 'transformers'). Required.",
460
+ },
461
+ "org": {
462
+ "type": "string",
463
+ "description": "GitHub organization or username. Default: 'huggingface'.",
464
+ },
465
+ "max_results": {
466
+ "type": "integer",
467
+ "description": "Maximum number of results to return. Default: 50.",
468
+ },
469
+ "min_score": {
470
+ "type": "integer",
471
+ "description": "Minimum fuzzy match score (0-100). Default: 60.",
472
+ },
473
+ },
474
+ "required": ["repo"],
475
+ },
476
+ }
477
+
478
+
479
+ async def github_find_examples_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
480
+ """Handler for agent tool router"""
481
+ try:
482
+ result = find_examples(
483
+ keyword=arguments.get("keyword", ""),
484
+ repo=arguments["repo"],
485
+ org=arguments.get("org", "huggingface"),
486
+ max_results=arguments.get("max_results", 50),
487
+ min_score=arguments.get("min_score", 60),
488
+ )
489
+ return result["formatted"], not result.get("isError", False)
490
+ except Exception as e:
491
+ return f"Error finding examples: {str(e)}", False
agent/tools/github_list_repos.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GitHub List Repositories Tool - List and sort repositories for any user or organization
3
+
4
+ Efficiently discover repositories with flexible sorting options.
5
+ """
6
+
7
+ import os
8
+ from typing import Any, Dict, Literal, Optional
9
+
10
+ import requests
11
+
12
+ from agent.tools.types import ToolResult
13
+
14
+
15
+ def list_repos(
16
+ owner: str,
17
+ owner_type: Literal["user", "org"] = "org",
18
+ sort: Literal["stars", "forks", "updated", "created"] = "stars",
19
+ order: Literal["asc", "desc"] = "desc",
20
+ limit: Optional[int] = 30,
21
+ ) -> ToolResult:
22
+ """
23
+ List repositories for a user or organization using GitHub REST API.
24
+
25
+ Args:
26
+ owner: GitHub username or organization name
27
+ owner_type: Whether the owner is a "user" or "org" (default: "org")
28
+ sort: Sort field - "stars", "forks", "updated", or "created"
29
+ order: Sort order - "asc" or "desc" (default: "desc")
30
+ limit: Maximum number of repositories to return
31
+
32
+ Returns:
33
+ ToolResult with repository information
34
+ """
35
+ token = os.environ.get("GITHUB_TOKEN")
36
+ if not token:
37
+ return {
38
+ "formatted": "Error: GITHUB_TOKEN environment variable is required",
39
+ "totalResults": 0,
40
+ "resultsShared": 0,
41
+ "isError": True,
42
+ }
43
+
44
+ if owner_type == "org":
45
+ url = f"https://api.github.com/orgs/{owner}/repos"
46
+ else:
47
+ url = f"https://api.github.com/users/{owner}/repos"
48
+
49
+ headers = {
50
+ "Accept": "application/vnd.github+json",
51
+ "X-GitHub-Api-Version": "2022-11-28",
52
+ "Authorization": f"Bearer {token}",
53
+ }
54
+
55
+ all_repos = []
56
+ page = 1
57
+ per_page = 100 # Maximum allowed by GitHub
58
+
59
+ # Map our sort values to GitHub API sort values
60
+ # Note: GitHub list repos API doesn't support sorting by stars/forks
61
+ # We'll fetch all repos and sort in memory for those cases
62
+ api_sort_map = {
63
+ "created": "created",
64
+ "updated": "updated",
65
+ "stars": None, # Not supported by list API
66
+ "forks": None, # Not supported by list API
67
+ }
68
+
69
+ api_sort = api_sort_map.get(sort)
70
+ need_manual_sort = api_sort is None
71
+
72
+ try:
73
+ while True:
74
+ params = {
75
+ "page": page,
76
+ "per_page": per_page,
77
+ }
78
+
79
+ # Only add sort/direction if API supports it
80
+ if api_sort:
81
+ params["sort"] = api_sort
82
+ params["direction"] = order
83
+
84
+ response = requests.get(
85
+ url,
86
+ headers=headers,
87
+ params=params,
88
+ timeout=30,
89
+ )
90
+
91
+ if response.status_code == 403:
92
+ error_data = response.json()
93
+ return {
94
+ "formatted": f"GitHub API rate limit or permission error: {error_data.get('message', 'Unknown error')}",
95
+ "totalResults": 0,
96
+ "resultsShared": 0,
97
+ "isError": True,
98
+ }
99
+
100
+ if response.status_code != 200:
101
+ error_msg = f"GitHub API error (status {response.status_code})"
102
+ try:
103
+ error_data = response.json()
104
+ if "message" in error_data:
105
+ error_msg += f": {error_data['message']}"
106
+ except Exception:
107
+ pass
108
+ return {
109
+ "formatted": error_msg,
110
+ "totalResults": 0,
111
+ "resultsShared": 0,
112
+ "isError": True,
113
+ }
114
+
115
+ items = response.json()
116
+
117
+ if not items:
118
+ break
119
+
120
+ for item in items:
121
+ all_repos.append(
122
+ {
123
+ "name": item.get("name"),
124
+ "full_name": item.get("full_name"),
125
+ "description": item.get("description"),
126
+ "html_url": item.get("html_url"),
127
+ "language": item.get("language"),
128
+ "stars": item.get("stargazers_count", 0),
129
+ "forks": item.get("forks_count", 0),
130
+ "open_issues": item.get("open_issues_count", 0),
131
+ "topics": item.get("topics", []),
132
+ "updated_at": item.get("updated_at"),
133
+ "created_at": item.get("created_at"),
134
+ }
135
+ )
136
+
137
+ # Check if we got fewer results than requested (last page)
138
+ if len(items) < per_page:
139
+ break
140
+
141
+ # Stop if we have enough repos
142
+ if limit and len(all_repos) >= limit:
143
+ break
144
+
145
+ page += 1
146
+
147
+ except requests.exceptions.RequestException as e:
148
+ return {
149
+ "formatted": f"Failed to connect to GitHub API: {str(e)}",
150
+ "totalResults": 0,
151
+ "resultsShared": 0,
152
+ "isError": True,
153
+ }
154
+
155
+ # Manual sorting if needed (for stars/forks)
156
+ if need_manual_sort and all_repos:
157
+ reverse = order == "desc"
158
+ all_repos.sort(key=lambda x: x[sort], reverse=reverse)
159
+
160
+ # Apply limit after sorting
161
+ if limit:
162
+ all_repos = all_repos[:limit]
163
+
164
+ if not all_repos:
165
+ return {
166
+ "formatted": f"No repositories found for {owner_type} '{owner}'",
167
+ "totalResults": 0,
168
+ "resultsShared": 0,
169
+ }
170
+
171
+ # Format output
172
+ lines = [f"**Found {len(all_repos)} repositories for {owner}:**\n"]
173
+
174
+ for i, repo in enumerate(all_repos, 1):
175
+ lines.append(f"{i}. **{repo['full_name']}**")
176
+ lines.append(
177
+ f" ⭐ {repo['stars']:,} stars | 🍴 {repo['forks']:,} forks | Language: {repo['language'] or 'N/A'}"
178
+ )
179
+ if repo["description"]:
180
+ desc = (
181
+ repo["description"][:100] + "..."
182
+ if len(repo["description"]) > 100
183
+ else repo["description"]
184
+ )
185
+ lines.append(f" {desc}")
186
+ lines.append(f" URL: {repo['html_url']}")
187
+ if repo["topics"]:
188
+ lines.append(f" Topics: {', '.join(repo['topics'][:5])}")
189
+
190
+ # Copyable parameters for other tools
191
+ lines.append(f" Use in tools: {{'repo': '{repo['full_name']}'}}")
192
+ lines.append("")
193
+
194
+ return {
195
+ "formatted": "\n".join(lines),
196
+ "totalResults": len(all_repos),
197
+ "resultsShared": len(all_repos),
198
+ }
199
+
200
+
201
+ # Tool specification
202
+ GITHUB_LIST_REPOS_TOOL_SPEC = {
203
+ "name": "github_list_repos",
204
+ "description": (
205
+ "List and discover repositories for any GitHub user or organization with flexible sorting.\n\n"
206
+ "Returns comprehensive repository information including stars, forks, language, topics, and direct URLs. "
207
+ "Sorts by stars, forks, update date, or creation date.\n\n"
208
+ "## When to use this tool\n\n"
209
+ "- When you need to find libraries to use in your implementation, or to explore what repositories exist for a task.\n"
210
+ "- When debugging an error to looking up if others are having the same issues in repositories."
211
+ "- When finding the most popular or active projects for a user or org\n"
212
+ "## Examples\n\n"
213
+ "<example>\n"
214
+ "// ML Workflow Step: Discover HF libraries for RLHF/alignment\n"
215
+ "// Use case: Find the right library for training with human feedback\n"
216
+ "{\n"
217
+ " owner: 'huggingface',\n"
218
+ " owner_type: 'org',\n"
219
+ " sort: 'stars',\n"
220
+ " limit: 10\n"
221
+ "}\n"
222
+ "// Returns: transformers, trl, peft, accelerate, diffusers...\n"
223
+ "</example>\n\n"
224
+ "<example>\n"
225
+ "// ML Workflow Step: Check for recently updated HF repos\n"
226
+ "// Use case: Find actively maintained libraries with latest features\n"
227
+ "{\n"
228
+ " owner: 'huggingface',\n"
229
+ " owner_type: 'org',\n"
230
+ " sort: 'updated',\n"
231
+ " order: 'desc',\n"
232
+ " limit: 15\n"
233
+ "}\n"
234
+ "// Helps identify which repos have recent improvements/fixes\n"
235
+ "</example>"
236
+ ),
237
+ "parameters": {
238
+ "type": "object",
239
+ "properties": {
240
+ "owner": {
241
+ "type": "string",
242
+ "description": "GitHub username or organization name. Required.",
243
+ },
244
+ "owner_type": {
245
+ "type": "string",
246
+ "enum": ["user", "org"],
247
+ "description": "Whether the owner is a 'user' or 'org'. Default: 'org'.",
248
+ },
249
+ "sort": {
250
+ "type": "string",
251
+ "enum": ["stars", "forks", "updated", "created"],
252
+ "description": "Sort field. Options: 'stars', 'forks', 'updated', 'created'. Default: 'stars'.",
253
+ },
254
+ "order": {
255
+ "type": "string",
256
+ "enum": ["asc", "desc"],
257
+ "description": "Sort order. Options: 'asc', 'desc'. Default: 'desc'.",
258
+ },
259
+ "limit": {
260
+ "type": "integer",
261
+ "description": "Maximum number of repositories to return. No limit if not specified. Default: 30.",
262
+ },
263
+ },
264
+ "required": ["owner"],
265
+ },
266
+ }
267
+
268
+
269
+ async def github_list_repos_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
270
+ """Handler for agent tool router"""
271
+ try:
272
+ result = list_repos(
273
+ owner=arguments["owner"],
274
+ owner_type=arguments.get("owner_type", "org"),
275
+ sort=arguments.get("sort", "stars"),
276
+ order=arguments.get("order", "desc"),
277
+ limit=arguments.get("limit"),
278
+ )
279
+ return result["formatted"], not result.get("isError", False)
280
+ except Exception as e:
281
+ return f"Error listing repositories: {str(e)}", False
agent/tools/github_read_file.py ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GitHub Read File Tool - Read file contents from any GitHub repository with line range support
3
+
4
+ Fetch exact file contents with metadata, supporting line ranges for efficient reading.
5
+ """
6
+
7
+ import base64
8
+ import json
9
+ import os
10
+ from typing import Any, Dict, Optional
11
+
12
+ import nbformat
13
+ import requests
14
+ from nbconvert import MarkdownExporter
15
+ from nbconvert.preprocessors import ClearOutputPreprocessor, TagRemovePreprocessor
16
+
17
+ from agent.tools.types import ToolResult
18
+
19
+
20
+ def _convert_ipynb_to_markdown(content: str) -> str:
21
+ """
22
+ Convert Jupyter notebook JSON to LLM-friendly Markdown.
23
+
24
+ Args:
25
+ content: Raw notebook JSON string
26
+
27
+ Returns:
28
+ Converted Markdown string
29
+ """
30
+ try:
31
+ # Parse notebook JSON
32
+ nb_dict = json.loads(content)
33
+
34
+ # Normalize cell sources (can be string or list of strings)
35
+ if "cells" in nb_dict:
36
+ for cell in nb_dict["cells"]:
37
+ if "source" in cell and isinstance(cell["source"], list):
38
+ cell["source"] = "".join(cell["source"])
39
+
40
+ # Read notebook with explicit version
41
+ nb = nbformat.reads(json.dumps(nb_dict), as_version=4)
42
+
43
+ # Strip outputs for LLM readability (outputs can be noisy/large)
44
+ clear = ClearOutputPreprocessor()
45
+ nb, _ = clear.preprocess(nb, {})
46
+
47
+ # Optionally remove cells tagged with "hide" or similar
48
+ remove = TagRemovePreprocessor(
49
+ remove_cell_tags={"hide", "hidden", "remove"},
50
+ remove_input_tags=set(),
51
+ remove_all_outputs_tags=set(),
52
+ )
53
+ nb, _ = remove.preprocess(nb, {})
54
+
55
+ # Convert to markdown
56
+ exporter = MarkdownExporter()
57
+ markdown, _ = exporter.from_notebook_node(nb)
58
+
59
+ return markdown
60
+
61
+ except json.JSONDecodeError:
62
+ return content
63
+ except Exception:
64
+ return content
65
+
66
+
67
+ def read_file(
68
+ repo: str,
69
+ path: str,
70
+ ref: str = "HEAD",
71
+ line_start: Optional[int] = None,
72
+ line_end: Optional[int] = None,
73
+ ) -> ToolResult:
74
+ """
75
+ Read file contents from a GitHub repository with line range support.
76
+
77
+ Args:
78
+ repo: Repository in format "owner/repo" (e.g., "github/github-mcp-server")
79
+ path: Path to file in repository (e.g., "pkg/github/search.go")
80
+ ref: Git reference - branch name, tag, or commit SHA (default: "HEAD")
81
+ line_start: Starting line number (1-indexed, inclusive)
82
+ line_end: Ending line number (1-indexed, inclusive)
83
+
84
+ Returns:
85
+ ToolResult with file contents and metadata
86
+ """
87
+ token = os.environ.get("GITHUB_TOKEN")
88
+ if not token:
89
+ return {
90
+ "formatted": "Error: GITHUB_TOKEN environment variable is required",
91
+ "totalResults": 0,
92
+ "resultsShared": 0,
93
+ "isError": True,
94
+ }
95
+
96
+ # Parse repo
97
+ if "/" not in repo:
98
+ return {
99
+ "formatted": "Error: repo must be in format 'owner/repo'",
100
+ "totalResults": 0,
101
+ "resultsShared": 0,
102
+ "isError": True,
103
+ }
104
+
105
+ owner, repo_name = repo.split("/", 1)
106
+
107
+ headers = {
108
+ "Accept": "application/vnd.github+json",
109
+ "X-GitHub-Api-Version": "2022-11-28",
110
+ "Authorization": f"Bearer {token}",
111
+ }
112
+
113
+ # Fetch file contents
114
+ url = f"https://api.github.com/repos/{owner}/{repo_name}/contents/{path}"
115
+ params = {}
116
+ if ref and ref != "HEAD":
117
+ params["ref"] = ref
118
+
119
+ try:
120
+ response = requests.get(url, headers=headers, params=params, timeout=30)
121
+
122
+ if response.status_code == 404:
123
+ return {
124
+ "formatted": f"File not found: {path} in {repo} (ref: {ref})",
125
+ "totalResults": 0,
126
+ "resultsShared": 0,
127
+ "isError": True,
128
+ }
129
+
130
+ if response.status_code != 200:
131
+ error_msg = f"GitHub API error (status {response.status_code})"
132
+ try:
133
+ error_data = response.json()
134
+ if "message" in error_data:
135
+ error_msg += f": {error_data['message']}"
136
+ except Exception:
137
+ pass
138
+ return {
139
+ "formatted": error_msg,
140
+ "totalResults": 0,
141
+ "resultsShared": 0,
142
+ "isError": True,
143
+ }
144
+
145
+ data = response.json()
146
+
147
+ # Check if it's a file
148
+ if data.get("type") != "file":
149
+ return {
150
+ "formatted": f"Path {path} is not a file (type: {data.get('type')})",
151
+ "totalResults": 0,
152
+ "resultsShared": 0,
153
+ "isError": True,
154
+ }
155
+
156
+ # Decode content
157
+ content_b64 = data.get("content", "")
158
+ if content_b64:
159
+ content_b64 = content_b64.replace("\n", "").replace(" ", "")
160
+ content = base64.b64decode(content_b64).decode("utf-8", errors="replace")
161
+ else:
162
+ # For large files, fetch raw content
163
+ raw_headers = {
164
+ "Accept": "application/vnd.github.raw",
165
+ "X-GitHub-Api-Version": "2022-11-28",
166
+ "Authorization": f"Bearer {token}",
167
+ }
168
+ raw_response = requests.get(
169
+ url, headers=raw_headers, params=params, timeout=30
170
+ )
171
+ if raw_response.status_code != 200:
172
+ return {
173
+ "formatted": "Failed to fetch file content",
174
+ "totalResults": 0,
175
+ "resultsShared": 0,
176
+ "isError": True,
177
+ }
178
+ content = raw_response.text
179
+
180
+ if path.lower().endswith(".ipynb"):
181
+ content = _convert_ipynb_to_markdown(content)
182
+
183
+ # Process line ranges
184
+ lines = content.split("\n")
185
+ total_lines = len(lines)
186
+
187
+ truncated = False
188
+
189
+ if line_start is None and line_end is None:
190
+ # No range specified
191
+ if total_lines > 300:
192
+ line_start = 1
193
+ line_end = 300
194
+ truncated = True
195
+ else:
196
+ line_start = 1
197
+ line_end = total_lines
198
+ else:
199
+ # Range specified
200
+ if line_start is None:
201
+ line_start = 1
202
+ if line_end is None:
203
+ line_end = total_lines
204
+
205
+ # Validate range
206
+ line_start = max(1, line_start)
207
+ line_end = min(total_lines, line_end)
208
+ if line_start > line_end:
209
+ return {
210
+ "formatted": f"Invalid range: line_start ({line_start}) > line_end ({line_end})",
211
+ "totalResults": 0,
212
+ "resultsShared": 0,
213
+ "isError": True,
214
+ }
215
+
216
+ # Extract lines
217
+ selected_lines = lines[line_start - 1 : line_end]
218
+ selected_content = "\n".join(selected_lines)
219
+
220
+ # Format output
221
+ lines_output = [f"**Reading file from repo: {repo}, path: {path}**"]
222
+
223
+ if ref and ref != "HEAD":
224
+ lines_output.append(f"Ref: {ref}")
225
+
226
+ lines_output.append("\n**File content:")
227
+ lines_output.append("```")
228
+ lines_output.append(selected_content)
229
+ lines_output.append("```")
230
+ if truncated:
231
+ lines_output.append(
232
+ f"Currently showing lines {line_start}-{line_end} out of {total_lines} total lines. Use line_start and line_end to view more lines."
233
+ )
234
+ return {
235
+ "formatted": "\n".join(lines_output),
236
+ "totalResults": 1,
237
+ "resultsShared": 1,
238
+ }
239
+
240
+ except requests.exceptions.RequestException as e:
241
+ return {
242
+ "formatted": f"Failed to connect to GitHub API: {str(e)}",
243
+ "totalResults": 0,
244
+ "resultsShared": 0,
245
+ "isError": True,
246
+ }
247
+
248
+
249
+ # Tool specification
250
+ GITHUB_READ_FILE_TOOL_SPEC = {
251
+ "name": "github_read_file",
252
+ "description": (
253
+ "Read file contents from any GitHub repository with line range support.\n\n"
254
+ "Fetches exact file contents in the given line range (default 300 lines, use line_start/line_end adjust). \n\n"
255
+ "## When to use this tool\n\n"
256
+ "- When reading example code, implementations, or documentation on a specific github file\n"
257
+ "- When you found a file via github_list_repos, or github_find_examples and need its contents\n"
258
+ "- When investigating specific code sections with line ranges\n"
259
+ "- When reading from specific branches, tags, or commits\n"
260
+ "## When NOT to use this tool\n\n"
261
+ "- When you don't know the exact file path beforehand (use github_search_code or github_find_examples first)\n\n"
262
+ "## Examples\n\n"
263
+ "<example>\n"
264
+ "// ML Workflow Step: Reading example code from for GRPO training with TRL\n"
265
+ "// Use case: Read trainer class to understand API and methods\n"
266
+ "{\n"
267
+ " repo: 'huggingface/trl',\n"
268
+ " path: 'trl/trainer/grpo_trainer.py',\n"
269
+ " line_start: 1,\n"
270
+ " line_end: 200\n"
271
+ "}\n"
272
+ "// Read class definition and constructor to understand parameters\n"
273
+ "</example>\n\n"
274
+ "<example>\n"
275
+ "// ML Workflow Step: Study complete training script\n"
276
+ "// Use case: Learn end-to-end VLM fine-tuning with GRPO\n"
277
+ "{\n"
278
+ " repo: 'huggingface/trl',\n"
279
+ " path: 'examples/scripts/grpo_vlm.py'\n"
280
+ "}\n"
281
+ "// Returns first 300 lines of the file\n"
282
+ "</example>\n\n"
283
+ "<example>\n"
284
+ "// ML Workflow Step: Check configuration patterns\n"
285
+ "// Use case: Learn how to structure training configs\n"
286
+ "{\n"
287
+ " repo: 'huggingface/transformers',\n"
288
+ " path: 'examples/pytorch/language-modeling/run_clm.py',\n"
289
+ " line_start: 50,\n"
290
+ " line_end: 150\n"
291
+ "}\n"
292
+ "// Read argument parsing and config setup section\n"
293
+ "</example>"
294
+ ),
295
+ "parameters": {
296
+ "type": "object",
297
+ "properties": {
298
+ "repo": {
299
+ "type": "string",
300
+ "description": "Repository in format 'owner/repo' (e.g., 'github/github-mcp-server'). Required.",
301
+ },
302
+ "path": {
303
+ "type": "string",
304
+ "description": "Path to file in repository (e.g., 'src/index.js'). Required.",
305
+ },
306
+ "ref": {
307
+ "type": "string",
308
+ "description": "Git reference - branch name, tag, or commit SHA. Default: 'HEAD'.",
309
+ },
310
+ "line_start": {
311
+ "type": "integer",
312
+ "description": "Starting line number (1-indexed, inclusive). Optional.",
313
+ },
314
+ "line_end": {
315
+ "type": "integer",
316
+ "description": "Ending line number (1-indexed, inclusive). Optional.",
317
+ },
318
+ },
319
+ "required": ["repo", "path"],
320
+ },
321
+ }
322
+
323
+
324
+ async def github_read_file_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
325
+ """Handler for agent tool router"""
326
+ try:
327
+ result = read_file(
328
+ repo=arguments["repo"],
329
+ path=arguments["path"],
330
+ ref=arguments.get("ref", "HEAD"),
331
+ line_start=arguments.get("line_start"),
332
+ line_end=arguments.get("line_end"),
333
+ )
334
+ return result["formatted"], not result.get("isError", False)
335
+ except Exception as e:
336
+ return f"Error reading file: {str(e)}", False
agent/tools/github_search_code.py ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GitHub Code Search Tool - Search code across GitHub with intelligent filtering
3
+
4
+ Maps user-friendly patterns to GitHub's Code Search API capabilities.
5
+ """
6
+
7
+ import fnmatch
8
+ import os
9
+ import re
10
+ from typing import Any, Dict, Optional
11
+
12
+ import requests
13
+
14
+ from agent.tools.types import ToolResult
15
+
16
+
17
+ def _glob_match(text: str, pattern: str) -> bool:
18
+ """Check if text matches glob pattern, supporting ** for multi-level paths"""
19
+ if "**" in pattern:
20
+ regex_pattern = pattern.replace("**", "<<<DOUBLESTAR>>>")
21
+ regex_pattern = fnmatch.translate(regex_pattern)
22
+ regex_pattern = regex_pattern.replace("<<<DOUBLESTAR>>>", ".*")
23
+ return re.match(regex_pattern, text) is not None
24
+ return fnmatch.fnmatch(text, pattern)
25
+
26
+
27
+ def _parse_repo_filter(repo_pattern: str) -> tuple[Optional[str], Optional[str]]:
28
+ """
29
+ Parse repository pattern into GitHub API filter and client-side glob pattern.
30
+
31
+ Returns: (api_filter, client_glob)
32
+ - api_filter: GitHub API filter string (e.g., "org:huggingface")
33
+ - client_glob: Pattern for client-side filtering (e.g., "huggingface/trl*")
34
+
35
+ Examples:
36
+ "huggingface/trl" → ("repo:huggingface/trl", None)
37
+ "huggingface/*" → ("org:huggingface", "huggingface/*")
38
+ "huggingface/trl*" → ("org:huggingface", "huggingface/trl*")
39
+ "huggingface" → ("org:huggingface", None)
40
+ "*/*" → (None, "*/*")
41
+ """
42
+ if not repo_pattern:
43
+ return None, None
44
+
45
+ # Pattern: owner/repo (exact match)
46
+ if "/" in repo_pattern and "*" not in repo_pattern and "?" not in repo_pattern:
47
+ return f"repo:{repo_pattern}", None
48
+
49
+ # Pattern: owner/* or owner/prefix* (org + client filter)
50
+ if "/" in repo_pattern and ("*" in repo_pattern or "?" in repo_pattern):
51
+ org_name = repo_pattern.split("/")[0]
52
+ if "*" not in org_name and "?" not in org_name:
53
+ return f"org:{org_name}", repo_pattern
54
+ # Org name has wildcards - can't filter server-side
55
+ return None, repo_pattern
56
+
57
+ # Pattern: owner (just org name, no wildcards)
58
+ if "*" not in repo_pattern and "?" not in repo_pattern:
59
+ return f"org:{repo_pattern}", None
60
+
61
+ # Pattern: */* or other complex patterns (client-side only)
62
+ return None, repo_pattern
63
+
64
+
65
+ def _parse_path_filter(path_pattern: str) -> tuple[Optional[str], Optional[str]]:
66
+ """
67
+ Parse path pattern into GitHub API filter and client-side glob pattern.
68
+
69
+ Returns: (api_filter, client_glob)
70
+
71
+ Examples:
72
+ "*.py" → ("extension:py", None)
73
+ "**/*.py" → ("extension:py", None)
74
+ "src/**/*.py" → ("extension:py", "src/**/*.py")
75
+ "test_*.py" → ("extension:py", "test_*.py")
76
+ "src/main.py" → ("path:src/main.py", None)
77
+ """
78
+ if not path_pattern:
79
+ return None, None
80
+
81
+ # Exact path (no wildcards)
82
+ if "*" not in path_pattern and "?" not in path_pattern:
83
+ return f"path:{path_pattern}", None
84
+
85
+ # Extract extension if present
86
+ ext_match = re.search(r"\*\.(\w+)$", path_pattern)
87
+ if ext_match:
88
+ extension = ext_match.group(1)
89
+ api_filter = f"extension:{extension}"
90
+
91
+ # Check if there's a directory prefix that needs client-side filtering
92
+ # e.g., "src/**/*.py" needs client filter, "**/*.py" doesn't
93
+ if path_pattern in [f"*.{extension}", f"**/*.{extension}"]:
94
+ # Simple patterns - API filter is enough
95
+ return api_filter, None
96
+ else:
97
+ # Complex pattern - need client-side filter too
98
+ return api_filter, path_pattern
99
+
100
+ # Pattern like "test_*.py" or "README*" - use filename with client filter
101
+ # GitHub's filename: doesn't support wildcards, so we rely on client-side
102
+ if "/" not in path_pattern:
103
+ # Try to extract extension for API filtering
104
+ if "." in path_pattern:
105
+ parts = path_pattern.rsplit(".", 1)
106
+ if "*" not in parts[-1] and "?" not in parts[-1]:
107
+ # Extension is clean
108
+ return f"extension:{parts[-1]}", path_pattern
109
+ # No extension or complex - client-side only
110
+ return None, path_pattern
111
+
112
+ # Complex path pattern - client-side only
113
+ return None, path_pattern
114
+
115
+
116
+ def search_code(
117
+ query: str,
118
+ repo_pattern: Optional[str] = None,
119
+ path_pattern: Optional[str] = None,
120
+ regex: bool = False,
121
+ max_results: int = 20,
122
+ ) -> ToolResult:
123
+ """
124
+ Search for code across GitHub with intelligent pattern matching.
125
+
126
+ This tool intelligently maps user patterns to GitHub's Code Search API capabilities:
127
+
128
+ Repository Patterns:
129
+ - "owner/repo" → Searches exact repository
130
+ - "owner/*" or "owner" → Searches all repos in organization
131
+ - "*/*" → Searches all GitHub (no repo filter)
132
+ - Wildcards trigger client-side filtering when needed
133
+
134
+ Path Patterns:
135
+ - "*.py" → Searches all Python files
136
+ - "**/*.js" → Searches all JavaScript files (any directory)
137
+ - "src/**/*.py" → Python files in src/ (uses client-side filtering)
138
+ - "test_*.py" → Files matching pattern (client-side filtering)
139
+ - "path/to/file.py" → Exact file path
140
+
141
+ Args:
142
+ query: Search term or pattern to find in code
143
+ repo_pattern: Repository pattern (e.g., "huggingface/trl", "huggingface/*", "huggingface")
144
+ path_pattern: File path pattern (e.g., "*.py", "src/**/*.js")
145
+ regex: If True, treat query as regular expression
146
+ max_results: Maximum number of results to return (default 20)
147
+
148
+ Returns:
149
+ ToolResult with code matches and snippets
150
+ """
151
+ token = os.environ.get("GITHUB_TOKEN")
152
+ if not token:
153
+ return {
154
+ "formatted": "Error: GITHUB_TOKEN environment variable is required",
155
+ "totalResults": 0,
156
+ "resultsShared": 0,
157
+ "isError": True,
158
+ }
159
+
160
+ # Build GitHub API query
161
+ query_parts = []
162
+
163
+ # Add search term
164
+ if regex:
165
+ query_parts.append(f"/{query}/")
166
+ else:
167
+ query_parts.append(f'"{query}"' if " " in query else query)
168
+
169
+ # Parse repository filter
170
+ repo_api_filter, repo_client_glob = _parse_repo_filter(repo_pattern)
171
+ if repo_api_filter:
172
+ query_parts.append(repo_api_filter)
173
+
174
+ # Parse path filter
175
+ path_api_filter, path_client_glob = _parse_path_filter(path_pattern)
176
+ if path_api_filter:
177
+ query_parts.append(path_api_filter)
178
+
179
+ github_query = " ".join(query_parts)
180
+
181
+ headers = {
182
+ "Accept": "application/vnd.github.text-match+json",
183
+ "X-GitHub-Api-Version": "2022-11-28",
184
+ "Authorization": f"Bearer {token}",
185
+ }
186
+
187
+ all_matches = []
188
+ page = 1
189
+ per_page = min(100, max_results)
190
+
191
+ try:
192
+ while len(all_matches) < max_results:
193
+ params = {
194
+ "q": github_query,
195
+ "page": page,
196
+ "per_page": per_page,
197
+ }
198
+
199
+ response = requests.get(
200
+ "https://api.github.com/search/code",
201
+ headers=headers,
202
+ params=params,
203
+ timeout=30,
204
+ )
205
+
206
+ if response.status_code == 403:
207
+ error_data = response.json()
208
+ return {
209
+ "formatted": f"GitHub API rate limit or permission error: {error_data.get('message', 'Unknown error')}",
210
+ "totalResults": 0,
211
+ "resultsShared": 0,
212
+ "isError": True,
213
+ }
214
+
215
+ if response.status_code != 200:
216
+ error_msg = f"GitHub API error (status {response.status_code})"
217
+ try:
218
+ error_data = response.json()
219
+ if "message" in error_data:
220
+ error_msg += f": {error_data['message']}"
221
+ except Exception:
222
+ pass
223
+ return {
224
+ "formatted": error_msg,
225
+ "totalResults": 0,
226
+ "resultsShared": 0,
227
+ "isError": True,
228
+ }
229
+
230
+ data = response.json()
231
+ items = data.get("items", [])
232
+
233
+ if not items:
234
+ break
235
+
236
+ for item in items:
237
+ repo_name = item.get("repository", {}).get("full_name", "unknown")
238
+ file_path = item.get("path", "")
239
+ sha = item.get("sha", "")
240
+
241
+ # Apply client-side filtering
242
+ if repo_client_glob and not _glob_match(repo_name, repo_client_glob):
243
+ continue
244
+ if path_client_glob and not _glob_match(file_path, path_client_glob):
245
+ continue
246
+
247
+ # Extract text matches
248
+ text_matches = item.get("text_matches", [])
249
+ if text_matches:
250
+ for text_match in text_matches:
251
+ fragment = text_match.get("fragment", "")
252
+ lines = fragment.split("\n")
253
+ line_count = len([line for line in lines if line.strip()])
254
+
255
+ all_matches.append(
256
+ {
257
+ "repo": repo_name,
258
+ "path": file_path,
259
+ "ref": sha,
260
+ "line_start": 1,
261
+ "line_end": line_count,
262
+ "snippet": fragment.strip(),
263
+ "url": item.get("html_url", ""),
264
+ }
265
+ )
266
+ else:
267
+ all_matches.append(
268
+ {
269
+ "repo": repo_name,
270
+ "path": file_path,
271
+ "ref": sha,
272
+ "line_start": 1,
273
+ "line_end": 1,
274
+ "snippet": "(snippet not available)",
275
+ "url": item.get("html_url", ""),
276
+ }
277
+ )
278
+
279
+ if len(all_matches) >= data.get("total_count", 0):
280
+ break
281
+
282
+ page += 1
283
+
284
+ except requests.exceptions.RequestException as e:
285
+ return {
286
+ "formatted": f"Failed to connect to GitHub API: {str(e)}",
287
+ "totalResults": 0,
288
+ "resultsShared": 0,
289
+ "isError": True,
290
+ }
291
+
292
+ results = all_matches[:max_results]
293
+
294
+ if not results:
295
+ return {
296
+ "formatted": f"No code matches found for query: {query}",
297
+ "totalResults": 0,
298
+ "resultsShared": 0,
299
+ }
300
+
301
+ # Format output
302
+ lines_output = [f"**Found {len(results)} code matches:**\n"]
303
+
304
+ for i, match in enumerate(results, 1):
305
+ lines_output.append(f"{i}. **{match['repo']}:{match['path']}**")
306
+ lines_output.append(
307
+ f" Lines: {match['line_start']}-{match['line_end']} | Ref: {match['ref'][:7]}"
308
+ )
309
+ lines_output.append(f" URL: {match['url']}")
310
+
311
+ # Copyable parameters for read_file tool
312
+ read_params = f"{{'repo': '{match['repo']}', 'path': '{match['path']}', 'ref': '{match['ref'][:7]}'}}"
313
+ lines_output.append(f" To read, use: {read_params}")
314
+
315
+ # Show snippet (first 5 lines)
316
+ snippet_lines = match["snippet"].split("\n")[:5]
317
+ if snippet_lines:
318
+ lines_output.append(" ```")
319
+ for line in snippet_lines:
320
+ lines_output.append(f" {line}")
321
+ if len(match["snippet"].split("\n")) > 5:
322
+ lines_output.append(" ...")
323
+ lines_output.append(" ```")
324
+ lines_output.append("")
325
+
326
+ return {
327
+ "formatted": "\n".join(lines_output),
328
+ "totalResults": len(results),
329
+ "resultsShared": len(results),
330
+ }
331
+
332
+
333
+ # Tool specification
334
+ GITHUB_SEARCH_CODE_TOOL_SPEC = {
335
+ "name": "github_search_code",
336
+ "description": (
337
+ "Search for code patterns across GitHub repositories with intelligent pattern matching.\n\n"
338
+ "Searches for specific code patterns, functions, classes, or implementations across GitHub. "
339
+ "Intelligently maps patterns to GitHub's Code Search API for efficient server-side filtering, "
340
+ "with automatic client-side filtering for complex patterns. Returns code snippets with context.\n\n"
341
+ "## When to use this tool\n\n"
342
+ "- When searching for specific code patterns, functions, or classes across repositories\n"
343
+ "- When looking for implementation examples of specific methods or APIs\n"
344
+ "- When you need to find where specific code exists across multiple files or repos\n"
345
+ "- When investigating how a feature is implemented in different repositories\n"
346
+ "- When searching for TODO comments, specific patterns, or code structures\n"
347
+ "- Use this for searching actual implementation code (not examples - use github_find_examples for those)\n\n"
348
+ "## When NOT to use this tool\n\n"
349
+ "- When looking for example files or tutorials (use github_find_examples instead)\n"
350
+ "- When you already know the exact file path (use github_read_file directly)\n"
351
+ "- When you need to list repositories (use github_list_repos instead)\n\n"
352
+ "## Repository Patterns\n\n"
353
+ "- **Exact repo**: `'huggingface/trl'` → Searches only that repository\n"
354
+ "- **Organization**: `'huggingface'` or `'huggingface/*'` → All repos in organization\n"
355
+ "- **All GitHub**: `'*/*'` or omit repo_pattern → Searches across all GitHub\n"
356
+ "- **Wildcards**: `'huggingface/trl*'` → Automatic client-side filtering for complex patterns\n\n"
357
+ "## Path Patterns\n\n"
358
+ "- **Extension**: `'*.py'` or `'**/*.py'` → All Python files\n"
359
+ "- **Directory**: `'src/**/*.js'` → JavaScript files in src/ directory (client-filtered)\n"
360
+ "- **Pattern**: `'test_*.py'` → Files matching pattern (client-filtered)\n"
361
+ "- **Exact path**: `'README.md'` → Specific file\n\n"
362
+ "## How it works\n\n"
363
+ "1. Parses repository and path patterns\n"
364
+ "2. Converts to GitHub API filters when possible (server-side, fast)\n"
365
+ "3. Falls back to client-side filtering for complex patterns\n"
366
+ "4. Returns code snippets with line numbers, URLs, and file refs\n"
367
+ "5. Results can be used directly with github_read_file tool\n\n"
368
+ "## Examples\n\n"
369
+ "<example>\n"
370
+ "// ML Workflow Step: Find how AutoModelForCausalLM is used\n"
371
+ "// Use case: Learning best practices for loading LLMs in TRL\n"
372
+ "{\n"
373
+ " query: 'AutoModelForCausalLM.from_pretrained',\n"
374
+ " repo_pattern: 'huggingface/trl',\n"
375
+ " path_pattern: '*.py'\n"
376
+ "}\n"
377
+ "// Finds all model loading patterns with quantization, device_map, etc.\n"
378
+ "</example>\n\n"
379
+ "<example>\n"
380
+ "// ML Workflow Step: Discover TrainingArguments configurations\n"
381
+ "// Use case: Setting up training hyperparameters correctly\n"
382
+ "{\n"
383
+ " query: 'TrainingArguments',\n"
384
+ " repo_pattern: 'huggingface/transformers',\n"
385
+ " path_pattern: 'examples/**/*.py',\n"
386
+ " max_results: 10\n"
387
+ "}\n"
388
+ "// Shows various TrainingArguments setups across different tasks\n"
389
+ "</example>\n\n"
390
+ "<example>\n"
391
+ "// ML Workflow Step: Find dataset preprocessing patterns\n"
392
+ "// Use case: Learning how to prepare data for instruction tuning\n"
393
+ "{\n"
394
+ " query: 'map(tokenize',\n"
395
+ " repo_pattern: 'huggingface',\n"
396
+ " path_pattern: '*.py'\n"
397
+ "}\n"
398
+ "// Discovers tokenization and dataset mapping patterns\n"
399
+ "</example>\n\n"
400
+ "<example>\n"
401
+ "// ML Workflow Step: Find all Trainer class implementations\n"
402
+ "// Use case: Understanding available trainer variants for different tasks\n"
403
+ "{\n"
404
+ " query: 'class \\\\w+Trainer\\\\(',\n"
405
+ " repo_pattern: 'huggingface/trl',\n"
406
+ " path_pattern: 'trl/trainer/**/*.py',\n"
407
+ " regex: true\n"
408
+ "}\n"
409
+ "// Lists: GRPOTrainer, DPOTrainer, PPOTrainer, RewardTrainer, etc.\n"
410
+ "</example>"
411
+ ),
412
+ "parameters": {
413
+ "type": "object",
414
+ "properties": {
415
+ "query": {
416
+ "type": "string",
417
+ "description": "Search term or pattern to find in code. Required.",
418
+ },
419
+ "repo_pattern": {
420
+ "type": "string",
421
+ "description": "Repository pattern: 'owner/repo' (exact), 'owner' (org), 'owner/*' (org with filter), '*/*' (all). Optional.",
422
+ },
423
+ "path_pattern": {
424
+ "type": "string",
425
+ "description": "File path pattern: '*.ext' (extension), 'dir/**/*.ext' (directory), 'pattern*.ext' (name pattern). Optional.",
426
+ },
427
+ "regex": {
428
+ "type": "boolean",
429
+ "description": "If true, treat query as regular expression. Default: false.",
430
+ },
431
+ "max_results": {
432
+ "type": "integer",
433
+ "description": "Maximum number of results to return. Default: 20.",
434
+ },
435
+ },
436
+ "required": ["query"],
437
+ },
438
+ }
439
+
440
+
441
+ async def github_search_code_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
442
+ """Handler for agent tool router"""
443
+ try:
444
+ result = search_code(
445
+ query=arguments["query"],
446
+ repo_pattern=arguments.get("repo_pattern"),
447
+ path_pattern=arguments.get("path_pattern"),
448
+ regex=arguments.get("regex", False),
449
+ max_results=arguments.get("max_results", 20),
450
+ )
451
+ return result["formatted"], not result.get("isError", False)
452
+ except Exception as e:
453
+ return f"Error searching code: {str(e)}", False
agent/tools/jobs_tool.py CHANGED
@@ -40,6 +40,20 @@ GPU_FLAVORS = [
40
  "h100",
41
  "h100x8",
42
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  SPECIALIZED_FLAVORS = ["inf2x6"]
44
  ALL_FLAVORS = CPU_FLAVORS + GPU_FLAVORS + SPECIALIZED_FLAVORS
45
 
@@ -741,12 +755,12 @@ HF_JOBS_TOOL_SPEC = {
741
  "1. **Python mode:** Provide 'script' + 'dependencies' → auto-handles pip install\n"
742
  "2. **Docker mode:** Provide 'image' + 'command' → full control\n"
743
  "(script and command are mutually exclusive)\n\n"
744
- "## Hardware:\n"
745
- "CPU: cpu-basic (default), cpu-upgrade, cpu-performance, cpu-xl\n"
746
- "GPU: t4-small, t4-medium, l4x1, a10g-small, a10g-large, a100-large, h100\n\n"
747
  "## Examples:\n\n"
748
  "**Fine-tune LLM and push to Hub:**\n"
749
- "{'operation': 'run', 'script': 'from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer\\nmodel = AutoModelForCausalLM.from_pretrained(\"gpt2\")\\n# ... training code ...\\nmodel.push_to_hub(\"user-name/my-finetuned-model\")', 'dependencies': ['transformers', 'torch', 'datasets'], 'hardware_flavor': 'a10g-large', 'timeout': '4h', 'env': {'CUSTOM_VAR': 'value'}}\n\n"
750
  "**Generate dataset daily and upload:**\n"
751
  "{'operation': 'scheduled run', 'script': 'from datasets import Dataset\\nimport pandas as pd\\n# scrape/generate data\\ndf = pd.DataFrame(data)\\nds = Dataset.from_pandas(df)\\nds.push_to_hub(\"user-name/daily-dataset\")', 'dependencies': ['datasets', 'pandas'], 'schedule': '@daily'}\n\n"
752
  "**Run custom training with Docker:**\n"
@@ -807,7 +821,7 @@ HF_JOBS_TOOL_SPEC = {
807
  # Hardware and environment
808
  "hardware_flavor": {
809
  "type": "string",
810
- "description": "Hardware type. CPU: cpu-basic (default), cpu-upgrade, cpu-performance, cpu-xl. GPU: t4-small, t4-medium, l4x1, a10g-small, a10g-large, a100-large, h100. Use with 'run'/'scheduled run'.",
811
  },
812
  "timeout": {
813
  "type": "string",
 
40
  "h100",
41
  "h100x8",
42
  ]
43
+
44
+ # Detailed specs for display (vCPU/RAM/GPU VRAM)
45
+ CPU_FLAVORS_DESC = (
46
+ "cpu-basic(2vCPU/16GB), cpu-upgrade(8vCPU/32GB), cpu-performance, cpu-xl"
47
+ )
48
+ GPU_FLAVORS_DESC = (
49
+ "t4-small(4vCPU/15GB/GPU 16GB), t4-medium(8vCPU/30GB/GPU 16GB), "
50
+ "l4x1(8vCPU/30GB/GPU 24GB), l4x4(48vCPU/186GB/GPU 96GB), "
51
+ "l40sx1(8vCPU/62GB/GPU 48GB), l40sx4(48vCPU/382GB/GPU 192GB), l40sx8(192vCPU/1534GB/GPU 384GB), "
52
+ "a10g-small(4vCPU/14GB/GPU 24GB), a10g-large(12vCPU/46GB/GPU 24GB), "
53
+ "a10g-largex2(24vCPU/92GB/GPU 48GB), a10g-largex4(48vCPU/184GB/GPU 96GB), "
54
+ "a100-large(12vCPU/142GB/GPU 80GB), h100(23vCPU/240GB/GPU 80GB), h100x8(184vCPU/1920GB/GPU 640GB), "
55
+ "zero-a10g(dynamic alloc)"
56
+ )
57
  SPECIALIZED_FLAVORS = ["inf2x6"]
58
  ALL_FLAVORS = CPU_FLAVORS + GPU_FLAVORS + SPECIALIZED_FLAVORS
59
 
 
755
  "1. **Python mode:** Provide 'script' + 'dependencies' → auto-handles pip install\n"
756
  "2. **Docker mode:** Provide 'image' + 'command' → full control\n"
757
  "(script and command are mutually exclusive)\n\n"
758
+ "## Available Hardware (vCPU/RAM/GPU):\n"
759
+ f"CPU: {CPU_FLAVORS_DESC}\n"
760
+ f"GPU: {GPU_FLAVORS_DESC}\n"
761
  "## Examples:\n\n"
762
  "**Fine-tune LLM and push to Hub:**\n"
763
+ "{'operation': 'run', 'script': 'from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer\\nmodel = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen3-4B-Thinking-2507\")\\n# ... training code ...\\nmodel.push_to_hub(\"user-name/my-finetuned-model\")', 'dependencies': ['transformers', 'torch', 'datasets'], 'hardware_flavor': 'a10g-large', 'timeout': '4h', 'env': {'CUSTOM_VAR': 'value'}}\n\n"
764
  "**Generate dataset daily and upload:**\n"
765
  "{'operation': 'scheduled run', 'script': 'from datasets import Dataset\\nimport pandas as pd\\n# scrape/generate data\\ndf = pd.DataFrame(data)\\nds = Dataset.from_pandas(df)\\nds.push_to_hub(\"user-name/daily-dataset\")', 'dependencies': ['datasets', 'pandas'], 'schedule': '@daily'}\n\n"
766
  "**Run custom training with Docker:**\n"
 
821
  # Hardware and environment
822
  "hardware_flavor": {
823
  "type": "string",
824
+ "description": f"Hardware type. Available CPU flavors: {CPU_FLAVORS}. Available GPU flavors: {GPU_FLAVORS}. Use with 'run'/'scheduled run'.",
825
  },
826
  "timeout": {
827
  "type": "string",
agent/tools/utilities.py CHANGED
@@ -2,8 +2,10 @@
2
  Utility functions for Hugging Face tools
3
 
4
  Ported from: hf-mcp-server/packages/mcp/src/jobs/formatters.ts
 
5
  """
6
 
 
7
  from datetime import datetime
8
  from typing import Any, Dict, List, Optional
9
 
@@ -126,7 +128,6 @@ def format_scheduled_jobs_table(jobs: List[Dict[str, Any]]) -> str:
126
 
127
  def format_job_details(jobs: Any) -> str:
128
  """Format job details as JSON in a markdown code block"""
129
- import json
130
 
131
  job_array = jobs if isinstance(jobs, list) else [jobs]
132
  json_str = json.dumps(job_array, indent=2)
@@ -135,7 +136,6 @@ def format_job_details(jobs: Any) -> str:
135
 
136
  def format_scheduled_job_details(jobs: Any) -> str:
137
  """Format scheduled job details as JSON in a markdown code block"""
138
- import json
139
 
140
  job_array = jobs if isinstance(jobs, list) else [jobs]
141
  json_str = json.dumps(job_array, indent=2)
 
2
  Utility functions for Hugging Face tools
3
 
4
  Ported from: hf-mcp-server/packages/mcp/src/jobs/formatters.ts
5
+ Includes GPU memory validation for job submissions
6
  """
7
 
8
+ import json
9
  from datetime import datetime
10
  from typing import Any, Dict, List, Optional
11
 
 
128
 
129
  def format_job_details(jobs: Any) -> str:
130
  """Format job details as JSON in a markdown code block"""
 
131
 
132
  job_array = jobs if isinstance(jobs, list) else [jobs]
133
  json_str = json.dumps(job_array, indent=2)
 
136
 
137
  def format_scheduled_job_details(jobs: Any) -> str:
138
  """Format scheduled job details as JSON in a markdown code block"""
 
139
 
140
  job_array = jobs if isinstance(jobs, list) else [jobs]
141
  json_str = json.dumps(job_array, indent=2)
agent/tools/utils_tools.py CHANGED
@@ -4,14 +4,9 @@ Utils Tools - General utility operations
4
  Provides system information like current date/time with timezone support.
5
  """
6
 
7
- import asyncio
8
  from datetime import datetime
9
- from typing import Any, Dict, Literal, Optional
10
-
11
- try:
12
- import zoneinfo
13
- except ImportError:
14
- from backports import zoneinfo
15
 
16
  from agent.tools.types import ToolResult
17
 
@@ -123,7 +118,9 @@ Common timezones: Europe/Paris, America/New_York, America/Los_Angeles, Asia/Toky
123
  date_str = now.strftime("%d-%m-%Y")
124
 
125
  # Format time as HH:MM:SS.mmm
126
- time_str = now.strftime("%H:%M:%S.%f")[:-3] # Remove last 3 digits to keep only milliseconds
 
 
127
 
128
  # Get timezone abbreviation/offset
129
  tz_offset = now.strftime("%z")
 
4
  Provides system information like current date/time with timezone support.
5
  """
6
 
7
+ import zoneinfo
8
  from datetime import datetime
9
+ from typing import Any, Dict, Literal
 
 
 
 
 
10
 
11
  from agent.tools.types import ToolResult
12
 
 
118
  date_str = now.strftime("%d-%m-%Y")
119
 
120
  # Format time as HH:MM:SS.mmm
121
+ time_str = now.strftime("%H:%M:%S.%f")[
122
+ :-3
123
+ ] # Remove last 3 digits to keep only milliseconds
124
 
125
  # Get timezone abbreviation/offset
126
  tz_offset = now.strftime("%z")
pyproject.toml CHANGED
@@ -23,4 +23,8 @@ dependencies = [
23
  "prompt-toolkit>=3.0.0",
24
  "ipykernel>=7.1.0",
25
  "ipywidgets>=8.1.8",
 
 
 
 
26
  ]
 
23
  "prompt-toolkit>=3.0.0",
24
  "ipykernel>=7.1.0",
25
  "ipywidgets>=8.1.8",
26
+ "thefuzz>=0.22.1",
27
+ "nbconvert>=7.16.6",
28
+ "nbformat>=5.10.4",
29
+ "markitdown[all,docx,outlook,pdf,pptx,xls,xlsx]>=0.1.4",
30
  ]
uv.lock CHANGED
The diff for this file is too large to render. See raw diff