""" GitHub Find Examples Tool - Discover examples, tutorials, and guides for any library Lists all files in a repository and performs deterministic keyword search. """ import os from typing import Any, Dict, List import requests from thefuzz import fuzz from agent.tools.types import ToolResult # In order of priority (lower index = higher priority for sorting) EXAMPLE_PATTERNS = [ "scripts", # General example patterns (catch-all, lower priority) "examples", "example", # Notebook patterns "notebooks", "notebook", # Tutorial/learning patterns "tutorials", "tutorial", "quickstart", "walkthroughs", "walkthrough", # Cookbook/recipe patterns "cookbook", "cookbooks", "recipes", "recipe", # Demo/sample patterns "demos", "demo", "samples", "sample", # Other patterns "guides", "guide", "getting-started", "getting_started", "playground", "howto", "how-to", "use-cases", "usecases", "use_cases", "sandbox", "showcase", ] def _get_repo_tree(org: str, repo: str, token: str) -> tuple[List[Dict[str, Any]], str]: """Get all files in a repository recursively. Returns (files, error_message)""" headers = { "Accept": "application/vnd.github+json", "X-GitHub-Api-Version": "2022-11-28", "Authorization": f"Bearer {token}", } full_repo = f"{org}/{repo}" # Get default branch try: response = requests.get( f"https://api.github.com/repos/{full_repo}", headers=headers, timeout=10 ) if response.status_code == 404: return [], "not_found" if response.status_code != 200: return [], f"API error: {response.status_code}" repo_data = response.json() default_branch = repo_data.get("default_branch", "main") except Exception as e: return [], f"Error fetching repo: {str(e)}" # Get repository tree recursively try: response = requests.get( f"https://api.github.com/repos/{full_repo}/git/trees/{default_branch}", headers=headers, params={"recursive": "1"}, timeout=30, ) if response.status_code != 200: return [], f"Error fetching tree: {response.status_code}" data = response.json() tree = data.get("tree", []) # Filter to only include files (not directories) files = [ { "path": item["path"], "ref": item["sha"], "size": item.get("size", 0), "url": f"https://github.com/{full_repo}/blob/{default_branch}/{item['path']}", } for item in tree if item["type"] == "blob" ] return files, "" except Exception as e: return [], f"Error processing tree: {str(e)}" def _search_similar_repos(org: str, repo: str, token: str) -> List[Dict[str, Any]]: """Search for similar repository names in the organization""" headers = { "Accept": "application/vnd.github+json", "X-GitHub-Api-Version": "2022-11-28", "Authorization": f"Bearer {token}", } # Search for repos in the org with similar name query = f"org:{org} {repo}" try: response = requests.get( "https://api.github.com/search/repositories", headers=headers, params={"q": query, "sort": "stars", "order": "desc", "per_page": 10}, timeout=30, ) if response.status_code != 200: return [] data = response.json() items = data.get("items", []) return [ { "name": item.get("name"), "full_name": item.get("full_name"), "description": item.get("description"), "stars": item.get("stargazers_count", 0), "url": item.get("html_url"), } for item in items ] except Exception: return [] def _score_against_example_patterns(file_path: str) -> int: """Score file against example patterns using token_set_ratio""" scores = [] for pattern in EXAMPLE_PATTERNS: score = fuzz.token_set_ratio(pattern.lower(), file_path.lower()) scores.append(score) return max(scores) if scores else 0 def _score_against_keyword(file_path: str, keyword: str) -> int: """Calculate fuzzy match score for a file path against a keyword""" # Use partial_ratio for substring matching (good for paths) # Also check token_set_ratio for word-level matching partial_score = fuzz.partial_ratio(keyword.lower(), file_path.lower()) token_score = fuzz.token_set_ratio(keyword.lower(), file_path.lower()) # Return the higher of the two return max(partial_score, token_score) def _get_pattern_priority(file_path: str) -> tuple[int, int, int]: """ Get priority of a file path based on which example pattern directory it's in. Returns: (in_examples_dir, pattern_priority, path_depth) - in_examples_dir: 0 if in examples/ directory, 1 otherwise (lower is better) - pattern_priority: Index in EXAMPLE_PATTERNS (lower is better), or 999 if no match - path_depth: Number of path segments (lower is better) Note: Prioritizes files in "examples/" directory first, then by most specific pattern match. E.g., "examples/scripts/train.py" is better than "scripts/util.py" """ path_lower = file_path.lower() path_parts = path_lower.split("/") # Check if file is in examples/ directory (highest priority) in_examples_dir = 0 if (path_parts[0] in ["examples", "example"]) else 1 # Find ALL matching patterns and use the best (lowest index) one # But prefer deeper matches (more specific) over shallow ones best_priority = 999 best_depth_at_match = -1 for i, pattern in enumerate(EXAMPLE_PATTERNS): # Check if pattern appears as a directory component in the path if pattern in path_parts: # Find the depth where this pattern appears (rightmost occurrence) depth = len(path_parts) - 1 - path_parts[::-1].index(pattern) # Prefer deeper matches, or better priority if at same depth if depth > best_depth_at_match or ( depth == best_depth_at_match and i < best_priority ): best_priority = i best_depth_at_match = depth return (in_examples_dir, best_priority, len(path_parts)) def _handle_repo_tree_errors( all_files: List[Dict[str, Any]], error: str, org: str, repo: str, token: str, ) -> ToolResult | None: """Handle errors from repo tree fetch. Returns ToolResult if error, None if OK.""" if error == "not_found": similar_repos = _search_similar_repos(org, repo, token) if not similar_repos: return { "formatted": f"Repository '{org}/{repo}' not found and no similar repositories found.", "totalResults": 0, "resultsShared": 0, "isError": True, } # Format similar repos lines = [f"**Repository '{org}/{repo}' not found. Similar repositories:**\n"] for i, r in enumerate(similar_repos, 1): lines.append(f"{i}. **{r['full_name']}** (⭐ {r['stars']:,} stars)") if r["description"]: desc = ( r["description"][:100] + "..." if len(r["description"]) > 100 else r["description"] ) lines.append(f" {desc}") lines.append(f" {r['url']}\n") return { "formatted": "\n".join(lines), "totalResults": len(similar_repos), "resultsShared": len(similar_repos), "isError": True, } if error: return { "formatted": f"Error accessing repository '{org}/{repo}': {error}", "totalResults": 0, "resultsShared": 0, "isError": True, } if not all_files: return { "formatted": f"No files found in repository '{org}/{repo}'", "totalResults": 0, "resultsShared": 0, } return None def find_examples( keyword: str = "", repo: str = "", org: str = "huggingface", max_results: int = 10, min_score: int = 80, ) -> ToolResult: """ Find example files in a repository using fuzzy matching. Args: keyword: Keyword to fuzzy match against file paths (e.g., "grpo") repo: Repository name (e.g., "trl") org: GitHub organization (default: "huggingface") max_results: Maximum number of results (default 50) min_score: Minimum fuzzy match score (0-100, default 60) Returns: ToolResult with matching files, or similar repos if repo not found """ token = os.environ.get("GITHUB_TOKEN") if not token: return { "formatted": "Error: GITHUB_TOKEN environment variable is required", "totalResults": 0, "resultsShared": 0, "isError": True, } if not repo: return { "formatted": "Error: repo parameter is required", "totalResults": 0, "resultsShared": 0, "isError": True, } # Get all files in the repository all_files, error = _get_repo_tree(org, repo, token) # Handle errors (not found, API errors, empty repo) if error_result := _handle_repo_tree_errors(all_files, error, org, repo, token): return error_result # Step 1: Filter files by example patterns (score >= 60) example_threshold = 60 example_files = [] for file in all_files: example_score = _score_against_example_patterns(file["path"]) if example_score >= example_threshold: example_files.append({**file, "example_score": example_score}) if not example_files: return { "formatted": f"No example files found in {org}/{repo} (no files match example patterns with score >= {example_threshold}).", "totalResults": 0, "resultsShared": 0, } # Step 2: If keyword provided, score and filter by keyword if keyword: scored_files = [] for file in example_files: keyword_score = _score_against_keyword(file["path"], keyword) if keyword_score >= min_score: scored_files.append({**file, "score": keyword_score}) if not scored_files: return { "formatted": f"No files found in {org}/{repo} matching keyword '{keyword}' (min score: {min_score}) among {len(example_files)} example files.", "totalResults": 0, "resultsShared": 0, } # Sort by keyword score (descending) for best matches first scored_files.sort(key=lambda x: x["score"], reverse=True) else: # No keyword: prioritize by pattern directory, then path depth scored_files = [] for file in example_files: in_examples_dir, pattern_priority, path_depth = _get_pattern_priority( file["path"] ) scored_files.append( { **file, "score": file["example_score"], "in_examples_dir": in_examples_dir, "pattern_priority": pattern_priority, "path_depth": path_depth, } ) if not scored_files: return { "formatted": f"No example files found in {org}/{repo}.", "totalResults": 0, "resultsShared": 0, } # Sort by: 1) files in examples/ dir first, 2) pattern priority (scripts > datasets > etc), 3) path depth, 4) path name scored_files.sort( key=lambda x: ( x["in_examples_dir"], x["pattern_priority"], x["path_depth"], x["path"], ) ) # Limit results results = scored_files[:max_results] # Format output keyword_desc = f" matching '{keyword}'" if keyword else "" lines = [f"**Found {len(results)} example files in {org}/{repo}{keyword_desc}:**"] if len(scored_files) > max_results: lines[0] += f" (showing {max_results} of {len(scored_files)})" lines.append("") for i, file in enumerate(results, 1): lines.append(f"{i}. **{file['path']}**") lines.append(f" Size: {file['size']:,} bytes | Ref: {file['ref'][:7]}") lines.append(f" URL: {file['url']}") # Copyable parameters for read_file tool read_params = f"{{'repo': '{org}/{repo}', 'path': '{file['path']}'}}" lines.append(f" To read, use: {read_params}") lines.append("") return { "formatted": "\n".join(lines), "totalResults": len(results), "resultsShared": len(results), } # Tool specification GITHUB_FIND_EXAMPLES_TOOL_SPEC = { "name": "github_find_examples", "description": "Discover best practices, reusable scripts, tutorials, and demos for using a specific library or framework. This is an important step before implementing anything ML related. " "Use together with github_read_file tool.\n\n" "## When to use this tool\n\n" "- ALWAYS before implementing any training/inference/benchmarking or other ML related code or answering how-to question.\n" "- When exploring a new repository and need to understand how to use it\n" "## How it works\n\n" "1. Fetches all (examples, tutorials, demos, notebooks, scripts, etc.) from the repository\n" "2. If keyword provided, scores found files against the keyword using fuzzy matching\n" "3. Returns best matches sorted by relevance score\n" "## Examples\n\n" "\n" "// ML Workflow Step: Find GRPO/SFT/DPO/RLOO etc training examples\n" "// Task: Starting GRPO fine-tuning project, need reference implementations\n" "{\n" " keyword: 'grpo',\n" " repo: 'trl',\n" " org: 'huggingface'\n" "}\n" "// Returns: examples/scripts/grpo_agent.py, examples/scripts/grpo_vlm.py\n" "// Next step: Use github_read_file to study the implementation\n" "\n\n" "\n" "// ML Workflow Step: Discover all training examples in TRL\n" "// Task: Exploring available training methods before choosing approach\n" "{\n" " repo: 'trl',\n" " org: 'huggingface',\n" " max_results: 20\n" "}\n" "// Lists all example scripts: PPO, DPO, GRPO, reward modeling, etc.\n" "\n\n" "\n" "// ML Workflow Step: Find LoRA fine-tuning examples\n" "// Task: Learning parameter-efficient fine-tuning with PEFT\n" "{\n" " keyword: 'lora',\n" " repo: 'peft',\n" " org: 'huggingface'\n" "}\n" "// Discovers LoRA configuration and training examples\n" "", "parameters": { "type": "object", "properties": { "keyword": { "type": "string", "description": "Keyword to fuzzy match against file paths (e.g., 'grpo', 'sft').", }, "repo": { "type": "string", "description": "Repository name (e.g., 'trl', 'transformers'). Required.", }, "org": { "type": "string", "description": "GitHub organization or username. Default: 'huggingface'.", }, "max_results": { "type": "integer", "description": "Maximum number of results to return. Default: 50.", }, "min_score": { "type": "integer", "description": "Minimum fuzzy match score (0-100). Default: 60.", }, }, "required": ["repo"], }, } async def github_find_examples_handler(arguments: Dict[str, Any]) -> tuple[str, bool]: """Handler for agent tool router""" try: result = find_examples( keyword=arguments.get("keyword", ""), repo=arguments["repo"], org=arguments.get("org", "huggingface"), max_results=arguments.get("max_results", 50), min_score=arguments.get("min_score", 60), ) return result["formatted"], not result.get("isError", False) except Exception as e: return f"Error finding examples: {str(e)}", False