Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| """ | |
| GitHub Find Examples Tool | |
| Finds examples, guides, and tutorials for a library using deterministic queries and heuristics. | |
| """ | |
| import asyncio | |
| import math | |
| import os | |
| from dataclasses import asdict, dataclass | |
| from datetime import datetime, timedelta | |
| from typing import Any, Dict, List, Optional | |
| try: | |
| import requests | |
| except ImportError: | |
| raise ImportError( | |
| "requests library is required. Install with: pip install requests" | |
| ) | |
| from agent.tools.types import ToolResult | |
| class Example: | |
| """An example file with metadata and relevance score.""" | |
| repo: str | |
| path: str | |
| ref: str | |
| url: str | |
| score: float | |
| reason: str | |
| repo_stars: int | |
| repo_updated: str | |
| file_size: int | |
| def to_dict(self): | |
| return asdict(self) | |
| class GitHubAPIError(Exception): | |
| """Raised when GitHub API returns an error.""" | |
| pass | |
| # Path-based scoring weights | |
| PATH_SCORES = { | |
| "README.md": 100, | |
| "readme.md": 100, | |
| "docs/": 80, | |
| "doc/": 80, | |
| "examples/": 90, | |
| "example/": 90, | |
| "notebooks/": 70, | |
| "notebook/": 70, | |
| "tutorials/": 85, | |
| "tutorial/": 85, | |
| "guides/": 85, | |
| "guide/": 85, | |
| "tests/": 40, | |
| "test/": 40, | |
| "demos/": 75, | |
| "demo/": 75, | |
| "samples/": 75, | |
| "sample/": 75, | |
| } | |
| # Content-based scoring keywords | |
| CONTENT_KEYWORDS = { | |
| 'if __name__ == "__main__"': 50, | |
| "if __name__ == '__main__'": 50, | |
| "quickstart": 60, | |
| "quick start": 60, | |
| "getting started": 60, | |
| "tutorial": 50, | |
| "example usage": 55, | |
| "usage example": 55, | |
| "how to use": 45, | |
| "basic example": 50, | |
| "simple example": 50, | |
| } | |
| # File extension preferences | |
| PREFERRED_EXTENSIONS = { | |
| ".py": 10, | |
| ".ipynb": 15, | |
| ".md": 20, | |
| ".rst": 10, | |
| ".js": 10, | |
| ".ts": 10, | |
| ".go": 10, | |
| ".java": 10, | |
| ".cpp": 10, | |
| ".c": 10, | |
| } | |
| def _get_github_token() -> str: | |
| """Get GitHub token from environment.""" | |
| token = os.environ.get("GITHUB_TOKEN") | |
| if not token: | |
| raise GitHubAPIError( | |
| "GITHUB_TOKEN environment variable is required. " | |
| "Set it with: export GITHUB_TOKEN=your_token_here" | |
| ) | |
| return token | |
| def _execute_search(query: str, token: str, limit: int = 20) -> List[Dict[str, Any]]: | |
| """Execute a GitHub code search query.""" | |
| headers = { | |
| "Accept": "application/vnd.github.text-match+json", | |
| "X-GitHub-Api-Version": "2022-11-28", | |
| "Authorization": f"Bearer {token}", | |
| } | |
| results = [] | |
| page = 1 | |
| per_page = min(100, limit) | |
| try: | |
| while len(results) < limit: | |
| params = {"q": query, "per_page": per_page, "page": page} | |
| url = "https://api.github.com/search/code" | |
| response = requests.get(url, headers=headers, params=params, timeout=30) | |
| if response.status_code != 200: | |
| break | |
| data = response.json() | |
| items = data.get("items", []) | |
| if not items: | |
| break | |
| for item in items: | |
| results.append( | |
| { | |
| "repo": item.get("repository", {}).get("full_name", ""), | |
| "path": item.get("path", ""), | |
| "sha": item.get("sha", ""), | |
| "url": item.get("html_url", ""), | |
| "size": item.get("size", 0), | |
| "text_matches": item.get("text_matches", []), | |
| } | |
| ) | |
| if len(results) >= limit or len(items) < per_page: | |
| break | |
| page += 1 | |
| except Exception: | |
| pass | |
| return results[:limit] | |
| def _fetch_repo_metadata(repos: List[str], token: str) -> Dict[str, Dict[str, Any]]: | |
| """Fetch metadata for repositories.""" | |
| headers = { | |
| "Accept": "application/vnd.github+json", | |
| "X-GitHub-Api-Version": "2022-11-28", | |
| "Authorization": f"Bearer {token}", | |
| } | |
| metadata = {} | |
| for repo in repos: | |
| try: | |
| url = f"https://api.github.com/repos/{repo}" | |
| response = requests.get(url, headers=headers, timeout=10) | |
| if response.status_code == 200: | |
| data = response.json() | |
| metadata[repo] = { | |
| "stars": data.get("stargazers_count", 0), | |
| "updated_at": data.get("updated_at", ""), | |
| "description": data.get("description", ""), | |
| } | |
| except: | |
| continue | |
| return metadata | |
| def _score_and_rank( | |
| results: List[Dict[str, Any]], library: str, token: str | |
| ) -> List[Example]: | |
| """Score results based on heuristics and rank them.""" | |
| repos = list(set(r["repo"] for r in results)) | |
| repo_metadata = _fetch_repo_metadata(repos, token) | |
| scored_examples = [] | |
| for result in results: | |
| repo = result["repo"] | |
| path = result["path"] | |
| score = 0.0 | |
| reasons = [] | |
| # Path-based scoring | |
| path_lower = path.lower() | |
| for pattern, points in PATH_SCORES.items(): | |
| if pattern.lower() in path_lower: | |
| score += points | |
| reasons.append(f"in {pattern}") | |
| break | |
| # File extension scoring | |
| for ext, points in PREFERRED_EXTENSIONS.items(): | |
| if path_lower.endswith(ext): | |
| score += points | |
| break | |
| # Content-based scoring | |
| text_content = "" | |
| for match in result.get("text_matches", []): | |
| text_content += match.get("fragment", "").lower() + " " | |
| for keyword, points in CONTENT_KEYWORDS.items(): | |
| if keyword.lower() in text_content: | |
| score += points | |
| reasons.append(f"contains '{keyword}'") | |
| # Repo-based scoring | |
| metadata = repo_metadata.get(repo, {}) | |
| stars = metadata.get("stars", 0) | |
| updated = metadata.get("updated_at", "") | |
| if stars > 0: | |
| star_score = math.log10(stars + 1) * 10 | |
| score += star_score | |
| # Recency bonus | |
| if updated: | |
| try: | |
| updated_date = datetime.fromisoformat(updated.replace("Z", "+00:00")) | |
| if datetime.now(updated_date.tzinfo) - updated_date < timedelta( | |
| days=180 | |
| ): | |
| score += 20 | |
| reasons.append("recently updated") | |
| except: | |
| pass | |
| # Filename quality | |
| filename = path.split("/")[-1].lower() | |
| if any( | |
| word in filename | |
| for word in ["example", "tutorial", "guide", "quickstart", "demo"] | |
| ): | |
| score += 30 | |
| reasons.append("descriptive filename") | |
| # Size penalty | |
| if result["size"] > 100000: | |
| score *= 0.5 | |
| reasons.append("large file") | |
| example = Example( | |
| repo=repo, | |
| path=path, | |
| ref=result["sha"], | |
| url=result["url"], | |
| score=score, | |
| reason=", ".join(reasons) if reasons else "matches library", | |
| repo_stars=stars, | |
| repo_updated=updated, | |
| file_size=result["size"], | |
| ) | |
| scored_examples.append(example) | |
| scored_examples.sort(key=lambda x: x.score, reverse=True) | |
| return scored_examples | |
| def _search_by_path( | |
| library: str, org: str, repo_scope: Optional[str], token: str | |
| ) -> List[Dict[str, Any]]: | |
| """Search for library in example/tutorial/docs directories.""" | |
| results = [] | |
| path_patterns = [ | |
| "examples/", | |
| "example/", | |
| "docs/", | |
| "tutorials/", | |
| "notebooks/", | |
| "guides/", | |
| ] | |
| for path in path_patterns: | |
| query_parts = [f"org:{org}", f"{library}", f"path:{path}"] | |
| if repo_scope: | |
| query_parts[0] = f"repo:{org}/{repo_scope}" | |
| query = " ".join(query_parts) | |
| results.extend(_execute_search(query, token, limit=20)) | |
| return results | |
| def _search_by_content( | |
| library: str, org: str, repo_scope: Optional[str], token: str | |
| ) -> List[Dict[str, Any]]: | |
| """Search for library with specific content patterns.""" | |
| results = [] | |
| content_patterns = [ | |
| f"{library} if __name__", | |
| f"{library} quickstart", | |
| f"{library} tutorial", | |
| f"{library} usage example", | |
| ] | |
| for pattern in content_patterns: | |
| query_parts = [f"org:{org}", pattern] | |
| if repo_scope: | |
| query_parts[0] = f"repo:{org}/{repo_scope}" | |
| query = " ".join(query_parts) | |
| results.extend(_execute_search(query, token, limit=15)) | |
| return results | |
| def _search_readmes( | |
| library: str, org: str, repo_scope: Optional[str], token: str | |
| ) -> List[Dict[str, Any]]: | |
| """Search for library mentions in README files.""" | |
| query_parts = [f"org:{org}", f"{library}", "filename:README"] | |
| if repo_scope: | |
| query_parts[0] = f"repo:{org}/{repo_scope}" | |
| query = " ".join(query_parts) | |
| return _execute_search(query, token, limit=20) | |
| def find_examples( | |
| library: str, | |
| org: str = "huggingface", | |
| repo_scope: Optional[str] = None, | |
| max_results: int = 10, | |
| ) -> List[Example]: | |
| """ | |
| Find examples, guides, and tutorials for a library using deterministic queries. | |
| Uses a playbook of smart searches and heuristics to find canonical examples: | |
| - Prefers README.md, docs/**, examples/**, notebooks/**, tests/** | |
| - Prefers files with if __name__ == "__main__", "quickstart", "tutorial" | |
| - Prefers repos with higher stars and more recent updates | |
| Args: | |
| library: Library name to search for (e.g., "transformers", "torch") | |
| org: GitHub organization to search in (default: "huggingface") | |
| repo_scope: Optional specific repository (e.g., "transformers") | |
| max_results: Maximum number of results to return (default: 10) | |
| Returns: | |
| List of Example objects, ranked by relevance score | |
| """ | |
| token = _get_github_token() | |
| all_results = [] | |
| all_results.extend(_search_by_path(library, org, repo_scope, token)) | |
| all_results.extend(_search_by_content(library, org, repo_scope, token)) | |
| all_results.extend(_search_readmes(library, org, repo_scope, token)) | |
| # Deduplicate | |
| seen = set() | |
| unique_results = [] | |
| for result in all_results: | |
| key = (result["repo"], result["path"]) | |
| if key not in seen: | |
| seen.add(key) | |
| unique_results.append(result) | |
| scored_examples = _score_and_rank(unique_results, library, token) | |
| return scored_examples[:max_results] | |
| async def _async_call(func, *args, **kwargs): | |
| """Wrap synchronous calls for async context.""" | |
| return await asyncio.to_thread(func, *args, **kwargs) | |
| def _format_examples_table(examples: List[Example]) -> str: | |
| """Format examples as a markdown table.""" | |
| if not examples: | |
| return "No examples found." | |
| lines = [ | |
| "| Rank | File | Score | Stars | Reason |", | |
| "|------|------|-------|-------|--------|", | |
| ] | |
| for i, ex in enumerate(examples, 1): | |
| file_path = f"{ex.repo}/{ex.path}" | |
| if len(file_path) > 60: | |
| file_path = file_path[:57] + "..." | |
| reason = ex.reason if len(ex.reason) < 40 else ex.reason[:37] + "..." | |
| lines.append( | |
| f"| {i} | {file_path} | {ex.score:.1f} | {ex.repo_stars:,} | {reason} |" | |
| ) | |
| return "\n".join(lines) | |
| class FindExamplesTool: | |
| """Tool for finding examples and tutorials for libraries.""" | |
| async def execute(self, params: Dict[str, Any]) -> ToolResult: | |
| """Execute find_examples operation.""" | |
| library = params.get("library") | |
| if not library: | |
| return { | |
| "formatted": "Error: 'library' parameter is required", | |
| "totalResults": 0, | |
| "resultsShared": 0, | |
| "isError": True, | |
| } | |
| org = params.get("org", "huggingface") | |
| repo_scope = params.get("repo_scope") | |
| max_results = params.get("max_results", 10) | |
| try: | |
| examples = await _async_call( | |
| find_examples, | |
| library=library, | |
| org=org, | |
| repo_scope=repo_scope, | |
| max_results=max_results, | |
| ) | |
| if not examples: | |
| return { | |
| "formatted": f"No examples found for '{library}' in {org}", | |
| "totalResults": 0, | |
| "resultsShared": 0, | |
| } | |
| table = _format_examples_table(examples) | |
| response = f"**Found {len(examples)} examples for '{library}' in {org}:**\n\n{table}" | |
| # Add URLs and suggest using read_file | |
| response += "\n\n**Top examples (use read_file to view):**\n" | |
| for i, ex in enumerate(examples[:3], 1): | |
| response += f"{i}. [{ex.repo}/{ex.path}]({ex.url})\n" | |
| response += f" Use: read_file(repo='{ex.repo}', path='{ex.path}')\n" | |
| return { | |
| "formatted": response, | |
| "totalResults": len(examples), | |
| "resultsShared": len(examples), | |
| } | |
| except GitHubAPIError as e: | |
| return { | |
| "formatted": f"GitHub API Error: {str(e)}", | |
| "totalResults": 0, | |
| "resultsShared": 0, | |
| "isError": True, | |
| } | |
| except Exception as e: | |
| return { | |
| "formatted": f"Error: {str(e)}", | |
| "totalResults": 0, | |
| "resultsShared": 0, | |
| "isError": True, | |
| } | |
| # Tool specification | |
| FIND_EXAMPLES_TOOL_SPEC = { | |
| "name": "find_examples", | |
| "description": ( | |
| "Find examples, guides, and tutorials for a library using deterministic queries and heuristics.\n\n" | |
| "Uses best practices retrieval without semantic search:\n" | |
| "- Prefers README.md, docs/**, examples/**, notebooks/**, tests/**\n" | |
| "- Prefers files with if __name__ == '__main__', 'quickstart', 'tutorial', 'usage'\n" | |
| "- Prefers repos with higher stars and more recent updates\n\n" | |
| "Returns a ranked list of canonical example files.\n\n" | |
| "Examples:\n" | |
| "- Find transformers examples: {'library': 'transformers', 'org': 'huggingface', 'max_results': 5}\n" | |
| "- Find torch examples in specific repo: {'library': 'torch', 'org': 'pytorch', 'repo_scope': 'examples'}\n\n" | |
| "Use read_file tool to view the content of returned files.\n\n" | |
| ), | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "library": { | |
| "type": "string", | |
| "description": "Library name to search for (e.g., 'transformers', 'torch', 'react')", | |
| }, | |
| "org": { | |
| "type": "string", | |
| "description": "GitHub organization to search in (default: 'huggingface')", | |
| }, | |
| "repo_scope": { | |
| "type": "string", | |
| "description": "Optional specific repository to search within", | |
| }, | |
| "max_results": { | |
| "type": "integer", | |
| "description": "Maximum number of results to return (default: 10)", | |
| }, | |
| }, | |
| "required": ["library"], | |
| }, | |
| } | |
| async def find_examples_handler(arguments: Dict[str, Any]) -> tuple[str, bool]: | |
| """Handler for agent tool router.""" | |
| try: | |
| tool = FindExamplesTool() | |
| result = await tool.execute(arguments) | |
| return result["formatted"], not result.get("isError", False) | |
| except Exception as e: | |
| return f"Error executing find_examples: {str(e)}", False | |