ml-intern / agent /tools /github_find_examples.py
akseljoonas's picture
poc github tools
ccbe2d2
raw
history blame
15.7 kB
"""
GitHub Find Examples Tool
Finds examples, guides, and tutorials for a library using deterministic queries and heuristics.
"""
import asyncio
import math
import os
from dataclasses import asdict, dataclass
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional
try:
import requests
except ImportError:
raise ImportError(
"requests library is required. Install with: pip install requests"
)
from agent.tools.types import ToolResult
@dataclass
class Example:
"""An example file with metadata and relevance score."""
repo: str
path: str
ref: str
url: str
score: float
reason: str
repo_stars: int
repo_updated: str
file_size: int
def to_dict(self):
return asdict(self)
class GitHubAPIError(Exception):
"""Raised when GitHub API returns an error."""
pass
# Path-based scoring weights
PATH_SCORES = {
"README.md": 100,
"readme.md": 100,
"docs/": 80,
"doc/": 80,
"examples/": 90,
"example/": 90,
"notebooks/": 70,
"notebook/": 70,
"tutorials/": 85,
"tutorial/": 85,
"guides/": 85,
"guide/": 85,
"tests/": 40,
"test/": 40,
"demos/": 75,
"demo/": 75,
"samples/": 75,
"sample/": 75,
}
# Content-based scoring keywords
CONTENT_KEYWORDS = {
'if __name__ == "__main__"': 50,
"if __name__ == '__main__'": 50,
"quickstart": 60,
"quick start": 60,
"getting started": 60,
"tutorial": 50,
"example usage": 55,
"usage example": 55,
"how to use": 45,
"basic example": 50,
"simple example": 50,
}
# File extension preferences
PREFERRED_EXTENSIONS = {
".py": 10,
".ipynb": 15,
".md": 20,
".rst": 10,
".js": 10,
".ts": 10,
".go": 10,
".java": 10,
".cpp": 10,
".c": 10,
}
def _get_github_token() -> str:
"""Get GitHub token from environment."""
token = os.environ.get("GITHUB_TOKEN")
if not token:
raise GitHubAPIError(
"GITHUB_TOKEN environment variable is required. "
"Set it with: export GITHUB_TOKEN=your_token_here"
)
return token
def _execute_search(query: str, token: str, limit: int = 20) -> List[Dict[str, Any]]:
"""Execute a GitHub code search query."""
headers = {
"Accept": "application/vnd.github.text-match+json",
"X-GitHub-Api-Version": "2022-11-28",
"Authorization": f"Bearer {token}",
}
results = []
page = 1
per_page = min(100, limit)
try:
while len(results) < limit:
params = {"q": query, "per_page": per_page, "page": page}
url = "https://api.github.com/search/code"
response = requests.get(url, headers=headers, params=params, timeout=30)
if response.status_code != 200:
break
data = response.json()
items = data.get("items", [])
if not items:
break
for item in items:
results.append(
{
"repo": item.get("repository", {}).get("full_name", ""),
"path": item.get("path", ""),
"sha": item.get("sha", ""),
"url": item.get("html_url", ""),
"size": item.get("size", 0),
"text_matches": item.get("text_matches", []),
}
)
if len(results) >= limit or len(items) < per_page:
break
page += 1
except Exception:
pass
return results[:limit]
def _fetch_repo_metadata(repos: List[str], token: str) -> Dict[str, Dict[str, Any]]:
"""Fetch metadata for repositories."""
headers = {
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
"Authorization": f"Bearer {token}",
}
metadata = {}
for repo in repos:
try:
url = f"https://api.github.com/repos/{repo}"
response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 200:
data = response.json()
metadata[repo] = {
"stars": data.get("stargazers_count", 0),
"updated_at": data.get("updated_at", ""),
"description": data.get("description", ""),
}
except:
continue
return metadata
def _score_and_rank(
results: List[Dict[str, Any]], library: str, token: str
) -> List[Example]:
"""Score results based on heuristics and rank them."""
repos = list(set(r["repo"] for r in results))
repo_metadata = _fetch_repo_metadata(repos, token)
scored_examples = []
for result in results:
repo = result["repo"]
path = result["path"]
score = 0.0
reasons = []
# Path-based scoring
path_lower = path.lower()
for pattern, points in PATH_SCORES.items():
if pattern.lower() in path_lower:
score += points
reasons.append(f"in {pattern}")
break
# File extension scoring
for ext, points in PREFERRED_EXTENSIONS.items():
if path_lower.endswith(ext):
score += points
break
# Content-based scoring
text_content = ""
for match in result.get("text_matches", []):
text_content += match.get("fragment", "").lower() + " "
for keyword, points in CONTENT_KEYWORDS.items():
if keyword.lower() in text_content:
score += points
reasons.append(f"contains '{keyword}'")
# Repo-based scoring
metadata = repo_metadata.get(repo, {})
stars = metadata.get("stars", 0)
updated = metadata.get("updated_at", "")
if stars > 0:
star_score = math.log10(stars + 1) * 10
score += star_score
# Recency bonus
if updated:
try:
updated_date = datetime.fromisoformat(updated.replace("Z", "+00:00"))
if datetime.now(updated_date.tzinfo) - updated_date < timedelta(
days=180
):
score += 20
reasons.append("recently updated")
except:
pass
# Filename quality
filename = path.split("/")[-1].lower()
if any(
word in filename
for word in ["example", "tutorial", "guide", "quickstart", "demo"]
):
score += 30
reasons.append("descriptive filename")
# Size penalty
if result["size"] > 100000:
score *= 0.5
reasons.append("large file")
example = Example(
repo=repo,
path=path,
ref=result["sha"],
url=result["url"],
score=score,
reason=", ".join(reasons) if reasons else "matches library",
repo_stars=stars,
repo_updated=updated,
file_size=result["size"],
)
scored_examples.append(example)
scored_examples.sort(key=lambda x: x.score, reverse=True)
return scored_examples
def _search_by_path(
library: str, org: str, repo_scope: Optional[str], token: str
) -> List[Dict[str, Any]]:
"""Search for library in example/tutorial/docs directories."""
results = []
path_patterns = [
"examples/",
"example/",
"docs/",
"tutorials/",
"notebooks/",
"guides/",
]
for path in path_patterns:
query_parts = [f"org:{org}", f"{library}", f"path:{path}"]
if repo_scope:
query_parts[0] = f"repo:{org}/{repo_scope}"
query = " ".join(query_parts)
results.extend(_execute_search(query, token, limit=20))
return results
def _search_by_content(
library: str, org: str, repo_scope: Optional[str], token: str
) -> List[Dict[str, Any]]:
"""Search for library with specific content patterns."""
results = []
content_patterns = [
f"{library} if __name__",
f"{library} quickstart",
f"{library} tutorial",
f"{library} usage example",
]
for pattern in content_patterns:
query_parts = [f"org:{org}", pattern]
if repo_scope:
query_parts[0] = f"repo:{org}/{repo_scope}"
query = " ".join(query_parts)
results.extend(_execute_search(query, token, limit=15))
return results
def _search_readmes(
library: str, org: str, repo_scope: Optional[str], token: str
) -> List[Dict[str, Any]]:
"""Search for library mentions in README files."""
query_parts = [f"org:{org}", f"{library}", "filename:README"]
if repo_scope:
query_parts[0] = f"repo:{org}/{repo_scope}"
query = " ".join(query_parts)
return _execute_search(query, token, limit=20)
def find_examples(
library: str,
org: str = "huggingface",
repo_scope: Optional[str] = None,
max_results: int = 10,
) -> List[Example]:
"""
Find examples, guides, and tutorials for a library using deterministic queries.
Uses a playbook of smart searches and heuristics to find canonical examples:
- Prefers README.md, docs/**, examples/**, notebooks/**, tests/**
- Prefers files with if __name__ == "__main__", "quickstart", "tutorial"
- Prefers repos with higher stars and more recent updates
Args:
library: Library name to search for (e.g., "transformers", "torch")
org: GitHub organization to search in (default: "huggingface")
repo_scope: Optional specific repository (e.g., "transformers")
max_results: Maximum number of results to return (default: 10)
Returns:
List of Example objects, ranked by relevance score
"""
token = _get_github_token()
all_results = []
all_results.extend(_search_by_path(library, org, repo_scope, token))
all_results.extend(_search_by_content(library, org, repo_scope, token))
all_results.extend(_search_readmes(library, org, repo_scope, token))
# Deduplicate
seen = set()
unique_results = []
for result in all_results:
key = (result["repo"], result["path"])
if key not in seen:
seen.add(key)
unique_results.append(result)
scored_examples = _score_and_rank(unique_results, library, token)
return scored_examples[:max_results]
async def _async_call(func, *args, **kwargs):
"""Wrap synchronous calls for async context."""
return await asyncio.to_thread(func, *args, **kwargs)
def _format_examples_table(examples: List[Example]) -> str:
"""Format examples as a markdown table."""
if not examples:
return "No examples found."
lines = [
"| Rank | File | Score | Stars | Reason |",
"|------|------|-------|-------|--------|",
]
for i, ex in enumerate(examples, 1):
file_path = f"{ex.repo}/{ex.path}"
if len(file_path) > 60:
file_path = file_path[:57] + "..."
reason = ex.reason if len(ex.reason) < 40 else ex.reason[:37] + "..."
lines.append(
f"| {i} | {file_path} | {ex.score:.1f} | {ex.repo_stars:,} | {reason} |"
)
return "\n".join(lines)
class FindExamplesTool:
"""Tool for finding examples and tutorials for libraries."""
async def execute(self, params: Dict[str, Any]) -> ToolResult:
"""Execute find_examples operation."""
library = params.get("library")
if not library:
return {
"formatted": "Error: 'library' parameter is required",
"totalResults": 0,
"resultsShared": 0,
"isError": True,
}
org = params.get("org", "huggingface")
repo_scope = params.get("repo_scope")
max_results = params.get("max_results", 10)
try:
examples = await _async_call(
find_examples,
library=library,
org=org,
repo_scope=repo_scope,
max_results=max_results,
)
if not examples:
return {
"formatted": f"No examples found for '{library}' in {org}",
"totalResults": 0,
"resultsShared": 0,
}
table = _format_examples_table(examples)
response = f"**Found {len(examples)} examples for '{library}' in {org}:**\n\n{table}"
# Add URLs and suggest using read_file
response += "\n\n**Top examples (use read_file to view):**\n"
for i, ex in enumerate(examples[:3], 1):
response += f"{i}. [{ex.repo}/{ex.path}]({ex.url})\n"
response += f" Use: read_file(repo='{ex.repo}', path='{ex.path}')\n"
return {
"formatted": response,
"totalResults": len(examples),
"resultsShared": len(examples),
}
except GitHubAPIError as e:
return {
"formatted": f"GitHub API Error: {str(e)}",
"totalResults": 0,
"resultsShared": 0,
"isError": True,
}
except Exception as e:
return {
"formatted": f"Error: {str(e)}",
"totalResults": 0,
"resultsShared": 0,
"isError": True,
}
# Tool specification
FIND_EXAMPLES_TOOL_SPEC = {
"name": "find_examples",
"description": (
"Find examples, guides, and tutorials for a library using deterministic queries and heuristics.\n\n"
"Uses best practices retrieval without semantic search:\n"
"- Prefers README.md, docs/**, examples/**, notebooks/**, tests/**\n"
"- Prefers files with if __name__ == '__main__', 'quickstart', 'tutorial', 'usage'\n"
"- Prefers repos with higher stars and more recent updates\n\n"
"Returns a ranked list of canonical example files.\n\n"
"Examples:\n"
"- Find transformers examples: {'library': 'transformers', 'org': 'huggingface', 'max_results': 5}\n"
"- Find torch examples in specific repo: {'library': 'torch', 'org': 'pytorch', 'repo_scope': 'examples'}\n\n"
"Use read_file tool to view the content of returned files.\n\n"
),
"parameters": {
"type": "object",
"properties": {
"library": {
"type": "string",
"description": "Library name to search for (e.g., 'transformers', 'torch', 'react')",
},
"org": {
"type": "string",
"description": "GitHub organization to search in (default: 'huggingface')",
},
"repo_scope": {
"type": "string",
"description": "Optional specific repository to search within",
},
"max_results": {
"type": "integer",
"description": "Maximum number of results to return (default: 10)",
},
},
"required": ["library"],
},
}
async def find_examples_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
"""Handler for agent tool router."""
try:
tool = FindExamplesTool()
result = await tool.execute(arguments)
return result["formatted"], not result.get("isError", False)
except Exception as e:
return f"Error executing find_examples: {str(e)}", False