Spaces:

smolagents
/

ml-intern

Running on CPU Upgrade

App Files Files Community

ml-intern / agent /tools /github_find_examples.py

akseljoonas

poc github tools

ccbe2d2 5 months ago

raw

history blame

15.7 kB

	"""
	GitHub Find Examples Tool

	Finds examples, guides, and tutorials for a library using deterministic queries and heuristics.
	"""

	import asyncio
	import math
	import os
	from dataclasses import asdict, dataclass
	from datetime import datetime, timedelta
	from typing import Any, Dict, List, Optional

	try:
	import requests
	except ImportError:
	raise ImportError(
	"requests library is required. Install with: pip install requests"
	)

	from agent.tools.types import ToolResult


	@dataclass
	class Example:
	"""An example file with metadata and relevance score."""

	repo: str
	path: str
	ref: str
	url: str
	score: float
	reason: str
	repo_stars: int
	repo_updated: str
	file_size: int

	def to_dict(self):
	return asdict(self)


	class GitHubAPIError(Exception):
	"""Raised when GitHub API returns an error."""

	pass


	# Path-based scoring weights
	PATH_SCORES = {
	"README.md": 100,
	"readme.md": 100,
	"docs/": 80,
	"doc/": 80,
	"examples/": 90,
	"example/": 90,
	"notebooks/": 70,
	"notebook/": 70,
	"tutorials/": 85,
	"tutorial/": 85,
	"guides/": 85,
	"guide/": 85,
	"tests/": 40,
	"test/": 40,
	"demos/": 75,
	"demo/": 75,
	"samples/": 75,
	"sample/": 75,
	}

	# Content-based scoring keywords
	CONTENT_KEYWORDS = {
	'if __name__ == "__main__"': 50,
	"if __name__ == '__main__'": 50,
	"quickstart": 60,
	"quick start": 60,
	"getting started": 60,
	"tutorial": 50,
	"example usage": 55,
	"usage example": 55,
	"how to use": 45,
	"basic example": 50,
	"simple example": 50,
	}

	# File extension preferences
	PREFERRED_EXTENSIONS = {
	".py": 10,
	".ipynb": 15,
	".md": 20,
	".rst": 10,
	".js": 10,
	".ts": 10,
	".go": 10,
	".java": 10,
	".cpp": 10,
	".c": 10,
	}


	def _get_github_token() -> str:
	"""Get GitHub token from environment."""
	token = os.environ.get("GITHUB_TOKEN")
	if not token:
	raise GitHubAPIError(
	"GITHUB_TOKEN environment variable is required. "
	"Set it with: export GITHUB_TOKEN=your_token_here"
	)
	return token


	def _execute_search(query: str, token: str, limit: int = 20) -> List[Dict[str, Any]]:
	"""Execute a GitHub code search query."""
	headers = {
	"Accept": "application/vnd.github.text-match+json",
	"X-GitHub-Api-Version": "2022-11-28",
	"Authorization": f"Bearer {token}",
	}

	results = []
	page = 1
	per_page = min(100, limit)

	try:
	while len(results) < limit:
	params = {"q": query, "per_page": per_page, "page": page}
	url = "https://api.github.com/search/code"
	response = requests.get(url, headers=headers, params=params, timeout=30)

	if response.status_code != 200:
	break

	data = response.json()
	items = data.get("items", [])

	if not items:
	break

	for item in items:
	results.append(
	{
	"repo": item.get("repository", {}).get("full_name", ""),
	"path": item.get("path", ""),
	"sha": item.get("sha", ""),
	"url": item.get("html_url", ""),
	"size": item.get("size", 0),
	"text_matches": item.get("text_matches", []),
	}
	)

	if len(results) >= limit or len(items) < per_page:
	break

	page += 1

	except Exception:
	pass

	return results[:limit]


	def _fetch_repo_metadata(repos: List[str], token: str) -> Dict[str, Dict[str, Any]]:
	"""Fetch metadata for repositories."""
	headers = {
	"Accept": "application/vnd.github+json",
	"X-GitHub-Api-Version": "2022-11-28",
	"Authorization": f"Bearer {token}",
	}

	metadata = {}

	for repo in repos:
	try:
	url = f"https://api.github.com/repos/{repo}"
	response = requests.get(url, headers=headers, timeout=10)

	if response.status_code == 200:
	data = response.json()
	metadata[repo] = {
	"stars": data.get("stargazers_count", 0),
	"updated_at": data.get("updated_at", ""),
	"description": data.get("description", ""),
	}
	except:
	continue

	return metadata


	def _score_and_rank(
	results: List[Dict[str, Any]], library: str, token: str
	) -> List[Example]:
	"""Score results based on heuristics and rank them."""
	repos = list(set(r["repo"] for r in results))
	repo_metadata = _fetch_repo_metadata(repos, token)

	scored_examples = []

	for result in results:
	repo = result["repo"]
	path = result["path"]

	score = 0.0
	reasons = []

	# Path-based scoring
	path_lower = path.lower()
	for pattern, points in PATH_SCORES.items():
	if pattern.lower() in path_lower:
	score += points
	reasons.append(f"in {pattern}")
	break

	# File extension scoring
	for ext, points in PREFERRED_EXTENSIONS.items():
	if path_lower.endswith(ext):
	score += points
	break

	# Content-based scoring
	text_content = ""
	for match in result.get("text_matches", []):
	text_content += match.get("fragment", "").lower() + " "

	for keyword, points in CONTENT_KEYWORDS.items():
	if keyword.lower() in text_content:
	score += points
	reasons.append(f"contains '{keyword}'")

	# Repo-based scoring
	metadata = repo_metadata.get(repo, {})
	stars = metadata.get("stars", 0)
	updated = metadata.get("updated_at", "")

	if stars > 0:
	star_score = math.log10(stars + 1) * 10
	score += star_score

	# Recency bonus
	if updated:
	try:
	updated_date = datetime.fromisoformat(updated.replace("Z", "+00:00"))
	if datetime.now(updated_date.tzinfo) - updated_date < timedelta(
	days=180
	):
	score += 20
	reasons.append("recently updated")
	except:
	pass

	# Filename quality
	filename = path.split("/")[-1].lower()
	if any(
	word in filename
	for word in ["example", "tutorial", "guide", "quickstart", "demo"]
	):
	score += 30
	reasons.append("descriptive filename")

	# Size penalty
	if result["size"] > 100000:
	score *= 0.5
	reasons.append("large file")

	example = Example(
	repo=repo,
	path=path,
	ref=result["sha"],
	url=result["url"],
	score=score,
	reason=", ".join(reasons) if reasons else "matches library",
	repo_stars=stars,
	repo_updated=updated,
	file_size=result["size"],
	)

	scored_examples.append(example)

	scored_examples.sort(key=lambda x: x.score, reverse=True)
	return scored_examples


	def _search_by_path(
	library: str, org: str, repo_scope: Optional[str], token: str
	) -> List[Dict[str, Any]]:
	"""Search for library in example/tutorial/docs directories."""
	results = []
	path_patterns = [
	"examples/",
	"example/",
	"docs/",
	"tutorials/",
	"notebooks/",
	"guides/",
	]

	for path in path_patterns:
	query_parts = [f"org:{org}", f"{library}", f"path:{path}"]
	if repo_scope:
	query_parts[0] = f"repo:{org}/{repo_scope}"

	query = " ".join(query_parts)
	results.extend(_execute_search(query, token, limit=20))

	return results


	def _search_by_content(
	library: str, org: str, repo_scope: Optional[str], token: str
	) -> List[Dict[str, Any]]:
	"""Search for library with specific content patterns."""
	results = []
	content_patterns = [
	f"{library} if __name__",
	f"{library} quickstart",
	f"{library} tutorial",
	f"{library} usage example",
	]

	for pattern in content_patterns:
	query_parts = [f"org:{org}", pattern]
	if repo_scope:
	query_parts[0] = f"repo:{org}/{repo_scope}"

	query = " ".join(query_parts)
	results.extend(_execute_search(query, token, limit=15))

	return results


	def _search_readmes(
	library: str, org: str, repo_scope: Optional[str], token: str
	) -> List[Dict[str, Any]]:
	"""Search for library mentions in README files."""
	query_parts = [f"org:{org}", f"{library}", "filename:README"]
	if repo_scope:
	query_parts[0] = f"repo:{org}/{repo_scope}"

	query = " ".join(query_parts)
	return _execute_search(query, token, limit=20)


	def find_examples(
	library: str,
	org: str = "huggingface",
	repo_scope: Optional[str] = None,
	max_results: int = 10,
	) -> List[Example]:
	"""
	Find examples, guides, and tutorials for a library using deterministic queries.

	Uses a playbook of smart searches and heuristics to find canonical examples:
	- Prefers README.md, docs/, examples/, notebooks/, tests/
	- Prefers files with if __name__ == "__main__", "quickstart", "tutorial"
	- Prefers repos with higher stars and more recent updates

	Args:
	library: Library name to search for (e.g., "transformers", "torch")
	org: GitHub organization to search in (default: "huggingface")
	repo_scope: Optional specific repository (e.g., "transformers")
	max_results: Maximum number of results to return (default: 10)

	Returns:
	List of Example objects, ranked by relevance score
	"""
	token = _get_github_token()

	all_results = []
	all_results.extend(_search_by_path(library, org, repo_scope, token))
	all_results.extend(_search_by_content(library, org, repo_scope, token))
	all_results.extend(_search_readmes(library, org, repo_scope, token))

	# Deduplicate
	seen = set()
	unique_results = []
	for result in all_results:
	key = (result["repo"], result["path"])
	if key not in seen:
	seen.add(key)
	unique_results.append(result)

	scored_examples = _score_and_rank(unique_results, library, token)
	return scored_examples[:max_results]


	async def _async_call(func, args, *kwargs):
	"""Wrap synchronous calls for async context."""
	return await asyncio.to_thread(func, args, *kwargs)


	def _format_examples_table(examples: List[Example]) -> str:
	"""Format examples as a markdown table."""
	if not examples:
	return "No examples found."

	lines = [
	"\| Rank \| File \| Score \| Stars \| Reason \|",
	"\|------\|------\|-------\|-------\|--------\|",
	]

	for i, ex in enumerate(examples, 1):
	file_path = f"{ex.repo}/{ex.path}"
	if len(file_path) > 60:
	file_path = file_path[:57] + "..."
	reason = ex.reason if len(ex.reason) < 40 else ex.reason[:37] + "..."
	lines.append(
	f"\| {i} \| {file_path} \| {ex.score:.1f} \| {ex.repo_stars:,} \| {reason} \|"
	)

	return "\n".join(lines)


	class FindExamplesTool:
	"""Tool for finding examples and tutorials for libraries."""

	async def execute(self, params: Dict[str, Any]) -> ToolResult:
	"""Execute find_examples operation."""
	library = params.get("library")
	if not library:
	return {
	"formatted": "Error: 'library' parameter is required",
	"totalResults": 0,
	"resultsShared": 0,
	"isError": True,
	}

	org = params.get("org", "huggingface")
	repo_scope = params.get("repo_scope")
	max_results = params.get("max_results", 10)

	try:
	examples = await _async_call(
	find_examples,
	library=library,
	org=org,
	repo_scope=repo_scope,
	max_results=max_results,
	)

	if not examples:
	return {
	"formatted": f"No examples found for '{library}' in {org}",
	"totalResults": 0,
	"resultsShared": 0,
	}

	table = _format_examples_table(examples)
	response = f"Found {len(examples)} examples for '{library}' in {org}:\n\n{table}"

	# Add URLs and suggest using read_file
	response += "\n\nTop examples (use read_file to view):\n"
	for i, ex in enumerate(examples[:3], 1):
	response += f"{i}. [{ex.repo}/{ex.path}]({ex.url})\n"
	response += f" Use: read_file(repo='{ex.repo}', path='{ex.path}')\n"

	return {
	"formatted": response,
	"totalResults": len(examples),
	"resultsShared": len(examples),
	}

	except GitHubAPIError as e:
	return {
	"formatted": f"GitHub API Error: {str(e)}",
	"totalResults": 0,
	"resultsShared": 0,
	"isError": True,
	}
	except Exception as e:
	return {
	"formatted": f"Error: {str(e)}",
	"totalResults": 0,
	"resultsShared": 0,
	"isError": True,
	}


	# Tool specification
	FIND_EXAMPLES_TOOL_SPEC = {
	"name": "find_examples",
	"description": (
	"Find examples, guides, and tutorials for a library using deterministic queries and heuristics.\n\n"
	"Uses best practices retrieval without semantic search:\n"
	"- Prefers README.md, docs/, examples/, notebooks/, tests/\n"
	"- Prefers files with if __name__ == '__main__', 'quickstart', 'tutorial', 'usage'\n"
	"- Prefers repos with higher stars and more recent updates\n\n"
	"Returns a ranked list of canonical example files.\n\n"
	"Examples:\n"
	"- Find transformers examples: {'library': 'transformers', 'org': 'huggingface', 'max_results': 5}\n"
	"- Find torch examples in specific repo: {'library': 'torch', 'org': 'pytorch', 'repo_scope': 'examples'}\n\n"
	"Use read_file tool to view the content of returned files.\n\n"
	),
	"parameters": {
	"type": "object",
	"properties": {
	"library": {
	"type": "string",
	"description": "Library name to search for (e.g., 'transformers', 'torch', 'react')",
	},
	"org": {
	"type": "string",
	"description": "GitHub organization to search in (default: 'huggingface')",
	},
	"repo_scope": {
	"type": "string",
	"description": "Optional specific repository to search within",
	},
	"max_results": {
	"type": "integer",
	"description": "Maximum number of results to return (default: 10)",
	},
	},
	"required": ["library"],
	},
	}


	async def find_examples_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
	"""Handler for agent tool router."""
	try:
	tool = FindExamplesTool()
	result = await tool.execute(arguments)
	return result["formatted"], not result.get("isError", False)
	except Exception as e:
	return f"Error executing find_examples: {str(e)}", False