Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Commit Β·
7c7cb7b
1
Parent(s): 712aaa7
whoosh impl for docs search
Browse files- agent/tools/docs_tools.py +286 -55
- pyproject.toml +1 -0
agent/tools/docs_tools.py
CHANGED
|
@@ -9,10 +9,53 @@ from typing import Any
|
|
| 9 |
|
| 10 |
import httpx
|
| 11 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
# Cache for OpenAPI spec to avoid repeated fetches
|
| 14 |
_openapi_spec_cache: dict[str, Any] | None = None
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
async def _fetch_html_page(hf_token: str, endpoint: str) -> str:
|
| 18 |
"""Fetch the HTML page for a given endpoint"""
|
|
@@ -52,7 +95,7 @@ def _parse_sidebar_navigation(html_content: str) -> list[dict[str, str]]:
|
|
| 52 |
async def _fetch_single_glimpse(
|
| 53 |
client: httpx.AsyncClient, hf_token: str, item: dict[str, str]
|
| 54 |
) -> dict[str, str]:
|
| 55 |
-
"""Fetch a glimpse
|
| 56 |
md_url = f"{item['url']}.md"
|
| 57 |
headers = {"Authorization": f"Bearer {hf_token}"}
|
| 58 |
|
|
@@ -60,9 +103,10 @@ async def _fetch_single_glimpse(
|
|
| 60 |
response = await client.get(md_url, headers=headers)
|
| 61 |
response.raise_for_status()
|
| 62 |
|
| 63 |
-
content = response.text
|
| 64 |
-
|
| 65 |
-
|
|
|
|
| 66 |
glimpse += "..."
|
| 67 |
|
| 68 |
return {
|
|
@@ -70,6 +114,7 @@ async def _fetch_single_glimpse(
|
|
| 70 |
"url": item["url"],
|
| 71 |
"md_url": md_url,
|
| 72 |
"glimpse": glimpse,
|
|
|
|
| 73 |
}
|
| 74 |
except Exception as e:
|
| 75 |
return {
|
|
@@ -77,6 +122,7 @@ async def _fetch_single_glimpse(
|
|
| 77 |
"url": item["url"],
|
| 78 |
"md_url": md_url,
|
| 79 |
"glimpse": f"[Could not fetch glimpse: {str(e)[:50]}]",
|
|
|
|
| 80 |
}
|
| 81 |
|
| 82 |
|
|
@@ -92,39 +138,225 @@ async def _fetch_all_glimpses(
|
|
| 92 |
return list(result_items)
|
| 93 |
|
| 94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
def _format_exploration_results(
|
| 96 |
-
endpoint: str,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
) -> str:
|
| 98 |
"""Format the exploration results as a readable string"""
|
| 99 |
base_url = "https://huggingface.co/docs"
|
| 100 |
url = f"{base_url}/{endpoint}"
|
| 101 |
result = f"Documentation structure for: {url}\n\n"
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
for i, item in enumerate(result_items, 1):
|
| 105 |
result += f"{i}. **{item['title']}**\n"
|
| 106 |
result += f" URL: {item['url']}\n"
|
|
|
|
|
|
|
|
|
|
| 107 |
result += f" Glimpse: {item['glimpse']}\n\n"
|
| 108 |
|
| 109 |
return result
|
| 110 |
|
| 111 |
|
| 112 |
-
async def explore_hf_docs(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
"""Main function to explore documentation structure"""
|
| 114 |
-
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
-
|
| 118 |
-
|
| 119 |
|
| 120 |
-
if
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
|
|
|
|
|
|
| 125 |
|
| 126 |
-
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
return result
|
| 130 |
|
|
@@ -140,6 +372,8 @@ async def explore_hf_docs_handler(arguments: dict[str, Any]) -> tuple[str, bool]
|
|
| 140 |
Tuple of (structured_navigation_with_glimpses, success)
|
| 141 |
"""
|
| 142 |
endpoint = arguments.get("endpoint", "")
|
|
|
|
|
|
|
| 143 |
|
| 144 |
if not endpoint:
|
| 145 |
return "Error: No endpoint provided", False
|
|
@@ -153,7 +387,20 @@ async def explore_hf_docs_handler(arguments: dict[str, Any]) -> tuple[str, bool]
|
|
| 153 |
endpoint = endpoint.lstrip("/")
|
| 154 |
|
| 155 |
try:
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
return result, True
|
| 158 |
|
| 159 |
except httpx.HTTPStatusError as e:
|
|
@@ -509,7 +756,7 @@ async def hf_docs_fetch_handler(arguments: dict[str, Any]) -> tuple[str, bool]:
|
|
| 509 |
EXPLORE_HF_DOCS_TOOL_SPEC = {
|
| 510 |
"name": "explore_hf_docs",
|
| 511 |
"description": (
|
| 512 |
-
"Explore Hugging Face documentation structure and discover available pages with
|
| 513 |
"β οΈ MANDATORY: ALWAYS use this BEFORE implementing any ML task (training, fine-tuning, data processing, inference). "
|
| 514 |
"Your training data may be outdated - current documentation is the source of truth. "
|
| 515 |
"**Use when:** (1) Starting any implementation task, (2) User asks 'how to' questions, "
|
|
@@ -519,6 +766,7 @@ EXPLORE_HF_DOCS_TOOL_SPEC = {
|
|
| 519 |
"Returns: Sidebar navigation with titles, URLs, and glimpses of all pages in the selected documentation. "
|
| 520 |
"**Then:** Use fetch_hf_docs with specific URLs from results to get full content. "
|
| 521 |
"**Critical for reliability:** Never implement based on internal knowledge without checking current docs first - APIs change frequently."
|
|
|
|
| 522 |
),
|
| 523 |
"parameters": {
|
| 524 |
"type": "object",
|
|
@@ -541,19 +789,8 @@ EXPLORE_HF_DOCS_TOOL_SPEC = {
|
|
| 541 |
"peft",
|
| 542 |
"accelerate",
|
| 543 |
"optimum",
|
| 544 |
-
"optimum-habana",
|
| 545 |
-
"optimum-neuron",
|
| 546 |
-
"optimum-intel",
|
| 547 |
-
"optimum-executorch",
|
| 548 |
-
"optimum-tpu",
|
| 549 |
"tokenizers",
|
| 550 |
-
"
|
| 551 |
-
"robotics-course",
|
| 552 |
-
"mcp-course",
|
| 553 |
-
"smol-course",
|
| 554 |
-
"agents-course",
|
| 555 |
-
"deep-rl-course",
|
| 556 |
-
"computer-vision-course",
|
| 557 |
"evaluate",
|
| 558 |
"tasks",
|
| 559 |
"dataset-viewer",
|
|
@@ -564,16 +801,11 @@ EXPLORE_HF_DOCS_TOOL_SPEC = {
|
|
| 564 |
"safetensors",
|
| 565 |
"tgi",
|
| 566 |
"setfit",
|
| 567 |
-
"audio-course",
|
| 568 |
"lerobot",
|
| 569 |
"autotrain",
|
| 570 |
"tei",
|
| 571 |
"bitsandbytes",
|
| 572 |
-
"cookbook",
|
| 573 |
"sentence_transformers",
|
| 574 |
-
"ml-games-course",
|
| 575 |
-
"diffusion-course",
|
| 576 |
-
"ml-for-3d-course",
|
| 577 |
"chat-ui",
|
| 578 |
"leaderboards",
|
| 579 |
"lighteval",
|
|
@@ -585,6 +817,7 @@ EXPLORE_HF_DOCS_TOOL_SPEC = {
|
|
| 585 |
],
|
| 586 |
"description": (
|
| 587 |
"The documentation endpoint to explore. Each endpoint corresponds to a major section of the Hugging Face documentation:\n\n"
|
|
|
|
| 588 |
"β’ hub β Find answers to questions about models/datasets/spaces, auth, versioning, metadata.\n"
|
| 589 |
"β’ transformers β Core model library: architectures, configs, tokenizers, training & inference APIs.\n"
|
| 590 |
"β’ diffusers β Diffusion pipelines, schedulers, fine-tuning, training, and deployment patterns.\n"
|
|
@@ -599,20 +832,8 @@ EXPLORE_HF_DOCS_TOOL_SPEC = {
|
|
| 599 |
"β’ inference-endpoints β Managed, scalable model deployments on HF infrastructure.\n"
|
| 600 |
"β’ peft β Parameter-efficient fine-tuning methods (LoRA, adapters, etc.).\n"
|
| 601 |
"β’ accelerate β Hardware-agnostic, distributed and mixed-precision training orchestration.\n"
|
| 602 |
-
"β’ optimum β Hardware-aware optimization and model export tooling.\n"
|
| 603 |
-
"β’ optimum-habana β Training and inference on Habana Gaudi accelerators.\n"
|
| 604 |
-
"β’ optimum-neuron β Optimization workflows for AWS Inferentia/Trainium.\n"
|
| 605 |
-
"β’ optimum-intel β Intel CPU/GPU optimizations (OpenVINO, IPEX).\n"
|
| 606 |
-
"β’ optimum-executorch β Exporting models to ExecuTorch for edge/mobile.\n"
|
| 607 |
-
"β’ optimum-tpu β TPU-specific training and optimization paths.\n"
|
| 608 |
"β’ tokenizers β Fast tokenizer internals, training, and low-level APIs.\n"
|
| 609 |
-
"β’ llm-course β End-to-end LLM concepts, training, and deployment.\n"
|
| 610 |
-
"β’ robotics-course β Learning-based robotics foundations.\n"
|
| 611 |
-
"β’ mcp-course β Model Context Protocol concepts and usage.\n"
|
| 612 |
-
"β’ smol-course β Small-model and efficiency-focused workflows.\n"
|
| 613 |
-
"β’ agents-course β Tool-using, planning, and multi-step agent design.\n"
|
| 614 |
-
"β’ deep-rl-course β Deep reinforcement learning foundations.\n"
|
| 615 |
-
"β’ computer-vision-course β Vision models, datasets, and pipelines.\n"
|
| 616 |
"β’ evaluate β Metrics, evaluation workflows, and training-loop integration.\n"
|
| 617 |
"β’ tasks β Canonical task definitions and model categorization.\n"
|
| 618 |
"β’ dataset-viewer β Dataset preview, streaming views, and viewer internals.\n"
|
|
@@ -623,16 +844,11 @@ EXPLORE_HF_DOCS_TOOL_SPEC = {
|
|
| 623 |
"β’ safetensors β Safe, fast tensor serialization format.\n"
|
| 624 |
"β’ tgi β High-throughput text generation server for LLMs.\n"
|
| 625 |
"β’ setfit β Few-shot text classification via sentence embeddings.\n"
|
| 626 |
-
"β’ audio-course β Speech and audio models, datasets, and tasks.\n"
|
| 627 |
"β’ lerobot β Robotics datasets, policies, and learning workflows.\n"
|
| 628 |
"β’ autotrain β No/low-code model training on Hugging Face.\n"
|
| 629 |
"β’ tei β Optimized inference server for embedding workloads.\n"
|
| 630 |
"β’ bitsandbytes β Quantization and memory-efficient optimizers.\n"
|
| 631 |
-
"β’ cookbook β Practical, task-oriented recipes across the ecosystem.\n"
|
| 632 |
"β’ sentence_transformers β Embedding models, training recipes, similarity/search workflows.\n"
|
| 633 |
-
"β’ ml-games-course β Game-based ML and reinforcement learning experiments.\n"
|
| 634 |
-
"β’ diffusion-course β Diffusion model theory and hands-on practice.\n"
|
| 635 |
-
"β’ ml-for-3d-course β 3D representations, models, and learning techniques.\n"
|
| 636 |
"β’ chat-ui β Reference chat interfaces for LLM deployment.\n"
|
| 637 |
"β’ leaderboards β Evaluation leaderboards and submission mechanics.\n"
|
| 638 |
"β’ lighteval β Lightweight, reproducible LLM evaluation framework.\n"
|
|
@@ -643,6 +859,21 @@ EXPLORE_HF_DOCS_TOOL_SPEC = {
|
|
| 643 |
"β’ google-cloud β GCP deployment and serving workflows.\n"
|
| 644 |
),
|
| 645 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 646 |
},
|
| 647 |
"required": ["endpoint"],
|
| 648 |
},
|
|
|
|
| 9 |
|
| 10 |
import httpx
|
| 11 |
from bs4 import BeautifulSoup
|
| 12 |
+
from whoosh.analysis import StemmingAnalyzer
|
| 13 |
+
from whoosh.fields import ID, TEXT, Schema
|
| 14 |
+
from whoosh.filedb.filestore import RamStorage
|
| 15 |
+
from whoosh.qparser import MultifieldParser, OrGroup
|
| 16 |
|
| 17 |
# Cache for OpenAPI spec to avoid repeated fetches
|
| 18 |
_openapi_spec_cache: dict[str, Any] | None = None
|
| 19 |
|
| 20 |
+
# Simple in-memory caches for docs and search indexes
|
| 21 |
+
_DOCS_CACHE: dict[str, list[dict[str, str]]] = {}
|
| 22 |
+
_INDEX_CACHE: dict[str, tuple[Any, MultifieldParser]] = {}
|
| 23 |
+
_CACHE_LOCK = asyncio.Lock()
|
| 24 |
+
|
| 25 |
+
# Result limiting defaults
|
| 26 |
+
DEFAULT_MAX_RESULTS = 20
|
| 27 |
+
MAX_RESULTS_CAP = 50
|
| 28 |
+
|
| 29 |
+
# High-level endpoints that bundle related documentation sections
|
| 30 |
+
COMPOSITE_ENDPOINTS: dict[str, list[str]] = {
|
| 31 |
+
"optimum": [
|
| 32 |
+
"optimum",
|
| 33 |
+
"optimum-habana",
|
| 34 |
+
"optimum-neuron",
|
| 35 |
+
"optimum-intel",
|
| 36 |
+
"optimum-executorch",
|
| 37 |
+
"optimum-tpu",
|
| 38 |
+
],
|
| 39 |
+
"courses": [
|
| 40 |
+
"llm-course",
|
| 41 |
+
"robotics-course",
|
| 42 |
+
"mcp-course",
|
| 43 |
+
"smol-course",
|
| 44 |
+
"agents-course",
|
| 45 |
+
"deep-rl-course",
|
| 46 |
+
"computer-vision-course",
|
| 47 |
+
"audio-course",
|
| 48 |
+
"ml-games-course",
|
| 49 |
+
"diffusion-course",
|
| 50 |
+
"ml-for-3d-course",
|
| 51 |
+
"cookbook",
|
| 52 |
+
],
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _expand_endpoint(endpoint: str) -> list[str]:
|
| 57 |
+
return COMPOSITE_ENDPOINTS.get(endpoint, [endpoint])
|
| 58 |
+
|
| 59 |
|
| 60 |
async def _fetch_html_page(hf_token: str, endpoint: str) -> str:
|
| 61 |
"""Fetch the HTML page for a given endpoint"""
|
|
|
|
| 95 |
async def _fetch_single_glimpse(
|
| 96 |
client: httpx.AsyncClient, hf_token: str, item: dict[str, str]
|
| 97 |
) -> dict[str, str]:
|
| 98 |
+
"""Fetch a short glimpse for a single page"""
|
| 99 |
md_url = f"{item['url']}.md"
|
| 100 |
headers = {"Authorization": f"Bearer {hf_token}"}
|
| 101 |
|
|
|
|
| 103 |
response = await client.get(md_url, headers=headers)
|
| 104 |
response.raise_for_status()
|
| 105 |
|
| 106 |
+
content = response.text.strip()
|
| 107 |
+
snippet_length = 200
|
| 108 |
+
glimpse = content[:snippet_length].strip()
|
| 109 |
+
if len(content) > snippet_length:
|
| 110 |
glimpse += "..."
|
| 111 |
|
| 112 |
return {
|
|
|
|
| 114 |
"url": item["url"],
|
| 115 |
"md_url": md_url,
|
| 116 |
"glimpse": glimpse,
|
| 117 |
+
"content": content,
|
| 118 |
}
|
| 119 |
except Exception as e:
|
| 120 |
return {
|
|
|
|
| 122 |
"url": item["url"],
|
| 123 |
"md_url": md_url,
|
| 124 |
"glimpse": f"[Could not fetch glimpse: {str(e)[:50]}]",
|
| 125 |
+
"content": "",
|
| 126 |
}
|
| 127 |
|
| 128 |
|
|
|
|
| 138 |
return list(result_items)
|
| 139 |
|
| 140 |
|
| 141 |
+
async def _load_single_endpoint(hf_token: str, endpoint: str) -> list[dict[str, str]]:
|
| 142 |
+
"""Fetch docs for a single endpoint."""
|
| 143 |
+
html_content = await _fetch_html_page(hf_token, endpoint)
|
| 144 |
+
nav_data = _parse_sidebar_navigation(html_content)
|
| 145 |
+
if not nav_data:
|
| 146 |
+
raise ValueError(f"No navigation links found for endpoint '{endpoint}'")
|
| 147 |
+
|
| 148 |
+
docs = await _fetch_all_glimpses(hf_token, nav_data)
|
| 149 |
+
for doc in docs:
|
| 150 |
+
doc["section"] = endpoint
|
| 151 |
+
return docs
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
async def _get_docs(hf_token: str, endpoint: str) -> list[dict[str, str]]:
|
| 155 |
+
"""Return docs for a single endpoint or expanded composite."""
|
| 156 |
+
async with _CACHE_LOCK:
|
| 157 |
+
cached = _DOCS_CACHE.get(endpoint)
|
| 158 |
+
if cached is not None:
|
| 159 |
+
return cached
|
| 160 |
+
|
| 161 |
+
docs: list[dict[str, str]] = []
|
| 162 |
+
for member in _expand_endpoint(endpoint):
|
| 163 |
+
async with _CACHE_LOCK:
|
| 164 |
+
member_cached = _DOCS_CACHE.get(member)
|
| 165 |
+
if member_cached is None:
|
| 166 |
+
member_cached = await _load_single_endpoint(hf_token, member)
|
| 167 |
+
async with _CACHE_LOCK:
|
| 168 |
+
_DOCS_CACHE[member] = member_cached
|
| 169 |
+
docs.extend(member_cached)
|
| 170 |
+
|
| 171 |
+
async with _CACHE_LOCK:
|
| 172 |
+
_DOCS_CACHE[endpoint] = docs
|
| 173 |
+
return docs
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
async def _ensure_index(
|
| 177 |
+
endpoint: str, docs: list[dict[str, str]]
|
| 178 |
+
) -> tuple[Any, MultifieldParser]:
|
| 179 |
+
async with _CACHE_LOCK:
|
| 180 |
+
cached = _INDEX_CACHE.get(endpoint)
|
| 181 |
+
if cached is not None:
|
| 182 |
+
return cached
|
| 183 |
+
|
| 184 |
+
analyzer = StemmingAnalyzer()
|
| 185 |
+
schema = Schema(
|
| 186 |
+
title=TEXT(stored=True, analyzer=analyzer),
|
| 187 |
+
url=ID(stored=True, unique=True),
|
| 188 |
+
md_url=ID(stored=True),
|
| 189 |
+
section=ID(stored=True),
|
| 190 |
+
glimpse=TEXT(stored=True, analyzer=analyzer),
|
| 191 |
+
content=TEXT(stored=False, analyzer=analyzer),
|
| 192 |
+
)
|
| 193 |
+
storage = RamStorage()
|
| 194 |
+
index = storage.create_index(schema)
|
| 195 |
+
writer = index.writer()
|
| 196 |
+
for doc in docs:
|
| 197 |
+
writer.add_document(
|
| 198 |
+
title=doc.get("title", ""),
|
| 199 |
+
url=doc.get("url", ""),
|
| 200 |
+
md_url=doc.get("md_url", ""),
|
| 201 |
+
section=doc.get("section", endpoint),
|
| 202 |
+
glimpse=doc.get("glimpse", ""),
|
| 203 |
+
content=doc.get("content", ""),
|
| 204 |
+
)
|
| 205 |
+
writer.commit()
|
| 206 |
+
|
| 207 |
+
parser = MultifieldParser(
|
| 208 |
+
["title", "content"],
|
| 209 |
+
schema=schema,
|
| 210 |
+
fieldboosts={"title": 2.0, "content": 1.0},
|
| 211 |
+
group=OrGroup,
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
async with _CACHE_LOCK:
|
| 215 |
+
_INDEX_CACHE[endpoint] = (index, parser)
|
| 216 |
+
return index, parser
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
async def _search_docs(
|
| 220 |
+
endpoint: str,
|
| 221 |
+
docs: list[dict[str, str]],
|
| 222 |
+
query: str,
|
| 223 |
+
limit: int | None,
|
| 224 |
+
) -> tuple[list[dict[str, Any]], str | None]:
|
| 225 |
+
"""
|
| 226 |
+
Run a Whoosh search over documentation entries.
|
| 227 |
+
|
| 228 |
+
Returns (results, fallback_message). If fallback_message is not None, the caller
|
| 229 |
+
should surface fallback information to the user.
|
| 230 |
+
"""
|
| 231 |
+
index, parser = await _ensure_index(endpoint, docs)
|
| 232 |
+
|
| 233 |
+
try:
|
| 234 |
+
query_obj = parser.parse(query)
|
| 235 |
+
except Exception:
|
| 236 |
+
return (
|
| 237 |
+
[],
|
| 238 |
+
"Query contained unsupported syntax; showing default ordering instead.",
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
with index.searcher() as searcher:
|
| 242 |
+
whoosh_results = searcher.search(query_obj, limit=limit or None)
|
| 243 |
+
matches: list[dict[str, Any]] = []
|
| 244 |
+
for hit in whoosh_results:
|
| 245 |
+
matches.append(
|
| 246 |
+
{
|
| 247 |
+
"title": hit["title"],
|
| 248 |
+
"url": hit["url"],
|
| 249 |
+
"md_url": hit.get("md_url", ""),
|
| 250 |
+
"section": hit.get("section", endpoint),
|
| 251 |
+
"glimpse": hit["glimpse"],
|
| 252 |
+
"score": round(hit.score, 2),
|
| 253 |
+
}
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
if not matches:
|
| 257 |
+
return [], "No strong matches found; showing default ordering instead."
|
| 258 |
+
|
| 259 |
+
return matches, None
|
| 260 |
+
|
| 261 |
+
|
| 262 |
def _format_exploration_results(
|
| 263 |
+
endpoint: str,
|
| 264 |
+
result_items: list[dict[str, str]],
|
| 265 |
+
total_items: int,
|
| 266 |
+
query: str | None = None,
|
| 267 |
+
fallback_message: str | None = None,
|
| 268 |
) -> str:
|
| 269 |
"""Format the exploration results as a readable string"""
|
| 270 |
base_url = "https://huggingface.co/docs"
|
| 271 |
url = f"{base_url}/{endpoint}"
|
| 272 |
result = f"Documentation structure for: {url}\n\n"
|
| 273 |
+
|
| 274 |
+
if query:
|
| 275 |
+
result += (
|
| 276 |
+
f"Query: '{query}' β showing {len(result_items)} result(s)"
|
| 277 |
+
f" out of {total_items} pages"
|
| 278 |
+
)
|
| 279 |
+
if fallback_message:
|
| 280 |
+
result += f" ({fallback_message})"
|
| 281 |
+
result += "\n\n"
|
| 282 |
+
else:
|
| 283 |
+
result += (
|
| 284 |
+
f"Found {len(result_items)} page(s) (total available: {total_items}).\n\n"
|
| 285 |
+
)
|
| 286 |
|
| 287 |
for i, item in enumerate(result_items, 1):
|
| 288 |
result += f"{i}. **{item['title']}**\n"
|
| 289 |
result += f" URL: {item['url']}\n"
|
| 290 |
+
result += f" Section: {item.get('section', endpoint)}\n"
|
| 291 |
+
if query and "score" in item:
|
| 292 |
+
result += f" Relevance score: {item['score']:.2f}\n"
|
| 293 |
result += f" Glimpse: {item['glimpse']}\n\n"
|
| 294 |
|
| 295 |
return result
|
| 296 |
|
| 297 |
|
| 298 |
+
async def explore_hf_docs(
|
| 299 |
+
hf_token: str,
|
| 300 |
+
endpoint: str,
|
| 301 |
+
query: str | None = None,
|
| 302 |
+
max_results: int | None = None,
|
| 303 |
+
) -> str:
|
| 304 |
"""Main function to explore documentation structure"""
|
| 305 |
+
cached_items = await _get_docs(hf_token, endpoint)
|
| 306 |
+
|
| 307 |
+
total_count = len(cached_items)
|
| 308 |
+
if max_results is None:
|
| 309 |
+
limit = DEFAULT_MAX_RESULTS
|
| 310 |
+
limit_note = f"Showing top {DEFAULT_MAX_RESULTS} results (set max_results to adjust)."
|
| 311 |
+
else:
|
| 312 |
+
limit = max_results if max_results > 0 else None
|
| 313 |
+
limit_note = None
|
| 314 |
+
if limit is None:
|
| 315 |
+
return "Error: max_results must be greater than zero."
|
| 316 |
+
|
| 317 |
+
if limit > MAX_RESULTS_CAP:
|
| 318 |
+
limit_note = (
|
| 319 |
+
f"Requested {limit} results but showing top {MAX_RESULTS_CAP} (maximum allowed)."
|
| 320 |
+
)
|
| 321 |
+
limit = MAX_RESULTS_CAP
|
| 322 |
|
| 323 |
+
selected_items: list[dict[str, Any]]
|
| 324 |
+
fallback_message: str | None = None
|
| 325 |
|
| 326 |
+
if query:
|
| 327 |
+
search_results, fallback_message = await _search_docs(
|
| 328 |
+
endpoint,
|
| 329 |
+
cached_items,
|
| 330 |
+
query,
|
| 331 |
+
limit,
|
| 332 |
+
)
|
| 333 |
|
| 334 |
+
if search_results:
|
| 335 |
+
selected_items = search_results
|
| 336 |
+
else:
|
| 337 |
+
selected_items = cached_items[:limit] if limit else cached_items
|
| 338 |
+
else:
|
| 339 |
+
selected_items = cached_items[:limit] if limit else cached_items
|
| 340 |
+
|
| 341 |
+
if not selected_items:
|
| 342 |
+
return f"No documentation entries available for endpoint '{endpoint}'."
|
| 343 |
+
|
| 344 |
+
note = None
|
| 345 |
+
if fallback_message or limit_note:
|
| 346 |
+
pieces = []
|
| 347 |
+
if fallback_message:
|
| 348 |
+
pieces.append(fallback_message)
|
| 349 |
+
if limit_note:
|
| 350 |
+
pieces.append(limit_note)
|
| 351 |
+
note = "; ".join(pieces)
|
| 352 |
+
|
| 353 |
+
result = _format_exploration_results(
|
| 354 |
+
endpoint,
|
| 355 |
+
selected_items,
|
| 356 |
+
total_items=total_count,
|
| 357 |
+
query=query,
|
| 358 |
+
fallback_message=note,
|
| 359 |
+
)
|
| 360 |
|
| 361 |
return result
|
| 362 |
|
|
|
|
| 372 |
Tuple of (structured_navigation_with_glimpses, success)
|
| 373 |
"""
|
| 374 |
endpoint = arguments.get("endpoint", "")
|
| 375 |
+
query = arguments.get("query")
|
| 376 |
+
max_results = arguments.get("max_results")
|
| 377 |
|
| 378 |
if not endpoint:
|
| 379 |
return "Error: No endpoint provided", False
|
|
|
|
| 387 |
endpoint = endpoint.lstrip("/")
|
| 388 |
|
| 389 |
try:
|
| 390 |
+
try:
|
| 391 |
+
max_results_int = int(max_results) if max_results is not None else None
|
| 392 |
+
except (TypeError, ValueError):
|
| 393 |
+
return "Error: max_results must be an integer", False
|
| 394 |
+
|
| 395 |
+
if max_results_int is not None and max_results_int <= 0:
|
| 396 |
+
return "Error: max_results must be greater than zero", False
|
| 397 |
+
|
| 398 |
+
result = await explore_hf_docs(
|
| 399 |
+
hf_token,
|
| 400 |
+
endpoint,
|
| 401 |
+
query=query.strip() if isinstance(query, str) and query.strip() else None,
|
| 402 |
+
max_results=max_results_int,
|
| 403 |
+
)
|
| 404 |
return result, True
|
| 405 |
|
| 406 |
except httpx.HTTPStatusError as e:
|
|
|
|
| 756 |
EXPLORE_HF_DOCS_TOOL_SPEC = {
|
| 757 |
"name": "explore_hf_docs",
|
| 758 |
"description": (
|
| 759 |
+
"Explore Hugging Face documentation structure and discover available pages with 200-character previews. "
|
| 760 |
"β οΈ MANDATORY: ALWAYS use this BEFORE implementing any ML task (training, fine-tuning, data processing, inference). "
|
| 761 |
"Your training data may be outdated - current documentation is the source of truth. "
|
| 762 |
"**Use when:** (1) Starting any implementation task, (2) User asks 'how to' questions, "
|
|
|
|
| 766 |
"Returns: Sidebar navigation with titles, URLs, and glimpses of all pages in the selected documentation. "
|
| 767 |
"**Then:** Use fetch_hf_docs with specific URLs from results to get full content. "
|
| 768 |
"**Critical for reliability:** Never implement based on internal knowledge without checking current docs first - APIs change frequently."
|
| 769 |
+
" By default returns the top 20 results; set max_results (max 50) to adjust."
|
| 770 |
),
|
| 771 |
"parameters": {
|
| 772 |
"type": "object",
|
|
|
|
| 789 |
"peft",
|
| 790 |
"accelerate",
|
| 791 |
"optimum",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 792 |
"tokenizers",
|
| 793 |
+
"courses",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 794 |
"evaluate",
|
| 795 |
"tasks",
|
| 796 |
"dataset-viewer",
|
|
|
|
| 801 |
"safetensors",
|
| 802 |
"tgi",
|
| 803 |
"setfit",
|
|
|
|
| 804 |
"lerobot",
|
| 805 |
"autotrain",
|
| 806 |
"tei",
|
| 807 |
"bitsandbytes",
|
|
|
|
| 808 |
"sentence_transformers",
|
|
|
|
|
|
|
|
|
|
| 809 |
"chat-ui",
|
| 810 |
"leaderboards",
|
| 811 |
"lighteval",
|
|
|
|
| 817 |
],
|
| 818 |
"description": (
|
| 819 |
"The documentation endpoint to explore. Each endpoint corresponds to a major section of the Hugging Face documentation:\n\n"
|
| 820 |
+
"β’ courses β All Hugging Face courses (LLM, robotics, MCP, smol (llm training), agents, deep RL, computer vision, games, diffusion, 3D, audio) and the cookbook recipes. Probably the best place for examples.\n"
|
| 821 |
"β’ hub β Find answers to questions about models/datasets/spaces, auth, versioning, metadata.\n"
|
| 822 |
"β’ transformers β Core model library: architectures, configs, tokenizers, training & inference APIs.\n"
|
| 823 |
"β’ diffusers β Diffusion pipelines, schedulers, fine-tuning, training, and deployment patterns.\n"
|
|
|
|
| 832 |
"β’ inference-endpoints β Managed, scalable model deployments on HF infrastructure.\n"
|
| 833 |
"β’ peft β Parameter-efficient fine-tuning methods (LoRA, adapters, etc.).\n"
|
| 834 |
"β’ accelerate β Hardware-agnostic, distributed and mixed-precision training orchestration.\n"
|
| 835 |
+
"β’ optimum β Hardware-aware optimization and model export tooling, including Habana, Neuron, Intel, ExecuTorch, and TPU variants.\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 836 |
"β’ tokenizers β Fast tokenizer internals, training, and low-level APIs.\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 837 |
"β’ evaluate β Metrics, evaluation workflows, and training-loop integration.\n"
|
| 838 |
"β’ tasks β Canonical task definitions and model categorization.\n"
|
| 839 |
"β’ dataset-viewer β Dataset preview, streaming views, and viewer internals.\n"
|
|
|
|
| 844 |
"β’ safetensors β Safe, fast tensor serialization format.\n"
|
| 845 |
"β’ tgi β High-throughput text generation server for LLMs.\n"
|
| 846 |
"β’ setfit β Few-shot text classification via sentence embeddings.\n"
|
|
|
|
| 847 |
"β’ lerobot β Robotics datasets, policies, and learning workflows.\n"
|
| 848 |
"β’ autotrain β No/low-code model training on Hugging Face.\n"
|
| 849 |
"β’ tei β Optimized inference server for embedding workloads.\n"
|
| 850 |
"β’ bitsandbytes β Quantization and memory-efficient optimizers.\n"
|
|
|
|
| 851 |
"β’ sentence_transformers β Embedding models, training recipes, similarity/search workflows.\n"
|
|
|
|
|
|
|
|
|
|
| 852 |
"β’ chat-ui β Reference chat interfaces for LLM deployment.\n"
|
| 853 |
"β’ leaderboards β Evaluation leaderboards and submission mechanics.\n"
|
| 854 |
"β’ lighteval β Lightweight, reproducible LLM evaluation framework.\n"
|
|
|
|
| 859 |
"β’ google-cloud β GCP deployment and serving workflows.\n"
|
| 860 |
),
|
| 861 |
},
|
| 862 |
+
"query": {
|
| 863 |
+
"type": "string",
|
| 864 |
+
"description": (
|
| 865 |
+
"Optional keyword query to rank and filter documentation pages. Fuzzy matching is used "
|
| 866 |
+
"against titles, URLs, and glimpses to surface the most relevant content."
|
| 867 |
+
),
|
| 868 |
+
},
|
| 869 |
+
"max_results": {
|
| 870 |
+
"type": "integer",
|
| 871 |
+
"description": (
|
| 872 |
+
"Optional cap on number of results to return. Defaults to 20 when omitted and cannot exceed 50."
|
| 873 |
+
),
|
| 874 |
+
"minimum": 1,
|
| 875 |
+
"maximum": 50,
|
| 876 |
+
},
|
| 877 |
},
|
| 878 |
"required": ["endpoint"],
|
| 879 |
},
|
pyproject.toml
CHANGED
|
@@ -24,6 +24,7 @@ agent = [
|
|
| 24 |
"nbconvert>=7.16.6",
|
| 25 |
"nbformat>=5.10.4",
|
| 26 |
"datasets>=4.3.0", # For session logging to HF datasets
|
|
|
|
| 27 |
]
|
| 28 |
|
| 29 |
# Evaluation/benchmarking dependencies
|
|
|
|
| 24 |
"nbconvert>=7.16.6",
|
| 25 |
"nbformat>=5.10.4",
|
| 26 |
"datasets>=4.3.0", # For session logging to HF datasets
|
| 27 |
+
"whoosh>=2.7.4",
|
| 28 |
]
|
| 29 |
|
| 30 |
# Evaluation/benchmarking dependencies
|