Aksel Joonas Reedi commited on
Commit
88deb52
·
2 Parent(s): 65c0ea69995a3b

Merge pull request #5 from huggingface/explore-tool

Browse files
agent/config.py CHANGED
@@ -8,7 +8,6 @@ from fastmcp.mcp_config import (
8
  RemoteMCPServer,
9
  StdioMCPServer,
10
  )
11
- from litellm import Tool
12
  from pydantic import BaseModel
13
 
14
  # These two are the canonical server config types for MCP servers.
@@ -19,7 +18,6 @@ class Config(BaseModel):
19
  """Configuration manager"""
20
 
21
  model_name: str
22
- tools: list[Tool] = []
23
  mcpServers: dict[str, MCPServerConfig] = {}
24
 
25
 
 
8
  RemoteMCPServer,
9
  StdioMCPServer,
10
  )
 
11
  from pydantic import BaseModel
12
 
13
  # These two are the canonical server config types for MCP servers.
 
18
  """Configuration manager"""
19
 
20
  model_name: str
 
21
  mcpServers: dict[str, MCPServerConfig] = {}
22
 
23
 
agent/config_claude_mcp.json DELETED
@@ -1,11 +0,0 @@
1
- {
2
- "mcpServers": {
3
- "huggingface": {
4
- "type": "http",
5
- "url": "https://huggingface.co/mcp",
6
- "headers": {
7
- "Authorization": "Bearer ${HF_TOKEN}"
8
- }
9
- }
10
- }
11
- }
 
 
 
 
 
 
 
 
 
 
 
 
agent/config_mcp_example copy.json DELETED
@@ -1,21 +0,0 @@
1
- {
2
- "model_name": "anthropic/claude-sonnet-4-5-20250929",
3
- "tools": [],
4
- "system_prompt_path": "",
5
- "mcpServers": {
6
- "hf-mcp-server": {
7
- "transport": "http",
8
- "url": "https://huggingface.co/mcp?login",
9
- "headers": {
10
- "Authorization": "Bearer ${HF_TOKEN}"
11
- }
12
- },
13
- "playwright": {
14
- "transport": "stdio",
15
- "command": "npx",
16
- "args": [
17
- "@playwright/mcp@latest"
18
- ]
19
- }
20
- }
21
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
agent/context_manager/manager.py CHANGED
@@ -19,17 +19,24 @@ class ContextManager:
19
  compact_size: float = 0.1,
20
  untouched_messages: int = 5,
21
  tool_specs: list[dict[str, Any]] | None = None,
 
22
  ):
23
- self.system_prompt = self._load_system_prompt(tool_specs or [])
 
 
24
  self.max_context = max_context
25
  self.compact_size = int(max_context * compact_size)
26
  self.context_length = len(self.system_prompt) // 4
27
  self.untouched_messages = untouched_messages
28
  self.items: list[Message] = [Message(role="system", content=self.system_prompt)]
29
 
30
- def _load_system_prompt(self, tool_specs: list[dict[str, Any]]):
 
 
 
 
31
  """Load and render the system prompt from YAML file with Jinja2"""
32
- prompt_file = Path(__file__).parent.parent / "prompts" / "system_prompt.yaml"
33
 
34
  with open(prompt_file, "r") as f:
35
  prompt_data = yaml.safe_load(f)
 
19
  compact_size: float = 0.1,
20
  untouched_messages: int = 5,
21
  tool_specs: list[dict[str, Any]] | None = None,
22
+ prompt_file_suffix: str = "system_prompt.yaml",
23
  ):
24
+ self.system_prompt = self._load_system_prompt(
25
+ tool_specs or [], prompt_file_suffix="system_prompt.yaml"
26
+ )
27
  self.max_context = max_context
28
  self.compact_size = int(max_context * compact_size)
29
  self.context_length = len(self.system_prompt) // 4
30
  self.untouched_messages = untouched_messages
31
  self.items: list[Message] = [Message(role="system", content=self.system_prompt)]
32
 
33
+ def _load_system_prompt(
34
+ self,
35
+ tool_specs: list[dict[str, Any]],
36
+ prompt_file_suffix: str = "system_prompt.yaml",
37
+ ):
38
  """Load and render the system prompt from YAML file with Jinja2"""
39
+ prompt_file = Path(__file__).parent.parent / "prompts" / f"{prompt_file_suffix}"
40
 
41
  with open(prompt_file, "r") as f:
42
  prompt_data = yaml.safe_load(f)
agent/core/session.py CHANGED
@@ -36,10 +36,11 @@ class Session:
36
  event_queue: asyncio.Queue,
37
  config: Config | None = None,
38
  tool_router=None,
 
39
  ):
40
  self.tool_router = tool_router
41
  tool_specs = tool_router.get_tool_specs_for_llm() if tool_router else []
42
- self.context_manager = ContextManager(
43
  max_context=get_max_tokens(config.model_name),
44
  compact_size=0.1,
45
  untouched_messages=5,
@@ -49,7 +50,6 @@ class Session:
49
  self.session_id = str(uuid.uuid4())
50
  self.config = config or Config(
51
  model_name="anthropic/claude-sonnet-4-5-20250929",
52
- tools=[],
53
  )
54
  self.is_running = True
55
  self.current_task: asyncio.Task | None = None
 
36
  event_queue: asyncio.Queue,
37
  config: Config | None = None,
38
  tool_router=None,
39
+ context_manager: ContextManager | None = None,
40
  ):
41
  self.tool_router = tool_router
42
  tool_specs = tool_router.get_tool_specs_for_llm() if tool_router else []
43
+ self.context_manager = context_manager or ContextManager(
44
  max_context=get_max_tokens(config.model_name),
45
  compact_size=0.1,
46
  untouched_messages=5,
 
50
  self.session_id = str(uuid.uuid4())
51
  self.config = config or Config(
52
  model_name="anthropic/claude-sonnet-4-5-20250929",
 
53
  )
54
  self.is_running = True
55
  self.current_task: asyncio.Task | None = None
agent/core/tools.py CHANGED
@@ -14,6 +14,7 @@ from mcp.types import EmbeddedResource, ImageContent, TextContent
14
 
15
  from agent.config import MCPServerConfig
16
  from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler
 
17
  from agent.tools.plan_tool import PLAN_TOOL_SPEC, plan_tool_handler
18
 
19
  # Suppress aiohttp deprecation warning
@@ -21,7 +22,7 @@ warnings.filterwarnings(
21
  "ignore", category=DeprecationWarning, module="aiohttp.connector"
22
  )
23
 
24
- NOT_ALLOWED_TOOL_NAMES = ["hf_jobs"]
25
 
26
 
27
  def convert_mcp_content_to_string(content: list) -> str:
@@ -187,7 +188,9 @@ class ToolRouter:
187
 
188
  def create_builtin_tools() -> list[ToolSpec]:
189
  """Create built-in tool specifications"""
190
- print(f"Creating built-in tools: {HF_JOBS_TOOL_SPEC['name']}, {PLAN_TOOL_SPEC['name']}")
 
 
191
  return [
192
  ToolSpec(
193
  name=HF_JOBS_TOOL_SPEC["name"],
@@ -196,9 +199,15 @@ def create_builtin_tools() -> list[ToolSpec]:
196
  handler=hf_jobs_handler,
197
  ),
198
  ToolSpec(
199
- name=PLAN_TOOL_SPEC["name"],
 
 
 
 
 
 
200
  description=PLAN_TOOL_SPEC["description"],
201
  parameters=PLAN_TOOL_SPEC["parameters"],
202
  handler=plan_tool_handler,
203
- ),
204
  ]
 
14
 
15
  from agent.config import MCPServerConfig
16
  from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler
17
+ from agent.tools.search_docs_tool import SEARCH_DOCS_TOOL_SPEC, search_docs_handler
18
  from agent.tools.plan_tool import PLAN_TOOL_SPEC, plan_tool_handler
19
 
20
  # Suppress aiohttp deprecation warning
 
22
  "ignore", category=DeprecationWarning, module="aiohttp.connector"
23
  )
24
 
25
+ NOT_ALLOWED_TOOL_NAMES = ["hf_jobs", "hf_doc_search", "hf_doc_fetch"]
26
 
27
 
28
  def convert_mcp_content_to_string(content: list) -> str:
 
188
 
189
  def create_builtin_tools() -> list[ToolSpec]:
190
  """Create built-in tool specifications"""
191
+ print(
192
+ f"Creating built-in tools: {HF_JOBS_TOOL_SPEC['name']}, {SEARCH_DOCS_TOOL_SPEC['name']}, {PLAN_TOOL_SPEC['name']}"
193
+ )
194
  return [
195
  ToolSpec(
196
  name=HF_JOBS_TOOL_SPEC["name"],
 
199
  handler=hf_jobs_handler,
200
  ),
201
  ToolSpec(
202
+ name=SEARCH_DOCS_TOOL_SPEC["name"],
203
+ description=SEARCH_DOCS_TOOL_SPEC["description"],
204
+ parameters=SEARCH_DOCS_TOOL_SPEC["parameters"],
205
+ handler=search_docs_handler,
206
+ ),
207
+ ToolSpec(
208
+ ame=PLAN_TOOL_SPEC["name"],
209
  description=PLAN_TOOL_SPEC["description"],
210
  parameters=PLAN_TOOL_SPEC["parameters"],
211
  handler=plan_tool_handler,
212
+ )
213
  ]
agent/main.py CHANGED
@@ -222,7 +222,7 @@ async def main():
222
  ready_event = asyncio.Event()
223
 
224
  # Start agent loop in background
225
- config_path = Path(__file__).parent / "config_mcp_example.json"
226
  config = load_config(config_path)
227
 
228
  # Create tool router
 
222
  ready_event = asyncio.Event()
223
 
224
  # Start agent loop in background
225
+ config_path = Path(__file__).parent.parent / "configs" / "main_agent_config.json"
226
  config = load_config(config_path)
227
 
228
  # Create tool router
agent/prompts/search_docs_system_prompt.yaml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ search_docs_system_prompt: |
2
+ You are a specialized documentation search agent. Your task is to comprehensively search and synthesize information from Hugging Face documentation.
3
+
4
+ # Search Strategy
5
+
6
+ You must search thoroughly before synthesizing results. Follow this approach:
7
+
8
+ 1. **Query Analysis**: Identify the core concepts and intent of the query
9
+ 2. **Initial Search**: Start with a broad search capturing the main topic
10
+ 3. **Iterative Refinement**: Run multiple searches to go deeper into topics. You will see parsed HTML pages, also look into links on the html pages for best information - first-pass results often miss key details
11
+ 4. **You must get to the end truth**: You must get to the bottom of the truth for this search query. You CAN NOT say that somebody should look up documentation. You must look it up yourself and give the best answer you can.
12
+
13
+ ## Query Formulation Best Practices
14
+
15
+ - Add relevant synonyms and related technical terms
16
+ - Remove filler words, focus on searchable concepts
17
+ - Break complex questions into focused sub-queries
18
+ - Include domain-specific terminology when applicable
19
+ - Try both specific terms and general related terms
20
+
21
+ # Response Guidelines
22
+
23
+ After gathering results, synthesize them following these principles:
24
+
25
+ 1. **Analyze Relevance**: Evaluate which results directly answer the query
26
+ 2. **Synthesize**: Combine information from multiple sources when applicable
27
+ 3. **Prioritize**: Present information in order of relevance
28
+ 4. **Cite Sources**: Reference which documents you're drawing from especially include relevant code samples and links to the code samples.
29
+ 5. **Acknowledge Gaps**: If documents don't fully answer the query, explicitly state this
30
+ 6. **Handle Conflicts**: If sources contradict, note this and explain your reasoning
31
+ 7. **Be Concise**: Provide a clear, direct answer without unnecessary elaboration
32
+
33
+ # Constraints
34
+
35
+ - Only provide information found in the documentation
36
+ - Do not make assumptions beyond what the sources state
37
+ - If information is not found, say so clearly rather than guessing
38
+ - Focus on answering the query directly
agent/tools/__init__.py CHANGED
@@ -3,6 +3,14 @@ Hugging Face tools for the agent
3
  """
4
 
5
  from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, HfJobsTool, hf_jobs_handler
 
6
  from agent.tools.types import ToolResult
7
 
8
- __all__ = ["ToolResult", "HF_JOBS_TOOL_SPEC", "hf_jobs_handler", HfJobsTool]
 
 
 
 
 
 
 
 
3
  """
4
 
5
  from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, HfJobsTool, hf_jobs_handler
6
+ from agent.tools.search_docs_tool import SEARCH_DOCS_TOOL_SPEC, search_docs_handler
7
  from agent.tools.types import ToolResult
8
 
9
+ __all__ = [
10
+ "ToolResult",
11
+ "HF_JOBS_TOOL_SPEC",
12
+ "hf_jobs_handler",
13
+ "HfJobsTool",
14
+ "SEARCH_DOCS_TOOL_SPEC",
15
+ "search_docs_handler",
16
+ ]
agent/tools/_search_agent_tools.py ADDED
@@ -0,0 +1,747 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tools available to the search sub-agent
3
+ These tools are used by the search sub-agent spawned by search_docs_tool
4
+ """
5
+
6
+ import asyncio
7
+ import os
8
+ import time
9
+ from typing import Any
10
+
11
+ import httpx
12
+ from bs4 import BeautifulSoup
13
+
14
+ # Cache for OpenAPI spec to avoid repeated fetches
15
+ _openapi_spec_cache: dict[str, Any] | None = None
16
+
17
+
18
+ async def _fetch_html_page(hf_token: str, endpoint: str) -> str:
19
+ """Fetch the HTML page for a given endpoint"""
20
+ base_url = "https://huggingface.co/docs"
21
+ url = f"{base_url}/{endpoint}"
22
+ headers = {"Authorization": f"Bearer {hf_token}"}
23
+
24
+ fetch_start = time.perf_counter()
25
+ async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
26
+ response = await client.get(url, headers=headers)
27
+ response.raise_for_status()
28
+
29
+ fetch_time = time.perf_counter() - fetch_start
30
+ print(f"[DEBUG] _fetch_html_page: Fetched in {fetch_time:.2f}s")
31
+
32
+ return response.text
33
+
34
+
35
+ def _parse_sidebar_navigation(html_content: str) -> list[dict[str, str]]:
36
+ """Parse the sidebar navigation and extract all links"""
37
+ parse_start = time.perf_counter()
38
+
39
+ soup = BeautifulSoup(html_content, "html.parser")
40
+ sidebar = soup.find("nav", class_=lambda x: x and "flex-auto" in x)
41
+
42
+ if not sidebar:
43
+ raise ValueError("Could not find navigation sidebar")
44
+
45
+ links = sidebar.find_all("a", href=True)
46
+ nav_data = []
47
+
48
+ for link in links:
49
+ title = link.get_text(strip=True)
50
+ href = link["href"]
51
+
52
+ # Make URL absolute
53
+ page_url = f"https://huggingface.co{href}" if href.startswith("/") else href
54
+ nav_data.append({"title": title, "url": page_url})
55
+
56
+ parse_time = time.perf_counter() - parse_start
57
+ print(
58
+ f"[DEBUG] _parse_sidebar_navigation: Parsed in {parse_time:.2f}s, found {len(nav_data)} links"
59
+ )
60
+
61
+ return nav_data
62
+
63
+
64
+ async def _fetch_single_glimpse(
65
+ client: httpx.AsyncClient, hf_token: str, item: dict[str, str]
66
+ ) -> dict[str, str]:
67
+ """Fetch a glimpse (first 300 chars) for a single page"""
68
+ md_url = f"{item['url']}.md"
69
+ headers = {"Authorization": f"Bearer {hf_token}"}
70
+
71
+ try:
72
+ response = await client.get(md_url, headers=headers)
73
+ response.raise_for_status()
74
+
75
+ content = response.text
76
+ glimpse = content[:300].strip()
77
+ if len(content) > 300:
78
+ glimpse += "..."
79
+
80
+ return {
81
+ "title": item["title"],
82
+ "url": item["url"],
83
+ "md_url": md_url,
84
+ "glimpse": glimpse,
85
+ }
86
+ except Exception as e:
87
+ return {
88
+ "title": item["title"],
89
+ "url": item["url"],
90
+ "md_url": md_url,
91
+ "glimpse": f"[Could not fetch glimpse: {str(e)[:50]}]",
92
+ }
93
+
94
+
95
+ async def _fetch_all_glimpses(
96
+ hf_token: str, nav_data: list[dict[str, str]]
97
+ ) -> list[dict[str, str]]:
98
+ """Fetch glimpses for all pages in parallel"""
99
+ glimpse_start = time.perf_counter()
100
+
101
+ async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
102
+ result_items = await asyncio.gather(
103
+ *[_fetch_single_glimpse(client, hf_token, item) for item in nav_data]
104
+ )
105
+
106
+ glimpse_time = time.perf_counter() - glimpse_start
107
+ print(
108
+ f"[DEBUG] _fetch_all_glimpses: Fetched {len(result_items)} glimpses in {glimpse_time:.2f}s"
109
+ )
110
+
111
+ return list(result_items)
112
+
113
+
114
+ def _format_exploration_results(
115
+ endpoint: str, result_items: list[dict[str, str]]
116
+ ) -> str:
117
+ """Format the exploration results as a readable string"""
118
+ base_url = "https://huggingface.co/docs"
119
+ url = f"{base_url}/{endpoint}"
120
+ result = f"Documentation structure for: {url}\n\n"
121
+ result += f"Found {len(result_items)} pages:\n\n"
122
+
123
+ for i, item in enumerate(result_items, 1):
124
+ result += f"{i}. **{item['title']}**\n"
125
+ result += f" URL: {item['url']}\n"
126
+ result += f" Glimpse: {item['glimpse']}\n\n"
127
+
128
+ return result
129
+
130
+
131
+ async def explore_hf_docs(hf_token: str, endpoint: str) -> str:
132
+ """Main function to explore documentation structure"""
133
+ start_time = time.perf_counter()
134
+ print(f"[DEBUG] explore_hf_docs: Starting for endpoint '{endpoint}'")
135
+
136
+ # Fetch HTML page
137
+ html_content = await _fetch_html_page(hf_token, endpoint)
138
+
139
+ # Parse navigation
140
+ nav_data = _parse_sidebar_navigation(html_content)
141
+
142
+ if not nav_data:
143
+ raise ValueError(f"No navigation links found for endpoint '{endpoint}'")
144
+
145
+ # Fetch all glimpses in parallel
146
+ result_items = await _fetch_all_glimpses(hf_token, nav_data)
147
+
148
+ # Format results
149
+ result = _format_exploration_results(endpoint, result_items)
150
+
151
+ total_time = time.perf_counter() - start_time
152
+ print(f"[DEBUG] explore_hf_docs: Total time {total_time:.2f}s")
153
+
154
+ return result
155
+
156
+
157
+ async def explore_hf_docs_handler(arguments: dict[str, Any]) -> tuple[str, bool]:
158
+ """
159
+ Explore the documentation structure for a given endpoint by parsing the sidebar navigation
160
+
161
+ Args:
162
+ arguments: Dictionary with 'endpoint' parameter (e.g., 'trl', 'transformers', etc.)
163
+
164
+ Returns:
165
+ Tuple of (structured_navigation_with_glimpses, success)
166
+ """
167
+ endpoint = arguments.get("endpoint", "")
168
+
169
+ if not endpoint:
170
+ return "Error: No endpoint provided", False
171
+
172
+ # Get HF token from environment
173
+ hf_token = os.environ.get("HF_TOKEN")
174
+
175
+ if not hf_token:
176
+ return "Error: HF_TOKEN environment variable not set", False
177
+
178
+ endpoint = endpoint.lstrip("/")
179
+
180
+ try:
181
+ result = await explore_hf_docs(hf_token, endpoint)
182
+ return result, True
183
+
184
+ except httpx.HTTPStatusError as e:
185
+ return (
186
+ f"HTTP error: {e.response.status_code} - {e.response.text[:200]}",
187
+ False,
188
+ )
189
+ except httpx.RequestError as e:
190
+ return f"Request error: {str(e)}", False
191
+ except ValueError as e:
192
+ return f"Error: {str(e)}", False
193
+ except Exception as e:
194
+ return f"Unexpected error: {str(e)}", False
195
+
196
+
197
+ async def _fetch_openapi_spec() -> dict[str, Any]:
198
+ """Fetch and cache the HuggingFace OpenAPI specification"""
199
+ global _openapi_spec_cache
200
+
201
+ if _openapi_spec_cache is not None:
202
+ print("[DEBUG] _fetch_openapi_spec: Using cached spec")
203
+ return _openapi_spec_cache
204
+
205
+ start_time = time.perf_counter()
206
+ print("[DEBUG] _fetch_openapi_spec: Fetching from API")
207
+
208
+ url = "https://huggingface.co/.well-known/openapi.json"
209
+
210
+ async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
211
+ response = await client.get(url)
212
+ response.raise_for_status()
213
+
214
+ spec = response.json()
215
+ _openapi_spec_cache = spec
216
+
217
+ fetch_time = time.perf_counter() - start_time
218
+ print(f"[DEBUG] _fetch_openapi_spec: Fetched and cached in {fetch_time:.2f}s")
219
+
220
+ return spec
221
+
222
+
223
+ def _extract_all_tags(spec: dict[str, Any]) -> list[str]:
224
+ """Extract all unique tags from the OpenAPI spec"""
225
+ tags = set()
226
+
227
+ # Get tags from the tags section
228
+ for tag_obj in spec.get("tags", []):
229
+ if "name" in tag_obj:
230
+ tags.add(tag_obj["name"])
231
+
232
+ # Also get tags from paths (in case some aren't in the tags section)
233
+ for path, path_item in spec.get("paths", {}).items():
234
+ for method, operation in path_item.items():
235
+ if method in ["get", "post", "put", "delete", "patch", "head", "options"]:
236
+ for tag in operation.get("tags", []):
237
+ tags.add(tag)
238
+
239
+ return sorted(list(tags))
240
+
241
+
242
+ def _search_openapi_by_tag(spec: dict[str, Any], tag: str) -> list[dict[str, Any]]:
243
+ """Search for API endpoints with a specific tag"""
244
+ results = []
245
+ paths = spec.get("paths", {})
246
+ servers = spec.get("servers", [])
247
+ base_url = (
248
+ servers[0].get("url", "https://huggingface.co")
249
+ if servers
250
+ else "https://huggingface.co"
251
+ )
252
+
253
+ for path, path_item in paths.items():
254
+ for method, operation in path_item.items():
255
+ if method not in [
256
+ "get",
257
+ "post",
258
+ "put",
259
+ "delete",
260
+ "patch",
261
+ "head",
262
+ "options",
263
+ ]:
264
+ continue
265
+
266
+ operation_tags = operation.get("tags", [])
267
+ if tag in operation_tags:
268
+ # Extract parameters
269
+ parameters = operation.get("parameters", [])
270
+ request_body = operation.get("requestBody", {})
271
+ responses = operation.get("responses", {})
272
+
273
+ results.append(
274
+ {
275
+ "path": path,
276
+ "method": method.upper(),
277
+ "operationId": operation.get("operationId", ""),
278
+ "summary": operation.get("summary", ""),
279
+ "description": operation.get("description", ""),
280
+ "parameters": parameters,
281
+ "request_body": request_body,
282
+ "responses": responses,
283
+ "base_url": base_url,
284
+ }
285
+ )
286
+
287
+ return results
288
+
289
+
290
+ def _generate_curl_example(endpoint: dict[str, Any]) -> str:
291
+ """Generate a curl command example for an endpoint"""
292
+ method = endpoint["method"]
293
+ path = endpoint["path"]
294
+ base_url = endpoint["base_url"]
295
+
296
+ # Build the full URL with example path parameters
297
+ full_path = path
298
+ for param in endpoint.get("parameters", []):
299
+ if param.get("in") == "path" and param.get("required"):
300
+ param_name = param["name"]
301
+ example = param.get(
302
+ "example", param.get("schema", {}).get("example", f"<{param_name}>")
303
+ )
304
+ full_path = full_path.replace(f"{{{param_name}}}", str(example))
305
+
306
+ curl = f"curl -X {method} \\\n '{base_url}{full_path}'"
307
+
308
+ # Add query parameters if any
309
+ query_params = [p for p in endpoint.get("parameters", []) if p.get("in") == "query"]
310
+ if query_params and query_params[0].get("required"):
311
+ param = query_params[0]
312
+ example = param.get("example", param.get("schema", {}).get("example", "value"))
313
+ curl += f"?{param['name']}={example}"
314
+
315
+ # Add headers
316
+ curl += " \\\n -H 'Authorization: Bearer $HF_TOKEN'"
317
+
318
+ # Add request body if applicable
319
+ if method in ["POST", "PUT", "PATCH"] and endpoint.get("request_body"):
320
+ content = endpoint["request_body"].get("content", {})
321
+ if "application/json" in content:
322
+ curl += " \\\n -H 'Content-Type: application/json'"
323
+ schema = content["application/json"].get("schema", {})
324
+ example = schema.get("example", "{}")
325
+ if isinstance(example, dict):
326
+ import json
327
+
328
+ example = json.dumps(example, indent=2)
329
+ curl += f" \\\n -d '{example}'"
330
+
331
+ return curl
332
+
333
+
334
+ def _format_parameters(parameters: list[dict[str, Any]]) -> str:
335
+ """Format parameter information from OpenAPI spec"""
336
+ if not parameters:
337
+ return ""
338
+
339
+ # Group parameters by type
340
+ path_params = [p for p in parameters if p.get("in") == "path"]
341
+ query_params = [p for p in parameters if p.get("in") == "query"]
342
+ header_params = [p for p in parameters if p.get("in") == "header"]
343
+
344
+ output = []
345
+
346
+ if path_params:
347
+ output.append("**Path Parameters:**")
348
+ for param in path_params:
349
+ name = param.get("name", "")
350
+ required = " (required)" if param.get("required") else " (optional)"
351
+ description = param.get("description", "")
352
+ param_type = param.get("schema", {}).get("type", "string")
353
+ example = param.get("example") or param.get("schema", {}).get("example", "")
354
+
355
+ output.append(f"- `{name}` ({param_type}){required}: {description}")
356
+ if example:
357
+ output.append(f" Example: `{example}`")
358
+
359
+ if query_params:
360
+ if output:
361
+ output.append("")
362
+ output.append("**Query Parameters:**")
363
+ for param in query_params:
364
+ name = param.get("name", "")
365
+ required = " (required)" if param.get("required") else " (optional)"
366
+ description = param.get("description", "")
367
+ param_type = param.get("schema", {}).get("type", "string")
368
+ example = param.get("example") or param.get("schema", {}).get("example", "")
369
+
370
+ output.append(f"- `{name}` ({param_type}){required}: {description}")
371
+ if example:
372
+ output.append(f" Example: `{example}`")
373
+
374
+ if header_params:
375
+ if output:
376
+ output.append("")
377
+ output.append("**Header Parameters:**")
378
+ for param in header_params:
379
+ name = param.get("name", "")
380
+ required = " (required)" if param.get("required") else " (optional)"
381
+ description = param.get("description", "")
382
+
383
+ output.append(f"- `{name}`{required}: {description}")
384
+
385
+ return "\n".join(output)
386
+
387
+
388
+ def _format_response_info(responses: dict[str, Any]) -> str:
389
+ """Format response information from OpenAPI spec"""
390
+ if not responses:
391
+ return "No response information available"
392
+
393
+ output = []
394
+ for status_code, response_obj in list(responses.items())[
395
+ :3
396
+ ]: # Show first 3 status codes
397
+ desc = response_obj.get("description", "")
398
+ output.append(f"- **{status_code}**: {desc}")
399
+
400
+ content = response_obj.get("content", {})
401
+ if "application/json" in content:
402
+ schema = content["application/json"].get("schema", {})
403
+ if "type" in schema:
404
+ output.append(f" Returns: {schema.get('type', 'object')}")
405
+
406
+ return "\n".join(output)
407
+
408
+
409
+ def _format_openapi_results(results: list[dict[str, Any]], tag: str) -> str:
410
+ """Format OpenAPI search results as markdown with curl examples"""
411
+ if not results:
412
+ return f"No API endpoints found with tag '{tag}'"
413
+
414
+ output = f"# API Endpoints for tag: `{tag}`\n\n"
415
+ output += f"Found {len(results)} endpoint(s)\n\n"
416
+ output += "---\n\n"
417
+
418
+ for i, endpoint in enumerate(results, 1):
419
+ output += f"## {i}. {endpoint['method']} {endpoint['path']}\n\n"
420
+
421
+ if endpoint["summary"]:
422
+ output += f"**Summary:** {endpoint['summary']}\n\n"
423
+
424
+ if endpoint["description"]:
425
+ desc = endpoint["description"][:300]
426
+ if len(endpoint["description"]) > 300:
427
+ desc += "..."
428
+ output += f"**Description:** {desc}\n\n"
429
+
430
+ # Parameters
431
+ params_info = _format_parameters(endpoint.get("parameters", []))
432
+ if params_info:
433
+ output += params_info + "\n\n"
434
+
435
+ # Curl example
436
+ output += "**Usage:**\n```bash\n"
437
+ output += _generate_curl_example(endpoint)
438
+ output += "\n```\n\n"
439
+
440
+ # Response info
441
+ output += "**Returns:**\n"
442
+ output += _format_response_info(endpoint["responses"])
443
+ output += "\n\n"
444
+
445
+ output += "---\n\n"
446
+
447
+ return output
448
+
449
+
450
+ async def search_openapi_handler(arguments: dict[str, Any]) -> tuple[str, bool]:
451
+ """
452
+ Search the HuggingFace OpenAPI specification by tag
453
+
454
+ Args:
455
+ arguments: Dictionary with 'tag' parameter
456
+
457
+ Returns:
458
+ Tuple of (search_results, success)
459
+ """
460
+ start_time = time.perf_counter()
461
+ tag = arguments.get("tag", "")
462
+ print(f"[DEBUG] search_openapi: Starting for tag '{tag}'")
463
+
464
+ if not tag:
465
+ return "Error: No tag provided", False
466
+
467
+ try:
468
+ # Fetch OpenAPI spec (cached after first fetch)
469
+ spec = await _fetch_openapi_spec()
470
+
471
+ # Search for endpoints with this tag
472
+ results = _search_openapi_by_tag(spec, tag)
473
+
474
+ # Format results
475
+ formatted = _format_openapi_results(results, tag)
476
+
477
+ total_time = time.perf_counter() - start_time
478
+ print(f"[DEBUG] search_openapi: Total time {total_time:.2f}s")
479
+
480
+ return formatted, True
481
+
482
+ except httpx.HTTPStatusError as e:
483
+ return f"HTTP error fetching OpenAPI spec: {e.response.status_code}", False
484
+ except httpx.RequestError as e:
485
+ return f"Request error: {str(e)}", False
486
+ except Exception as e:
487
+ return f"Error searching OpenAPI spec: {str(e)}", False
488
+
489
+
490
+ async def hf_docs_fetch_handler(arguments: dict[str, Any]) -> tuple[str, bool]:
491
+ """
492
+ Fetch full documentation content from a specific HF docs page
493
+
494
+ Args:
495
+ arguments: Dictionary with 'url' parameter (full URL to the doc page)
496
+
497
+ Returns:
498
+ Tuple of (full_markdown_content, success)
499
+ """
500
+ start_time = time.perf_counter()
501
+ url = arguments.get("url", "")
502
+ print(f"[DEBUG] fetch_hf_docs: Starting for URL '{url}'")
503
+
504
+ if not url:
505
+ return "Error: No URL provided", False
506
+
507
+ # Get HF token from environment
508
+ hf_token = os.environ.get("HF_TOKEN")
509
+
510
+ if not hf_token:
511
+ return (
512
+ "Error: HF_TOKEN environment variable not set",
513
+ False,
514
+ )
515
+
516
+ # Add .md extension if not already present
517
+ if not url.endswith(".md"):
518
+ url = f"{url}.md"
519
+
520
+ try:
521
+ # Make request with auth
522
+ headers = {"Authorization": f"Bearer {hf_token}"}
523
+
524
+ fetch_start = time.perf_counter()
525
+ async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
526
+ response = await client.get(url, headers=headers)
527
+ response.raise_for_status()
528
+
529
+ fetch_time = time.perf_counter() - fetch_start
530
+ content = response.text
531
+ content_size_kb = len(content) / 1024
532
+
533
+ print(
534
+ f"[DEBUG] fetch_hf_docs: Fetched {content_size_kb:.1f}KB in {fetch_time:.2f}s"
535
+ )
536
+
537
+ # Return the markdown content directly
538
+ result = f"Documentation from: {url}\n\n{content}"
539
+
540
+ total_time = time.perf_counter() - start_time
541
+ print(f"[DEBUG] fetch_hf_docs: Total time {total_time:.2f}s")
542
+
543
+ return result, True
544
+
545
+ except httpx.HTTPStatusError as e:
546
+ return (
547
+ f"HTTP error fetching {url}: {e.response.status_code} - {e.response.text[:200]}",
548
+ False,
549
+ )
550
+ except httpx.RequestError as e:
551
+ return f"Request error fetching {url}: {str(e)}", False
552
+ except Exception as e:
553
+ return f"Error fetching documentation: {str(e)}", False
554
+
555
+
556
+ # Tool specifications for the search sub-agent
557
+
558
+ EXPLORE_HF_DOCS_TOOL_SPEC = {
559
+ "name": "explore_hf_docs",
560
+ "description": (
561
+ "Explore the Hugging Face documentation at a glance. "
562
+ "Select an endpoint from the available options and get a list of all documentation pages "
563
+ "with their titles, URLs, and a 300-character glimpse of each page. "
564
+ "Use this to discover what documentation is available before fetching specific pages."
565
+ ),
566
+ "parameters": {
567
+ "type": "object",
568
+ "properties": {
569
+ "endpoint": {
570
+ "type": "string",
571
+ "enum": [
572
+ "hub",
573
+ "transformers",
574
+ "diffusers",
575
+ "datasets",
576
+ "gradio",
577
+ "trackio",
578
+ "smolagents",
579
+ "huggingface_hub",
580
+ "huggingface.js",
581
+ "transformers.js",
582
+ "inference-providers",
583
+ "inference-endpoints",
584
+ "peft",
585
+ "accelerate",
586
+ "optimum",
587
+ "optimum-habana",
588
+ "optimum-neuron",
589
+ "optimum-intel",
590
+ "optimum-executorch",
591
+ "optimum-tpu",
592
+ "tokenizers",
593
+ "llm-course",
594
+ "robotics-course",
595
+ "mcp-course",
596
+ "smol-course",
597
+ "agents-course",
598
+ "deep-rl-course",
599
+ "computer-vision-course",
600
+ "evaluate",
601
+ "tasks",
602
+ "dataset-viewer",
603
+ "trl",
604
+ "simulate",
605
+ "sagemaker",
606
+ "timm",
607
+ "safetensors",
608
+ "tgi",
609
+ "setfit",
610
+ "audio-course",
611
+ "lerobot",
612
+ "autotrain",
613
+ "tei",
614
+ "bitsandbytes",
615
+ "cookbook",
616
+ "sentence_transformers",
617
+ "ml-games-course",
618
+ "diffusion-course",
619
+ "ml-for-3d-course",
620
+ "chat-ui",
621
+ "leaderboards",
622
+ "lighteval",
623
+ "argilla",
624
+ "distilabel",
625
+ "microsoft-azure",
626
+ "kernels",
627
+ "google-cloud",
628
+ ],
629
+ "description": (
630
+ "The documentation endpoint to explore. Each endpoint corresponds to a major section of the Hugging Face documentation:\n\n"
631
+ "• hub — Find answers to questions about models/datasets/spaces, auth, versioning, metadata.\n"
632
+ "• transformers — Core model library: architectures, configs, tokenizers, training & inference APIs.\n"
633
+ "• diffusers — Diffusion pipelines, schedulers, fine-tuning, training, and deployment patterns.\n"
634
+ "• datasets — Dataset loading, streaming, processing, Arrow format, Hub integration.\n"
635
+ "• gradio — UI components and demos for interacting with ML models.\n"
636
+ "• trackio — Experiment tracking, metrics logging, and run comparison.\n"
637
+ "• smolagents — Lightweight agent abstractions and tool-using patterns.\n"
638
+ "• huggingface_hub — Python client for Hub operations (auth, upload/download, repo management).\n"
639
+ "• huggingface.js — JS/TS client for Hub APIs in browser and Node.\n"
640
+ "• transformers.js — Run Transformer models in browser/Node via WebGPU/WASM.\n"
641
+ "• inference-providers — Unified interface for third-party inference backends.\n"
642
+ "• inference-endpoints — Managed, scalable model deployments on HF infrastructure.\n"
643
+ "• peft — Parameter-efficient fine-tuning methods (LoRA, adapters, etc.).\n"
644
+ "• accelerate — Hardware-agnostic, distributed and mixed-precision training orchestration.\n"
645
+ "• optimum — Hardware-aware optimization and model export tooling.\n"
646
+ "• optimum-habana — Training and inference on Habana Gaudi accelerators.\n"
647
+ "• optimum-neuron — Optimization workflows for AWS Inferentia/Trainium.\n"
648
+ "• optimum-intel — Intel CPU/GPU optimizations (OpenVINO, IPEX).\n"
649
+ "• optimum-executorch — Exporting models to ExecuTorch for edge/mobile.\n"
650
+ "• optimum-tpu — TPU-specific training and optimization paths.\n"
651
+ "• tokenizers — Fast tokenizer internals, training, and low-level APIs.\n"
652
+ "• llm-course — End-to-end LLM concepts, training, and deployment.\n"
653
+ "• robotics-course — Learning-based robotics foundations.\n"
654
+ "• mcp-course — Model Context Protocol concepts and usage.\n"
655
+ "• smol-course — Small-model and efficiency-focused workflows.\n"
656
+ "• agents-course — Tool-using, planning, and multi-step agent design.\n"
657
+ "• deep-rl-course — Deep reinforcement learning foundations.\n"
658
+ "• computer-vision-course — Vision models, datasets, and pipelines.\n"
659
+ "• evaluate — Metrics, evaluation workflows, and training-loop integration.\n"
660
+ "• tasks — Canonical task definitions and model categorization.\n"
661
+ "• dataset-viewer — Dataset preview, streaming views, and viewer internals.\n"
662
+ "• trl — RLHF, DPO, PPO, and SFT utilities for LLMs.\n"
663
+ "• simulate — Experimental simulation tools and workflows.\n"
664
+ "• sagemaker — Deploying Hugging Face models on AWS SageMaker.\n"
665
+ "• timm — Image model zoo and utilities via HF integrations.\n"
666
+ "• safetensors — Safe, fast tensor serialization format.\n"
667
+ "• tgi — High-throughput text generation server for LLMs.\n"
668
+ "• setfit — Few-shot text classification via sentence embeddings.\n"
669
+ "• audio-course — Speech and audio models, datasets, and tasks.\n"
670
+ "• lerobot — Robotics datasets, policies, and learning workflows.\n"
671
+ "• autotrain — No/low-code model training on Hugging Face.\n"
672
+ "• tei — Optimized inference server for embedding workloads.\n"
673
+ "• bitsandbytes — Quantization and memory-efficient optimizers.\n"
674
+ "• cookbook — Practical, task-oriented recipes across the ecosystem.\n"
675
+ "• sentence_transformers — Embedding models, training recipes, similarity/search workflows.\n"
676
+ "• ml-games-course — Game-based ML and reinforcement learning experiments.\n"
677
+ "• diffusion-course — Diffusion model theory and hands-on practice.\n"
678
+ "• ml-for-3d-course — 3D representations, models, and learning techniques.\n"
679
+ "• chat-ui — Reference chat interfaces for LLM deployment.\n"
680
+ "• leaderboards — Evaluation leaderboards and submission mechanics.\n"
681
+ "• lighteval — Lightweight, reproducible LLM evaluation framework.\n"
682
+ "• argilla — Data annotation, feedback, and human-in-the-loop workflows.\n"
683
+ "• distilabel — Synthetic data generation and distillation pipelines.\n"
684
+ "• microsoft-azure — Azure deployment and integration guides.\n"
685
+ "• kernels — Lightweight execution environments and notebook-style workflows.\n"
686
+ "• google-cloud — GCP deployment and serving workflows.\n"
687
+ ),
688
+ },
689
+ },
690
+ "required": ["endpoint"],
691
+ },
692
+ }
693
+
694
+ HF_DOCS_FETCH_TOOL_SPEC = {
695
+ "name": "fetch_hf_docs",
696
+ "description": (
697
+ "Fetch the full content of a specific HF documentation page. "
698
+ "Provide the full URL to the doc page (e.g., from explore_hf_docs results). "
699
+ "Returns the complete markdown content of that page. "
700
+ "Use explore_hf_docs first to discover available pages."
701
+ ),
702
+ "parameters": {
703
+ "type": "object",
704
+ "properties": {
705
+ "url": {
706
+ "type": "string",
707
+ "description": (
708
+ "The full URL to the documentation page. "
709
+ "Example: 'https://huggingface.co/docs/trl/dpo_trainer' "
710
+ "The .md extension will be added automatically if not present."
711
+ ),
712
+ },
713
+ },
714
+ "required": ["url"],
715
+ },
716
+ }
717
+
718
+
719
+ async def _get_api_search_tool_spec() -> dict[str, Any]:
720
+ """
721
+ Dynamically generate the OpenAPI tool spec with tag enum populated at runtime
722
+ This must be called async to fetch the OpenAPI spec and extract tags
723
+ """
724
+ spec = await _fetch_openapi_spec()
725
+ tags = _extract_all_tags(spec)
726
+
727
+ return {
728
+ "name": "search_hf_api_endpoints",
729
+ "description": (
730
+ "Search the HuggingFace OpenAPI specification by tag to find related API endpoints. "
731
+ "Returns all endpoints with the specified tag including curl examples showing how to use them. "
732
+ "Each result includes the endpoint path, summary, usage example with curl, and response information."
733
+ ),
734
+ "parameters": {
735
+ "type": "object",
736
+ "properties": {
737
+ "tag": {
738
+ "type": "string",
739
+ "enum": tags,
740
+ "description": (
741
+ "The API tag to search for. Each tag groups related API endpoints. "
742
+ ),
743
+ },
744
+ },
745
+ "required": ["tag"],
746
+ },
747
+ }
agent/tools/search_docs_tool.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Search documentation tool that spawns a sub-agent
3
+ The sub-agent has its own agent loop and set of specialized search tools
4
+ """
5
+
6
+ import asyncio
7
+ from typing import Any
8
+
9
+ from litellm.utils import get_max_tokens
10
+
11
+ from agent.core.session import Session
12
+
13
+
14
+ async def create_search_tool_router(github_mcp_config: dict[str, Any] | None = None):
15
+ """
16
+ Create a ToolRouter instance for the search sub-agent
17
+ Async because OpenAPI tool needs to fetch and parse spec at initialization
18
+
19
+ Args:
20
+ github_mcp_config: Optional GitHub MCP server configuration
21
+ """
22
+ # Import at runtime to avoid circular dependency
23
+ from fastmcp import Client
24
+
25
+ from agent.core.tools import ToolRouter
26
+
27
+ # List of allowed GitHub MCP tools
28
+ ALLOWED_GITHUB_TOOLS = {
29
+ "list_pull_requests",
30
+ "list_issues",
31
+ "search_code",
32
+ "search_issues",
33
+ "search_repositories",
34
+ "search_users",
35
+ "get_pull_request_status",
36
+ "get_pull_request_reviews",
37
+ "get_pull_request",
38
+ "get_issue",
39
+ "get_file_contents",
40
+ }
41
+
42
+ class SearchDocsToolRouter(ToolRouter):
43
+ """Specialized ToolRouter for the search sub-agent"""
44
+
45
+ def __init__(self, github_mcp_config: dict[str, Any] | None = None):
46
+ self.tools: dict[str, Any] = {}
47
+ self.mcp_servers: dict[str, dict[str, Any]] = {}
48
+ self._mcp_initialized = False
49
+
50
+ # Initialize MCP client with GitHub server if provided
51
+ if github_mcp_config:
52
+ self.mcp_client = Client({"mcpServers": github_mcp_config})
53
+ else:
54
+ self.mcp_client = None
55
+
56
+ async def initialize_tools(self):
57
+ """Initialize tools asynchronously"""
58
+ tools = await make_search_agent_tools()
59
+ for tool in tools:
60
+ self.register_tool(tool)
61
+
62
+ async def register_mcp_tools(self) -> None:
63
+ """Register only allowed GitHub MCP tools"""
64
+ if self.mcp_client is None:
65
+ return
66
+
67
+ tools = await self.mcp_client.list_tools()
68
+ for tool in tools:
69
+ # Only register allowed GitHub tools
70
+ if tool.name in ALLOWED_GITHUB_TOOLS:
71
+ print(f"Registering GitHub MCP Tool: {tool.name}")
72
+ from agent.core.tools import ToolSpec
73
+
74
+ self.register_tool(
75
+ ToolSpec(
76
+ name=tool.name,
77
+ description=tool.description,
78
+ parameters=tool.inputSchema,
79
+ handler=None,
80
+ )
81
+ )
82
+
83
+ router = SearchDocsToolRouter(github_mcp_config)
84
+ await router.initialize_tools()
85
+ return router
86
+
87
+
88
+ async def search_docs_handler(arguments: dict[str, Any]) -> tuple[str, bool]:
89
+ """
90
+ Handler that spawns a sub-agent to perform comprehensive doc search
91
+
92
+ Args:
93
+ arguments: dictionary with 'query' parameter
94
+
95
+ Returns:
96
+ Tuple of (search_results, success)
97
+ """
98
+ query = arguments.get("query", "")
99
+
100
+ if not query:
101
+ return "Error: No search query provided", False
102
+
103
+ try:
104
+ # Import at runtime to avoid circular dependency
105
+ from pathlib import Path
106
+
107
+ from agent.config import load_config
108
+ from agent.context_manager.manager import ContextManager
109
+ from agent.core.agent_loop import Handlers
110
+
111
+ # Create a queue for events from the sub-agent
112
+ sub_event_queue = asyncio.Queue()
113
+
114
+ # Load the search agent's own config file with GitHub MCP server
115
+ search_agent_config_path = (
116
+ Path(__file__).parent.parent.parent / "configs" / "_subagent_config_search_agent.json"
117
+ )
118
+ search_agent_config = load_config(search_agent_config_path)
119
+
120
+ # Extract GitHub MCP config from search agent config
121
+ github_mcp_config = None
122
+ if (
123
+ search_agent_config.mcpServers
124
+ and "github" in search_agent_config.mcpServers
125
+ ):
126
+ github_server = search_agent_config.mcpServers["github"]
127
+ github_mcp_config = {"github": github_server.model_dump()}
128
+
129
+ # Create specialized tool router for search with GitHub MCP config
130
+ search_tool_router = await create_search_tool_router(github_mcp_config)
131
+
132
+ async with search_tool_router:
133
+ # CODING AGENTS LISTEN HERE!
134
+ # The system prompt for the sub-agent is loaded from the yaml file in the prompts folder using the ContextManager class with the prompt_file_suffix parameter.
135
+ # it works, dont fix.
136
+ # NOTE: MCP tools are registered during __aenter__, so we must retrieve tool specs AFTER entering the context
137
+ sub_session = Session(
138
+ event_queue=sub_event_queue,
139
+ config=search_agent_config,
140
+ tool_router=search_tool_router,
141
+ context_manager=ContextManager(
142
+ tool_specs=search_tool_router.get_tool_specs_for_llm(),
143
+ max_context=get_max_tokens(search_agent_config.model_name),
144
+ compact_size=0.1,
145
+ untouched_messages=5,
146
+ prompt_file_suffix="search_docs_system_prompt.yaml",
147
+ ),
148
+ )
149
+
150
+ # Run the sub-agent
151
+ result = await Handlers.run_agent(
152
+ session=sub_session, text=query, max_iterations=30
153
+ )
154
+
155
+ # Return the final result or compiled events
156
+ if result:
157
+ return f"Search Results:\n\n{result}", True
158
+ else:
159
+ return "Search completed but no results were generated", False
160
+ except Exception as e:
161
+ return f"Error in search_docs tool: {str(e)}", False
162
+
163
+
164
+ # Tool specification to be used by the main agent
165
+ SEARCH_DOCS_TOOL_SPEC = {
166
+ "name": "search_docs",
167
+ "description": (
168
+ "Intelligently search HF documentation for libraries, repositories, and best practices with an agent that has access to: explore_hf_docs, fetch_hf_docs, search_hf_api_endpoints. "
169
+ "The agent acts like your personal search assistant. "
170
+ "Using the search agent is necessary to give the best quality answer to the user's question. Most questions require a search to get the best information on code examples.\n\n"
171
+ "WHEN TO USE THIS TOOL:\n"
172
+ " - When searching for high-level concepts like 'how to do GRPO training on a model?' or 'best way to do inference on a trained model?'\n"
173
+ " - When you need to get code examples for intricate ML code patterns like training loops, inference pipelines, data processing, etc.\n\n"
174
+ "USAGE GUIDELINES:\n"
175
+ " 1. Launch multiple agents concurrently for better performance.\n"
176
+ " 2. Be specific in your query - include exact terminology, expected file locations, or code patterns.\n"
177
+ " 3. Use the query as if you were talking to another engineer. Bad: logger impl Good: where is the logger implemented, we're trying to find out how to log to files.\n"
178
+ " 4. Make sure to formulate the query in such a way that the agent knows when it's done or has found the result."
179
+ ),
180
+ "parameters": {
181
+ "type": "object",
182
+ "properties": {
183
+ "query": {
184
+ "type": "string",
185
+ "description": (
186
+ "The search query describing to the agent what it should do. Be "
187
+ "specific and include technical terms, file types, or expected "
188
+ "code patterns to help the agent find relevant code. Formulate "
189
+ "the query in a way that makes it clear to the agent when it "
190
+ "has found the right thing."
191
+ ),
192
+ },
193
+ },
194
+ "required": ["query"],
195
+ },
196
+ }
197
+
198
+
199
+ async def make_search_agent_tools():
200
+ """
201
+ Create a list of tools for the search agent
202
+ Async because OpenAPI tool spec needs to be populated at runtime
203
+ """
204
+ # Import at runtime to avoid circular dependency
205
+ from agent.core.tools import ToolSpec
206
+ from agent.tools._search_agent_tools import (
207
+ EXPLORE_HF_DOCS_TOOL_SPEC,
208
+ HF_DOCS_FETCH_TOOL_SPEC,
209
+ _get_api_search_tool_spec,
210
+ explore_hf_docs_handler,
211
+ hf_docs_fetch_handler,
212
+ search_openapi_handler,
213
+ )
214
+
215
+ # Get the OpenAPI tool spec with dynamically populated tags
216
+ openapi_spec = await _get_api_search_tool_spec()
217
+
218
+ return [
219
+ ToolSpec(
220
+ name=EXPLORE_HF_DOCS_TOOL_SPEC["name"],
221
+ description=EXPLORE_HF_DOCS_TOOL_SPEC["description"],
222
+ parameters=EXPLORE_HF_DOCS_TOOL_SPEC["parameters"],
223
+ handler=explore_hf_docs_handler,
224
+ ),
225
+ ToolSpec(
226
+ name=HF_DOCS_FETCH_TOOL_SPEC["name"],
227
+ description=HF_DOCS_FETCH_TOOL_SPEC["description"],
228
+ parameters=HF_DOCS_FETCH_TOOL_SPEC["parameters"],
229
+ handler=hf_docs_fetch_handler,
230
+ ),
231
+ ToolSpec(
232
+ name=openapi_spec["name"],
233
+ description=openapi_spec["description"],
234
+ parameters=openapi_spec["parameters"],
235
+ handler=search_openapi_handler,
236
+ ),
237
+ ]
configs/_subagent_config_search_agent.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "anthropic/claude-haiku-4-5",
3
+ "mcpServers": {
4
+ "github": {
5
+ "transport": "http",
6
+ "url": "https://api.githubcopilot.com/mcp/",
7
+ "headers": {
8
+ "Authorization": "Bearer ${GITHUB_TOKEN}"
9
+ }
10
+ }
11
+ }
12
+ }
agent/config_mcp_example.json → configs/main_agent_config.json RENAMED
@@ -1,7 +1,5 @@
1
  {
2
  "model_name": "anthropic/claude-sonnet-4-5-20250929",
3
- "tools": [],
4
- "system_prompt_path": "",
5
  "mcpServers": {
6
  "hf-mcp-server": {
7
  "transport": "http",
 
1
  {
2
  "model_name": "anthropic/claude-sonnet-4-5-20250929",
 
 
3
  "mcpServers": {
4
  "hf-mcp-server": {
5
  "transport": "http",
pyproject.toml CHANGED
@@ -20,4 +20,5 @@ dependencies = [
20
  "transformers>=2.3.0",
21
  "torch>=2.9.1",
22
  "pytest>=9.0.2",
 
23
  ]
 
20
  "transformers>=2.3.0",
21
  "torch>=2.9.1",
22
  "pytest>=9.0.2",
23
+ "trafilatura>=2.0.0",
24
  ]
run_search_agent.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Standalone test script for the search sub-agent
3
+ Run with: uv run python test_search_agent.py
4
+ """
5
+
6
+ import asyncio
7
+
8
+ from litellm.utils import get_max_tokens
9
+
10
+ from agent.config import Config
11
+ from agent.context_manager.manager import ContextManager
12
+ from agent.core.agent_loop import Handlers
13
+ from agent.core.session import Session
14
+ from agent.tools.search_docs_tool import create_search_tool_router
15
+
16
+
17
+ async def test_search_agent(query: str):
18
+ """Test the search sub-agent with a query"""
19
+ print(f"Testing search agent with query: {query}\n")
20
+ print("=" * 60)
21
+
22
+ # Create event queue for the sub-agent
23
+ sub_event_queue = asyncio.Queue()
24
+
25
+ # Create search tool router
26
+ search_tool_router = await create_search_tool_router()
27
+
28
+ # Create config
29
+ sub_config = Config(
30
+ model_name="anthropic/claude-haiku-4-5",
31
+ )
32
+
33
+ # Create session with custom system prompt
34
+ sub_session = Session(
35
+ event_queue=sub_event_queue,
36
+ config=sub_config,
37
+ tool_router=search_tool_router,
38
+ context_manager=ContextManager(
39
+ tool_specs=search_tool_router.get_tool_specs_for_llm(),
40
+ max_context=get_max_tokens(sub_config.model_name),
41
+ compact_size=0.1,
42
+ untouched_messages=5,
43
+ prompt_file_suffix="search_docs_system_prompt.yaml",
44
+ ),
45
+ )
46
+
47
+ # Event listener to show what the sub-agent is doing
48
+ async def event_monitor():
49
+ while True:
50
+ try:
51
+ event = await asyncio.wait_for(sub_event_queue.get(), timeout=1.0)
52
+
53
+ if event.event_type == "assistant_message":
54
+ content = event.data.get("content", "") if event.data else ""
55
+ if content:
56
+ print(f"\n🤖 Sub-agent: {content}\n")
57
+
58
+ elif event.event_type == "tool_call":
59
+ tool_name = event.data.get("tool", "") if event.data else ""
60
+ arguments = event.data.get("arguments", {}) if event.data else {}
61
+ print(f"🔧 Tool call: {tool_name}")
62
+ print(f" Args: {arguments}")
63
+
64
+ elif event.event_type == "tool_output":
65
+ output = event.data.get("output", "") if event.data else ""
66
+ success = event.data.get("success", False) if event.data else False
67
+ status = "✅" if success else "❌"
68
+
69
+ print(f"{status} Tool output: {output}\n")
70
+
71
+ elif event.event_type == "turn_complete":
72
+ print("✅ Sub-agent turn complete")
73
+ break
74
+
75
+ except asyncio.TimeoutError:
76
+ # Check if agent is still running
77
+ continue
78
+ except Exception as e:
79
+ print(f"⚠️ Event error: {e}")
80
+ break
81
+
82
+ # Run the sub-agent and event monitor concurrently
83
+ async with search_tool_router:
84
+ monitor_task = asyncio.create_task(event_monitor())
85
+
86
+ result = await Handlers.run_agent(
87
+ session=sub_session, text=query, max_iterations=30
88
+ )
89
+
90
+ # Wait for event monitor to finish
91
+ await asyncio.wait_for(monitor_task, timeout=5.0)
92
+
93
+ print("\n" + "=" * 60)
94
+ print("FINAL RESULT:")
95
+ print("=" * 60)
96
+ if result:
97
+ print(result)
98
+ else:
99
+ print("No result returned")
100
+ print("=" * 60)
101
+
102
+
103
+ async def main():
104
+ """Main test function"""
105
+ print("🧪 Search Sub-Agent Test\n")
106
+
107
+ # Example queries to test
108
+ test_queries = [
109
+ # "Explore the TRL documentation structure and find information about DPO trainer",
110
+ # "is there a way to get the logs from a served huggingface space",
111
+ # "How do I train GLM4.7 with a GRPO training loop with trl with llm judge as a reward model for training on hle?"
112
+ "can i stream logs through the api for a served huggingface space",
113
+ ]
114
+
115
+ for i, query in enumerate(test_queries, 1):
116
+ print(f"\n{'=' * 60}")
117
+ print(f"TEST {i}/{len(test_queries)}")
118
+ print(f"{'=' * 60}\n")
119
+
120
+ try:
121
+ await test_search_agent(query)
122
+ except Exception as e:
123
+ print(f"\n❌ Test failed: {e}")
124
+ import traceback
125
+
126
+ traceback.print_exc()
127
+
128
+ if i < len(test_queries):
129
+ print("\n\nPress Enter to continue to next test...")
130
+ input()
131
+
132
+
133
+ if __name__ == "__main__":
134
+ try:
135
+ asyncio.run(main())
136
+ except KeyboardInterrupt:
137
+ print("\n\n⚠️ Test interrupted")
138
+ except Exception as e:
139
+ print(f"\n❌ Error: {e}")
140
+ import traceback
141
+
142
+ traceback.print_exc()
uv.lock CHANGED
@@ -230,6 +230,15 @@ wheels = [
230
  { url = "https://files.pythonhosted.org/packages/f8/aa/5082412d1ee302e9e7d80b6949bc4d2a8fa1149aaab610c5fc24709605d6/authlib-1.6.5-py2.py3-none-any.whl", hash = "sha256:3e0e0507807f842b02175507bdee8957a1d5707fd4afb17c32fb43fee90b6e3a", size = 243608, upload-time = "2025-10-02T13:36:07.637Z" },
231
  ]
232
 
 
 
 
 
 
 
 
 
 
233
  [[package]]
234
  name = "beartype"
235
  version = "0.22.6"
@@ -433,6 +442,20 @@ wheels = [
433
  { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
434
  ]
435
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
  [[package]]
437
  name = "cryptography"
438
  version = "46.0.3"
@@ -529,6 +552,21 @@ wheels = [
529
  { url = "https://files.pythonhosted.org/packages/3b/5e/6f8d874366788ad5d549e9ba258037d974dda6e004843be1bda794571701/datasets-4.4.1-py3-none-any.whl", hash = "sha256:c1163de5211e42546079ab355cc0250c7e6db16eb209ac5ac6252f801f596c44", size = 511591, upload-time = "2025-11-05T16:00:36.365Z" },
530
  ]
531
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
532
  [[package]]
533
  name = "debugpy"
534
  version = "1.8.17"
@@ -898,6 +936,7 @@ dependencies = [
898
  { name = "requests" },
899
  { name = "tenacity" },
900
  { name = "torch" },
 
901
  { name = "transformers" },
902
  ]
903
 
@@ -917,6 +956,7 @@ requires-dist = [
917
  { name = "requests", specifier = ">=2.32.5" },
918
  { name = "tenacity", specifier = ">=8.0.0" },
919
  { name = "torch", specifier = ">=2.9.1" },
 
920
  { name = "transformers", specifier = ">=2.3.0" },
921
  ]
922
 
@@ -949,6 +989,22 @@ wheels = [
949
  { url = "https://files.pythonhosted.org/packages/cb/44/870d44b30e1dcfb6a65932e3e1506c103a8a5aea9103c337e7a53180322c/hf_xet-1.2.0-cp37-abi3-win_amd64.whl", hash = "sha256:e6584a52253f72c9f52f9e549d5895ca7a471608495c4ecaa6cc73dba2b24d69", size = 2905735, upload-time = "2025-10-24T19:04:35.928Z" },
950
  ]
951
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
952
  [[package]]
953
  name = "httpcore"
954
  version = "1.0.9"
@@ -1391,6 +1447,18 @@ wheels = [
1391
  { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" },
1392
  ]
1393
 
 
 
 
 
 
 
 
 
 
 
 
 
1394
  [[package]]
1395
  name = "keyring"
1396
  version = "25.7.0"
@@ -1497,6 +1565,103 @@ all = [
1497
  { name = "opentelemetry-instrumentation-weaviate" },
1498
  ]
1499
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1500
  [[package]]
1501
  name = "markdown-it-py"
1502
  version = "4.0.0"
@@ -3699,6 +3864,15 @@ wheels = [
3699
  { url = "https://files.pythonhosted.org/packages/af/df/c7891ef9d2712ad774777271d39fdef63941ffba0a9d59b7ad1fd2765e57/tiktoken-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f61c0aea5565ac82e2ec50a05e02a6c44734e91b51c10510b084ea1b8e633a71", size = 920667, upload-time = "2025-10-06T20:22:34.444Z" },
3700
  ]
3701
 
 
 
 
 
 
 
 
 
 
3702
  [[package]]
3703
  name = "tokenizers"
3704
  version = "0.22.1"
@@ -3788,6 +3962,24 @@ wheels = [
3788
  { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
3789
  ]
3790
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3791
  [[package]]
3792
  name = "transformers"
3793
  version = "2.3.0"
@@ -3861,6 +4053,18 @@ wheels = [
3861
  { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
3862
  ]
3863
 
 
 
 
 
 
 
 
 
 
 
 
 
3864
  [[package]]
3865
  name = "uc-micro-py"
3866
  version = "1.0.3"
 
230
  { url = "https://files.pythonhosted.org/packages/f8/aa/5082412d1ee302e9e7d80b6949bc4d2a8fa1149aaab610c5fc24709605d6/authlib-1.6.5-py2.py3-none-any.whl", hash = "sha256:3e0e0507807f842b02175507bdee8957a1d5707fd4afb17c32fb43fee90b6e3a", size = 243608, upload-time = "2025-10-02T13:36:07.637Z" },
231
  ]
232
 
233
+ [[package]]
234
+ name = "babel"
235
+ version = "2.17.0"
236
+ source = { registry = "https://pypi.org/simple" }
237
+ sdist = { url = "https://files.pythonhosted.org/packages/7d/6b/d52e42361e1aa00709585ecc30b3f9684b3ab62530771402248b1b1d6240/babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d", size = 9951852, upload-time = "2025-02-01T15:17:41.026Z" }
238
+ wheels = [
239
+ { url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537, upload-time = "2025-02-01T15:17:37.39Z" },
240
+ ]
241
+
242
  [[package]]
243
  name = "beartype"
244
  version = "0.22.6"
 
442
  { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
443
  ]
444
 
445
+ [[package]]
446
+ name = "courlan"
447
+ version = "1.3.2"
448
+ source = { registry = "https://pypi.org/simple" }
449
+ dependencies = [
450
+ { name = "babel" },
451
+ { name = "tld" },
452
+ { name = "urllib3" },
453
+ ]
454
+ sdist = { url = "https://files.pythonhosted.org/packages/6f/54/6d6ceeff4bed42e7a10d6064d35ee43a810e7b3e8beb4abeae8cff4713ae/courlan-1.3.2.tar.gz", hash = "sha256:0b66f4db3a9c39a6e22dd247c72cfaa57d68ea660e94bb2c84ec7db8712af190", size = 206382, upload-time = "2024-10-29T16:40:20.994Z" }
455
+ wheels = [
456
+ { url = "https://files.pythonhosted.org/packages/8e/ca/6a667ccbe649856dcd3458bab80b016681b274399d6211187c6ab969fc50/courlan-1.3.2-py3-none-any.whl", hash = "sha256:d0dab52cf5b5b1000ee2839fbc2837e93b2514d3cb5bb61ae158a55b7a04c6be", size = 33848, upload-time = "2024-10-29T16:40:18.325Z" },
457
+ ]
458
+
459
  [[package]]
460
  name = "cryptography"
461
  version = "46.0.3"
 
552
  { url = "https://files.pythonhosted.org/packages/3b/5e/6f8d874366788ad5d549e9ba258037d974dda6e004843be1bda794571701/datasets-4.4.1-py3-none-any.whl", hash = "sha256:c1163de5211e42546079ab355cc0250c7e6db16eb209ac5ac6252f801f596c44", size = 511591, upload-time = "2025-11-05T16:00:36.365Z" },
553
  ]
554
 
555
+ [[package]]
556
+ name = "dateparser"
557
+ version = "1.2.2"
558
+ source = { registry = "https://pypi.org/simple" }
559
+ dependencies = [
560
+ { name = "python-dateutil" },
561
+ { name = "pytz" },
562
+ { name = "regex" },
563
+ { name = "tzlocal" },
564
+ ]
565
+ sdist = { url = "https://files.pythonhosted.org/packages/a9/30/064144f0df1749e7bb5faaa7f52b007d7c2d08ec08fed8411aba87207f68/dateparser-1.2.2.tar.gz", hash = "sha256:986316f17cb8cdc23ea8ce563027c5ef12fc725b6fb1d137c14ca08777c5ecf7", size = 329840, upload-time = "2025-06-26T09:29:23.211Z" }
566
+ wheels = [
567
+ { url = "https://files.pythonhosted.org/packages/87/22/f020c047ae1346613db9322638186468238bcfa8849b4668a22b97faad65/dateparser-1.2.2-py3-none-any.whl", hash = "sha256:5a5d7211a09013499867547023a2a0c91d5a27d15dd4dbcea676ea9fe66f2482", size = 315453, upload-time = "2025-06-26T09:29:21.412Z" },
568
+ ]
569
+
570
  [[package]]
571
  name = "debugpy"
572
  version = "1.8.17"
 
936
  { name = "requests" },
937
  { name = "tenacity" },
938
  { name = "torch" },
939
+ { name = "trafilatura" },
940
  { name = "transformers" },
941
  ]
942
 
 
956
  { name = "requests", specifier = ">=2.32.5" },
957
  { name = "tenacity", specifier = ">=8.0.0" },
958
  { name = "torch", specifier = ">=2.9.1" },
959
+ { name = "trafilatura", specifier = ">=2.0.0" },
960
  { name = "transformers", specifier = ">=2.3.0" },
961
  ]
962
 
 
989
  { url = "https://files.pythonhosted.org/packages/cb/44/870d44b30e1dcfb6a65932e3e1506c103a8a5aea9103c337e7a53180322c/hf_xet-1.2.0-cp37-abi3-win_amd64.whl", hash = "sha256:e6584a52253f72c9f52f9e549d5895ca7a471608495c4ecaa6cc73dba2b24d69", size = 2905735, upload-time = "2025-10-24T19:04:35.928Z" },
990
  ]
991
 
992
+ [[package]]
993
+ name = "htmldate"
994
+ version = "1.9.4"
995
+ source = { registry = "https://pypi.org/simple" }
996
+ dependencies = [
997
+ { name = "charset-normalizer" },
998
+ { name = "dateparser" },
999
+ { name = "lxml" },
1000
+ { name = "python-dateutil" },
1001
+ { name = "urllib3" },
1002
+ ]
1003
+ sdist = { url = "https://files.pythonhosted.org/packages/9d/10/ead9dabc999f353c3aa5d0dc0835b1e355215a5ecb489a7f4ef2ddad5e33/htmldate-1.9.4.tar.gz", hash = "sha256:1129063e02dd0354b74264de71e950c0c3fcee191178321418ccad2074cc8ed0", size = 44690, upload-time = "2025-11-04T17:46:44.983Z" }
1004
+ wheels = [
1005
+ { url = "https://files.pythonhosted.org/packages/a1/bd/adfcdaaad5805c0c5156aeefd64c1e868c05e9c1cd6fd21751f168cd88c7/htmldate-1.9.4-py3-none-any.whl", hash = "sha256:1b94bcc4e08232a5b692159903acf95548b6a7492dddca5bb123d89d6325921c", size = 31558, upload-time = "2025-11-04T17:46:43.258Z" },
1006
+ ]
1007
+
1008
  [[package]]
1009
  name = "httpcore"
1010
  version = "1.0.9"
 
1447
  { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" },
1448
  ]
1449
 
1450
+ [[package]]
1451
+ name = "justext"
1452
+ version = "3.0.2"
1453
+ source = { registry = "https://pypi.org/simple" }
1454
+ dependencies = [
1455
+ { name = "lxml", extra = ["html-clean"] },
1456
+ ]
1457
+ sdist = { url = "https://files.pythonhosted.org/packages/49/f3/45890c1b314f0d04e19c1c83d534e611513150939a7cf039664d9ab1e649/justext-3.0.2.tar.gz", hash = "sha256:13496a450c44c4cd5b5a75a5efcd9996066d2a189794ea99a49949685a0beb05", size = 828521, upload-time = "2025-02-25T20:21:49.934Z" }
1458
+ wheels = [
1459
+ { url = "https://files.pythonhosted.org/packages/f2/ac/52f4e86d1924a7fc05af3aeb34488570eccc39b4af90530dd6acecdf16b5/justext-3.0.2-py2.py3-none-any.whl", hash = "sha256:62b1c562b15c3c6265e121cc070874243a443bfd53060e869393f09d6b6cc9a7", size = 837940, upload-time = "2025-02-25T20:21:44.179Z" },
1460
+ ]
1461
+
1462
  [[package]]
1463
  name = "keyring"
1464
  version = "25.7.0"
 
1565
  { name = "opentelemetry-instrumentation-weaviate" },
1566
  ]
1567
 
1568
+ [[package]]
1569
+ name = "lxml"
1570
+ version = "6.0.2"
1571
+ source = { registry = "https://pypi.org/simple" }
1572
+ sdist = { url = "https://files.pythonhosted.org/packages/aa/88/262177de60548e5a2bfc46ad28232c9e9cbde697bd94132aeb80364675cb/lxml-6.0.2.tar.gz", hash = "sha256:cd79f3367bd74b317dda655dc8fcfa304d9eb6e4fb06b7168c5cf27f96e0cd62", size = 4073426, upload-time = "2025-09-22T04:04:59.287Z" }
1573
+ wheels = [
1574
+ { url = "https://files.pythonhosted.org/packages/f3/c8/8ff2bc6b920c84355146cd1ab7d181bc543b89241cfb1ebee824a7c81457/lxml-6.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a59f5448ba2ceccd06995c95ea59a7674a10de0810f2ce90c9006f3cbc044456", size = 8661887, upload-time = "2025-09-22T04:01:17.265Z" },
1575
+ { url = "https://files.pythonhosted.org/packages/37/6f/9aae1008083bb501ef63284220ce81638332f9ccbfa53765b2b7502203cf/lxml-6.0.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:e8113639f3296706fbac34a30813929e29247718e88173ad849f57ca59754924", size = 4667818, upload-time = "2025-09-22T04:01:19.688Z" },
1576
+ { url = "https://files.pythonhosted.org/packages/f1/ca/31fb37f99f37f1536c133476674c10b577e409c0a624384147653e38baf2/lxml-6.0.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a8bef9b9825fa8bc816a6e641bb67219489229ebc648be422af695f6e7a4fa7f", size = 4950807, upload-time = "2025-09-22T04:01:21.487Z" },
1577
+ { url = "https://files.pythonhosted.org/packages/da/87/f6cb9442e4bada8aab5ae7e1046264f62fdbeaa6e3f6211b93f4c0dd97f1/lxml-6.0.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:65ea18d710fd14e0186c2f973dc60bb52039a275f82d3c44a0e42b43440ea534", size = 5109179, upload-time = "2025-09-22T04:01:23.32Z" },
1578
+ { url = "https://files.pythonhosted.org/packages/c8/20/a7760713e65888db79bbae4f6146a6ae5c04e4a204a3c48896c408cd6ed2/lxml-6.0.2-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c371aa98126a0d4c739ca93ceffa0fd7a5d732e3ac66a46e74339acd4d334564", size = 5023044, upload-time = "2025-09-22T04:01:25.118Z" },
1579
+ { url = "https://files.pythonhosted.org/packages/a2/b0/7e64e0460fcb36471899f75831509098f3fd7cd02a3833ac517433cb4f8f/lxml-6.0.2-cp312-cp312-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:700efd30c0fa1a3581d80a748157397559396090a51d306ea59a70020223d16f", size = 5359685, upload-time = "2025-09-22T04:01:27.398Z" },
1580
+ { url = "https://files.pythonhosted.org/packages/b9/e1/e5df362e9ca4e2f48ed6411bd4b3a0ae737cc842e96877f5bf9428055ab4/lxml-6.0.2-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c33e66d44fe60e72397b487ee92e01da0d09ba2d66df8eae42d77b6d06e5eba0", size = 5654127, upload-time = "2025-09-22T04:01:29.629Z" },
1581
+ { url = "https://files.pythonhosted.org/packages/c6/d1/232b3309a02d60f11e71857778bfcd4acbdb86c07db8260caf7d008b08f8/lxml-6.0.2-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:90a345bbeaf9d0587a3aaffb7006aa39ccb6ff0e96a57286c0cb2fd1520ea192", size = 5253958, upload-time = "2025-09-22T04:01:31.535Z" },
1582
+ { url = "https://files.pythonhosted.org/packages/35/35/d955a070994725c4f7d80583a96cab9c107c57a125b20bb5f708fe941011/lxml-6.0.2-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:064fdadaf7a21af3ed1dcaa106b854077fbeada827c18f72aec9346847cd65d0", size = 4711541, upload-time = "2025-09-22T04:01:33.801Z" },
1583
+ { url = "https://files.pythonhosted.org/packages/1e/be/667d17363b38a78c4bd63cfd4b4632029fd68d2c2dc81f25ce9eb5224dd5/lxml-6.0.2-cp312-cp312-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fbc74f42c3525ac4ffa4b89cbdd00057b6196bcefe8bce794abd42d33a018092", size = 5267426, upload-time = "2025-09-22T04:01:35.639Z" },
1584
+ { url = "https://files.pythonhosted.org/packages/ea/47/62c70aa4a1c26569bc958c9ca86af2bb4e1f614e8c04fb2989833874f7ae/lxml-6.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6ddff43f702905a4e32bc24f3f2e2edfe0f8fde3277d481bffb709a4cced7a1f", size = 5064917, upload-time = "2025-09-22T04:01:37.448Z" },
1585
+ { url = "https://files.pythonhosted.org/packages/bd/55/6ceddaca353ebd0f1908ef712c597f8570cc9c58130dbb89903198e441fd/lxml-6.0.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:6da5185951d72e6f5352166e3da7b0dc27aa70bd1090b0eb3f7f7212b53f1bb8", size = 4788795, upload-time = "2025-09-22T04:01:39.165Z" },
1586
+ { url = "https://files.pythonhosted.org/packages/cf/e8/fd63e15da5e3fd4c2146f8bbb3c14e94ab850589beab88e547b2dbce22e1/lxml-6.0.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:57a86e1ebb4020a38d295c04fc79603c7899e0df71588043eb218722dabc087f", size = 5676759, upload-time = "2025-09-22T04:01:41.506Z" },
1587
+ { url = "https://files.pythonhosted.org/packages/76/47/b3ec58dc5c374697f5ba37412cd2728f427d056315d124dd4b61da381877/lxml-6.0.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:2047d8234fe735ab77802ce5f2297e410ff40f5238aec569ad7c8e163d7b19a6", size = 5255666, upload-time = "2025-09-22T04:01:43.363Z" },
1588
+ { url = "https://files.pythonhosted.org/packages/19/93/03ba725df4c3d72afd9596eef4a37a837ce8e4806010569bedfcd2cb68fd/lxml-6.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6f91fd2b2ea15a6800c8e24418c0775a1694eefc011392da73bc6cef2623b322", size = 5277989, upload-time = "2025-09-22T04:01:45.215Z" },
1589
+ { url = "https://files.pythonhosted.org/packages/c6/80/c06de80bfce881d0ad738576f243911fccf992687ae09fd80b734712b39c/lxml-6.0.2-cp312-cp312-win32.whl", hash = "sha256:3ae2ce7d6fedfb3414a2b6c5e20b249c4c607f72cb8d2bb7cc9c6ec7c6f4e849", size = 3611456, upload-time = "2025-09-22T04:01:48.243Z" },
1590
+ { url = "https://files.pythonhosted.org/packages/f7/d7/0cdfb6c3e30893463fb3d1e52bc5f5f99684a03c29a0b6b605cfae879cd5/lxml-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:72c87e5ee4e58a8354fb9c7c84cbf95a1c8236c127a5d1b7683f04bed8361e1f", size = 4011793, upload-time = "2025-09-22T04:01:50.042Z" },
1591
+ { url = "https://files.pythonhosted.org/packages/ea/7b/93c73c67db235931527301ed3785f849c78991e2e34f3fd9a6663ffda4c5/lxml-6.0.2-cp312-cp312-win_arm64.whl", hash = "sha256:61cb10eeb95570153e0c0e554f58df92ecf5109f75eacad4a95baa709e26c3d6", size = 3672836, upload-time = "2025-09-22T04:01:52.145Z" },
1592
+ { url = "https://files.pythonhosted.org/packages/53/fd/4e8f0540608977aea078bf6d79f128e0e2c2bba8af1acf775c30baa70460/lxml-6.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:9b33d21594afab46f37ae58dfadd06636f154923c4e8a4d754b0127554eb2e77", size = 8648494, upload-time = "2025-09-22T04:01:54.242Z" },
1593
+ { url = "https://files.pythonhosted.org/packages/5d/f4/2a94a3d3dfd6c6b433501b8d470a1960a20ecce93245cf2db1706adf6c19/lxml-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:6c8963287d7a4c5c9a432ff487c52e9c5618667179c18a204bdedb27310f022f", size = 4661146, upload-time = "2025-09-22T04:01:56.282Z" },
1594
+ { url = "https://files.pythonhosted.org/packages/25/2e/4efa677fa6b322013035d38016f6ae859d06cac67437ca7dc708a6af7028/lxml-6.0.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1941354d92699fb5ffe6ed7b32f9649e43c2feb4b97205f75866f7d21aa91452", size = 4946932, upload-time = "2025-09-22T04:01:58.989Z" },
1595
+ { url = "https://files.pythonhosted.org/packages/ce/0f/526e78a6d38d109fdbaa5049c62e1d32fdd70c75fb61c4eadf3045d3d124/lxml-6.0.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bb2f6ca0ae2d983ded09357b84af659c954722bbf04dea98030064996d156048", size = 5100060, upload-time = "2025-09-22T04:02:00.812Z" },
1596
+ { url = "https://files.pythonhosted.org/packages/81/76/99de58d81fa702cc0ea7edae4f4640416c2062813a00ff24bd70ac1d9c9b/lxml-6.0.2-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb2a12d704f180a902d7fa778c6d71f36ceb7b0d317f34cdc76a5d05aa1dd1df", size = 5019000, upload-time = "2025-09-22T04:02:02.671Z" },
1597
+ { url = "https://files.pythonhosted.org/packages/b5/35/9e57d25482bc9a9882cb0037fdb9cc18f4b79d85df94fa9d2a89562f1d25/lxml-6.0.2-cp313-cp313-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:6ec0e3f745021bfed19c456647f0298d60a24c9ff86d9d051f52b509663feeb1", size = 5348496, upload-time = "2025-09-22T04:02:04.904Z" },
1598
+ { url = "https://files.pythonhosted.org/packages/a6/8e/cb99bd0b83ccc3e8f0f528e9aa1f7a9965dfec08c617070c5db8d63a87ce/lxml-6.0.2-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:846ae9a12d54e368933b9759052d6206a9e8b250291109c48e350c1f1f49d916", size = 5643779, upload-time = "2025-09-22T04:02:06.689Z" },
1599
+ { url = "https://files.pythonhosted.org/packages/d0/34/9e591954939276bb679b73773836c6684c22e56d05980e31d52a9a8deb18/lxml-6.0.2-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ef9266d2aa545d7374938fb5c484531ef5a2ec7f2d573e62f8ce722c735685fd", size = 5244072, upload-time = "2025-09-22T04:02:08.587Z" },
1600
+ { url = "https://files.pythonhosted.org/packages/8d/27/b29ff065f9aaca443ee377aff699714fcbffb371b4fce5ac4ca759e436d5/lxml-6.0.2-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:4077b7c79f31755df33b795dc12119cb557a0106bfdab0d2c2d97bd3cf3dffa6", size = 4718675, upload-time = "2025-09-22T04:02:10.783Z" },
1601
+ { url = "https://files.pythonhosted.org/packages/2b/9f/f756f9c2cd27caa1a6ef8c32ae47aadea697f5c2c6d07b0dae133c244fbe/lxml-6.0.2-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a7c5d5e5f1081955358533be077166ee97ed2571d6a66bdba6ec2f609a715d1a", size = 5255171, upload-time = "2025-09-22T04:02:12.631Z" },
1602
+ { url = "https://files.pythonhosted.org/packages/61/46/bb85ea42d2cb1bd8395484fd72f38e3389611aa496ac7772da9205bbda0e/lxml-6.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8f8d0cbd0674ee89863a523e6994ac25fd5be9c8486acfc3e5ccea679bad2679", size = 5057175, upload-time = "2025-09-22T04:02:14.718Z" },
1603
+ { url = "https://files.pythonhosted.org/packages/95/0c/443fc476dcc8e41577f0af70458c50fe299a97bb6b7505bb1ae09aa7f9ac/lxml-6.0.2-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:2cbcbf6d6e924c28f04a43f3b6f6e272312a090f269eff68a2982e13e5d57659", size = 4785688, upload-time = "2025-09-22T04:02:16.957Z" },
1604
+ { url = "https://files.pythonhosted.org/packages/48/78/6ef0b359d45bb9697bc5a626e1992fa5d27aa3f8004b137b2314793b50a0/lxml-6.0.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:dfb874cfa53340009af6bdd7e54ebc0d21012a60a4e65d927c2e477112e63484", size = 5660655, upload-time = "2025-09-22T04:02:18.815Z" },
1605
+ { url = "https://files.pythonhosted.org/packages/ff/ea/e1d33808f386bc1339d08c0dcada6e4712d4ed8e93fcad5f057070b7988a/lxml-6.0.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:fb8dae0b6b8b7f9e96c26fdd8121522ce5de9bb5538010870bd538683d30e9a2", size = 5247695, upload-time = "2025-09-22T04:02:20.593Z" },
1606
+ { url = "https://files.pythonhosted.org/packages/4f/47/eba75dfd8183673725255247a603b4ad606f4ae657b60c6c145b381697da/lxml-6.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:358d9adae670b63e95bc59747c72f4dc97c9ec58881d4627fe0120da0f90d314", size = 5269841, upload-time = "2025-09-22T04:02:22.489Z" },
1607
+ { url = "https://files.pythonhosted.org/packages/76/04/5c5e2b8577bc936e219becb2e98cdb1aca14a4921a12995b9d0c523502ae/lxml-6.0.2-cp313-cp313-win32.whl", hash = "sha256:e8cd2415f372e7e5a789d743d133ae474290a90b9023197fd78f32e2dc6873e2", size = 3610700, upload-time = "2025-09-22T04:02:24.465Z" },
1608
+ { url = "https://files.pythonhosted.org/packages/fe/0a/4643ccc6bb8b143e9f9640aa54e38255f9d3b45feb2cbe7ae2ca47e8782e/lxml-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:b30d46379644fbfc3ab81f8f82ae4de55179414651f110a1514f0b1f8f6cb2d7", size = 4010347, upload-time = "2025-09-22T04:02:26.286Z" },
1609
+ { url = "https://files.pythonhosted.org/packages/31/ef/dcf1d29c3f530577f61e5fe2f1bd72929acf779953668a8a47a479ae6f26/lxml-6.0.2-cp313-cp313-win_arm64.whl", hash = "sha256:13dcecc9946dca97b11b7c40d29fba63b55ab4170d3c0cf8c0c164343b9bfdcf", size = 3671248, upload-time = "2025-09-22T04:02:27.918Z" },
1610
+ { url = "https://files.pythonhosted.org/packages/03/15/d4a377b385ab693ce97b472fe0c77c2b16ec79590e688b3ccc71fba19884/lxml-6.0.2-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:b0c732aa23de8f8aec23f4b580d1e52905ef468afb4abeafd3fec77042abb6fe", size = 8659801, upload-time = "2025-09-22T04:02:30.113Z" },
1611
+ { url = "https://files.pythonhosted.org/packages/c8/e8/c128e37589463668794d503afaeb003987373c5f94d667124ffd8078bbd9/lxml-6.0.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:4468e3b83e10e0317a89a33d28f7aeba1caa4d1a6fd457d115dd4ffe90c5931d", size = 4659403, upload-time = "2025-09-22T04:02:32.119Z" },
1612
+ { url = "https://files.pythonhosted.org/packages/00/ce/74903904339decdf7da7847bb5741fc98a5451b42fc419a86c0c13d26fe2/lxml-6.0.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:abd44571493973bad4598a3be7e1d807ed45aa2adaf7ab92ab7c62609569b17d", size = 4966974, upload-time = "2025-09-22T04:02:34.155Z" },
1613
+ { url = "https://files.pythonhosted.org/packages/1f/d3/131dec79ce61c5567fecf82515bd9bc36395df42501b50f7f7f3bd065df0/lxml-6.0.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:370cd78d5855cfbffd57c422851f7d3864e6ae72d0da615fca4dad8c45d375a5", size = 5102953, upload-time = "2025-09-22T04:02:36.054Z" },
1614
+ { url = "https://files.pythonhosted.org/packages/3a/ea/a43ba9bb750d4ffdd885f2cd333572f5bb900cd2408b67fdda07e85978a0/lxml-6.0.2-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:901e3b4219fa04ef766885fb40fa516a71662a4c61b80c94d25336b4934b71c0", size = 5055054, upload-time = "2025-09-22T04:02:38.154Z" },
1615
+ { url = "https://files.pythonhosted.org/packages/60/23/6885b451636ae286c34628f70a7ed1fcc759f8d9ad382d132e1c8d3d9bfd/lxml-6.0.2-cp314-cp314-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:a4bf42d2e4cf52c28cc1812d62426b9503cdb0c87a6de81442626aa7d69707ba", size = 5352421, upload-time = "2025-09-22T04:02:40.413Z" },
1616
+ { url = "https://files.pythonhosted.org/packages/48/5b/fc2ddfc94ddbe3eebb8e9af6e3fd65e2feba4967f6a4e9683875c394c2d8/lxml-6.0.2-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b2c7fdaa4d7c3d886a42534adec7cfac73860b89b4e5298752f60aa5984641a0", size = 5673684, upload-time = "2025-09-22T04:02:42.288Z" },
1617
+ { url = "https://files.pythonhosted.org/packages/29/9c/47293c58cc91769130fbf85531280e8cc7868f7fbb6d92f4670071b9cb3e/lxml-6.0.2-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:98a5e1660dc7de2200b00d53fa00bcd3c35a3608c305d45a7bbcaf29fa16e83d", size = 5252463, upload-time = "2025-09-22T04:02:44.165Z" },
1618
+ { url = "https://files.pythonhosted.org/packages/9b/da/ba6eceb830c762b48e711ded880d7e3e89fc6c7323e587c36540b6b23c6b/lxml-6.0.2-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:dc051506c30b609238d79eda75ee9cab3e520570ec8219844a72a46020901e37", size = 4698437, upload-time = "2025-09-22T04:02:46.524Z" },
1619
+ { url = "https://files.pythonhosted.org/packages/a5/24/7be3f82cb7990b89118d944b619e53c656c97dc89c28cfb143fdb7cd6f4d/lxml-6.0.2-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8799481bbdd212470d17513a54d568f44416db01250f49449647b5ab5b5dccb9", size = 5269890, upload-time = "2025-09-22T04:02:48.812Z" },
1620
+ { url = "https://files.pythonhosted.org/packages/1b/bd/dcfb9ea1e16c665efd7538fc5d5c34071276ce9220e234217682e7d2c4a5/lxml-6.0.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:9261bb77c2dab42f3ecd9103951aeca2c40277701eb7e912c545c1b16e0e4917", size = 5097185, upload-time = "2025-09-22T04:02:50.746Z" },
1621
+ { url = "https://files.pythonhosted.org/packages/21/04/a60b0ff9314736316f28316b694bccbbabe100f8483ad83852d77fc7468e/lxml-6.0.2-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:65ac4a01aba353cfa6d5725b95d7aed6356ddc0a3cd734de00124d285b04b64f", size = 4745895, upload-time = "2025-09-22T04:02:52.968Z" },
1622
+ { url = "https://files.pythonhosted.org/packages/d6/bd/7d54bd1846e5a310d9c715921c5faa71cf5c0853372adf78aee70c8d7aa2/lxml-6.0.2-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:b22a07cbb82fea98f8a2fd814f3d1811ff9ed76d0fc6abc84eb21527596e7cc8", size = 5695246, upload-time = "2025-09-22T04:02:54.798Z" },
1623
+ { url = "https://files.pythonhosted.org/packages/fd/32/5643d6ab947bc371da21323acb2a6e603cedbe71cb4c99c8254289ab6f4e/lxml-6.0.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:d759cdd7f3e055d6bc8d9bec3ad905227b2e4c785dc16c372eb5b5e83123f48a", size = 5260797, upload-time = "2025-09-22T04:02:57.058Z" },
1624
+ { url = "https://files.pythonhosted.org/packages/33/da/34c1ec4cff1eea7d0b4cd44af8411806ed943141804ac9c5d565302afb78/lxml-6.0.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:945da35a48d193d27c188037a05fec5492937f66fb1958c24fc761fb9d40d43c", size = 5277404, upload-time = "2025-09-22T04:02:58.966Z" },
1625
+ { url = "https://files.pythonhosted.org/packages/82/57/4eca3e31e54dc89e2c3507e1cd411074a17565fa5ffc437c4ae0a00d439e/lxml-6.0.2-cp314-cp314-win32.whl", hash = "sha256:be3aaa60da67e6153eb15715cc2e19091af5dc75faef8b8a585aea372507384b", size = 3670072, upload-time = "2025-09-22T04:03:38.05Z" },
1626
+ { url = "https://files.pythonhosted.org/packages/e3/e0/c96cf13eccd20c9421ba910304dae0f619724dcf1702864fd59dd386404d/lxml-6.0.2-cp314-cp314-win_amd64.whl", hash = "sha256:fa25afbadead523f7001caf0c2382afd272c315a033a7b06336da2637d92d6ed", size = 4080617, upload-time = "2025-09-22T04:03:39.835Z" },
1627
+ { url = "https://files.pythonhosted.org/packages/d5/5d/b3f03e22b3d38d6f188ef044900a9b29b2fe0aebb94625ce9fe244011d34/lxml-6.0.2-cp314-cp314-win_arm64.whl", hash = "sha256:063eccf89df5b24e361b123e257e437f9e9878f425ee9aae3144c77faf6da6d8", size = 3754930, upload-time = "2025-09-22T04:03:41.565Z" },
1628
+ { url = "https://files.pythonhosted.org/packages/5e/5c/42c2c4c03554580708fc738d13414801f340c04c3eff90d8d2d227145275/lxml-6.0.2-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:6162a86d86893d63084faaf4ff937b3daea233e3682fb4474db07395794fa80d", size = 8910380, upload-time = "2025-09-22T04:03:01.645Z" },
1629
+ { url = "https://files.pythonhosted.org/packages/bf/4f/12df843e3e10d18d468a7557058f8d3733e8b6e12401f30b1ef29360740f/lxml-6.0.2-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:414aaa94e974e23a3e92e7ca5b97d10c0cf37b6481f50911032c69eeb3991bba", size = 4775632, upload-time = "2025-09-22T04:03:03.814Z" },
1630
+ { url = "https://files.pythonhosted.org/packages/e4/0c/9dc31e6c2d0d418483cbcb469d1f5a582a1cd00a1f4081953d44051f3c50/lxml-6.0.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:48461bd21625458dd01e14e2c38dd0aea69addc3c4f960c30d9f59d7f93be601", size = 4975171, upload-time = "2025-09-22T04:03:05.651Z" },
1631
+ { url = "https://files.pythonhosted.org/packages/e7/2b/9b870c6ca24c841bdd887504808f0417aa9d8d564114689266f19ddf29c8/lxml-6.0.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:25fcc59afc57d527cfc78a58f40ab4c9b8fd096a9a3f964d2781ffb6eb33f4ed", size = 5110109, upload-time = "2025-09-22T04:03:07.452Z" },
1632
+ { url = "https://files.pythonhosted.org/packages/bf/0c/4f5f2a4dd319a178912751564471355d9019e220c20d7db3fb8307ed8582/lxml-6.0.2-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5179c60288204e6ddde3f774a93350177e08876eaf3ab78aa3a3649d43eb7d37", size = 5041061, upload-time = "2025-09-22T04:03:09.297Z" },
1633
+ { url = "https://files.pythonhosted.org/packages/12/64/554eed290365267671fe001a20d72d14f468ae4e6acef1e179b039436967/lxml-6.0.2-cp314-cp314t-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:967aab75434de148ec80597b75062d8123cadf2943fb4281f385141e18b21338", size = 5306233, upload-time = "2025-09-22T04:03:11.651Z" },
1634
+ { url = "https://files.pythonhosted.org/packages/7a/31/1d748aa275e71802ad9722df32a7a35034246b42c0ecdd8235412c3396ef/lxml-6.0.2-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d100fcc8930d697c6561156c6810ab4a508fb264c8b6779e6e61e2ed5e7558f9", size = 5604739, upload-time = "2025-09-22T04:03:13.592Z" },
1635
+ { url = "https://files.pythonhosted.org/packages/8f/41/2c11916bcac09ed561adccacceaedd2bf0e0b25b297ea92aab99fd03d0fa/lxml-6.0.2-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2ca59e7e13e5981175b8b3e4ab84d7da57993eeff53c07764dcebda0d0e64ecd", size = 5225119, upload-time = "2025-09-22T04:03:15.408Z" },
1636
+ { url = "https://files.pythonhosted.org/packages/99/05/4e5c2873d8f17aa018e6afde417c80cc5d0c33be4854cce3ef5670c49367/lxml-6.0.2-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:957448ac63a42e2e49531b9d6c0fa449a1970dbc32467aaad46f11545be9af1d", size = 4633665, upload-time = "2025-09-22T04:03:17.262Z" },
1637
+ { url = "https://files.pythonhosted.org/packages/0f/c9/dcc2da1bebd6275cdc723b515f93edf548b82f36a5458cca3578bc899332/lxml-6.0.2-cp314-cp314t-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b7fc49c37f1786284b12af63152fe1d0990722497e2d5817acfe7a877522f9a9", size = 5234997, upload-time = "2025-09-22T04:03:19.14Z" },
1638
+ { url = "https://files.pythonhosted.org/packages/9c/e2/5172e4e7468afca64a37b81dba152fc5d90e30f9c83c7c3213d6a02a5ce4/lxml-6.0.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e19e0643cc936a22e837f79d01a550678da8377d7d801a14487c10c34ee49c7e", size = 5090957, upload-time = "2025-09-22T04:03:21.436Z" },
1639
+ { url = "https://files.pythonhosted.org/packages/a5/b3/15461fd3e5cd4ddcb7938b87fc20b14ab113b92312fc97afe65cd7c85de1/lxml-6.0.2-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:1db01e5cf14345628e0cbe71067204db658e2fb8e51e7f33631f5f4735fefd8d", size = 4764372, upload-time = "2025-09-22T04:03:23.27Z" },
1640
+ { url = "https://files.pythonhosted.org/packages/05/33/f310b987c8bf9e61c4dd8e8035c416bd3230098f5e3cfa69fc4232de7059/lxml-6.0.2-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:875c6b5ab39ad5291588aed6925fac99d0097af0dd62f33c7b43736043d4a2ec", size = 5634653, upload-time = "2025-09-22T04:03:25.767Z" },
1641
+ { url = "https://files.pythonhosted.org/packages/70/ff/51c80e75e0bc9382158133bdcf4e339b5886c6ee2418b5199b3f1a61ed6d/lxml-6.0.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:cdcbed9ad19da81c480dfd6dd161886db6096083c9938ead313d94b30aadf272", size = 5233795, upload-time = "2025-09-22T04:03:27.62Z" },
1642
+ { url = "https://files.pythonhosted.org/packages/56/4d/4856e897df0d588789dd844dbed9d91782c4ef0b327f96ce53c807e13128/lxml-6.0.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:80dadc234ebc532e09be1975ff538d154a7fa61ea5031c03d25178855544728f", size = 5257023, upload-time = "2025-09-22T04:03:30.056Z" },
1643
+ { url = "https://files.pythonhosted.org/packages/0f/85/86766dfebfa87bea0ab78e9ff7a4b4b45225df4b4d3b8cc3c03c5cd68464/lxml-6.0.2-cp314-cp314t-win32.whl", hash = "sha256:da08e7bb297b04e893d91087df19638dc7a6bb858a954b0cc2b9f5053c922312", size = 3911420, upload-time = "2025-09-22T04:03:32.198Z" },
1644
+ { url = "https://files.pythonhosted.org/packages/fe/1a/b248b355834c8e32614650b8008c69ffeb0ceb149c793961dd8c0b991bb3/lxml-6.0.2-cp314-cp314t-win_amd64.whl", hash = "sha256:252a22982dca42f6155125ac76d3432e548a7625d56f5a273ee78a5057216eca", size = 4406837, upload-time = "2025-09-22T04:03:34.027Z" },
1645
+ { url = "https://files.pythonhosted.org/packages/92/aa/df863bcc39c5e0946263454aba394de8a9084dbaff8ad143846b0d844739/lxml-6.0.2-cp314-cp314t-win_arm64.whl", hash = "sha256:bb4c1847b303835d89d785a18801a883436cdfd5dc3d62947f9c49e24f0f5a2c", size = 3822205, upload-time = "2025-09-22T04:03:36.249Z" },
1646
+ ]
1647
+
1648
+ [package.optional-dependencies]
1649
+ html-clean = [
1650
+ { name = "lxml-html-clean" },
1651
+ ]
1652
+
1653
+ [[package]]
1654
+ name = "lxml-html-clean"
1655
+ version = "0.4.3"
1656
+ source = { registry = "https://pypi.org/simple" }
1657
+ dependencies = [
1658
+ { name = "lxml" },
1659
+ ]
1660
+ sdist = { url = "https://files.pythonhosted.org/packages/d9/cb/c9c5bb2a9c47292e236a808dd233a03531f53b626f36259dcd32b49c76da/lxml_html_clean-0.4.3.tar.gz", hash = "sha256:c9df91925b00f836c807beab127aac82575110eacff54d0a75187914f1bd9d8c", size = 21498, upload-time = "2025-10-02T20:49:24.895Z" }
1661
+ wheels = [
1662
+ { url = "https://files.pythonhosted.org/packages/10/4a/63a9540e3ca73709f4200564a737d63a4c8c9c4dd032bab8535f507c190a/lxml_html_clean-0.4.3-py3-none-any.whl", hash = "sha256:63fd7b0b9c3a2e4176611c2ca5d61c4c07ffca2de76c14059a81a2825833731e", size = 14177, upload-time = "2025-10-02T20:49:23.749Z" },
1663
+ ]
1664
+
1665
  [[package]]
1666
  name = "markdown-it-py"
1667
  version = "4.0.0"
 
3864
  { url = "https://files.pythonhosted.org/packages/af/df/c7891ef9d2712ad774777271d39fdef63941ffba0a9d59b7ad1fd2765e57/tiktoken-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f61c0aea5565ac82e2ec50a05e02a6c44734e91b51c10510b084ea1b8e633a71", size = 920667, upload-time = "2025-10-06T20:22:34.444Z" },
3865
  ]
3866
 
3867
+ [[package]]
3868
+ name = "tld"
3869
+ version = "0.13.1"
3870
+ source = { registry = "https://pypi.org/simple" }
3871
+ sdist = { url = "https://files.pythonhosted.org/packages/df/a1/5723b07a70c1841a80afc9ac572fdf53488306848d844cd70519391b0d26/tld-0.13.1.tar.gz", hash = "sha256:75ec00936cbcf564f67361c41713363440b6c4ef0f0c1592b5b0fbe72c17a350", size = 462000, upload-time = "2025-05-21T22:18:29.341Z" }
3872
+ wheels = [
3873
+ { url = "https://files.pythonhosted.org/packages/dc/70/b2f38360c3fc4bc9b5e8ef429e1fde63749144ac583c2dbdf7e21e27a9ad/tld-0.13.1-py2.py3-none-any.whl", hash = "sha256:a2d35109433ac83486ddf87e3c4539ab2c5c2478230e5d9c060a18af4b03aa7c", size = 274718, upload-time = "2025-05-21T22:18:25.811Z" },
3874
+ ]
3875
+
3876
  [[package]]
3877
  name = "tokenizers"
3878
  version = "0.22.1"
 
3962
  { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
3963
  ]
3964
 
3965
+ [[package]]
3966
+ name = "trafilatura"
3967
+ version = "2.0.0"
3968
+ source = { registry = "https://pypi.org/simple" }
3969
+ dependencies = [
3970
+ { name = "certifi" },
3971
+ { name = "charset-normalizer" },
3972
+ { name = "courlan" },
3973
+ { name = "htmldate" },
3974
+ { name = "justext" },
3975
+ { name = "lxml" },
3976
+ { name = "urllib3" },
3977
+ ]
3978
+ sdist = { url = "https://files.pythonhosted.org/packages/06/25/e3ebeefdebfdfae8c4a4396f5a6ea51fc6fa0831d63ce338e5090a8003dc/trafilatura-2.0.0.tar.gz", hash = "sha256:ceb7094a6ecc97e72fea73c7dba36714c5c5b577b6470e4520dca893706d6247", size = 253404, upload-time = "2024-12-03T15:23:24.16Z" }
3979
+ wheels = [
3980
+ { url = "https://files.pythonhosted.org/packages/8a/b6/097367f180b6383a3581ca1b86fcae284e52075fa941d1232df35293363c/trafilatura-2.0.0-py3-none-any.whl", hash = "sha256:77eb5d1e993747f6f20938e1de2d840020719735690c840b9a1024803a4cd51d", size = 132557, upload-time = "2024-12-03T15:23:21.41Z" },
3981
+ ]
3982
+
3983
  [[package]]
3984
  name = "transformers"
3985
  version = "2.3.0"
 
4053
  { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
4054
  ]
4055
 
4056
+ [[package]]
4057
+ name = "tzlocal"
4058
+ version = "5.3.1"
4059
+ source = { registry = "https://pypi.org/simple" }
4060
+ dependencies = [
4061
+ { name = "tzdata", marker = "sys_platform == 'win32'" },
4062
+ ]
4063
+ sdist = { url = "https://files.pythonhosted.org/packages/8b/2e/c14812d3d4d9cd1773c6be938f89e5735a1f11a9f184ac3639b93cef35d5/tzlocal-5.3.1.tar.gz", hash = "sha256:cceffc7edecefea1f595541dbd6e990cb1ea3d19bf01b2809f362a03dd7921fd", size = 30761, upload-time = "2025-03-05T21:17:41.549Z" }
4064
+ wheels = [
4065
+ { url = "https://files.pythonhosted.org/packages/c2/14/e2a54fabd4f08cd7af1c07030603c3356b74da07f7cc056e600436edfa17/tzlocal-5.3.1-py3-none-any.whl", hash = "sha256:eb1a66c3ef5847adf7a834f1be0800581b683b5608e74f86ecbcef8ab91bb85d", size = 18026, upload-time = "2025-03-05T21:17:39.857Z" },
4066
+ ]
4067
+
4068
  [[package]]
4069
  name = "uc-micro-py"
4070
  version = "1.0.3"