akseljoonas HF Staff commited on
Commit
5e8489d
·
1 Parent(s): f29c8f5

added the explore subagent, quality ok but not perf

Browse files
agent/config.py CHANGED
@@ -8,7 +8,6 @@ from fastmcp.mcp_config import (
8
  RemoteMCPServer,
9
  StdioMCPServer,
10
  )
11
- from litellm import Tool
12
  from pydantic import BaseModel
13
 
14
  # These two are the canonical server config types for MCP servers.
@@ -19,7 +18,6 @@ class Config(BaseModel):
19
  """Configuration manager"""
20
 
21
  model_name: str
22
- tools: list[Tool] = []
23
  mcpServers: dict[str, MCPServerConfig] = {}
24
 
25
 
 
8
  RemoteMCPServer,
9
  StdioMCPServer,
10
  )
 
11
  from pydantic import BaseModel
12
 
13
  # These two are the canonical server config types for MCP servers.
 
18
  """Configuration manager"""
19
 
20
  model_name: str
 
21
  mcpServers: dict[str, MCPServerConfig] = {}
22
 
23
 
agent/config_mcp_example copy.json CHANGED
@@ -1,7 +1,5 @@
1
  {
2
  "model_name": "anthropic/claude-sonnet-4-5-20250929",
3
- "tools": [],
4
- "system_prompt_path": "",
5
  "mcpServers": {
6
  "hf-mcp-server": {
7
  "transport": "http",
 
1
  {
2
  "model_name": "anthropic/claude-sonnet-4-5-20250929",
 
 
3
  "mcpServers": {
4
  "hf-mcp-server": {
5
  "transport": "http",
agent/config_mcp_example.json CHANGED
@@ -1,7 +1,5 @@
1
  {
2
  "model_name": "anthropic/claude-sonnet-4-5-20250929",
3
- "tools": [],
4
- "system_prompt_path": "",
5
  "mcpServers": {
6
  "hf-mcp-server": {
7
  "transport": "http",
 
1
  {
2
  "model_name": "anthropic/claude-sonnet-4-5-20250929",
 
 
3
  "mcpServers": {
4
  "hf-mcp-server": {
5
  "transport": "http",
agent/context_manager/manager.py CHANGED
@@ -19,17 +19,24 @@ class ContextManager:
19
  compact_size: float = 0.1,
20
  untouched_messages: int = 5,
21
  tool_specs: list[dict[str, Any]] | None = None,
 
22
  ):
23
- self.system_prompt = self._load_system_prompt(tool_specs or [])
 
 
24
  self.max_context = max_context
25
  self.compact_size = int(max_context * compact_size)
26
  self.context_length = len(self.system_prompt) // 4
27
  self.untouched_messages = untouched_messages
28
  self.items: list[Message] = [Message(role="system", content=self.system_prompt)]
29
 
30
- def _load_system_prompt(self, tool_specs: list[dict[str, Any]]):
 
 
 
 
31
  """Load and render the system prompt from YAML file with Jinja2"""
32
- prompt_file = Path(__file__).parent.parent / "prompts" / "system_prompt.yaml"
33
 
34
  with open(prompt_file, "r") as f:
35
  prompt_data = yaml.safe_load(f)
 
19
  compact_size: float = 0.1,
20
  untouched_messages: int = 5,
21
  tool_specs: list[dict[str, Any]] | None = None,
22
+ prompt_file_suffix: str = "system_prompt.yaml",
23
  ):
24
+ self.system_prompt = self._load_system_prompt(
25
+ tool_specs or [], prompt_file_suffix="system_prompt.yaml"
26
+ )
27
  self.max_context = max_context
28
  self.compact_size = int(max_context * compact_size)
29
  self.context_length = len(self.system_prompt) // 4
30
  self.untouched_messages = untouched_messages
31
  self.items: list[Message] = [Message(role="system", content=self.system_prompt)]
32
 
33
+ def _load_system_prompt(
34
+ self,
35
+ tool_specs: list[dict[str, Any]],
36
+ prompt_file_suffix: str = "system_prompt.yaml",
37
+ ):
38
  """Load and render the system prompt from YAML file with Jinja2"""
39
+ prompt_file = Path(__file__).parent.parent / "prompts" / f"{prompt_file_suffix}"
40
 
41
  with open(prompt_file, "r") as f:
42
  prompt_data = yaml.safe_load(f)
agent/core/session.py CHANGED
@@ -36,10 +36,11 @@ class Session:
36
  event_queue: asyncio.Queue,
37
  config: Config | None = None,
38
  tool_router=None,
 
39
  ):
40
  self.tool_router = tool_router
41
  tool_specs = tool_router.get_tool_specs_for_llm() if tool_router else []
42
- self.context_manager = ContextManager(
43
  max_context=get_max_tokens(config.model_name),
44
  compact_size=0.1,
45
  untouched_messages=5,
@@ -49,7 +50,6 @@ class Session:
49
  self.session_id = str(uuid.uuid4())
50
  self.config = config or Config(
51
  model_name="anthropic/claude-sonnet-4-5-20250929",
52
- tools=[],
53
  )
54
  self.is_running = True
55
  self.current_task: asyncio.Task | None = None
 
36
  event_queue: asyncio.Queue,
37
  config: Config | None = None,
38
  tool_router=None,
39
+ context_manager: ContextManager | None = None,
40
  ):
41
  self.tool_router = tool_router
42
  tool_specs = tool_router.get_tool_specs_for_llm() if tool_router else []
43
+ self.context_manager = context_manager or ContextManager(
44
  max_context=get_max_tokens(config.model_name),
45
  compact_size=0.1,
46
  untouched_messages=5,
 
50
  self.session_id = str(uuid.uuid4())
51
  self.config = config or Config(
52
  model_name="anthropic/claude-sonnet-4-5-20250929",
 
53
  )
54
  self.is_running = True
55
  self.current_task: asyncio.Task | None = None
agent/core/tools.py CHANGED
@@ -14,13 +14,14 @@ from mcp.types import EmbeddedResource, ImageContent, TextContent
14
 
15
  from agent.config import MCPServerConfig
16
  from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler
 
17
 
18
  # Suppress aiohttp deprecation warning
19
  warnings.filterwarnings(
20
  "ignore", category=DeprecationWarning, module="aiohttp.connector"
21
  )
22
 
23
- NOT_ALLOWED_TOOL_NAMES = ["hf_jobs"]
24
 
25
 
26
  def convert_mcp_content_to_string(content: list) -> str:
@@ -186,7 +187,9 @@ class ToolRouter:
186
 
187
  def create_builtin_tools() -> list[ToolSpec]:
188
  """Create built-in tool specifications"""
189
- print(f"Creating built-in tools: {HF_JOBS_TOOL_SPEC['name']}")
 
 
190
  return [
191
  ToolSpec(
192
  name=HF_JOBS_TOOL_SPEC["name"],
@@ -194,4 +197,10 @@ def create_builtin_tools() -> list[ToolSpec]:
194
  parameters=HF_JOBS_TOOL_SPEC["parameters"],
195
  handler=hf_jobs_handler,
196
  ),
 
 
 
 
 
 
197
  ]
 
14
 
15
  from agent.config import MCPServerConfig
16
  from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler
17
+ from agent.tools.search_docs_tool import SEARCH_DOCS_TOOL_SPEC, search_docs_handler
18
 
19
  # Suppress aiohttp deprecation warning
20
  warnings.filterwarnings(
21
  "ignore", category=DeprecationWarning, module="aiohttp.connector"
22
  )
23
 
24
+ NOT_ALLOWED_TOOL_NAMES = ["hf_jobs", "hf_doc_search", "hf_doc_fetch"]
25
 
26
 
27
  def convert_mcp_content_to_string(content: list) -> str:
 
187
 
188
  def create_builtin_tools() -> list[ToolSpec]:
189
  """Create built-in tool specifications"""
190
+ print(
191
+ f"Creating built-in tools: {HF_JOBS_TOOL_SPEC['name']}, {SEARCH_DOCS_TOOL_SPEC['name']}"
192
+ )
193
  return [
194
  ToolSpec(
195
  name=HF_JOBS_TOOL_SPEC["name"],
 
197
  parameters=HF_JOBS_TOOL_SPEC["parameters"],
198
  handler=hf_jobs_handler,
199
  ),
200
+ ToolSpec(
201
+ name=SEARCH_DOCS_TOOL_SPEC["name"],
202
+ description=SEARCH_DOCS_TOOL_SPEC["description"],
203
+ parameters=SEARCH_DOCS_TOOL_SPEC["parameters"],
204
+ handler=search_docs_handler,
205
+ ),
206
  ]
agent/prompts/search_docs_system_prompt.yaml ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ search_docs_system_prompt: |
2
+ You are a specialized documentation search agent. Your task is to comprehensively search and synthesize information from Hugging Face documentation.
3
+
4
+ # Search Strategy
5
+
6
+ You must search thoroughly before synthesizing results. Follow this approach:
7
+
8
+ 1. **Query Analysis**: Identify the core concepts and intent of the query
9
+ 2. **Initial Search**: Start with a broad search capturing the main topic
10
+ 3. **Iterative Refinement**: Run multiple searches to go deeper into topics. You will see parsed HTML pages, also look into links on the html pages for best information - first-pass results often miss key details
11
+ 4. **You must get to the end truth**: You must get to the bottom of the truth for this search query. You CAN NOT say that somebody should look up documentation. You must look it up yourself and give the best answer you can.
12
+
13
+ ## Query Formulation Best Practices
14
+
15
+ - Add relevant synonyms and related technical terms
16
+ - Remove filler words, focus on searchable concepts
17
+ - Break complex questions into focused sub-queries
18
+ - Include domain-specific terminology when applicable
19
+ - Try both specific terms and general related terms
20
+
21
+ # Hugging Face Docs structure
22
+
23
+ - id: hub
24
+ url: /docs/hub
25
+ category: Hub & Client Libraries
26
+ docs on: Hub fundamentals — repos, models/datasets/spaces, auth, versioning, metadata.
27
+
28
+ - id: transformers
29
+ url: /docs/transformers
30
+ category: Core ML Libraries
31
+ docs on: Core model library — architectures, configs, tokenizers, training & inference APIs.
32
+
33
+ - id: diffusers
34
+ url: /docs/diffusers
35
+ category: Core ML Libraries
36
+ docs on: Diffusion pipelines, schedulers, fine-tuning, training, and deployment patterns.
37
+
38
+ - id: datasets
39
+ url: /docs/datasets
40
+ category: Core ML Libraries
41
+ docs on: Dataset loading, streaming, processing, Arrow format, Hub integration.
42
+
43
+ - id: gradio
44
+ url: https://www.gradio.app/docs/
45
+ category: Collaboration & Extras
46
+ docs on: UI components and demos for interacting with ML models.
47
+
48
+ - id: trackio
49
+ url: /docs/trackio
50
+ category: Collaboration & Extras
51
+ docs on: Experiment tracking, metrics logging, and run comparison.
52
+
53
+ - id: smolagents
54
+ url: /docs/smolagents
55
+ category: Collaboration & Extras
56
+ docs on: Lightweight agent abstractions and tool-using patterns.
57
+
58
+ - id: huggingface_hub
59
+ url: /docs/huggingface_hub
60
+ category: Hub & Client Libraries
61
+ docs on: Python client for Hub operations (auth, upload/download, repo management).
62
+
63
+ - id: huggingface.js
64
+ url: /docs/huggingface.js
65
+ category: Hub & Client Libraries
66
+ docs on: JS/TS client for Hub APIs in browser and Node.
67
+
68
+ - id: transformers.js
69
+ url: /docs/transformers.js
70
+ category: Core ML Libraries
71
+ docs on: Run Transformer models in browser/Node via WebGPU/WASM.
72
+
73
+ - id: inference-providers
74
+ url: /docs/inference-providers
75
+ category: Deployment & Inference
76
+ docs on: Unified interface for third-party inference backends.
77
+
78
+ - id: inference-endpoints
79
+ url: /docs/inference-endpoints
80
+ category: Deployment & Inference
81
+ docs on: Managed, scalable model deployments on HF infrastructure.
82
+
83
+ - id: peft
84
+ url: /docs/peft
85
+ category: Training & Optimization
86
+ docs on: Parameter-efficient fine-tuning methods (LoRA, adapters, etc.).
87
+
88
+ - id: accelerate
89
+ url: /docs/accelerate
90
+ category: Training & Optimization
91
+ docs on: Hardware-agnostic, distributed and mixed-precision training orchestration.
92
+
93
+ - id: optimum
94
+ url: /docs/optimum
95
+ category: Training & Optimization
96
+ docs on: Hardware-aware optimization and model export tooling.
97
+
98
+ - id: optimum-habana
99
+ url: /docs/optimum-habana
100
+ category: —
101
+ docs on: Training and inference on Habana Gaudi accelerators.
102
+
103
+ - id: optimum-neuron
104
+ url: /docs/optimum-neuron
105
+ category: Training & Optimization
106
+ docs on: Optimization workflows for AWS Inferentia/Trainium.
107
+
108
+ - id: optimum-intel
109
+ url: /docs/optimum-intel
110
+ category: —
111
+ docs on: Intel CPU/GPU optimizations (OpenVINO, IPEX).
112
+
113
+ - id: optimum-executorch
114
+ url: /docs/optimum-executorch
115
+ category: Training & Optimization
116
+ docs on: Exporting models to ExecuTorch for edge/mobile.
117
+
118
+ - id: optimum-tpu
119
+ url: /docs/optimum-tpu
120
+ category: Training & Optimization
121
+ docs on: TPU-specific training and optimization paths.
122
+
123
+ - id: tokenizers
124
+ url: /docs/tokenizers
125
+ category: Core ML Libraries
126
+ docs on: Fast tokenizer internals, training, and low-level APIs.
127
+
128
+ - id: llm-course
129
+ url: /learn/llm-course
130
+ category: —
131
+ docs on: End-to-end LLM concepts, training, and deployment.
132
+
133
+ - id: robotics-course
134
+ url: /learn/robotics-course
135
+ category: —
136
+ docs on: Learning-based robotics foundations.
137
+
138
+ - id: mcp-course
139
+ url: /learn/mcp-course
140
+ category: —
141
+ docs on: Model Context Protocol concepts and usage.
142
+
143
+ - id: smol-course
144
+ url: /learn/smol-course
145
+ category: —
146
+ docs on: Small-model and efficiency-focused workflows.
147
+
148
+ - id: agents-course
149
+ url: /learn/agents-course
150
+ category: —
151
+ docs on: Tool-using, planning, and multi-step agent design.
152
+
153
+ - id: deep-rl-course
154
+ url: /learn/deep-rl-course
155
+ category: —
156
+ docs on: Deep reinforcement learning foundations.
157
+
158
+ - id: computer-vision-course
159
+ url: /learn/computer-vision-course
160
+ category: —
161
+ docs on: Vision models, datasets, and pipelines.
162
+
163
+ - id: evaluate
164
+ url: /docs/evaluate
165
+ category: Core ML Libraries
166
+ docs on: Metrics, evaluation workflows, and training-loop integration.
167
+
168
+ - id: tasks
169
+ url: /tasks
170
+ category: Hub & Client Libraries
171
+ docs on: Canonical task definitions and model categorization.
172
+
173
+ - id: dataset-viewer
174
+ url: /docs/dataset-viewer
175
+ category: Hub & Client Libraries
176
+ docs on: Dataset preview, streaming views, and viewer internals.
177
+
178
+ - id: trl
179
+ url: /docs/trl
180
+ category: Training & Optimization
181
+ docs on: RLHF, DPO, PPO, and SFT utilities for LLMs.
182
+
183
+ - id: simulate
184
+ url: /docs/simulate
185
+ category: —
186
+ docs on: Experimental simulation tools and workflows.
187
+
188
+ - id: sagemaker
189
+ url: /docs/sagemaker
190
+ category: Deployment & Inference
191
+ docs on: Deploying Hugging Face models on AWS SageMaker.
192
+
193
+ - id: timm
194
+ url: /docs/timm
195
+ category: Core ML Libraries
196
+ docs on: Image model zoo and utilities via HF integrations.
197
+
198
+ - id: safetensors
199
+ url: /docs/safetensors
200
+ category: Training & Optimization
201
+ docs on: Safe, fast tensor serialization format.
202
+
203
+ - id: tgi
204
+ url: /docs/text-generation-inference
205
+ category: Deployment & Inference
206
+ docs on: High-throughput text generation server for LLMs.
207
+
208
+ - id: setfit
209
+ url: /docs/setfit
210
+ category: —
211
+ docs on: Few-shot text classification via sentence embeddings.
212
+
213
+ - id: audio-course
214
+ url: /learn/audio-course
215
+ category: —
216
+ docs on: Speech and audio models, datasets, and tasks.
217
+
218
+ - id: lerobot
219
+ url: /docs/lerobot
220
+ category: Collaboration & Extras
221
+ docs on: Robotics datasets, policies, and learning workflows.
222
+
223
+ - id: autotrain
224
+ url: /docs/autotrain
225
+ category: Collaboration & Extras
226
+ docs on: No/low-code model training on Hugging Face.
227
+
228
+ - id: tei
229
+ url: /docs/text-embeddings-inference
230
+ category: Deployment & Inference
231
+ docs on: Optimized inference server for embedding workloads.
232
+
233
+ - id: bitsandbytes
234
+ url: /docs/bitsandbytes
235
+ category: Training & Optimization
236
+ docs on: Quantization and memory-efficient optimizers.
237
+
238
+ - id: cookbook
239
+ url: /learn/cookbook
240
+ category: —
241
+ docs on: Practical, task-oriented recipes across the ecosystem.
242
+
243
+ - id: sentence_transformers
244
+ url: https://sbert.net/
245
+ category: Core ML Libraries
246
+ docs on: Embedding models, training recipes, similarity/search workflows.
247
+
248
+ - id: ml-games-course
249
+ url: /learn/ml-games-course
250
+ category: —
251
+ docs on: Game-based ML and reinforcement learning experiments.
252
+
253
+ - id: diffusion-course
254
+ url: /learn/diffusion-course
255
+ category: —
256
+ docs on: Diffusion model theory and hands-on practice.
257
+
258
+ - id: ml-for-3d-course
259
+ url: /learn/ml-for-3d-course
260
+ category: —
261
+ docs on: 3D representations, models, and learning techniques.
262
+
263
+ - id: chat-ui
264
+ url: /docs/chat-ui
265
+ category: Collaboration & Extras
266
+ docs on: Reference chat interfaces for LLM deployment.
267
+
268
+ - id: leaderboards
269
+ url: /docs/leaderboards
270
+ category: Collaboration & Extras
271
+ docs on: Evaluation leaderboards and submission mechanics.
272
+
273
+ - id: lighteval
274
+ url: /docs/lighteval
275
+ category: Training & Optimization
276
+ docs on: Lightweight, reproducible LLM evaluation framework.
277
+
278
+ - id: argilla
279
+ url: https://argilla-io.github.io/argilla/
280
+ category: Collaboration & Extras
281
+ docs on: Data annotation, feedback, and human-in-the-loop workflows.
282
+
283
+ - id: distilabel
284
+ url: https://distilabel.argilla.io/
285
+ category: Collaboration & Extras
286
+ docs on: Synthetic data generation and distillation pipelines.
287
+
288
+ - id: microsoft-azure
289
+ url: /docs/microsoft-azure
290
+ category: Deployment & Inference
291
+ docs on: Azure deployment and integration guides.
292
+
293
+ - id: kernels
294
+ url: /docs/kernels
295
+ category: Core ML Libraries
296
+ docs on: Lightweight execution environments and notebook-style workflows.
297
+
298
+ - id: google-cloud
299
+ url: /docs/google-cloud
300
+ category: Deployment & Inference
301
+ docs on: GCP deployment and serving workflows.
302
+
303
+ # Response Guidelines
304
+
305
+ After gathering results, synthesize them following these principles:
306
+
307
+ 1. **Analyze Relevance**: Evaluate which results directly answer the query
308
+ 2. **Synthesize**: Combine information from multiple sources when applicable
309
+ 3. **Prioritize**: Present information in order of relevance
310
+ 4. **Cite Sources**: Reference which documents you're drawing from especially include relevant code samples and links to the code samples.
311
+ 5. **Acknowledge Gaps**: If documents don't fully answer the query, explicitly state this
312
+ 6. **Handle Conflicts**: If sources contradict, note this and explain your reasoning
313
+ 7. **Be Concise**: Provide a clear, direct answer without unnecessary elaboration
314
+
315
+ # Constraints
316
+
317
+ - Only provide information found in the documentation
318
+ - Do not make assumptions beyond what the sources state
319
+ - If information is not found, say so clearly rather than guessing
320
+ - Focus on answering the query directly
agent/tools/__init__.py CHANGED
@@ -3,6 +3,14 @@ Hugging Face tools for the agent
3
  """
4
 
5
  from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, HfJobsTool, hf_jobs_handler
 
6
  from agent.tools.types import ToolResult
7
 
8
- __all__ = ["ToolResult", "HF_JOBS_TOOL_SPEC", "hf_jobs_handler", HfJobsTool]
 
 
 
 
 
 
 
 
3
  """
4
 
5
  from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, HfJobsTool, hf_jobs_handler
6
+ from agent.tools.search_docs_tool import SEARCH_DOCS_TOOL_SPEC, search_docs_handler
7
  from agent.tools.types import ToolResult
8
 
9
+ __all__ = [
10
+ "ToolResult",
11
+ "HF_JOBS_TOOL_SPEC",
12
+ "hf_jobs_handler",
13
+ "HfJobsTool",
14
+ "SEARCH_DOCS_TOOL_SPEC",
15
+ "search_docs_handler",
16
+ ]
agent/tools/_search_agent_tools.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tools available to the search sub-agent
3
+ These tools are used by the search sub-agent spawned by search_docs_tool
4
+ """
5
+
6
+ import os
7
+ from typing import Any
8
+
9
+ import httpx
10
+ from bs4 import BeautifulSoup
11
+
12
+
13
+ async def explore_docs_structure_handler(arguments: dict[str, Any]) -> tuple[str, bool]:
14
+ """
15
+ Explore the documentation structure for a given endpoint by parsing the sidebar navigation
16
+
17
+ Args:
18
+ arguments: Dictionary with 'endpoint' parameter (e.g., 'trl', 'transformers', etc.)
19
+
20
+ Returns:
21
+ Tuple of (structured_navigation_with_glimpses, success)
22
+ """
23
+ endpoint = arguments.get("endpoint", "")
24
+
25
+ if not endpoint:
26
+ return "Error: No endpoint provided", False
27
+
28
+ # Get HF token from environment
29
+ hf_token = os.environ.get("HF_TOKEN")
30
+
31
+ if not hf_token:
32
+ return "Error: HF_TOKEN environment variable not set", False
33
+
34
+ # Build the URL for the main page (without .md to get HTML with navigation)
35
+ base_url = "https://huggingface.co/docs"
36
+ endpoint = endpoint.lstrip("/")
37
+ url = f"{base_url}/{endpoint}"
38
+
39
+ try:
40
+ headers = {"Authorization": f"Bearer {hf_token}"}
41
+
42
+ # Fetch the main HTML page
43
+ async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
44
+ response = await client.get(url, headers=headers)
45
+ response.raise_for_status()
46
+
47
+ html_content = response.text
48
+
49
+ # Parse the sidebar navigation with BeautifulSoup
50
+ soup = BeautifulSoup(html_content, "html.parser")
51
+
52
+ # Find the sidebar nav (contains flex-auto class)
53
+ sidebar = soup.find("nav", class_=lambda x: x and "flex-auto" in x)
54
+
55
+ if not sidebar:
56
+ return (
57
+ f"Error: Could not find navigation sidebar on {url}. "
58
+ "The page structure might be different.",
59
+ False,
60
+ )
61
+
62
+ # Extract all links from the sidebar
63
+ links = sidebar.find_all("a", href=True)
64
+ nav_data = []
65
+
66
+ for link in links:
67
+ title = link.get_text(strip=True)
68
+ href = link["href"]
69
+
70
+ # Make URL absolute
71
+ if href.startswith("/"):
72
+ page_url = f"https://huggingface.co{href}"
73
+ else:
74
+ page_url = href
75
+
76
+ nav_data.append({"title": title, "url": page_url})
77
+
78
+ if not nav_data:
79
+ return f"No navigation links found in sidebar at {url}", False
80
+
81
+ # Now fetch glimpses (first 200 chars) for each page
82
+ result_items = []
83
+
84
+ async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
85
+ for item in nav_data[:20]: # Limit to first 20 to avoid too many requests
86
+ md_url = f"{item['url']}.md"
87
+
88
+ try:
89
+ md_response = await client.get(md_url, headers=headers)
90
+ md_response.raise_for_status()
91
+
92
+ content = md_response.text
93
+ # Get first 200 characters as glimpse
94
+ glimpse = content[:200].strip()
95
+ if len(content) > 200:
96
+ glimpse += "..."
97
+
98
+ result_items.append(
99
+ {
100
+ "title": item["title"],
101
+ "url": item["url"],
102
+ "md_url": md_url,
103
+ "glimpse": glimpse,
104
+ }
105
+ )
106
+ except Exception as e:
107
+ # If fetching glimpse fails, include without glimpse
108
+ result_items.append(
109
+ {
110
+ "title": item["title"],
111
+ "url": item["url"],
112
+ "md_url": f"{item['url']}.md",
113
+ "glimpse": f"[Could not fetch glimpse: {str(e)[:50]}]",
114
+ }
115
+ )
116
+
117
+ # Format the results nicely
118
+ result = f"Documentation structure for: {url}\n\n"
119
+ result += f"Found {len(result_items)} pages:\n\n"
120
+
121
+ for i, item in enumerate(result_items, 1):
122
+ result += f"{i}. **{item['title']}**\n"
123
+ result += f" URL: {item['url']}\n"
124
+ result += f" Glimpse: {item['glimpse']}\n\n"
125
+
126
+ return result, True
127
+
128
+ except httpx.HTTPStatusError as e:
129
+ return (
130
+ f"HTTP error fetching {url}: {e.response.status_code} - {e.response.text[:200]}",
131
+ False,
132
+ )
133
+ except httpx.RequestError as e:
134
+ return f"Request error fetching {url}: {str(e)}", False
135
+ except Exception as e:
136
+ return f"Error exploring docs structure: {str(e)}", False
137
+
138
+
139
+ async def hf_docs_fetch_handler(arguments: dict[str, Any]) -> tuple[str, bool]:
140
+ """
141
+ Fetch full documentation content from a specific HF docs page
142
+
143
+ Args:
144
+ arguments: Dictionary with 'url' parameter (full URL to the doc page)
145
+
146
+ Returns:
147
+ Tuple of (full_markdown_content, success)
148
+ """
149
+ url = arguments.get("url", "")
150
+
151
+ if not url:
152
+ return "Error: No URL provided", False
153
+
154
+ # Get HF token from environment
155
+ hf_token = os.environ.get("HF_TOKEN")
156
+
157
+ if not hf_token:
158
+ return (
159
+ "Error: HF_TOKEN environment variable not set",
160
+ False,
161
+ )
162
+
163
+ # Add .md extension if not already present
164
+ if not url.endswith(".md"):
165
+ url = f"{url}.md"
166
+
167
+ try:
168
+ # Make request with auth
169
+ headers = {"Authorization": f"Bearer {hf_token}"}
170
+
171
+ async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
172
+ response = await client.get(url, headers=headers)
173
+ response.raise_for_status()
174
+
175
+ content = response.text
176
+
177
+ # Return the markdown content directly
178
+ result = f"Documentation from: {url}\n\n{content}"
179
+ return result, True
180
+
181
+ except httpx.HTTPStatusError as e:
182
+ return (
183
+ f"HTTP error fetching {url}: {e.response.status_code} - {e.response.text[:200]}",
184
+ False,
185
+ )
186
+ except httpx.RequestError as e:
187
+ return f"Request error fetching {url}: {str(e)}", False
188
+ except Exception as e:
189
+ return f"Error fetching documentation: {str(e)}", False
190
+
191
+
192
+ # Tool specifications for the search sub-agent
193
+
194
+ EXPLORE_DOCS_STRUCTURE_TOOL_SPEC = {
195
+ "name": "explore_docs_structure",
196
+ "description": (
197
+ "Explore the structure of HF documentation by parsing the sidebar navigation. "
198
+ "Provide an endpoint (e.g., 'trl', 'transformers', 'datasets') and get a list of all "
199
+ "documentation pages with their titles, URLs, and a 200-character glimpse of each page. "
200
+ "Use this to discover what documentation is available before fetching specific pages."
201
+ ),
202
+ "parameters": {
203
+ "type": "object",
204
+ "properties": {
205
+ "endpoint": {
206
+ "type": "string",
207
+ "description": (
208
+ "The documentation endpoint to explore (e.g., 'trl', 'transformers', 'hub'). "
209
+ "Do not include '/docs/' or leading slashes."
210
+ ),
211
+ },
212
+ },
213
+ "required": ["endpoint"],
214
+ },
215
+ }
216
+
217
+ HF_DOCS_FETCH_TOOL_SPEC = {
218
+ "name": "fetch_hf_docs",
219
+ "description": (
220
+ "Fetch the full content of a specific HF documentation page. "
221
+ "Provide the full URL to the doc page (e.g., from explore_docs_structure results). "
222
+ "Returns the complete markdown content of that page. "
223
+ "Use explore_docs_structure first to discover available pages."
224
+ ),
225
+ "parameters": {
226
+ "type": "object",
227
+ "properties": {
228
+ "url": {
229
+ "type": "string",
230
+ "description": (
231
+ "The full URL to the documentation page. "
232
+ "Example: 'https://huggingface.co/docs/trl/dpo_trainer' "
233
+ "The .md extension will be added automatically if not present."
234
+ ),
235
+ },
236
+ },
237
+ "required": ["url"],
238
+ },
239
+ }
agent/tools/search_docs_tool.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Search documentation tool that spawns a sub-agent
3
+ The sub-agent has its own agent loop and set of specialized search tools
4
+ """
5
+
6
+ import asyncio
7
+ from typing import Any
8
+
9
+ from litellm.utils import get_max_tokens
10
+
11
+ from agent.config import Config
12
+ from agent.core.session import Session
13
+
14
+
15
+ def create_search_tool_router():
16
+ """Create a ToolRouter instance for the search sub-agent"""
17
+ # Import at runtime to avoid circular dependency
18
+ from agent.core.tools import ToolRouter
19
+
20
+ class SearchDocsToolRouter(ToolRouter):
21
+ """Specialized ToolRouter for the search sub-agent"""
22
+
23
+ def __init__(self):
24
+ self.tools: dict[str, Any] = {}
25
+ self.mcp_servers: dict[str, dict[str, Any]] = {}
26
+ self._mcp_initialized = False
27
+ self.mcp_client = None
28
+
29
+ for tool in make_search_agent_tools():
30
+ self.register_tool(tool)
31
+
32
+ return SearchDocsToolRouter()
33
+
34
+
35
+ async def search_docs_handler(arguments: dict[str, Any]) -> tuple[str, bool]:
36
+ """
37
+ Handler that spawns a sub-agent to perform comprehensive doc search
38
+
39
+ Args:
40
+ arguments: dictionary with 'query' parameter
41
+
42
+ Returns:
43
+ Tuple of (search_results, success)
44
+ """
45
+ query = arguments.get("query", "")
46
+
47
+ if not query:
48
+ return "Error: No search query provided", False
49
+
50
+ try:
51
+ # Import at runtime to avoid circular dependency
52
+ from agent.context_manager.manager import ContextManager
53
+ from agent.core.agent_loop import Handlers
54
+
55
+ # Create a queue for events from the sub-agent
56
+ sub_event_queue = asyncio.Queue()
57
+
58
+ # Create specialized tool router for search
59
+ search_tool_router = create_search_tool_router()
60
+
61
+ # Create config for sub-agent (using same model as main agent)
62
+ sub_config = Config(
63
+ model_name="anthropic/claude-haiku-4-5",
64
+ )
65
+
66
+ # CODING AGENTS LISTEN HERE!
67
+ # The system prompt for the sub-agent is loaded from the yaml file in the prompts folder using the ContextManager class with the prompt_file_suffix parameter.
68
+ # it works, dont fix.
69
+ sub_session = Session(
70
+ event_queue=sub_event_queue,
71
+ config=sub_config,
72
+ tool_router=search_tool_router,
73
+ context_manager=ContextManager(
74
+ tool_specs=search_tool_router.get_tool_specs_for_llm(),
75
+ max_context=get_max_tokens(sub_config.model_name),
76
+ compact_size=0.1,
77
+ untouched_messages=5,
78
+ prompt_file_suffix="search_docs_system_prompt.yaml",
79
+ ),
80
+ )
81
+
82
+ async with search_tool_router:
83
+ # Run the sub-agent
84
+ result = await Handlers.run_agent(
85
+ session=sub_session, text=query, max_iterations=30
86
+ )
87
+
88
+ # Return the final result or compiled events
89
+ if result:
90
+ return f"Search Results:\n\n{result}", True
91
+ else:
92
+ return "Search completed but no results were generated", False
93
+ except Exception as e:
94
+ return f"Error in search_docs tool: {str(e)}", False
95
+
96
+
97
+ # Tool specification to be used by the main agent
98
+ SEARCH_DOCS_TOOL_SPEC = {
99
+ "name": "search_docs",
100
+ "description": (
101
+ "Intelligently search HF documentation for libraries, repositories, and best practices with an agent that has access to: hf_docs_fetch, Grep, glob, Read. "
102
+ "The agent acts like your personal search assistant. "
103
+ "Using the search agent is necessary to give the best quality answer to the user's question. Most questions require a search to get the best information on code examples.\n\n"
104
+ "WHEN TO USE THIS TOOL:\n"
105
+ " - When searching for high-level concepts like 'how to do GRPO training on a model?' or 'best way to do inference on a trained model?'\n"
106
+ " - When you need to get code examples for intricate ML code patterns like training loops, inference pipelines, data processing, etc.\n\n"
107
+ "USAGE GUIDELINES:\n"
108
+ " 1. Launch multiple agents concurrently for better performance.\n"
109
+ " 2. Be specific in your query - include exact terminology, expected file locations, or code patterns.\n"
110
+ " 3. Use the query as if you were talking to another engineer. Bad: logger impl Good: where is the logger implemented, we're trying to find out how to log to files.\n"
111
+ " 4. Make sure to formulate the query in such a way that the agent knows when it's done or has found the result."
112
+ ),
113
+ "parameters": {
114
+ "type": "object",
115
+ "properties": {
116
+ "query": {
117
+ "type": "string",
118
+ "description": (
119
+ "The search query describing to the agent what it should do. Be "
120
+ "specific and include technical terms, file types, or expected "
121
+ "code patterns to help the agent find relevant code. Formulate "
122
+ "the query in a way that makes it clear to the agent when it "
123
+ "has found the right thing."
124
+ ),
125
+ },
126
+ },
127
+ "required": ["query"],
128
+ },
129
+ }
130
+
131
+
132
+
133
+
134
+ def make_search_agent_tools():
135
+ """
136
+ Create a list of tools for the search agent
137
+ """
138
+ # Import at runtime to avoid circular dependency
139
+ from agent.core.tools import ToolSpec
140
+ from agent.tools._search_agent_tools import (
141
+ EXPLORE_DOCS_STRUCTURE_TOOL_SPEC,
142
+ HF_DOCS_FETCH_TOOL_SPEC,
143
+ explore_docs_structure_handler,
144
+ hf_docs_fetch_handler,
145
+ )
146
+
147
+ return [
148
+ ToolSpec(
149
+ name=EXPLORE_DOCS_STRUCTURE_TOOL_SPEC["name"],
150
+ description=EXPLORE_DOCS_STRUCTURE_TOOL_SPEC["description"],
151
+ parameters=EXPLORE_DOCS_STRUCTURE_TOOL_SPEC["parameters"],
152
+ handler=explore_docs_structure_handler,
153
+ ),
154
+ ToolSpec(
155
+ name=HF_DOCS_FETCH_TOOL_SPEC["name"],
156
+ description=HF_DOCS_FETCH_TOOL_SPEC["description"],
157
+ parameters=HF_DOCS_FETCH_TOOL_SPEC["parameters"],
158
+ handler=hf_docs_fetch_handler,
159
+ ),
160
+ ]
pyproject.toml CHANGED
@@ -20,4 +20,5 @@ dependencies = [
20
  "transformers>=2.3.0",
21
  "torch>=2.9.1",
22
  "pytest>=9.0.2",
 
23
  ]
 
20
  "transformers>=2.3.0",
21
  "torch>=2.9.1",
22
  "pytest>=9.0.2",
23
+ "trafilatura>=2.0.0",
24
  ]
uv.lock CHANGED
@@ -230,6 +230,15 @@ wheels = [
230
  { url = "https://files.pythonhosted.org/packages/f8/aa/5082412d1ee302e9e7d80b6949bc4d2a8fa1149aaab610c5fc24709605d6/authlib-1.6.5-py2.py3-none-any.whl", hash = "sha256:3e0e0507807f842b02175507bdee8957a1d5707fd4afb17c32fb43fee90b6e3a", size = 243608, upload-time = "2025-10-02T13:36:07.637Z" },
231
  ]
232
 
 
 
 
 
 
 
 
 
 
233
  [[package]]
234
  name = "beartype"
235
  version = "0.22.6"
@@ -433,6 +442,20 @@ wheels = [
433
  { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
434
  ]
435
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
  [[package]]
437
  name = "cryptography"
438
  version = "46.0.3"
@@ -529,6 +552,21 @@ wheels = [
529
  { url = "https://files.pythonhosted.org/packages/3b/5e/6f8d874366788ad5d549e9ba258037d974dda6e004843be1bda794571701/datasets-4.4.1-py3-none-any.whl", hash = "sha256:c1163de5211e42546079ab355cc0250c7e6db16eb209ac5ac6252f801f596c44", size = 511591, upload-time = "2025-11-05T16:00:36.365Z" },
530
  ]
531
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
532
  [[package]]
533
  name = "debugpy"
534
  version = "1.8.17"
@@ -898,6 +936,7 @@ dependencies = [
898
  { name = "requests" },
899
  { name = "tenacity" },
900
  { name = "torch" },
 
901
  { name = "transformers" },
902
  ]
903
 
@@ -917,6 +956,7 @@ requires-dist = [
917
  { name = "requests", specifier = ">=2.32.5" },
918
  { name = "tenacity", specifier = ">=8.0.0" },
919
  { name = "torch", specifier = ">=2.9.1" },
 
920
  { name = "transformers", specifier = ">=2.3.0" },
921
  ]
922
 
@@ -949,6 +989,22 @@ wheels = [
949
  { url = "https://files.pythonhosted.org/packages/cb/44/870d44b30e1dcfb6a65932e3e1506c103a8a5aea9103c337e7a53180322c/hf_xet-1.2.0-cp37-abi3-win_amd64.whl", hash = "sha256:e6584a52253f72c9f52f9e549d5895ca7a471608495c4ecaa6cc73dba2b24d69", size = 2905735, upload-time = "2025-10-24T19:04:35.928Z" },
950
  ]
951
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
952
  [[package]]
953
  name = "httpcore"
954
  version = "1.0.9"
@@ -1391,6 +1447,18 @@ wheels = [
1391
  { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" },
1392
  ]
1393
 
 
 
 
 
 
 
 
 
 
 
 
 
1394
  [[package]]
1395
  name = "keyring"
1396
  version = "25.7.0"
@@ -1497,6 +1565,103 @@ all = [
1497
  { name = "opentelemetry-instrumentation-weaviate" },
1498
  ]
1499
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1500
  [[package]]
1501
  name = "markdown-it-py"
1502
  version = "4.0.0"
@@ -3699,6 +3864,15 @@ wheels = [
3699
  { url = "https://files.pythonhosted.org/packages/af/df/c7891ef9d2712ad774777271d39fdef63941ffba0a9d59b7ad1fd2765e57/tiktoken-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f61c0aea5565ac82e2ec50a05e02a6c44734e91b51c10510b084ea1b8e633a71", size = 920667, upload-time = "2025-10-06T20:22:34.444Z" },
3700
  ]
3701
 
 
 
 
 
 
 
 
 
 
3702
  [[package]]
3703
  name = "tokenizers"
3704
  version = "0.22.1"
@@ -3788,6 +3962,24 @@ wheels = [
3788
  { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
3789
  ]
3790
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3791
  [[package]]
3792
  name = "transformers"
3793
  version = "2.3.0"
@@ -3861,6 +4053,18 @@ wheels = [
3861
  { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
3862
  ]
3863
 
 
 
 
 
 
 
 
 
 
 
 
 
3864
  [[package]]
3865
  name = "uc-micro-py"
3866
  version = "1.0.3"
 
230
  { url = "https://files.pythonhosted.org/packages/f8/aa/5082412d1ee302e9e7d80b6949bc4d2a8fa1149aaab610c5fc24709605d6/authlib-1.6.5-py2.py3-none-any.whl", hash = "sha256:3e0e0507807f842b02175507bdee8957a1d5707fd4afb17c32fb43fee90b6e3a", size = 243608, upload-time = "2025-10-02T13:36:07.637Z" },
231
  ]
232
 
233
+ [[package]]
234
+ name = "babel"
235
+ version = "2.17.0"
236
+ source = { registry = "https://pypi.org/simple" }
237
+ sdist = { url = "https://files.pythonhosted.org/packages/7d/6b/d52e42361e1aa00709585ecc30b3f9684b3ab62530771402248b1b1d6240/babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d", size = 9951852, upload-time = "2025-02-01T15:17:41.026Z" }
238
+ wheels = [
239
+ { url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537, upload-time = "2025-02-01T15:17:37.39Z" },
240
+ ]
241
+
242
  [[package]]
243
  name = "beartype"
244
  version = "0.22.6"
 
442
  { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
443
  ]
444
 
445
+ [[package]]
446
+ name = "courlan"
447
+ version = "1.3.2"
448
+ source = { registry = "https://pypi.org/simple" }
449
+ dependencies = [
450
+ { name = "babel" },
451
+ { name = "tld" },
452
+ { name = "urllib3" },
453
+ ]
454
+ sdist = { url = "https://files.pythonhosted.org/packages/6f/54/6d6ceeff4bed42e7a10d6064d35ee43a810e7b3e8beb4abeae8cff4713ae/courlan-1.3.2.tar.gz", hash = "sha256:0b66f4db3a9c39a6e22dd247c72cfaa57d68ea660e94bb2c84ec7db8712af190", size = 206382, upload-time = "2024-10-29T16:40:20.994Z" }
455
+ wheels = [
456
+ { url = "https://files.pythonhosted.org/packages/8e/ca/6a667ccbe649856dcd3458bab80b016681b274399d6211187c6ab969fc50/courlan-1.3.2-py3-none-any.whl", hash = "sha256:d0dab52cf5b5b1000ee2839fbc2837e93b2514d3cb5bb61ae158a55b7a04c6be", size = 33848, upload-time = "2024-10-29T16:40:18.325Z" },
457
+ ]
458
+
459
  [[package]]
460
  name = "cryptography"
461
  version = "46.0.3"
 
552
  { url = "https://files.pythonhosted.org/packages/3b/5e/6f8d874366788ad5d549e9ba258037d974dda6e004843be1bda794571701/datasets-4.4.1-py3-none-any.whl", hash = "sha256:c1163de5211e42546079ab355cc0250c7e6db16eb209ac5ac6252f801f596c44", size = 511591, upload-time = "2025-11-05T16:00:36.365Z" },
553
  ]
554
 
555
+ [[package]]
556
+ name = "dateparser"
557
+ version = "1.2.2"
558
+ source = { registry = "https://pypi.org/simple" }
559
+ dependencies = [
560
+ { name = "python-dateutil" },
561
+ { name = "pytz" },
562
+ { name = "regex" },
563
+ { name = "tzlocal" },
564
+ ]
565
+ sdist = { url = "https://files.pythonhosted.org/packages/a9/30/064144f0df1749e7bb5faaa7f52b007d7c2d08ec08fed8411aba87207f68/dateparser-1.2.2.tar.gz", hash = "sha256:986316f17cb8cdc23ea8ce563027c5ef12fc725b6fb1d137c14ca08777c5ecf7", size = 329840, upload-time = "2025-06-26T09:29:23.211Z" }
566
+ wheels = [
567
+ { url = "https://files.pythonhosted.org/packages/87/22/f020c047ae1346613db9322638186468238bcfa8849b4668a22b97faad65/dateparser-1.2.2-py3-none-any.whl", hash = "sha256:5a5d7211a09013499867547023a2a0c91d5a27d15dd4dbcea676ea9fe66f2482", size = 315453, upload-time = "2025-06-26T09:29:21.412Z" },
568
+ ]
569
+
570
  [[package]]
571
  name = "debugpy"
572
  version = "1.8.17"
 
936
  { name = "requests" },
937
  { name = "tenacity" },
938
  { name = "torch" },
939
+ { name = "trafilatura" },
940
  { name = "transformers" },
941
  ]
942
 
 
956
  { name = "requests", specifier = ">=2.32.5" },
957
  { name = "tenacity", specifier = ">=8.0.0" },
958
  { name = "torch", specifier = ">=2.9.1" },
959
+ { name = "trafilatura", specifier = ">=2.0.0" },
960
  { name = "transformers", specifier = ">=2.3.0" },
961
  ]
962
 
 
989
  { url = "https://files.pythonhosted.org/packages/cb/44/870d44b30e1dcfb6a65932e3e1506c103a8a5aea9103c337e7a53180322c/hf_xet-1.2.0-cp37-abi3-win_amd64.whl", hash = "sha256:e6584a52253f72c9f52f9e549d5895ca7a471608495c4ecaa6cc73dba2b24d69", size = 2905735, upload-time = "2025-10-24T19:04:35.928Z" },
990
  ]
991
 
992
+ [[package]]
993
+ name = "htmldate"
994
+ version = "1.9.4"
995
+ source = { registry = "https://pypi.org/simple" }
996
+ dependencies = [
997
+ { name = "charset-normalizer" },
998
+ { name = "dateparser" },
999
+ { name = "lxml" },
1000
+ { name = "python-dateutil" },
1001
+ { name = "urllib3" },
1002
+ ]
1003
+ sdist = { url = "https://files.pythonhosted.org/packages/9d/10/ead9dabc999f353c3aa5d0dc0835b1e355215a5ecb489a7f4ef2ddad5e33/htmldate-1.9.4.tar.gz", hash = "sha256:1129063e02dd0354b74264de71e950c0c3fcee191178321418ccad2074cc8ed0", size = 44690, upload-time = "2025-11-04T17:46:44.983Z" }
1004
+ wheels = [
1005
+ { url = "https://files.pythonhosted.org/packages/a1/bd/adfcdaaad5805c0c5156aeefd64c1e868c05e9c1cd6fd21751f168cd88c7/htmldate-1.9.4-py3-none-any.whl", hash = "sha256:1b94bcc4e08232a5b692159903acf95548b6a7492dddca5bb123d89d6325921c", size = 31558, upload-time = "2025-11-04T17:46:43.258Z" },
1006
+ ]
1007
+
1008
  [[package]]
1009
  name = "httpcore"
1010
  version = "1.0.9"
 
1447
  { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" },
1448
  ]
1449
 
1450
+ [[package]]
1451
+ name = "justext"
1452
+ version = "3.0.2"
1453
+ source = { registry = "https://pypi.org/simple" }
1454
+ dependencies = [
1455
+ { name = "lxml", extra = ["html-clean"] },
1456
+ ]
1457
+ sdist = { url = "https://files.pythonhosted.org/packages/49/f3/45890c1b314f0d04e19c1c83d534e611513150939a7cf039664d9ab1e649/justext-3.0.2.tar.gz", hash = "sha256:13496a450c44c4cd5b5a75a5efcd9996066d2a189794ea99a49949685a0beb05", size = 828521, upload-time = "2025-02-25T20:21:49.934Z" }
1458
+ wheels = [
1459
+ { url = "https://files.pythonhosted.org/packages/f2/ac/52f4e86d1924a7fc05af3aeb34488570eccc39b4af90530dd6acecdf16b5/justext-3.0.2-py2.py3-none-any.whl", hash = "sha256:62b1c562b15c3c6265e121cc070874243a443bfd53060e869393f09d6b6cc9a7", size = 837940, upload-time = "2025-02-25T20:21:44.179Z" },
1460
+ ]
1461
+
1462
  [[package]]
1463
  name = "keyring"
1464
  version = "25.7.0"
 
1565
  { name = "opentelemetry-instrumentation-weaviate" },
1566
  ]
1567
 
1568
+ [[package]]
1569
+ name = "lxml"
1570
+ version = "6.0.2"
1571
+ source = { registry = "https://pypi.org/simple" }
1572
+ sdist = { url = "https://files.pythonhosted.org/packages/aa/88/262177de60548e5a2bfc46ad28232c9e9cbde697bd94132aeb80364675cb/lxml-6.0.2.tar.gz", hash = "sha256:cd79f3367bd74b317dda655dc8fcfa304d9eb6e4fb06b7168c5cf27f96e0cd62", size = 4073426, upload-time = "2025-09-22T04:04:59.287Z" }
1573
+ wheels = [
1574
+ { url = "https://files.pythonhosted.org/packages/f3/c8/8ff2bc6b920c84355146cd1ab7d181bc543b89241cfb1ebee824a7c81457/lxml-6.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a59f5448ba2ceccd06995c95ea59a7674a10de0810f2ce90c9006f3cbc044456", size = 8661887, upload-time = "2025-09-22T04:01:17.265Z" },
1575
+ { url = "https://files.pythonhosted.org/packages/37/6f/9aae1008083bb501ef63284220ce81638332f9ccbfa53765b2b7502203cf/lxml-6.0.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:e8113639f3296706fbac34a30813929e29247718e88173ad849f57ca59754924", size = 4667818, upload-time = "2025-09-22T04:01:19.688Z" },
1576
+ { url = "https://files.pythonhosted.org/packages/f1/ca/31fb37f99f37f1536c133476674c10b577e409c0a624384147653e38baf2/lxml-6.0.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a8bef9b9825fa8bc816a6e641bb67219489229ebc648be422af695f6e7a4fa7f", size = 4950807, upload-time = "2025-09-22T04:01:21.487Z" },
1577
+ { url = "https://files.pythonhosted.org/packages/da/87/f6cb9442e4bada8aab5ae7e1046264f62fdbeaa6e3f6211b93f4c0dd97f1/lxml-6.0.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:65ea18d710fd14e0186c2f973dc60bb52039a275f82d3c44a0e42b43440ea534", size = 5109179, upload-time = "2025-09-22T04:01:23.32Z" },
1578
+ { url = "https://files.pythonhosted.org/packages/c8/20/a7760713e65888db79bbae4f6146a6ae5c04e4a204a3c48896c408cd6ed2/lxml-6.0.2-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c371aa98126a0d4c739ca93ceffa0fd7a5d732e3ac66a46e74339acd4d334564", size = 5023044, upload-time = "2025-09-22T04:01:25.118Z" },
1579
+ { url = "https://files.pythonhosted.org/packages/a2/b0/7e64e0460fcb36471899f75831509098f3fd7cd02a3833ac517433cb4f8f/lxml-6.0.2-cp312-cp312-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:700efd30c0fa1a3581d80a748157397559396090a51d306ea59a70020223d16f", size = 5359685, upload-time = "2025-09-22T04:01:27.398Z" },
1580
+ { url = "https://files.pythonhosted.org/packages/b9/e1/e5df362e9ca4e2f48ed6411bd4b3a0ae737cc842e96877f5bf9428055ab4/lxml-6.0.2-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c33e66d44fe60e72397b487ee92e01da0d09ba2d66df8eae42d77b6d06e5eba0", size = 5654127, upload-time = "2025-09-22T04:01:29.629Z" },
1581
+ { url = "https://files.pythonhosted.org/packages/c6/d1/232b3309a02d60f11e71857778bfcd4acbdb86c07db8260caf7d008b08f8/lxml-6.0.2-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:90a345bbeaf9d0587a3aaffb7006aa39ccb6ff0e96a57286c0cb2fd1520ea192", size = 5253958, upload-time = "2025-09-22T04:01:31.535Z" },
1582
+ { url = "https://files.pythonhosted.org/packages/35/35/d955a070994725c4f7d80583a96cab9c107c57a125b20bb5f708fe941011/lxml-6.0.2-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:064fdadaf7a21af3ed1dcaa106b854077fbeada827c18f72aec9346847cd65d0", size = 4711541, upload-time = "2025-09-22T04:01:33.801Z" },
1583
+ { url = "https://files.pythonhosted.org/packages/1e/be/667d17363b38a78c4bd63cfd4b4632029fd68d2c2dc81f25ce9eb5224dd5/lxml-6.0.2-cp312-cp312-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fbc74f42c3525ac4ffa4b89cbdd00057b6196bcefe8bce794abd42d33a018092", size = 5267426, upload-time = "2025-09-22T04:01:35.639Z" },
1584
+ { url = "https://files.pythonhosted.org/packages/ea/47/62c70aa4a1c26569bc958c9ca86af2bb4e1f614e8c04fb2989833874f7ae/lxml-6.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6ddff43f702905a4e32bc24f3f2e2edfe0f8fde3277d481bffb709a4cced7a1f", size = 5064917, upload-time = "2025-09-22T04:01:37.448Z" },
1585
+ { url = "https://files.pythonhosted.org/packages/bd/55/6ceddaca353ebd0f1908ef712c597f8570cc9c58130dbb89903198e441fd/lxml-6.0.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:6da5185951d72e6f5352166e3da7b0dc27aa70bd1090b0eb3f7f7212b53f1bb8", size = 4788795, upload-time = "2025-09-22T04:01:39.165Z" },
1586
+ { url = "https://files.pythonhosted.org/packages/cf/e8/fd63e15da5e3fd4c2146f8bbb3c14e94ab850589beab88e547b2dbce22e1/lxml-6.0.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:57a86e1ebb4020a38d295c04fc79603c7899e0df71588043eb218722dabc087f", size = 5676759, upload-time = "2025-09-22T04:01:41.506Z" },
1587
+ { url = "https://files.pythonhosted.org/packages/76/47/b3ec58dc5c374697f5ba37412cd2728f427d056315d124dd4b61da381877/lxml-6.0.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:2047d8234fe735ab77802ce5f2297e410ff40f5238aec569ad7c8e163d7b19a6", size = 5255666, upload-time = "2025-09-22T04:01:43.363Z" },
1588
+ { url = "https://files.pythonhosted.org/packages/19/93/03ba725df4c3d72afd9596eef4a37a837ce8e4806010569bedfcd2cb68fd/lxml-6.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6f91fd2b2ea15a6800c8e24418c0775a1694eefc011392da73bc6cef2623b322", size = 5277989, upload-time = "2025-09-22T04:01:45.215Z" },
1589
+ { url = "https://files.pythonhosted.org/packages/c6/80/c06de80bfce881d0ad738576f243911fccf992687ae09fd80b734712b39c/lxml-6.0.2-cp312-cp312-win32.whl", hash = "sha256:3ae2ce7d6fedfb3414a2b6c5e20b249c4c607f72cb8d2bb7cc9c6ec7c6f4e849", size = 3611456, upload-time = "2025-09-22T04:01:48.243Z" },
1590
+ { url = "https://files.pythonhosted.org/packages/f7/d7/0cdfb6c3e30893463fb3d1e52bc5f5f99684a03c29a0b6b605cfae879cd5/lxml-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:72c87e5ee4e58a8354fb9c7c84cbf95a1c8236c127a5d1b7683f04bed8361e1f", size = 4011793, upload-time = "2025-09-22T04:01:50.042Z" },
1591
+ { url = "https://files.pythonhosted.org/packages/ea/7b/93c73c67db235931527301ed3785f849c78991e2e34f3fd9a6663ffda4c5/lxml-6.0.2-cp312-cp312-win_arm64.whl", hash = "sha256:61cb10eeb95570153e0c0e554f58df92ecf5109f75eacad4a95baa709e26c3d6", size = 3672836, upload-time = "2025-09-22T04:01:52.145Z" },
1592
+ { url = "https://files.pythonhosted.org/packages/53/fd/4e8f0540608977aea078bf6d79f128e0e2c2bba8af1acf775c30baa70460/lxml-6.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:9b33d21594afab46f37ae58dfadd06636f154923c4e8a4d754b0127554eb2e77", size = 8648494, upload-time = "2025-09-22T04:01:54.242Z" },
1593
+ { url = "https://files.pythonhosted.org/packages/5d/f4/2a94a3d3dfd6c6b433501b8d470a1960a20ecce93245cf2db1706adf6c19/lxml-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:6c8963287d7a4c5c9a432ff487c52e9c5618667179c18a204bdedb27310f022f", size = 4661146, upload-time = "2025-09-22T04:01:56.282Z" },
1594
+ { url = "https://files.pythonhosted.org/packages/25/2e/4efa677fa6b322013035d38016f6ae859d06cac67437ca7dc708a6af7028/lxml-6.0.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1941354d92699fb5ffe6ed7b32f9649e43c2feb4b97205f75866f7d21aa91452", size = 4946932, upload-time = "2025-09-22T04:01:58.989Z" },
1595
+ { url = "https://files.pythonhosted.org/packages/ce/0f/526e78a6d38d109fdbaa5049c62e1d32fdd70c75fb61c4eadf3045d3d124/lxml-6.0.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bb2f6ca0ae2d983ded09357b84af659c954722bbf04dea98030064996d156048", size = 5100060, upload-time = "2025-09-22T04:02:00.812Z" },
1596
+ { url = "https://files.pythonhosted.org/packages/81/76/99de58d81fa702cc0ea7edae4f4640416c2062813a00ff24bd70ac1d9c9b/lxml-6.0.2-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb2a12d704f180a902d7fa778c6d71f36ceb7b0d317f34cdc76a5d05aa1dd1df", size = 5019000, upload-time = "2025-09-22T04:02:02.671Z" },
1597
+ { url = "https://files.pythonhosted.org/packages/b5/35/9e57d25482bc9a9882cb0037fdb9cc18f4b79d85df94fa9d2a89562f1d25/lxml-6.0.2-cp313-cp313-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:6ec0e3f745021bfed19c456647f0298d60a24c9ff86d9d051f52b509663feeb1", size = 5348496, upload-time = "2025-09-22T04:02:04.904Z" },
1598
+ { url = "https://files.pythonhosted.org/packages/a6/8e/cb99bd0b83ccc3e8f0f528e9aa1f7a9965dfec08c617070c5db8d63a87ce/lxml-6.0.2-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:846ae9a12d54e368933b9759052d6206a9e8b250291109c48e350c1f1f49d916", size = 5643779, upload-time = "2025-09-22T04:02:06.689Z" },
1599
+ { url = "https://files.pythonhosted.org/packages/d0/34/9e591954939276bb679b73773836c6684c22e56d05980e31d52a9a8deb18/lxml-6.0.2-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ef9266d2aa545d7374938fb5c484531ef5a2ec7f2d573e62f8ce722c735685fd", size = 5244072, upload-time = "2025-09-22T04:02:08.587Z" },
1600
+ { url = "https://files.pythonhosted.org/packages/8d/27/b29ff065f9aaca443ee377aff699714fcbffb371b4fce5ac4ca759e436d5/lxml-6.0.2-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:4077b7c79f31755df33b795dc12119cb557a0106bfdab0d2c2d97bd3cf3dffa6", size = 4718675, upload-time = "2025-09-22T04:02:10.783Z" },
1601
+ { url = "https://files.pythonhosted.org/packages/2b/9f/f756f9c2cd27caa1a6ef8c32ae47aadea697f5c2c6d07b0dae133c244fbe/lxml-6.0.2-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a7c5d5e5f1081955358533be077166ee97ed2571d6a66bdba6ec2f609a715d1a", size = 5255171, upload-time = "2025-09-22T04:02:12.631Z" },
1602
+ { url = "https://files.pythonhosted.org/packages/61/46/bb85ea42d2cb1bd8395484fd72f38e3389611aa496ac7772da9205bbda0e/lxml-6.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8f8d0cbd0674ee89863a523e6994ac25fd5be9c8486acfc3e5ccea679bad2679", size = 5057175, upload-time = "2025-09-22T04:02:14.718Z" },
1603
+ { url = "https://files.pythonhosted.org/packages/95/0c/443fc476dcc8e41577f0af70458c50fe299a97bb6b7505bb1ae09aa7f9ac/lxml-6.0.2-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:2cbcbf6d6e924c28f04a43f3b6f6e272312a090f269eff68a2982e13e5d57659", size = 4785688, upload-time = "2025-09-22T04:02:16.957Z" },
1604
+ { url = "https://files.pythonhosted.org/packages/48/78/6ef0b359d45bb9697bc5a626e1992fa5d27aa3f8004b137b2314793b50a0/lxml-6.0.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:dfb874cfa53340009af6bdd7e54ebc0d21012a60a4e65d927c2e477112e63484", size = 5660655, upload-time = "2025-09-22T04:02:18.815Z" },
1605
+ { url = "https://files.pythonhosted.org/packages/ff/ea/e1d33808f386bc1339d08c0dcada6e4712d4ed8e93fcad5f057070b7988a/lxml-6.0.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:fb8dae0b6b8b7f9e96c26fdd8121522ce5de9bb5538010870bd538683d30e9a2", size = 5247695, upload-time = "2025-09-22T04:02:20.593Z" },
1606
+ { url = "https://files.pythonhosted.org/packages/4f/47/eba75dfd8183673725255247a603b4ad606f4ae657b60c6c145b381697da/lxml-6.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:358d9adae670b63e95bc59747c72f4dc97c9ec58881d4627fe0120da0f90d314", size = 5269841, upload-time = "2025-09-22T04:02:22.489Z" },
1607
+ { url = "https://files.pythonhosted.org/packages/76/04/5c5e2b8577bc936e219becb2e98cdb1aca14a4921a12995b9d0c523502ae/lxml-6.0.2-cp313-cp313-win32.whl", hash = "sha256:e8cd2415f372e7e5a789d743d133ae474290a90b9023197fd78f32e2dc6873e2", size = 3610700, upload-time = "2025-09-22T04:02:24.465Z" },
1608
+ { url = "https://files.pythonhosted.org/packages/fe/0a/4643ccc6bb8b143e9f9640aa54e38255f9d3b45feb2cbe7ae2ca47e8782e/lxml-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:b30d46379644fbfc3ab81f8f82ae4de55179414651f110a1514f0b1f8f6cb2d7", size = 4010347, upload-time = "2025-09-22T04:02:26.286Z" },
1609
+ { url = "https://files.pythonhosted.org/packages/31/ef/dcf1d29c3f530577f61e5fe2f1bd72929acf779953668a8a47a479ae6f26/lxml-6.0.2-cp313-cp313-win_arm64.whl", hash = "sha256:13dcecc9946dca97b11b7c40d29fba63b55ab4170d3c0cf8c0c164343b9bfdcf", size = 3671248, upload-time = "2025-09-22T04:02:27.918Z" },
1610
+ { url = "https://files.pythonhosted.org/packages/03/15/d4a377b385ab693ce97b472fe0c77c2b16ec79590e688b3ccc71fba19884/lxml-6.0.2-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:b0c732aa23de8f8aec23f4b580d1e52905ef468afb4abeafd3fec77042abb6fe", size = 8659801, upload-time = "2025-09-22T04:02:30.113Z" },
1611
+ { url = "https://files.pythonhosted.org/packages/c8/e8/c128e37589463668794d503afaeb003987373c5f94d667124ffd8078bbd9/lxml-6.0.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:4468e3b83e10e0317a89a33d28f7aeba1caa4d1a6fd457d115dd4ffe90c5931d", size = 4659403, upload-time = "2025-09-22T04:02:32.119Z" },
1612
+ { url = "https://files.pythonhosted.org/packages/00/ce/74903904339decdf7da7847bb5741fc98a5451b42fc419a86c0c13d26fe2/lxml-6.0.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:abd44571493973bad4598a3be7e1d807ed45aa2adaf7ab92ab7c62609569b17d", size = 4966974, upload-time = "2025-09-22T04:02:34.155Z" },
1613
+ { url = "https://files.pythonhosted.org/packages/1f/d3/131dec79ce61c5567fecf82515bd9bc36395df42501b50f7f7f3bd065df0/lxml-6.0.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:370cd78d5855cfbffd57c422851f7d3864e6ae72d0da615fca4dad8c45d375a5", size = 5102953, upload-time = "2025-09-22T04:02:36.054Z" },
1614
+ { url = "https://files.pythonhosted.org/packages/3a/ea/a43ba9bb750d4ffdd885f2cd333572f5bb900cd2408b67fdda07e85978a0/lxml-6.0.2-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:901e3b4219fa04ef766885fb40fa516a71662a4c61b80c94d25336b4934b71c0", size = 5055054, upload-time = "2025-09-22T04:02:38.154Z" },
1615
+ { url = "https://files.pythonhosted.org/packages/60/23/6885b451636ae286c34628f70a7ed1fcc759f8d9ad382d132e1c8d3d9bfd/lxml-6.0.2-cp314-cp314-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:a4bf42d2e4cf52c28cc1812d62426b9503cdb0c87a6de81442626aa7d69707ba", size = 5352421, upload-time = "2025-09-22T04:02:40.413Z" },
1616
+ { url = "https://files.pythonhosted.org/packages/48/5b/fc2ddfc94ddbe3eebb8e9af6e3fd65e2feba4967f6a4e9683875c394c2d8/lxml-6.0.2-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b2c7fdaa4d7c3d886a42534adec7cfac73860b89b4e5298752f60aa5984641a0", size = 5673684, upload-time = "2025-09-22T04:02:42.288Z" },
1617
+ { url = "https://files.pythonhosted.org/packages/29/9c/47293c58cc91769130fbf85531280e8cc7868f7fbb6d92f4670071b9cb3e/lxml-6.0.2-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:98a5e1660dc7de2200b00d53fa00bcd3c35a3608c305d45a7bbcaf29fa16e83d", size = 5252463, upload-time = "2025-09-22T04:02:44.165Z" },
1618
+ { url = "https://files.pythonhosted.org/packages/9b/da/ba6eceb830c762b48e711ded880d7e3e89fc6c7323e587c36540b6b23c6b/lxml-6.0.2-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:dc051506c30b609238d79eda75ee9cab3e520570ec8219844a72a46020901e37", size = 4698437, upload-time = "2025-09-22T04:02:46.524Z" },
1619
+ { url = "https://files.pythonhosted.org/packages/a5/24/7be3f82cb7990b89118d944b619e53c656c97dc89c28cfb143fdb7cd6f4d/lxml-6.0.2-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8799481bbdd212470d17513a54d568f44416db01250f49449647b5ab5b5dccb9", size = 5269890, upload-time = "2025-09-22T04:02:48.812Z" },
1620
+ { url = "https://files.pythonhosted.org/packages/1b/bd/dcfb9ea1e16c665efd7538fc5d5c34071276ce9220e234217682e7d2c4a5/lxml-6.0.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:9261bb77c2dab42f3ecd9103951aeca2c40277701eb7e912c545c1b16e0e4917", size = 5097185, upload-time = "2025-09-22T04:02:50.746Z" },
1621
+ { url = "https://files.pythonhosted.org/packages/21/04/a60b0ff9314736316f28316b694bccbbabe100f8483ad83852d77fc7468e/lxml-6.0.2-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:65ac4a01aba353cfa6d5725b95d7aed6356ddc0a3cd734de00124d285b04b64f", size = 4745895, upload-time = "2025-09-22T04:02:52.968Z" },
1622
+ { url = "https://files.pythonhosted.org/packages/d6/bd/7d54bd1846e5a310d9c715921c5faa71cf5c0853372adf78aee70c8d7aa2/lxml-6.0.2-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:b22a07cbb82fea98f8a2fd814f3d1811ff9ed76d0fc6abc84eb21527596e7cc8", size = 5695246, upload-time = "2025-09-22T04:02:54.798Z" },
1623
+ { url = "https://files.pythonhosted.org/packages/fd/32/5643d6ab947bc371da21323acb2a6e603cedbe71cb4c99c8254289ab6f4e/lxml-6.0.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:d759cdd7f3e055d6bc8d9bec3ad905227b2e4c785dc16c372eb5b5e83123f48a", size = 5260797, upload-time = "2025-09-22T04:02:57.058Z" },
1624
+ { url = "https://files.pythonhosted.org/packages/33/da/34c1ec4cff1eea7d0b4cd44af8411806ed943141804ac9c5d565302afb78/lxml-6.0.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:945da35a48d193d27c188037a05fec5492937f66fb1958c24fc761fb9d40d43c", size = 5277404, upload-time = "2025-09-22T04:02:58.966Z" },
1625
+ { url = "https://files.pythonhosted.org/packages/82/57/4eca3e31e54dc89e2c3507e1cd411074a17565fa5ffc437c4ae0a00d439e/lxml-6.0.2-cp314-cp314-win32.whl", hash = "sha256:be3aaa60da67e6153eb15715cc2e19091af5dc75faef8b8a585aea372507384b", size = 3670072, upload-time = "2025-09-22T04:03:38.05Z" },
1626
+ { url = "https://files.pythonhosted.org/packages/e3/e0/c96cf13eccd20c9421ba910304dae0f619724dcf1702864fd59dd386404d/lxml-6.0.2-cp314-cp314-win_amd64.whl", hash = "sha256:fa25afbadead523f7001caf0c2382afd272c315a033a7b06336da2637d92d6ed", size = 4080617, upload-time = "2025-09-22T04:03:39.835Z" },
1627
+ { url = "https://files.pythonhosted.org/packages/d5/5d/b3f03e22b3d38d6f188ef044900a9b29b2fe0aebb94625ce9fe244011d34/lxml-6.0.2-cp314-cp314-win_arm64.whl", hash = "sha256:063eccf89df5b24e361b123e257e437f9e9878f425ee9aae3144c77faf6da6d8", size = 3754930, upload-time = "2025-09-22T04:03:41.565Z" },
1628
+ { url = "https://files.pythonhosted.org/packages/5e/5c/42c2c4c03554580708fc738d13414801f340c04c3eff90d8d2d227145275/lxml-6.0.2-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:6162a86d86893d63084faaf4ff937b3daea233e3682fb4474db07395794fa80d", size = 8910380, upload-time = "2025-09-22T04:03:01.645Z" },
1629
+ { url = "https://files.pythonhosted.org/packages/bf/4f/12df843e3e10d18d468a7557058f8d3733e8b6e12401f30b1ef29360740f/lxml-6.0.2-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:414aaa94e974e23a3e92e7ca5b97d10c0cf37b6481f50911032c69eeb3991bba", size = 4775632, upload-time = "2025-09-22T04:03:03.814Z" },
1630
+ { url = "https://files.pythonhosted.org/packages/e4/0c/9dc31e6c2d0d418483cbcb469d1f5a582a1cd00a1f4081953d44051f3c50/lxml-6.0.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:48461bd21625458dd01e14e2c38dd0aea69addc3c4f960c30d9f59d7f93be601", size = 4975171, upload-time = "2025-09-22T04:03:05.651Z" },
1631
+ { url = "https://files.pythonhosted.org/packages/e7/2b/9b870c6ca24c841bdd887504808f0417aa9d8d564114689266f19ddf29c8/lxml-6.0.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:25fcc59afc57d527cfc78a58f40ab4c9b8fd096a9a3f964d2781ffb6eb33f4ed", size = 5110109, upload-time = "2025-09-22T04:03:07.452Z" },
1632
+ { url = "https://files.pythonhosted.org/packages/bf/0c/4f5f2a4dd319a178912751564471355d9019e220c20d7db3fb8307ed8582/lxml-6.0.2-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5179c60288204e6ddde3f774a93350177e08876eaf3ab78aa3a3649d43eb7d37", size = 5041061, upload-time = "2025-09-22T04:03:09.297Z" },
1633
+ { url = "https://files.pythonhosted.org/packages/12/64/554eed290365267671fe001a20d72d14f468ae4e6acef1e179b039436967/lxml-6.0.2-cp314-cp314t-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:967aab75434de148ec80597b75062d8123cadf2943fb4281f385141e18b21338", size = 5306233, upload-time = "2025-09-22T04:03:11.651Z" },
1634
+ { url = "https://files.pythonhosted.org/packages/7a/31/1d748aa275e71802ad9722df32a7a35034246b42c0ecdd8235412c3396ef/lxml-6.0.2-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d100fcc8930d697c6561156c6810ab4a508fb264c8b6779e6e61e2ed5e7558f9", size = 5604739, upload-time = "2025-09-22T04:03:13.592Z" },
1635
+ { url = "https://files.pythonhosted.org/packages/8f/41/2c11916bcac09ed561adccacceaedd2bf0e0b25b297ea92aab99fd03d0fa/lxml-6.0.2-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2ca59e7e13e5981175b8b3e4ab84d7da57993eeff53c07764dcebda0d0e64ecd", size = 5225119, upload-time = "2025-09-22T04:03:15.408Z" },
1636
+ { url = "https://files.pythonhosted.org/packages/99/05/4e5c2873d8f17aa018e6afde417c80cc5d0c33be4854cce3ef5670c49367/lxml-6.0.2-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:957448ac63a42e2e49531b9d6c0fa449a1970dbc32467aaad46f11545be9af1d", size = 4633665, upload-time = "2025-09-22T04:03:17.262Z" },
1637
+ { url = "https://files.pythonhosted.org/packages/0f/c9/dcc2da1bebd6275cdc723b515f93edf548b82f36a5458cca3578bc899332/lxml-6.0.2-cp314-cp314t-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b7fc49c37f1786284b12af63152fe1d0990722497e2d5817acfe7a877522f9a9", size = 5234997, upload-time = "2025-09-22T04:03:19.14Z" },
1638
+ { url = "https://files.pythonhosted.org/packages/9c/e2/5172e4e7468afca64a37b81dba152fc5d90e30f9c83c7c3213d6a02a5ce4/lxml-6.0.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e19e0643cc936a22e837f79d01a550678da8377d7d801a14487c10c34ee49c7e", size = 5090957, upload-time = "2025-09-22T04:03:21.436Z" },
1639
+ { url = "https://files.pythonhosted.org/packages/a5/b3/15461fd3e5cd4ddcb7938b87fc20b14ab113b92312fc97afe65cd7c85de1/lxml-6.0.2-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:1db01e5cf14345628e0cbe71067204db658e2fb8e51e7f33631f5f4735fefd8d", size = 4764372, upload-time = "2025-09-22T04:03:23.27Z" },
1640
+ { url = "https://files.pythonhosted.org/packages/05/33/f310b987c8bf9e61c4dd8e8035c416bd3230098f5e3cfa69fc4232de7059/lxml-6.0.2-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:875c6b5ab39ad5291588aed6925fac99d0097af0dd62f33c7b43736043d4a2ec", size = 5634653, upload-time = "2025-09-22T04:03:25.767Z" },
1641
+ { url = "https://files.pythonhosted.org/packages/70/ff/51c80e75e0bc9382158133bdcf4e339b5886c6ee2418b5199b3f1a61ed6d/lxml-6.0.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:cdcbed9ad19da81c480dfd6dd161886db6096083c9938ead313d94b30aadf272", size = 5233795, upload-time = "2025-09-22T04:03:27.62Z" },
1642
+ { url = "https://files.pythonhosted.org/packages/56/4d/4856e897df0d588789dd844dbed9d91782c4ef0b327f96ce53c807e13128/lxml-6.0.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:80dadc234ebc532e09be1975ff538d154a7fa61ea5031c03d25178855544728f", size = 5257023, upload-time = "2025-09-22T04:03:30.056Z" },
1643
+ { url = "https://files.pythonhosted.org/packages/0f/85/86766dfebfa87bea0ab78e9ff7a4b4b45225df4b4d3b8cc3c03c5cd68464/lxml-6.0.2-cp314-cp314t-win32.whl", hash = "sha256:da08e7bb297b04e893d91087df19638dc7a6bb858a954b0cc2b9f5053c922312", size = 3911420, upload-time = "2025-09-22T04:03:32.198Z" },
1644
+ { url = "https://files.pythonhosted.org/packages/fe/1a/b248b355834c8e32614650b8008c69ffeb0ceb149c793961dd8c0b991bb3/lxml-6.0.2-cp314-cp314t-win_amd64.whl", hash = "sha256:252a22982dca42f6155125ac76d3432e548a7625d56f5a273ee78a5057216eca", size = 4406837, upload-time = "2025-09-22T04:03:34.027Z" },
1645
+ { url = "https://files.pythonhosted.org/packages/92/aa/df863bcc39c5e0946263454aba394de8a9084dbaff8ad143846b0d844739/lxml-6.0.2-cp314-cp314t-win_arm64.whl", hash = "sha256:bb4c1847b303835d89d785a18801a883436cdfd5dc3d62947f9c49e24f0f5a2c", size = 3822205, upload-time = "2025-09-22T04:03:36.249Z" },
1646
+ ]
1647
+
1648
+ [package.optional-dependencies]
1649
+ html-clean = [
1650
+ { name = "lxml-html-clean" },
1651
+ ]
1652
+
1653
+ [[package]]
1654
+ name = "lxml-html-clean"
1655
+ version = "0.4.3"
1656
+ source = { registry = "https://pypi.org/simple" }
1657
+ dependencies = [
1658
+ { name = "lxml" },
1659
+ ]
1660
+ sdist = { url = "https://files.pythonhosted.org/packages/d9/cb/c9c5bb2a9c47292e236a808dd233a03531f53b626f36259dcd32b49c76da/lxml_html_clean-0.4.3.tar.gz", hash = "sha256:c9df91925b00f836c807beab127aac82575110eacff54d0a75187914f1bd9d8c", size = 21498, upload-time = "2025-10-02T20:49:24.895Z" }
1661
+ wheels = [
1662
+ { url = "https://files.pythonhosted.org/packages/10/4a/63a9540e3ca73709f4200564a737d63a4c8c9c4dd032bab8535f507c190a/lxml_html_clean-0.4.3-py3-none-any.whl", hash = "sha256:63fd7b0b9c3a2e4176611c2ca5d61c4c07ffca2de76c14059a81a2825833731e", size = 14177, upload-time = "2025-10-02T20:49:23.749Z" },
1663
+ ]
1664
+
1665
  [[package]]
1666
  name = "markdown-it-py"
1667
  version = "4.0.0"
 
3864
  { url = "https://files.pythonhosted.org/packages/af/df/c7891ef9d2712ad774777271d39fdef63941ffba0a9d59b7ad1fd2765e57/tiktoken-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f61c0aea5565ac82e2ec50a05e02a6c44734e91b51c10510b084ea1b8e633a71", size = 920667, upload-time = "2025-10-06T20:22:34.444Z" },
3865
  ]
3866
 
3867
+ [[package]]
3868
+ name = "tld"
3869
+ version = "0.13.1"
3870
+ source = { registry = "https://pypi.org/simple" }
3871
+ sdist = { url = "https://files.pythonhosted.org/packages/df/a1/5723b07a70c1841a80afc9ac572fdf53488306848d844cd70519391b0d26/tld-0.13.1.tar.gz", hash = "sha256:75ec00936cbcf564f67361c41713363440b6c4ef0f0c1592b5b0fbe72c17a350", size = 462000, upload-time = "2025-05-21T22:18:29.341Z" }
3872
+ wheels = [
3873
+ { url = "https://files.pythonhosted.org/packages/dc/70/b2f38360c3fc4bc9b5e8ef429e1fde63749144ac583c2dbdf7e21e27a9ad/tld-0.13.1-py2.py3-none-any.whl", hash = "sha256:a2d35109433ac83486ddf87e3c4539ab2c5c2478230e5d9c060a18af4b03aa7c", size = 274718, upload-time = "2025-05-21T22:18:25.811Z" },
3874
+ ]
3875
+
3876
  [[package]]
3877
  name = "tokenizers"
3878
  version = "0.22.1"
 
3962
  { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
3963
  ]
3964
 
3965
+ [[package]]
3966
+ name = "trafilatura"
3967
+ version = "2.0.0"
3968
+ source = { registry = "https://pypi.org/simple" }
3969
+ dependencies = [
3970
+ { name = "certifi" },
3971
+ { name = "charset-normalizer" },
3972
+ { name = "courlan" },
3973
+ { name = "htmldate" },
3974
+ { name = "justext" },
3975
+ { name = "lxml" },
3976
+ { name = "urllib3" },
3977
+ ]
3978
+ sdist = { url = "https://files.pythonhosted.org/packages/06/25/e3ebeefdebfdfae8c4a4396f5a6ea51fc6fa0831d63ce338e5090a8003dc/trafilatura-2.0.0.tar.gz", hash = "sha256:ceb7094a6ecc97e72fea73c7dba36714c5c5b577b6470e4520dca893706d6247", size = 253404, upload-time = "2024-12-03T15:23:24.16Z" }
3979
+ wheels = [
3980
+ { url = "https://files.pythonhosted.org/packages/8a/b6/097367f180b6383a3581ca1b86fcae284e52075fa941d1232df35293363c/trafilatura-2.0.0-py3-none-any.whl", hash = "sha256:77eb5d1e993747f6f20938e1de2d840020719735690c840b9a1024803a4cd51d", size = 132557, upload-time = "2024-12-03T15:23:21.41Z" },
3981
+ ]
3982
+
3983
  [[package]]
3984
  name = "transformers"
3985
  version = "2.3.0"
 
4053
  { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
4054
  ]
4055
 
4056
+ [[package]]
4057
+ name = "tzlocal"
4058
+ version = "5.3.1"
4059
+ source = { registry = "https://pypi.org/simple" }
4060
+ dependencies = [
4061
+ { name = "tzdata", marker = "sys_platform == 'win32'" },
4062
+ ]
4063
+ sdist = { url = "https://files.pythonhosted.org/packages/8b/2e/c14812d3d4d9cd1773c6be938f89e5735a1f11a9f184ac3639b93cef35d5/tzlocal-5.3.1.tar.gz", hash = "sha256:cceffc7edecefea1f595541dbd6e990cb1ea3d19bf01b2809f362a03dd7921fd", size = 30761, upload-time = "2025-03-05T21:17:41.549Z" }
4064
+ wheels = [
4065
+ { url = "https://files.pythonhosted.org/packages/c2/14/e2a54fabd4f08cd7af1c07030603c3356b74da07f7cc056e600436edfa17/tzlocal-5.3.1-py3-none-any.whl", hash = "sha256:eb1a66c3ef5847adf7a834f1be0800581b683b5608e74f86ecbcef8ab91bb85d", size = 18026, upload-time = "2025-03-05T21:17:39.857Z" },
4066
+ ]
4067
+
4068
  [[package]]
4069
  name = "uc-micro-py"
4070
  version = "1.0.3"