| """ |
| Web Search Module for RAG-Anything using Tavily API |
| |
| Provides intelligent web search capabilities to augment RAG with real-time information. |
| |
| Features: |
| - Tavily API integration for high-quality search results |
| - Context-aware search query generation |
| - Result filtering and ranking |
| - Hybrid RAG + Web search mode |
| |
| Author: RAG-Anything Team |
| Version: 1.0.0 |
| """ |
|
|
| import os |
| import asyncio |
| import logging |
| from typing import List, Dict, Any, Optional |
| from datetime import datetime |
|
|
| logger = logging.getLogger(__name__) |
|
|
| try: |
| from tavily import TavilyClient, AsyncTavilyClient |
| TAVILY_AVAILABLE = True |
| except ImportError: |
| TAVILY_AVAILABLE = False |
| logger.warning("Tavily not installed. Install with: pip install tavily-python") |
|
|
|
|
| class WebSearcher: |
| """Web search integration using Tavily API""" |
|
|
| def __init__( |
| self, |
| api_key: Optional[str] = None, |
| max_results: int = 5, |
| search_depth: str = "advanced", |
| include_raw_content: bool = True |
| ): |
| """ |
| Initialize web searcher |
| |
| Args: |
| api_key: Tavily API key (from env if not provided) |
| max_results: Maximum number of search results to return |
| search_depth: "basic" or "advanced" (advanced is more thorough) |
| include_raw_content: Whether to include full page content |
| """ |
| if not TAVILY_AVAILABLE: |
| raise ImportError("Tavily is not installed. Install with: pip install tavily-python") |
|
|
| self.api_key = api_key or os.getenv("TAVILY_API_KEY") |
| if not self.api_key: |
| raise ValueError("Tavily API key not found. Set TAVILY_API_KEY environment variable.") |
|
|
| self.max_results = max_results |
| self.search_depth = search_depth |
| self.include_raw_content = include_raw_content |
|
|
| |
| self.client = AsyncTavilyClient(api_key=self.api_key) |
|
|
| logger.info(f"WebSearcher initialized (max_results={max_results}, depth={search_depth})") |
|
|
| async def search( |
| self, |
| query: str, |
| max_results: Optional[int] = None, |
| include_domains: Optional[List[str]] = None, |
| exclude_domains: Optional[List[str]] = None, |
| search_depth: Optional[str] = None |
| ) -> Dict[str, Any]: |
| """ |
| Perform web search |
| |
| Args: |
| query: Search query |
| max_results: Override default max results |
| include_domains: Only search these domains |
| exclude_domains: Exclude these domains |
| search_depth: Override default search depth |
| |
| Returns: |
| Dictionary with search results and metadata |
| """ |
| try: |
| logger.info(f"Searching web: {query[:100]}...") |
|
|
| |
| search_params = { |
| "query": query, |
| "max_results": max_results or self.max_results, |
| "search_depth": search_depth or self.search_depth, |
| "include_raw_content": self.include_raw_content, |
| } |
|
|
| if include_domains: |
| search_params["include_domains"] = include_domains |
| if exclude_domains: |
| search_params["exclude_domains"] = exclude_domains |
|
|
| |
| response = await self.client.search(**search_params) |
|
|
| |
| results = { |
| "query": query, |
| "results": response.get("results", []), |
| "answer": response.get("answer", ""), |
| "search_metadata": { |
| "total_results": len(response.get("results", [])), |
| "search_depth": search_params["search_depth"], |
| "timestamp": datetime.now().isoformat(), |
| } |
| } |
|
|
| logger.info(f"Web search complete: {len(results['results'])} results found") |
| return results |
|
|
| except Exception as e: |
| logger.error(f"Web search error: {e}", exc_info=True) |
| return { |
| "query": query, |
| "results": [], |
| "answer": "", |
| "error": str(e), |
| "search_metadata": { |
| "total_results": 0, |
| "error": str(e), |
| "timestamp": datetime.now().isoformat(), |
| } |
| } |
|
|
| async def search_with_context( |
| self, |
| query: str, |
| context: Optional[str] = None, |
| **kwargs |
| ) -> Dict[str, Any]: |
| """ |
| Search with additional context to refine query |
| |
| Args: |
| query: Base search query |
| context: Additional context to help refine search |
| **kwargs: Additional search parameters |
| |
| Returns: |
| Search results dictionary |
| """ |
| |
| if context: |
| enhanced_query = f"{query} {context}" |
| else: |
| enhanced_query = query |
|
|
| return await self.search(enhanced_query, **kwargs) |
|
|
| def format_results_for_rag(self, search_results: Dict[str, Any]) -> str: |
| """ |
| Format web search results for RAG context |
| |
| Args: |
| search_results: Results from search() |
| |
| Returns: |
| Formatted string for RAG context |
| """ |
| if not search_results.get("results"): |
| return "No web search results found." |
|
|
| formatted = ["=== Web Search Results ===\n"] |
|
|
| |
| if search_results.get("answer"): |
| formatted.append(f"Quick Answer: {search_results['answer']}\n") |
|
|
| |
| for idx, result in enumerate(search_results["results"], 1): |
| formatted.append(f"\n[Source {idx}] {result.get('title', 'Untitled')}") |
| formatted.append(f"URL: {result.get('url', 'N/A')}") |
| formatted.append(f"Content: {result.get('content', 'No content')[:500]}...") |
| if result.get("score"): |
| formatted.append(f"Relevance: {result['score']:.2f}") |
|
|
| formatted.append(f"\n=== End of Web Results ({len(search_results['results'])} sources) ===") |
| return "\n".join(formatted) |
|
|
| def format_results_for_llm(self, search_results: Dict[str, Any]) -> str: |
| """ |
| Format web search results optimally for LLM processing |
| |
| Args: |
| search_results: Results from search() |
| |
| Returns: |
| Structured string optimized for LLM comprehension |
| """ |
| if not search_results.get("results"): |
| return "No relevant web search results were found for this query." |
|
|
| formatted = [] |
|
|
| |
| if search_results.get("answer"): |
| formatted.append("### AI-Generated Summary:") |
| formatted.append(search_results['answer']) |
| formatted.append("") |
|
|
| |
| formatted.append("### Detailed Sources:") |
| formatted.append("") |
|
|
| for idx, result in enumerate(search_results["results"], 1): |
| formatted.append(f"**Source {idx}: {result.get('title', 'Untitled')}**") |
| formatted.append(f"- URL: {result.get('url', 'N/A')}") |
| formatted.append(f"- Published: {result.get('published_date', 'Unknown date')}") |
|
|
| |
| content = result.get('content', '') |
| if result.get('raw_content') and len(result.get('raw_content', '')) > len(content): |
| content = result['raw_content'][:2000] |
|
|
| formatted.append(f"- Content: {content}") |
|
|
| if result.get("score"): |
| formatted.append(f"- Relevance Score: {result['score']:.2%}") |
|
|
| formatted.append("") |
|
|
| formatted.append(f"*Total sources: {len(search_results['results'])}*") |
| return "\n".join(formatted) |
|
|
| async def hybrid_search( |
| self, |
| query: str, |
| rag_results: Optional[str] = None, |
| combine_results: bool = True, |
| **kwargs |
| ) -> Dict[str, Any]: |
| """ |
| Hybrid search: combine RAG results with web search |
| |
| Args: |
| query: Search query |
| rag_results: Results from RAG system |
| combine_results: Whether to combine RAG and web results |
| **kwargs: Additional search parameters |
| |
| Returns: |
| Dictionary with combined results |
| """ |
| |
| web_results = await self.search(query, **kwargs) |
|
|
| if not combine_results: |
| return web_results |
|
|
| |
| combined_context = [] |
|
|
| if rag_results: |
| combined_context.append("=== Knowledge Base Results ===") |
| combined_context.append(rag_results) |
| combined_context.append("") |
|
|
| combined_context.append(self.format_results_for_rag(web_results)) |
|
|
| return { |
| "query": query, |
| "combined_context": "\n".join(combined_context), |
| "rag_results": rag_results, |
| "web_results": web_results, |
| "metadata": { |
| "has_rag_results": bool(rag_results), |
| "has_web_results": len(web_results.get("results", [])) > 0, |
| "timestamp": datetime.now().isoformat(), |
| } |
| } |
|
|
|
|
| def create_web_searcher(api_key: Optional[str] = None, **kwargs) -> WebSearcher: |
| """ |
| Factory function to create a web searcher |
| |
| Args: |
| api_key: Tavily API key |
| **kwargs: Additional WebSearcher parameters |
| |
| Returns: |
| Configured WebSearcher instance |
| """ |
| return WebSearcher(api_key=api_key, **kwargs) |
|
|