File size: 2,868 Bytes
f866820
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""
Hybrid retrieval combining semantic and keyword search.

Provides better recall by leveraging both:
- Semantic search: conceptual similarity
- Keyword search: exact term matching
"""

from typing import List, Dict, Any, Optional
from dataclasses import dataclass

from src.retrieval.retriever import query_pinecone
from src.retrieval.keyword_search import keyword_search, hybrid_score_chunks


@dataclass
class HybridSearchResult:
    """Result from hybrid search."""
    chunks: List[Dict[str, Any]]
    semantic_count: int
    keyword_count: int
    strategy: str


def hybrid_search(
    query: str,
    top_k: int = 10,
    semantic_weight: float = 0.7,
    keyword_weight: float = 0.3,
    fetch_k: int = None,
    chunks_path: str = "data/chunks.jsonl"
) -> HybridSearchResult:
    """
    Perform hybrid search combining semantic and keyword retrieval.

    Args:
        query: Search query
        top_k: Final number of results to return
        semantic_weight: Weight for semantic search results (0-1)
        keyword_weight: Weight for keyword search results (0-1)
        fetch_k: Number to fetch from each source (default: 2x top_k)
        chunks_path: Path to chunks file for BM25

    Returns:
        HybridSearchResult with combined chunks and metadata
    """
    if fetch_k is None:
        fetch_k = top_k * 2

    semantic_chunks = []
    keyword_chunks = []

    # 1. Semantic search via Pinecone
    try:
        semantic_results = query_pinecone(query, top_k=fetch_k)
        # Ensure chunks have text field from metadata if not present
        for chunk in semantic_results:
            if "text" not in chunk and "metadata" in chunk:
                chunk["text"] = chunk["metadata"].get("text", "")
        semantic_chunks = semantic_results
    except Exception:
        semantic_chunks = []

    # 2. Keyword search via BM25
    try:
        keyword_result = keyword_search(query, top_k=fetch_k, chunks_path=chunks_path)
        keyword_chunks = keyword_result.chunks
    except Exception:
        keyword_chunks = []

    # 3. Determine strategy based on what succeeded
    if semantic_chunks and keyword_chunks:
        strategy = "hybrid"
        combined = hybrid_score_chunks(
            semantic_chunks=semantic_chunks,
            keyword_chunks=keyword_chunks,
            semantic_weight=semantic_weight,
            keyword_weight=keyword_weight,
            top_k=top_k
        )
    elif semantic_chunks:
        strategy = "semantic_only"
        combined = semantic_chunks[:top_k]
    elif keyword_chunks:
        strategy = "keyword_only"
        combined = keyword_chunks[:top_k]
    else:
        strategy = "none"
        combined = []

    return HybridSearchResult(
        chunks=combined,
        semantic_count=len(semantic_chunks),
        keyword_count=len(keyword_chunks),
        strategy=strategy
    )