Spaces:

Subhadip007
/

researchpilot-api

Running

Subhadip007 commited on 20 days ago

Commit

f780124

1 Parent(s): daafb32

feat: retrieval optimization pipeline complete

- BM25 sparse index: 15,664 documents, 39.3MB
- Hybrid retrieval: RRF fusion (dense 0.7 + sparse 0.3)
- Cross-encoder re-ranking: ms-marco-MiniLM-L-6-v2
- Diversity filter: max 2 chunks per paper
- Fixed Qdrant Range filter: publication_year as integer field
- CE score range: 4.3-8.3 (strong relevance signal)
- Query latency: 3-17s (first query loads models, subsequent ~4s)

Files changed (8) hide show

.vscode/settings.json +3 -0
src/retrieval/__init__.py +0 -0
src/retrieval/hybrid_retriever.py +164 -0
src/retrieval/reranker.py +176 -0
src/retrieval/retrieval_pipeline.py +117 -0
src/vectorstore/bm25_store.py +203 -0
src/vectorstore/qdrant_store.py +6 -6
test_retrieval.py +79 -0

.vscode/settings.json CHANGED Viewed

@@ -1,2 +1,5 @@
 {
 }

 {
+    "cSpell.words": [
+        "reranked"
+    ]
 }

src/retrieval/__init__.py ADDED Viewed

File without changes

src/retrieval/hybrid_retriever.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""
+Hybrid retriever combining dense (Qdrant) and sparse (BM25) search.
+RECIPROCAL RANK FUSION (RRF) EXPLAINED:
+Instead of trying to normalize scores across two completely different
+scoring systems (cosine similarity vs BM25 score), RRF uses RANKS.
+For each result, we compute:
+    RRF_score = 1 / (k + rank_in_dense_results)
+              + 1 / (k + rank_in_bm25_results)
+Where k=60 is a constant that dampens the impact of very high ranks.
+Example:
+    Chunk A: rank 1 in dense, rank 3 in BM25
+        RRF = 1/(60+1) + 1/(60+3) = 0.0164 + 0.0159 = 0.0323
+    Chunk B: rank 2 in dense, not in BM25
+        RRF = 1/(60+2) + 0 = 0.0161
+    Chunk C: rank 5 in dense, rank 1 in BM25
+        RRF = 1/(60+5) + 1/(60+1) = 0.0154 + 0.0164 = 0.0317
+Chunk A wins - it ranked highly in BOTH systems.
+Chunk C is second - it was top in BM25 and decent in dense.
+WHY RRF OVER SCORE NORMALIZATION:
+    BM25 scores range 0-15 typically.
+    Cosine similarity scores range 0-1.
+    Normalizing these to the same scale requires knowing
+    the distribution of each, which changes per query.
+    RRF sidesteps this entirely by using ranks.
+    This is why RRF is the industry standard for hybrid search.
+"""
+from typing import Optional
+import numpy as np
+from src.vectorstore.qdrant_store import QdrantStore
+from src.vectorstore.bm25_store import BM25Store
+from src.embeddings.embedding_model import EmbeddingModel
+from src.utils.logger import get_logger
+from config.settings import TOP_K_RETRIEVAL
+logger = get_logger(__name__)
+# RRF constant - 60 is the standard value from the original paper
+RRF_K = 60
+class HybridRetriever:
+    """
+    Combines dense vector search and BM25 keyword search
+    using Reciprocal Rank Fusion for score merging.
+    """
+    def __init__(
+        self,
+        qdrant_store:    QdrantStore,
+        bm25_store:      BM25Store,
+        embedding_model: EmbeddingModel,
+    ):
+        self.qdrant     = qdrant_store
+        self.bm25       = bm25_store
+        self.embedder   = embedding_model
+    def retrieve(
+        self,
+        query:           str,
+        top_k:           int = TOP_K_RETRIEVAL,
+        filter_category: Optional[str] = None,
+        filter_year_gte: Optional[int] = None,
+        dense_weight:    float = 0.7,
+        sparse_weight:   float = 0.3,
+    ) -> list[dict]:
+        """
+        Hybrid retrieval with RRF fusion.
+        Args:
+            query:           User's raw query string
+            top_k:           Final number of results to return
+            filter_category: ArXiv category filter (e.g. "cs.LG")
+            filter_year_gte: Only papers from this year onwards
+            dense_weight:    Weight for dense retrieval in fusion (0-1)
+            sparse_weight:   Weight for BM25 retrieval in fusion (0-1)
+        Returns:
+            List of result dicts sorted by RRF score (best first)
+        WHY dense_weight = 0.7, sparse_weight = 0.3:
+            Research papers use technical language where semantic
+            understanding (dense) matters more than exact keyword
+            matching (sparse). For a code search system, you'd
+            flip these weights.
+        """
+        # -------------- Step 1: Dense retrieval --------------
+        query_vector    = self.embedder.embed_query(query)
+        dense_results   = self.qdrant.search(
+            query_vector    = query_vector,
+            top_k           = top_k * 2,    # Retrieve more for fusion
+            filter_category = filter_category,
+            filter_year_gte = filter_year_gte,
+        )
+        # -------------- Step 2: Sparse (BM25) retrieval --------------
+        sparse_results = self.bm25.search(query, top_k = top_k * 2)
+        # -------------- Step 3: Build chunk_id -> full data lookup --------------
+        # Dense results have full payload (text, metadata)
+        # Sparse results only have chunk_id and text
+        chunk_data = {}
+        for r in dense_results:
+            if r["chunk_id"] not in chunk_data:
+                chunk_data[r["chunk_id"]] = {
+                    "chunk_id": r["chunk_id"],
+                    "text":     r["text"],
+                    "score":    0.0,
+                }
+        # -------------- Step 4: Compute RRF score --------------
+        RRF_scores = {}
+        # Add dense ranks
+        for rank, result in enumerate(dense_results):
+            cid = result["chunk_id"]
+            RRF_scores[cid] = RRF_scores.get(cid, 0.0)
+            RRF_scores[cid] += dense_weight * (1.0 / (RRF_K + rank + 1))
+        # Add sparse ranks
+        for rank, result in enumerate(sparse_results):
+            cid = result["chunk_id"]
+            RRF_scores[cid] = RRF_scores.get(cid, 0.0)
+            RRF_scores[cid] += sparse_weight * (1.0 / (RRF_K + rank + 1))
+        # -------------- Step 5: Sort by RRF score --------------
+        sorted_ids = sorted(RRF_scores, key = RRF_scores.get, reverse = True)
+        # -------------- Step 6: Build final results --------------
+        final_results = []
+        for cid in sorted_ids[:top_k]:
+            data = chunk_data.get(cid, {})
+            final_results.append(
+                {
+                    **data,
+                    "rrf_score":    round(RRF_scores[cid], 6),
+                    "retrieval":    "hybrid",
+                }
+            )
+        logger.debug(
+            f"Hybrid retrieval: {len(dense_results)} dense + "
+            f"{len(sparse_results)} sparse -> {len(final_results)} merged"
+        )
+        return final_results

src/retrieval/reranker.py ADDED Viewed

	@@ -0,0 +1,176 @@

+"""
+Cross-encoder re-ranking for improved retrieval precision.
+THE DIFFERENCE BETWEEN BI-ENCODER AND CROSS-ENCODER:
+Bi-encoder (what BGE does):
+    embed(query) → vector_q
+    embed(chunk) → vector_c
+    score = cosine(vector_q, vector_c)
+    Query and chunk are embedded INDEPENDENTLY.
+    Fast (vectors pre-computed), but loses interaction signal.
+Cross-encoder (what we use for re-ranking):
+    score = model(query + [SEP] + chunk)
+    Query and chunk are processed TOGETHER by the model.
+    The model can see how query tokens relate to chunk tokens.
+    Slower (cannot pre-compute), but much more accurate.
+THE TWO-STAGE PATTERN:
+    Stage 1 (Retrieval):   Bi-encoder -> top-20 candidates (fast, approximate)
+    Stage 2 (Re-ranking):  Cross-encoder -> re-score top-20 (slow, accurate)
+    We only run the expensive cross-encoder on 20 candidates,
+    not all 15,664 chunks. This gives us accuracy without
+    paying the full cost for every chunk.
+MODEL: cross-encoder/ms-marco-MiniLM-L-6-v2
+    - Trained on MS MARCO passage retrieval dataset (500K+ queries)
+    - MiniLM architecture: fast on CPU
+    - Output: relevance score (-inf to +inf, higher = more relevant)
+    - Size: ~80MB
+"""
+import logging
+logging.getLogger("sentence_transformers").setLevel(logging.ERROR)
+from sentence_transformers import CrossEncoder
+from src.utils.logger import get_logger
+logger = get_logger(__name__)
+RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+class CrossEncoderReranker:
+    """
+    Re-ranks retrieved chunks using a cross-encoder model.
+    """
+    def __init__(self, model_name: str = RERANKER_MODEL):
+        self._model      = None
+        self._model_name = model_name
+        logger.info(f"CrossEncoderReranker initialized: {model_name}")
+    @property
+    def model(self) -> CrossEncoder:
+        """Lazy-load cross-encoder model."""
+        if self._model is None:
+            logger.info(f"Loading cross-encoder: {self._model_name}")
+            self._model = CrossEncoder(
+                self._model_name,
+                max_length = 512    # Max tokens for query+chunk combined
+            )
+            logger.info("Cross-encoder loaded")
+        return self._model
+    def rerank(
+        self,
+        query:      str,
+        results:    list[dict],
+        top_k:      int = 5
+    ) -> list[dict]:
+        """
+        Re-rank a list of retrieved chunks using cross-encoder scoring.
+        Args:
+            query:   Original user query
+            results: List of retrieved chunk dicts (from hybrid retriever)
+            top_k:   How many top results to return after re-ranking
+        Returns:
+            Top-k results sorted by cross-encoder relevance score
+        WHAT THE CROSS-ENCODER SEES:
+            Input: "[CLS] how does attention work? [SEP] The transformer
+                    architecture uses scaled dot-product attention where
+                    queries, keys and values are computed... [SEP]"
+            Output: 8.3  (high relevance)
+            vs.
+            Input: "[CLS] how does attention work? [SEP] UAV delivery
+                    systems require multi-agent coordination... [SEP]"
+            Output: -2.1  (low relevance)
+        The model learned these relevance patterns from 500K+
+        human-labeled query-passage pairs in MS MARCO.
+        """
+        if not results:
+            return []
+        # Build (query, chunk_text) pairs for batch scoring
+        pairs = [
+            (query, r.get("text", ""))
+            for r in results
+        ]
+        # Score all pairs in one batch
+        # predict() returns numpy array of relevance scores
+        scores = self.model.predict(
+            pairs,
+            show_progress_bar = False,
+            batch_size = 32,
+        )
+        # Attach cross_encoder score to each result
+        for result, score in zip(results, scores):
+            result["ce_score"] = round(float(score), 4)
+        # Sort by cross-encoder score (descending)
+        reranked = sorted(results, key = lambda x: x["ce_score"], reverse = True)
+        logger.debug(
+            f"Re-ranked {len(results)} -> top-{top_k}. "
+            f"Score range: [{reranked[-1]["ce_score"]:.2f}, "
+            f"{reranked[0]["ce_score"]:.2f}]"
+        )
+        return reranked[:top_k]
+def diversity_filter(results: list[dict], max_per_paper: int = 2) -> list[dict]:
+    """
+    Ensure no single paper dominates the results.
+    As you saw in test_search.py - the same paper appeared twice
+    in top-3. This function limits results to max_per_paper
+    chunks from any single paper.
+    Args:
+        results:       List of result dicts (sorted by relevance)
+        max_per_paper: Maximum chunks allowed from the same paper
+    Returns:
+        Filtered list maintaining original relevance order
+    WHY THIS MATTERS FOR USER EXPERIENCE:
+        User asks: "how does attention work?"
+        Without diversity filter: 3 chunks from same attention paper
+        With diversity filter: 1-2 chunks each from 3 different papers
+        The second response is richer - multiple perspectives,
+        multiple research groups, more comprehensive coverage.
+    """
+    seen_papers: dict[str, int] = {}
+    filtered = []
+    for result in results:
+        paper_id    = result.get("paper_id", "unknown")
+        count       = seen_papers.get(paper_id, 0)
+        if count < max_per_paper:
+            filtered.append(result)
+            seen_papers[paper_id] = count + 1
+    return filtered

src/retrieval/retrieval_pipeline.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+Orchestrates the full retrieval pipeline:
+    1. Hybrid retrieval (dense + BM25)
+    2. Cross-encoder re-ranking
+    3. Diversity filtering
+This is the component that the RAG pipeline (Phase 9) will call.
+It takes a query string and returns the best chunks.
+"""
+from typing import Optional
+from src.retrieval.hybrid_retriever import HybridRetriever
+from src.retrieval.reranker import CrossEncoderReranker, diversity_filter
+from src.vectorstore.qdrant_store import QdrantStore
+from src.vectorstore.bm25_store import BM25Store
+from src.embeddings.embedding_model import EmbeddingModel
+from src.utils.logger import get_logger
+from config.settings import TOP_K_RETRIEVAL, TOP_K_RERANK
+logger = get_logger(__name__)
+class RetrievalPipeline:
+    """
+    Full retrieval pipeline with hybrid search + re-ranking.
+    Usage:
+        pipeline = RetrievalPipeline()
+        results  = pipeline.retrieve("how does LoRA fine-tuning work?")
+        for r in results:
+            print(r["title"], r["ce_score"], r["text"][:100])
+    """
+    def __init__(self):
+        # Initialize all components
+        logger.info("Initializing RetrievalPipeline...")
+        qdrant   = QdrantStore()
+        embedder = EmbeddingModel()
+        # Load or build BM25 index
+        bm25 = BM25Store()
+        if not bm25.load():
+            logger.info("BM25 index not found - building now...")
+            bm25.build_index()
+        self.hybrid_retriever = HybridRetriever(
+            qdrant_store    = qdrant,
+            bm25_store      = bm25,
+            embedding_model = embedder,
+        )
+        self.reranker = CrossEncoderReranker()
+        logger.info("RetrievalPipeline ready")
+    def retrieve(
+        self,
+        query:           str,
+        top_k_final:     int = TOP_K_RERANK,
+        filter_category: Optional[str] = None,
+        filter_year_gte: Optional[int] = None,
+    ) -> list[dict]:
+        """
+        Full retrieval: hybrid search → re-rank → diversity filter.
+        Args:
+            query:           User's natural language question
+            top_k_final:     Number of chunks to return (default 5)
+            filter_category: ArXiv category filter
+            filter_year_gte: Year filter
+        Returns:
+            List of top chunks with all metadata and scores
+        """
+        logger.debug(f"Retrieving for query: '{query[:60]}'")
+        # Stage 1: Hybrid retrieval → top-20 candidates
+        candidates = self.hybrid_retriever.retrieve(
+            query           = query,
+            top_k           = TOP_K_RETRIEVAL * 2,   # 40 candidates
+            filter_category = filter_category,
+            filter_year_gte = filter_year_gte,
+        )
+        if not candidates:
+            logger.warning(f"No candidates found for query: {query}")
+            return []
+        # Stage 2: Cross-encoder re-ranking -> top-5
+        reranked = self.reranker.rerank(
+            query   = query,
+            results = candidates,
+            top_k   = top_k_final * 2,  # Keep extra before diversity filter
+        )
+        # Stage 3: Diversity filter -> max 2 chunks per paper
+        diverse = diversity_filter(reranked, max_per_paper=2)
+        # Return top_k_final after diversity filtering
+        final = diverse[:top_k_final]
+        logger.debug(
+            f"Pipeline: {len(candidates)} candidates -> "
+            f"{len(reranked)} reranked -> "
+            f"{len(final)} final"
+        )
+        return final

src/vectorstore/bm25_store.py ADDED Viewed

	@@ -0,0 +1,203 @@

+"""
+BM25 sparse retrieval index for keyword-based search.
+BM25 (Best Match 25) is the gold standard keyword search algorithm.
+It powers Elasticsearch, Solr, and was the backbone of Google Search
+before neural methods. It rewards:
+    - Term frequency: how often the query word appears in the chunk
+    - Inverse document frequency: rare words are more discriminative
+    - Document length normalization: prevents long chunks from dominating
+WHY WE NEED THIS ALONGSIDE VECTOR SEARCH:
+    Query: "what is LoRA fine-tuning?"
+    Vector search: finds chunks about "parameter-efficient training"
+    (semantically related but may miss the exact acronym)
+    BM25: finds chunks containing the EXACT token "LoRA"
+    (exact match, regardless of semantic similarity)
+    Hybrid: finds chunks that are BOTH semantically relevant
+    AND contain the keyword - best of both worlds.
+"""
+from copyreg import pickle
+import json
+import pickle
+import re
+from pathlib import Path
+import numpy as np
+from rank_bm25 import BM25Okapi
+from src.utils.logger import get_logger
+from config.settings import CHUNKS_DIR, EMBEDDINGS_DIR
+logger = get_logger(__name__)
+# Where we persist the BM25 index
+BM25_INDEX_PATH = EMBEDDINGS_DIR / "bm25_index.pkl"
+def tokenize(text: str) -> list[str]:
+    """
+    Simple tokenizer for BM25.
+    BM25 works on token lists, not raw strings.
+    We lowercase and split on non-alphanumeric characters.
+    WHY NOT USE NLTK/SPACY:
+        For BM25 in a RAG pipeline, simple whitespace+punctuation
+        tokenization is sufficient and avoids heavy dependencies.
+        The quality difference is minimal for retrieval tasks.
+    """
+    text = text.lower()
+    # Split on anything that't not a letter, number, or hyphen
+    tokens = re.findall(r'[a-z0-9]+(?:-[a-z0-9]+)*', text)
+    return tokens
+class BM25Store:
+    """
+    Manages a BM25 index over all chunk texts.
+    The index is built once and persisted to disk as a pickle file.
+    Loading from pickle is near-instant vs rebuilding from scratch.
+    """
+    def __init__(self):
+        self.bm25:          BM25Okapi = None
+        self.chunk_ids:     list[str] = []
+        self.texts:         list[str] = []
+    def build_index(self) -> None:
+        """
+        Build BM25 index from all chunk files.
+        Loads all chunk texts, tokenizes them, and creates the BM25 index.
+        This takes ~30 seconds for 15,664 chunks.
+        """
+        logger.info("Building BM25 index from chunk files...")
+        chunk_ids = []
+        texts     = []
+        for cf in CHUNKS_DIR.glob("*_semantic.json"):
+            with open(cf, "r", encoding = 'utf-8') as f:
+                chunks = json.load(f)
+            for chunk in chunks:
+                chunk_ids.append(chunk["chunk_id"])
+                texts.append(chunk["text"])
+        logger.info(f"Tokenizing {len(texts):,} chunks...")
+        # Tokenize all texts
+        # bm250kapi expects a list of token lists
+        tokenized_corpus = [tokenize(text) for text in texts]
+        # Build the BM25 index
+        # BM250kapi is the standard 0kapi BM25 variant
+        self.bm25       = BM25Okapi(tokenized_corpus)
+        self.chunk_ids  = chunk_ids
+        self.texts      = texts
+        logger.info(f"BM25 index built: {len(chunk_ids):,} documents")
+        # Persist to disk
+        self._save()
+    def _save(self) -> None:
+        """Save index to disk using pickle."""
+        data = {
+            "bm25":      self.bm25,
+            "chunk_ids": self.chunk_ids,
+            "texts":     self.texts,
+        }
+        with open(BM25_INDEX_PATH, "wb") as f:
+            pickle.dump(data, f)
+        size_mb = BM25_INDEX_PATH.stat().st_size / 1024 / 1024
+        logger.info(f"BM25 index saved: {BM25_INDEX_PATH} ({size_mb:.1f} MB)")
+    def load(self) -> bool:
+        """
+        Look index from disk
+        Return True if loaded, False if index doesn't exists
+        """
+        if not BM25_INDEX_PATH.exists():
+            logger.info("No BM25 index found on disk")
+            return False
+        logger.info("Loading BM25 index from disk...")
+        with open(BM25_INDEX_PATH, "rb") as f:
+            data = pickle.load(f)
+        self.bm25       = data["bm25"]
+        self.chunk_ids  = data["chunk_ids"]
+        self.texts      = data["texts"]
+        logger.info(f"BM25 index loaded: {len(self.chunk_ids):,} documents")
+        return True
+    def search(self, query: str, top_k: int = 20) -> list[dict]:
+        """
+        Search BM25 index with a text query.
+        Args:
+            query: Raw query string (NOT embedded - BM25 uses tokens)
+            top_k: Number of top results to return
+        Returns:
+            List of dicts with chunk_id, bm25_score, text
+        HOW BM25 SCORING WORKS:
+            Given query tokens ["lora", "fine-tuning"],
+            BM25 scores each document based on how frequently
+            these tokens appear, weighted by their rarity across
+            all documents (IDF) and normalized by document length.
+            Higher score = better keyword match.
+        """
+        if self.bm25 is None:
+            raise RuntimeError("BM25 index not loaded. Call build_index() or load() first.")
+        query_tokens = tokenize(query)
+        if not query_tokens:
+            return []
+        # get_scores returns array of shape (n_documents,)
+        # with BM25 score for each document
+        scores = self.bm25.get_scores(query_tokens)
+        # Get indices of top-k scores (argsort ascending, take last k, reverse)
+        top_indices = np.argsort(scores)[-top_k:][::-1]
+        results = []
+        for idx in top_indices:
+            score = float(scores[idx])
+            if score <= 0:
+                # Skip zero-score results - no keywords overlap at all
+                continue
+            results.append(
+                {
+                    "chunk_id":     self.chunk_ids[idx],
+                    "bm25_score":   round(score, 4),
+                    "text":         self.texts[idx],
+                }
+            )
+        return results

src/vectorstore/qdrant_store.py CHANGED Viewed

@@ -175,7 +175,8 @@ class QdrantStore:
                     payload = {
                         # Store ALL metadata in payload for retrieval
                         **metadata[i],
-                        "text": texts[i],   # Inlcude chunk text
                     }
                 )
                 points.append(point)
@@ -286,13 +287,12 @@ class QdrantStore:
             )
         if year_gte:
-            # published_date is stored as "YYYY-MM-DD" string
-            # We filter by string comparison: "2024-01-01" <= date
-            # This works because ISO date strings sort lexicographically
             conditions.append(
                 FieldCondition(
-                    key     = "published_date",
-                    range   = Range(gte = f"{year_gte}-01-01")
                 )
             )

                     payload = {
                         # Store ALL metadata in payload for retrieval
                         **metadata[i],
+                        "text": texts[i],   # Include chunk text
+                        "publication_year": int(metadata[i].get("published_date", "0000")[:4]),
                     }
                 )
                 points.append(point)
             )
         if year_gte:
+            # publication_year is stored as an integer (e.g. 2026)
+            # Range(gte=year_gte) filters to papers from that year onwards
             conditions.append(
                 FieldCondition(
+                    key     = "publication_year",
+                    range   = Range(gte = year_gte)
                 )
             )

test_retrieval.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""
+Test the full retrieval pipeline: hybrid search + re-ranking + diversity.
+Compare it against pure dense search to show the improvement.
+"""
+import time
+from src.utils.logger import setup_logger, get_logger
+from src.retrieval.retrieval_pipeline import RetrievalPipeline
+from src.vectorstore.qdrant_store import QdrantStore
+from src.embeddings.embedding_model import EmbeddingModel
+setup_logger()
+logger = get_logger(__name__)
+def test_pipeline(pipeline: RetrievalPipeline, query: str):
+    print(f"\n{'='*60}")
+    print(f"QUERY: {query}")
+    print(f"{'='*60}")
+    start = time.time()
+    results = pipeline.retrieve(query, top_k_final=5)
+    elapsed = time.time() - start
+    print(f"Retrieved {len(results)} results in {elapsed:.2f}s\n")
+    for i, r in enumerate(results):
+        print(f"[{i+1}] CE Score: {r.get('ce_score', 'N/A'):>7} | "
+              f"RRF: {r.get('rrf_score', 'N/A'):.4f}")
+        print(f"     {r.get('title','')[:65]}...")
+        print(f"     {r.get('text','')[:120].replace(chr(10),' ')}...")
+        print()
+def main():
+    logger.info("Initializing full retrieval pipeline...")
+    pipeline = RetrievalPipeline()
+    # Test 1: Conceptual query
+    test_pipeline(
+        pipeline,
+        "how does self-attention mechanism work in transformers"
+    )
+    # Test 2: Specific method query - tests BM25 keyword advantage
+    test_pipeline(
+        pipeline,
+        "LoRA low-rank adaptation fine-tuning"
+    )
+    # Test 3: Comparison query
+    test_pipeline(
+        pipeline,
+        "reinforcement learning reward shaping techniques"
+    )
+    # Test 4: With year filter
+    print(f"\n{'='*60}")
+    print("FILTERED: 'graph neural networks' (2026 only)")
+    print(f"{'='*60}")
+    results = pipeline.retrieve(
+        "graph neural networks",
+        filter_year_gte = 2026,
+        top_k_final = 3
+    )
+    for i, r in enumerate(results):
+        print(
+            f"[{i+1}] {r.get('published_date', 'N/A')} | "
+            f"CE: {r.get('ce_score','N/A'):>6} | "
+            f"{r.get('title','')[:55]}..."
+        )
+    logger.info("\n✅ Retrieval pipeline test complete")
+if __name__ == "__main__":
+    main()