Spaces:

dtufail
/

nuremberg-scholar

Sleeping

App Files Files Community

dtufail commited on Mar 11

Commit

8168ff0

verified ·

1 Parent(s): 9f50407

Upload retriever.py with huggingface_hub

Browse files

Files changed (1) hide show

retriever.py +647 -0

retriever.py ADDED Viewed

	@@ -0,0 +1,647 @@

+"""
+retriever.py  —  Nuremberg Scholar Hybrid Retriever (HuggingFace Spaces / ZeroGPU)
+====================================================================================
+Changes from local/SageMaker version:
+  - index_dir parameter         : Retriever accepts an explicit path instead of
+                                  hardcoded Path("output/index"). On Spaces this
+                                  comes from snapshot_download(); locally it falls
+                                  back to the default ./output/index/.
+  - CPU-first model loading     : QueryEncoder and Reranker load to CPU at init.
+                                  rag.py moves them to CUDA inside the @spaces.GPU
+                                  window and back to CPU after. The .device attribute
+                                  on QueryEncoder and Reranker is updated by rag.py
+                                  before each call so encode()/rerank() run on the
+                                  correct device.
+  - dtype= replaces torch_dtype : fixes the transformers deprecation warning.
+  - CLI smoke test preserved    : `python retriever.py --query "..." ` still works
+                                  for local testing; it auto-detects CUDA availability.
+Pipeline (paper-backed):
+  1. Query encoding  : BGE-M3 dense (1024d) + sparse (lexical weights)
+  2. Dense retrieval : FAISS FlatIP top-N  (cosine via L2-norm + inner product)
+  3. Sparse retrieval: dot-product over CSR sparse matrix, top-N
+  4. RRF fusion      : k=60, merge dense+sparse ranked lists -> top-K candidates
+  5. Reranking       : bge-reranker-v2-m3 cross-encoder -> sigmoid scores -> top-K_final
+  6. Return          : list of ranked Result objects with metadata + scores
+Design decisions from literature:
+  - RRF k=60: industry standard, robust across domains (Cormack et al. 2009)
+  - Dense N=100, Sparse N=100 -> RRF top-25 -> rerank to top-5
+    (two-stage funnel: high recall first, high precision second)
+  - BGE-M3 paper recommends dense+sparse hybrid for long-document corpus;
+    sparse alone outperforms dense by ~10 NDCG points on long docs (MLDR)
+  - bge-reranker-v2-m3 is the official reranker pairing for bge-m3 embeddings
+  - Scores sigmoid-mapped to [0,1] for interpretability at generation time
+  - No query instruction prefix needed for BGE-M3 (unlike BGE v1.5)
+Usage:
+    from retriever import Retriever
+    r = Retriever()
+    results = r.retrieve("What did Goring say about the Luftwaffe?", top_k=5)
+    for res in results:
+        print(res)
+    # CLI smoke test
+    python retriever.py --query "crimes against humanity Article 6c"
+    python retriever.py --query "Ohlendorf Einsatzgruppen" --top-k 3 --no-rerank
+    python retriever.py --query "London Agreement 1945" --dense-only
+"""
+import json
+import time
+import argparse
+from pathlib import Path
+from dataclasses import dataclass, field
+from typing import Optional
+# ── Defaults ──────────────────────────────────────────────────────────────────
+DEFAULT_INDEX_DIR = Path("output/index")
+EMBED_MODEL   = "BAAI/bge-m3"
+RERANK_MODEL  = "BAAI/bge-reranker-v2-m3"
+EMBED_DIM     = 1024
+RRF_K         = 60       # Cormack et al. 2009 — robust standard
+DENSE_N       = 100      # candidates from dense retrieval
+SPARSE_N      = 100      # candidates from sparse retrieval
+RERANK_INPUT  = 25       # max chunks sent to reranker (post-RRF)
+DEFAULT_TOP_K = 5        # final chunks returned to generator
+MAX_Q_TOKENS  = 512      # query max tokens (queries are short)
+# ── Result dataclass ──────────────────────────────────────────────────────────
+@dataclass
+class Result:
+    chunk_id:     str
+    body:         str
+    collection:   str
+    date_iso:     Optional[str]
+    speaker:      Optional[str]
+    source_url:   Optional[str]
+    page_number:  Optional[int]
+    slug:         Optional[str]
+    # Scores
+    dense_rank:   Optional[int]   = None
+    sparse_rank:  Optional[int]   = None
+    rrf_score:    float           = 0.0
+    rerank_score: Optional[float] = None   # sigmoid [0,1], None if bypassed
+    def __str__(self):
+        rerank = f"  rerank={self.rerank_score:.4f}" if self.rerank_score is not None else ""
+        return (
+            f"[{self.collection}] {self.date_iso or '?'}  {self.slug or ''}\n"
+            f"  speaker={self.speaker or '-'}  page={self.page_number or '?'}\n"
+            f"  rrf={self.rrf_score:.5f}{rerank}\n"
+            f"  {self.body[:200]}..."
+        )
+# ── BGE-M3 query encoder ──────────────────────────────────────────────────────
+UNUSED_TOKENS = [0, 1, 2]   # <s>, <pad>, </s>
+class QueryEncoder:
+    """
+    Encodes a query into:
+      dense_vec      : np.ndarray (1024,)  L2-normalised
+      sparse_weights : dict {token_str: score}
+    sparse_linear = Linear(1024, 1) — scalar weight per token position.
+    Scatter onto input_ids vocab positions via scatter_reduce("amax").
+    ZeroGPU note:
+      Loads to CPU at init. rag.py moves self.model to CUDA inside the
+      @spaces.GPU window by calling self.model.to("cuda") and updating
+      self.device. encode() uses self.device for all tensor ops, so it
+      runs on whichever device the model currently sits on.
+    """
+    def __init__(self, model_name: str, device: str = "cpu"):
+        import torch
+        import torch.nn as nn
+        from transformers import AutoTokenizer, AutoModel
+        from huggingface_hub import hf_hub_download
+        self.device     = torch.device(device)
+        self.torch      = torch
+        self.fp16       = device != "cpu"
+        self.tokenizer  = AutoTokenizer.from_pretrained(model_name)
+        self.vocab_size = self.tokenizer.vocab_size   # 250002
+        # CPU-first: always load to CPU, let caller move to GPU when needed.
+        # dtype= replaces deprecated torch_dtype=
+        self.model = AutoModel.from_pretrained(
+            model_name,
+            dtype=torch.float16 if self.fp16 else torch.float32,
+        )
+        self.model.to(self.device)
+        self.model.eval()
+        sparse_path        = hf_hub_download(repo_id=model_name, filename="sparse_linear.pt")
+        raw                = torch.load(sparse_path, map_location="cpu", weights_only=True)
+        in_f, out_f        = raw["weight"].shape[1], raw["weight"].shape[0]
+        self.sparse_linear = nn.Linear(in_f, out_f, bias=True)
+        self.sparse_linear.load_state_dict(raw, strict=True)
+        if self.fp16:
+            self.sparse_linear = self.sparse_linear.half()
+        self.sparse_linear.to(self.device)
+        self.sparse_linear.eval()
+    def encode(self, query: str) -> dict:
+        """
+        Encode a query string. Uses self.device for all tensor placement,
+        so this works on both CPU and CUDA depending on where the model
+        has been moved by the caller.
+        """
+        import torch
+        import numpy as np
+        import torch.nn.functional as F
+        # Resolve current device from the model parameters — this handles
+        # the case where rag.py has moved self.model to CUDA but self.device
+        # hasn't been explicitly updated yet.
+        device = next(self.model.parameters()).device
+        enc = self.tokenizer(
+            [query],
+            padding=True,
+            truncation=True,
+            max_length=MAX_Q_TOKENS,
+            return_tensors="pt",
+        )
+        enc = {k: v.to(device) for k, v in enc.items()}
+        with torch.no_grad():
+            out         = self.model(**enc, return_dict=True)
+            last_hidden = out.last_hidden_state
+            dense    = F.normalize(last_hidden[:, 0, :].float(), p=2, dim=-1)
+            dense_np = dense.cpu().numpy().astype("float32")[0]  # (1024,)
+            # sparse_linear may be on a different device if only self.model
+            # was moved — move it to match
+            if next(self.sparse_linear.parameters()).device != device:
+                self.sparse_linear.to(device)
+            token_weights = torch.relu(
+                self.sparse_linear(last_hidden)
+            ).squeeze(-1).float()
+        sparse_emb = torch.zeros(
+            1, self.vocab_size, dtype=torch.float32, device=device
+        )
+        sparse_emb = sparse_emb.scatter_reduce(
+            dim=1,
+            index=enc["input_ids"],
+            src=token_weights,
+            reduce="amax",
+            include_self=False,
+        )
+        for uid in UNUSED_TOKENS:
+            if uid < self.vocab_size:
+                sparse_emb[0, uid] = 0.0
+        nonzero = sparse_emb[0].nonzero(as_tuple=True)[0].tolist()
+        scores  = sparse_emb[0][nonzero].cpu().tolist()
+        sparse  = {}
+        for tid, score in zip(nonzero, scores):
+            if score <= 0:
+                continue
+            tok = self.tokenizer.decode([tid]).strip()
+            if tok:
+                sparse[tok] = round(float(score), 4)
+        return {"dense_vec": dense_np, "sparse_weights": sparse}
+# ── Reranker ──────────────────────────────────────────────────────────────────
+class Reranker:
+    """
+    bge-reranker-v2-m3 cross-encoder.
+    Scores sigmoid-mapped to [0,1] per HF model card recommendation.
+    ZeroGPU note:
+      Same CPU-first pattern as QueryEncoder. rag.py moves self.model
+      to CUDA inside the @spaces.GPU window. rerank() resolves device
+      from model parameters.
+    """
+    def __init__(self, model_name: str, device: str = "cpu"):
+        import torch
+        from transformers import AutoTokenizer, AutoModelForSequenceClassification
+        self.device    = torch.device(device)
+        self.torch     = torch
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        # dtype= replaces deprecated torch_dtype=
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            model_name,
+            dtype=torch.float16 if device != "cpu" else torch.float32,
+        )
+        self.model.to(self.device)
+        self.model.eval()
+    def rerank(self, query: str, candidates: list[Result],
+               batch_size: int = 32) -> list[Result]:
+        import torch
+        # Resolve current device from model parameters
+        device = next(self.model.parameters()).device
+        pairs      = [[query, c.body] for c in candidates]
+        all_scores = []
+        for i in range(0, len(pairs), batch_size):
+            batch = pairs[i:i + batch_size]
+            enc   = self.tokenizer(
+                batch,
+                padding=True,
+                truncation=True,
+                max_length=512,
+                return_tensors="pt",
+            )
+            enc = {k: v.to(device) for k, v in enc.items()}
+            with torch.no_grad():
+                logits = self.model(**enc, return_dict=True).logits.view(-1).float()
+            scores = torch.sigmoid(logits).cpu().tolist()
+            all_scores.extend(scores)
+        for candidate, score in zip(candidates, all_scores):
+            candidate.rerank_score = round(score, 6)
+        return sorted(candidates, key=lambda x: x.rerank_score, reverse=True)
+# ── Sparse index ──────────────────────────────────────────────────────────────
+class SparseIndex:
+    """
+    CSR sparse matrix index over sparse.jsonl.
+    Layout: matrix shape (num_tokens, num_chunks), float32.
+      rows    = tokens  (indexed via token_to_row dict)
+      columns = chunks  (same order as metadata.jsonl / FAISS rows)
+      values  = BGE-M3 sparse weights
+    Query:
+      1. Build a 1-row CSR query vector from query token weights.
+      2. query_vec @ matrix  ->  dense (num_chunks,) score array. One BLAS call.
+      3. np.argpartition for top-n, argsort only the top slice.
+    Why CSR vs dict-of-lists:
+      - RAM  : 54 MB vs 608 MB  (-554 MB measured on this corpus)
+      - Query: single scipy sparse matmul vs Python loop over posting lists
+      - Load : ~4s vs ~24s
+    query() signature is identical to the old implementation.
+    """
+    def __init__(self, sparse_path: Path):
+        import numpy as np
+        from scipy.sparse import csr_matrix
+        print(f"  Loading sparse index from {sparse_path}...")
+        t0 = time.time()
+        token_to_row: dict[str, int] = {}
+        chunk_ids:    list[str]      = []
+        rows:         list[int]      = []
+        cols:         list[int]      = []
+        data:         list[float]    = []
+        with sparse_path.open(encoding="utf-8") as f:
+            for chunk_idx, line in enumerate(f):
+                line = line.strip()
+                if not line:
+                    continue
+                obj = json.loads(line)
+                chunk_ids.append(obj["chunk_id"])
+                for token, weight in obj.get("weights", {}).items():
+                    if token not in token_to_row:
+                        token_to_row[token] = len(token_to_row)
+                    rows.append(token_to_row[token])
+                    cols.append(chunk_idx)
+                    data.append(weight)
+        num_tokens = len(token_to_row)
+        num_chunks = len(chunk_ids)
+        self.matrix = csr_matrix(
+            (
+                np.array(data,  dtype=np.float32),
+                (np.array(rows, dtype=np.int32),
+                 np.array(cols, dtype=np.int32)),
+            ),
+            shape=(num_tokens, num_chunks),
+        )
+        self.token_to_row = token_to_row
+        self.chunk_ids    = chunk_ids
+        elapsed = time.time() - t0
+        ram_mb  = (self.matrix.data.nbytes
+                   + self.matrix.indices.nbytes
+                   + self.matrix.indptr.nbytes) / 1024**2
+        print(f"  Sparse index: {num_chunks:,} chunks, "
+              f"{num_tokens:,} unique tokens, "
+              f"{self.matrix.nnz:,} nnz  "
+              f"({elapsed:.1f}s, {ram_mb:.1f} MB CSR)")
+    def query(self, sparse_weights: dict[str, float],
+              top_n: int) -> list[tuple[int, float]]:
+        """
+        Returns list of (chunk_idx, score) sorted descending, length <= top_n.
+        Identical signature to the old dict-of-lists implementation.
+        """
+        import numpy as np
+        from scipy.sparse import csr_matrix
+        if not sparse_weights:
+            return []
+        q_rows, q_cols, q_data = [], [], []
+        for token, weight in sparse_weights.items():
+            row = self.token_to_row.get(token)
+            if row is not None:
+                q_rows.append(0)
+                q_cols.append(row)
+                q_data.append(weight)
+        if not q_data:
+            return []
+        num_tokens = self.matrix.shape[0]
+        q_vec = csr_matrix(
+            (np.array(q_data,  dtype=np.float32),
+             (np.array(q_rows, dtype=np.int32),
+              np.array(q_cols, dtype=np.int32))),
+            shape=(1, num_tokens),
+        )
+        # (1, num_tokens) @ (num_tokens, num_chunks) -> (1, num_chunks)
+        # todense() ensures we always get a plain numpy matrix, not sparse
+        scores = np.asarray((q_vec @ self.matrix).todense()).ravel()  # (num_chunks,)
+        if top_n >= len(scores):
+            top_indices = np.argsort(scores)[::-1]
+        else:
+            top_indices = np.argpartition(scores, -top_n)[-top_n:]
+            top_indices = top_indices[np.argsort(scores[top_indices])[::-1]]
+        return [
+            (int(idx), float(scores[idx]))
+            for idx in top_indices
+            if float(scores[idx]) > 0
+        ]
+# ── RRF fusion ────────────────────────────────────────────────────────────────
+def reciprocal_rank_fusion(
+    dense_ranked:  list[tuple[int, float]],
+    sparse_ranked: list[tuple[int, float]],
+    k: int = RRF_K,
+) -> list[tuple[int, float]]:
+    """
+    RRF(d) = sum( 1 / (k + rank_r(d)) )
+    Returns list of (chunk_idx, rrf_score) sorted descending.
+    """
+    rrf: dict[int, float] = {}
+    for rank, (chunk_idx, _) in enumerate(dense_ranked, start=1):
+        rrf[chunk_idx] = rrf.get(chunk_idx, 0.0) + 1.0 / (k + rank)
+    for rank, (chunk_idx, _) in enumerate(sparse_ranked, start=1):
+        rrf[chunk_idx] = rrf.get(chunk_idx, 0.0) + 1.0 / (k + rank)
+    return sorted(rrf.items(), key=lambda x: x[1], reverse=True)
+# ── Main Retriever ────────────────────────────────────────────────────────────
+class Retriever:
+    """
+    Full hybrid retrieval pipeline.
+    Parameters
+    ----------
+    index_dir     : Path to directory containing dense.faiss, metadata.jsonl,
+                    sparse.jsonl. Defaults to ./output/index/ for local dev.
+                    On Spaces, rag.py passes the snapshot_download() cache path.
+    device        : "cuda" / "cpu". On Spaces this is "cpu" at init time;
+                    rag.py moves models to CUDA inside the @spaces.GPU window.
+    dense_n       : candidates from FAISS  (default 100)
+    sparse_n      : candidates from sparse index (default 100)
+    rerank_input  : max chunks sent to reranker (default 25)
+    top_k         : final results returned (default 5)
+    use_reranker  : bool (default True)
+    dense_only    : skip sparse + RRF, just return FAISS top-k (baseline mode)
+    """
+    def __init__(
+        self,
+        index_dir:    Optional[str] = None,
+        device:       str  = "cpu",
+        dense_n:      int  = DENSE_N,
+        sparse_n:     int  = SPARSE_N,
+        rerank_input: int  = RERANK_INPUT,
+        top_k:        int  = DEFAULT_TOP_K,
+        use_reranker: bool = True,
+        dense_only:   bool = False,
+    ):
+        import faiss
+        # Resolve index directory
+        idx_dir = Path(index_dir) if index_dir else DEFAULT_INDEX_DIR
+        dense_file  = idx_dir / "dense.faiss"
+        sparse_file = idx_dir / "sparse.jsonl"
+        meta_file   = idx_dir / "metadata.jsonl"
+        self.device       = device
+        self.dense_n      = dense_n
+        self.sparse_n     = sparse_n
+        self.rerank_input = rerank_input
+        self.top_k        = top_k
+        self.use_reranker = use_reranker
+        self.dense_only   = dense_only
+        if not dense_file.exists():
+            raise FileNotFoundError(f"Dense index not found: {dense_file}")
+        print(f"  Loading FAISS index from {idx_dir}...")
+        self.faiss_index = faiss.read_index(str(dense_file))
+        print(f"  FAISS: {self.faiss_index.ntotal:,} vectors")
+        print(f"  Loading metadata...")
+        self.metadata: list[dict] = []
+        with meta_file.open(encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    self.metadata.append(json.loads(line))
+        print(f"  Metadata: {len(self.metadata):,} records")
+        self.chunk_id_to_idx = {m["chunk_id"]: i for i, m in enumerate(self.metadata)}
+        if not dense_only:
+            self.sparse_index = SparseIndex(sparse_file)
+        else:
+            self.sparse_index = None
+        print(f"  Loading query encoder ({EMBED_MODEL})...")
+        self.encoder = QueryEncoder(EMBED_MODEL, device)
+        self.reranker = None
+        if use_reranker:
+            print(f"  Loading reranker ({RERANK_MODEL})...")
+            self.reranker = Reranker(RERANK_MODEL, device)
+        print(f"\n  Retriever ready  "
+              f"device={device}  index={idx_dir}  "
+              f"dense_n={dense_n}  sparse_n={sparse_n}  "
+              f"rerank={use_reranker}  top_k={top_k}\n")
+    def retrieve(self, query: str, top_k: Optional[int] = None) -> list[Result]:
+        import numpy as np
+        top_k = top_k or self.top_k
+        t0    = time.time()
+        # ── 1. Encode query ───────────────────────────────────────────────────
+        encoded   = self.encoder.encode(query)
+        dense_vec = encoded["dense_vec"]
+        sparse_w  = encoded["sparse_weights"]
+        # ── 2. Dense retrieval (FAISS) ────────────────────────────────────────
+        q_vec            = dense_vec.reshape(1, -1).astype("float32")
+        scores, indices  = self.faiss_index.search(q_vec, self.dense_n)
+        dense_ranked     = [
+            (int(idx), float(score))
+            for idx, score in zip(indices[0], scores[0])
+            if idx >= 0
+        ]
+        if self.dense_only:
+            results = self._build_results(
+                dense_ranked[:top_k],
+                dense_ranked=dense_ranked,
+                sparse_ranked=[],
+            )
+            if self.use_reranker and self.reranker:
+                results = self.reranker.rerank(query, results)
+            return results[:top_k]
+        # ── 3. Sparse retrieval ───────────────────────────────────────────────
+        sparse_ranked = self.sparse_index.query(sparse_w, self.sparse_n)
+        # ── 4. RRF fusion ─────────────────────────────────────────────────────
+        fused = reciprocal_rank_fusion(dense_ranked, sparse_ranked, k=RRF_K)
+        fused = fused[:self.rerank_input]
+        # ── 5. Build Result objects ───────────────────────────────────────────
+        dense_rank_map  = {idx: r+1 for r, (idx, _) in enumerate(dense_ranked)}
+        sparse_rank_map = {idx: r+1 for r, (idx, _) in enumerate(sparse_ranked)}
+        candidates = []
+        for chunk_idx, rrf_score in fused:
+            if chunk_idx >= len(self.metadata):
+                continue
+            m = self.metadata[chunk_idx]
+            candidates.append(Result(
+                chunk_id    = m.get("chunk_id", ""),
+                body        = m.get("body", ""),
+                collection  = m.get("collection", ""),
+                date_iso    = m.get("date_iso"),
+                speaker     = m.get("speaker"),
+                source_url  = m.get("source_url"),
+                page_number = m.get("page_number"),
+                slug        = m.get("slug"),
+                dense_rank  = dense_rank_map.get(chunk_idx),
+                sparse_rank = sparse_rank_map.get(chunk_idx),
+                rrf_score   = rrf_score,
+            ))
+        # ── 6. Rerank ─────────────────────────────────────────────────────────
+        if self.use_reranker and self.reranker and candidates:
+            candidates = self.reranker.rerank(query, candidates)
+        elapsed = time.time() - t0
+        print(f"  Retrieved {len(candidates[:top_k])} results in {elapsed:.2f}s  "
+              f"(dense={len(dense_ranked)} sparse={len(sparse_ranked)} "
+              f"fused={len(fused)} reranked={self.use_reranker})")
+        return candidates[:top_k]
+    def _build_results(self, ranked, dense_ranked, sparse_ranked) -> list[Result]:
+        dense_rank_map  = {idx: r+1 for r, (idx, _) in enumerate(dense_ranked)}
+        sparse_rank_map = {idx: r+1 for r, (idx, _) in enumerate(sparse_ranked)}
+        results = []
+        for chunk_idx, rrf_score in ranked:
+            if chunk_idx >= len(self.metadata):
+                continue
+            m = self.metadata[chunk_idx]
+            results.append(Result(
+                chunk_id    = m.get("chunk_id", ""),
+                body        = m.get("body", ""),
+                collection  = m.get("collection", ""),
+                date_iso    = m.get("date_iso"),
+                speaker     = m.get("speaker"),
+                source_url  = m.get("source_url"),
+                page_number = m.get("page_number"),
+                slug        = m.get("slug"),
+                dense_rank  = dense_rank_map.get(chunk_idx),
+                sparse_rank = sparse_rank_map.get(chunk_idx),
+                rrf_score   = rrf_score,
+            ))
+        return results
+# ── CLI smoke test ──────────────────────────────────────────���─────────────────
+def main():
+    ap = argparse.ArgumentParser(description="Nuremberg Scholar -- Retriever smoke test")
+    ap.add_argument("--query",      required=True)
+    ap.add_argument("--top-k",      type=int, default=DEFAULT_TOP_K)
+    ap.add_argument("--device",     default="cuda")
+    ap.add_argument("--no-rerank",  action="store_true")
+    ap.add_argument("--dense-only", action="store_true")
+    ap.add_argument("--dense-n",    type=int, default=DENSE_N)
+    ap.add_argument("--sparse-n",   type=int, default=SPARSE_N)
+    ap.add_argument("--index-dir",  default=None,
+                    help="Path to index directory (default: ./output/index/)")
+    args = ap.parse_args()
+    if args.device == "cuda":
+        try:
+            import torch
+            if not torch.cuda.is_available():
+                args.device = "cpu"
+        except ImportError:
+            args.device = "cpu"
+    print(f"\nNuremberg Scholar -- Retriever")
+    print("=" * 60)
+    retriever = Retriever(
+        index_dir    = args.index_dir,
+        device       = args.device,
+        dense_n      = args.dense_n,
+        sparse_n     = args.sparse_n,
+        top_k        = args.top_k,
+        use_reranker = not args.no_rerank,
+        dense_only   = args.dense_only,
+    )
+    print(f"\nQuery: {args.query}\n")
+    results = retriever.retrieve(args.query, top_k=args.top_k)
+    print(f"\n{'='*60}")
+    print(f"Top {len(results)} results:")
+    print(f"{'='*60}\n")
+    for i, r in enumerate(results, 1):
+        print(f"  -- Result {i} --")
+        print(f"  {r}\n")
+if __name__ == "__main__":
+    main()