researchpilot-api / reindex_light.py
Subhadip007's picture
feat: ResearchPilot v2 β€” Next.js frontend, 358k embeddings pipeline, LaTeX math rendering, dual-GPU Kaggle scaling
99cac84
"""
Re-index Qdrant with a LIMITED number of chunks (for low-RAM machines).
Your full 358k embeddings stay on disk untouched.
This only controls how many get loaded into the Qdrant search index.
Usage:
python reindex_light.py (default: 100,000 chunks)
python reindex_light.py --limit 50000
"""
import sys
import json
import numpy as np
from pathlib import Path
from src.utils.logger import setup_logger, get_logger
from src.vectorstore.qdrant_store import QdrantStore
from src.embeddings.embedding_cache import EmbeddingCache
from config.settings import CHUNKS_DIR
setup_logger()
logger = get_logger(__name__)
def main():
# Parse limit from args
limit = 100_000
if "--limit" in sys.argv:
idx = sys.argv.index("--limit")
limit = int(sys.argv[idx + 1])
print(f"{'=' * 60}")
print(f" LIGHTWEIGHT RE-INDEXER (RAM-safe)")
print(f" Chunk limit: {limit:,}")
print(f"{'=' * 60}\n")
# Step 1: Load chunk files from disk (only up to limit)
print("Step 1: Loading chunk files...")
chunk_ids = []
texts = []
metadata = []
chunk_files = sorted(CHUNKS_DIR.glob("*_semantic.json"))
print(f" Found {len(chunk_files)} chunk files on disk")
for cf in chunk_files:
if len(chunk_ids) >= limit:
break
with open(cf, 'r', encoding='utf-8') as f:
raw = json.load(f)
# Handle both formats
if isinstance(raw, dict) and "chunks" in raw:
chunk_list = raw["chunks"]
elif isinstance(raw, list):
chunk_list = raw
else:
continue
for chunk in chunk_list:
if len(chunk_ids) >= limit:
break
chunk_ids.append(chunk['chunk_id'])
texts.append(chunk['text'])
metadata.append({k: v for k, v in chunk.items() if k != 'text'})
print(f" Loaded {len(chunk_ids):,} chunks (limit: {limit:,})\n")
# Step 2: Load embeddings and match to chunks
print("Step 2: Loading embedding cache...")
cache = EmbeddingCache()
cache.load()
embeddings_matrix, cached_ids = cache.get_all()
print(f" Cache has {len(cached_ids):,} embeddings")
# Build lookup
id_to_row = {cid: i for i, cid in enumerate(cached_ids)}
# Match chunks to embeddings
valid = [(i, id_to_row[cid]) for i, cid in enumerate(chunk_ids) if cid in id_to_row]
print(f" Matched {len(valid):,} chunks with embeddings\n")
chunk_indices = [v[0] for v in valid]
embed_indices = [v[1] for v in valid]
final_chunk_ids = [chunk_ids[i] for i in chunk_indices]
final_texts = [texts[i] for i in chunk_indices]
final_metadata = [metadata[i] for i in chunk_indices]
final_embeddings = embeddings_matrix[embed_indices]
# Step 3: Recreate Qdrant collection
print("Step 3: Rebuilding Qdrant collection...")
store = QdrantStore()
store.create_collection(recreate=True)
# Step 4: Index
print(f"Step 4: Indexing {len(final_chunk_ids):,} chunks into Qdrant...")
total = store.index_chunks(
embeddings=final_embeddings,
chunk_ids=final_chunk_ids,
metadata=final_metadata,
texts=final_texts,
)
print(f"\n{'=' * 60}")
print(f" βœ… INDEXING COMPLETE")
print(f" Chunks indexed: {total:,}")
print(f" Collection: {store.get_collection_info()}")
print(f" RAM usage: ~{total * 768 * 4 / 1e6:.0f} MB (vectors only)")
print(f"{'=' * 60}")
print(f"\n πŸ‘‰ Now run: python run_api.py")
if __name__ == "__main__":
main()