Spaces:

Subhadip007
/

researchpilot-api

Running

App Files Files Community

researchpilot-api / reindex_light.py

Subhadip007

feat: ResearchPilot v2 — Next.js frontend, 358k embeddings pipeline, LaTeX math rendering, dual-GPU Kaggle scaling

99cac84 10 days ago

raw

history blame contribute delete

3.59 kB

	"""
	Re-index Qdrant with a LIMITED number of chunks (for low-RAM machines).

	Your full 358k embeddings stay on disk untouched.
	This only controls how many get loaded into the Qdrant search index.

	Usage:
	python reindex_light.py (default: 100,000 chunks)
	python reindex_light.py --limit 50000
	"""

	import sys
	import json
	import numpy as np
	from pathlib import Path

	from src.utils.logger import setup_logger, get_logger
	from src.vectorstore.qdrant_store import QdrantStore
	from src.embeddings.embedding_cache import EmbeddingCache
	from config.settings import CHUNKS_DIR

	setup_logger()
	logger = get_logger(__name__)


	def main():
	# Parse limit from args
	limit = 100_000
	if "--limit" in sys.argv:
	idx = sys.argv.index("--limit")
	limit = int(sys.argv[idx + 1])

	print(f"{'=' * 60}")
	print(f" LIGHTWEIGHT RE-INDEXER (RAM-safe)")
	print(f" Chunk limit: {limit:,}")
	print(f"{'=' * 60}\n")

	# Step 1: Load chunk files from disk (only up to limit)
	print("Step 1: Loading chunk files...")
	chunk_ids = []
	texts = []
	metadata = []

	chunk_files = sorted(CHUNKS_DIR.glob("*_semantic.json"))
	print(f" Found {len(chunk_files)} chunk files on disk")

	for cf in chunk_files:
	if len(chunk_ids) >= limit:
	break

	with open(cf, 'r', encoding='utf-8') as f:
	raw = json.load(f)

	# Handle both formats
	if isinstance(raw, dict) and "chunks" in raw:
	chunk_list = raw["chunks"]
	elif isinstance(raw, list):
	chunk_list = raw
	else:
	continue

	for chunk in chunk_list:
	if len(chunk_ids) >= limit:
	break
	chunk_ids.append(chunk['chunk_id'])
	texts.append(chunk['text'])
	metadata.append({k: v for k, v in chunk.items() if k != 'text'})

	print(f" Loaded {len(chunk_ids):,} chunks (limit: {limit:,})\n")

	# Step 2: Load embeddings and match to chunks
	print("Step 2: Loading embedding cache...")
	cache = EmbeddingCache()
	cache.load()
	embeddings_matrix, cached_ids = cache.get_all()
	print(f" Cache has {len(cached_ids):,} embeddings")

	# Build lookup
	id_to_row = {cid: i for i, cid in enumerate(cached_ids)}

	# Match chunks to embeddings
	valid = [(i, id_to_row[cid]) for i, cid in enumerate(chunk_ids) if cid in id_to_row]
	print(f" Matched {len(valid):,} chunks with embeddings\n")

	chunk_indices = [v[0] for v in valid]
	embed_indices = [v[1] for v in valid]

	final_chunk_ids = [chunk_ids[i] for i in chunk_indices]
	final_texts = [texts[i] for i in chunk_indices]
	final_metadata = [metadata[i] for i in chunk_indices]
	final_embeddings = embeddings_matrix[embed_indices]

	# Step 3: Recreate Qdrant collection
	print("Step 3: Rebuilding Qdrant collection...")
	store = QdrantStore()
	store.create_collection(recreate=True)

	# Step 4: Index
	print(f"Step 4: Indexing {len(final_chunk_ids):,} chunks into Qdrant...")
	total = store.index_chunks(
	embeddings=final_embeddings,
	chunk_ids=final_chunk_ids,
	metadata=final_metadata,
	texts=final_texts,
	)

	print(f"\n{'=' * 60}")
	print(f" ✅ INDEXING COMPLETE")
	print(f" Chunks indexed: {total:,}")
	print(f" Collection: {store.get_collection_info()}")
	print(f" RAM usage: ~{total * 768 * 4 / 1e6:.0f} MB (vectors only)")
	print(f"{'=' * 60}")
	print(f"\n 👉 Now run: python run_api.py")


	if __name__ == "__main__":
	main()