researchpilot-api / merge_embeddings.py
Subhadip007's picture
feat: ResearchPilot v2 β€” Next.js frontend, 358k embeddings pipeline, LaTeX math rendering, dual-GPU Kaggle scaling
99cac84
"""
Merge old (local) and new (Kaggle) embeddings into a single set.
This script:
1. Loads existing embeddings.npy + chunk_ids.npy (your ~51k chunks)
2. Loads new_embeddings.npy + new_chunk_ids.npy (from Kaggle batch)
3. Deduplicates by chunk_id
4. Saves the merged result back as embeddings.npy + chunk_ids.npy
5. Backs up the originals first
Run from project root:
python merge_embeddings.py
"""
import json
import shutil
import numpy as np
from pathlib import Path
EMBEDDINGS_DIR = Path("data/embeddings")
# File paths
old_emb_file = EMBEDDINGS_DIR / "embeddings.npy"
old_ids_file = EMBEDDINGS_DIR / "chunk_ids.npy"
new_emb_file = EMBEDDINGS_DIR / "new_embeddings.npy"
new_ids_file = EMBEDDINGS_DIR / "new_chunk_ids.npy"
# Backup dir
backup_dir = EMBEDDINGS_DIR / "backup_before_merge"
backup_dir.mkdir(exist_ok=True)
def main():
print("=" * 60)
print(" RESEARCHPILOT β€” EMBEDDING MERGE TOOL")
print("=" * 60)
# ── Step 1: Validate files exist ──
for f in [old_emb_file, old_ids_file, new_emb_file, new_ids_file]:
if not f.exists():
print(f"❌ Missing file: {f}")
return
print("βœ… All required files found.\n")
# ── Step 2: Load old embeddings ──
print("Loading OLD embeddings...")
old_embeddings = np.load(str(old_emb_file))
old_ids = list(np.load(str(old_ids_file), allow_pickle=True))
print(f" Old: {old_embeddings.shape[0]:,} chunks, dim={old_embeddings.shape[1]}")
# ── Step 3: Load new embeddings ──
print("Loading NEW embeddings (from Kaggle)...")
new_embeddings = np.load(str(new_emb_file))
new_ids = list(np.load(str(new_ids_file), allow_pickle=True))
print(f" New: {new_embeddings.shape[0]:,} chunks, dim={new_embeddings.shape[1]}")
# ── Step 4: Deduplicate ──
print("\nDeduplicating...")
old_id_set = set(old_ids)
keep_indices = []
for i, cid in enumerate(new_ids):
if cid not in old_id_set:
keep_indices.append(i)
unique_new_embeddings = new_embeddings[keep_indices]
unique_new_ids = [new_ids[i] for i in keep_indices]
duplicates_removed = len(new_ids) - len(unique_new_ids)
print(f" Duplicates skipped: {duplicates_removed}")
print(f" Unique new chunks: {len(unique_new_ids):,}")
# ── Step 5: Merge ──
print("\nMerging...")
merged_embeddings = np.vstack([old_embeddings, unique_new_embeddings])
merged_ids = old_ids + unique_new_ids
print(f" MERGED TOTAL: {merged_embeddings.shape[0]:,} chunks")
# ── Step 6: Backup originals ──
print("\nBacking up originals...")
shutil.copy2(old_emb_file, backup_dir / "embeddings_old.npy")
shutil.copy2(old_ids_file, backup_dir / "chunk_ids_old.npy")
print(f" Backed up to: {backup_dir}")
# ── Step 7: Save merged files ──
print("\nSaving merged embeddings...")
np.save(str(old_emb_file), merged_embeddings)
np.save(str(old_ids_file), np.array(merged_ids, dtype=object))
# Update the index file
index = {
"total_embeddings": len(merged_ids),
"embedding_dimension": int(merged_embeddings.shape[1]),
"model_name": "BAAI/bge-base-en-v1.5",
"chunk_id_sample": merged_ids[:5],
}
with open(EMBEDDINGS_DIR / "embedding_index.json", "w") as f:
json.dump(index, f, indent=2)
print(f" βœ… embeddings.npy β†’ {merged_embeddings.shape}")
print(f" βœ… chunk_ids.npy β†’ {len(merged_ids):,} IDs")
print(f" βœ… embedding_index.json updated")
# ── Summary ──
size_mb = (EMBEDDINGS_DIR / "embeddings.npy").stat().st_size / 1e6
print(f"\n{'=' * 60}")
print(f" MERGE COMPLETE!")
print(f" Old: {len(old_ids):,} chunks")
print(f" + New: {len(unique_new_ids):,} chunks")
print(f" = Total: {len(merged_ids):,} chunks")
print(f" File size: {size_mb:.0f} MB")
print(f"{'=' * 60}")
print(f"\nπŸ‘‰ Now run: python run_indexing.py --recreate")
if __name__ == "__main__":
main()