"""
Merge old (local) and new (Kaggle) embeddings into a single set.

This script:
  1. Loads existing embeddings.npy + chunk_ids.npy  (your ~51k chunks)
  2. Loads new_embeddings.npy + new_chunk_ids.npy  (from Kaggle batch)
  3. Deduplicates by chunk_id
  4. Saves the merged result back as embeddings.npy + chunk_ids.npy
  5. Backs up the originals first

Run from project root:
    python merge_embeddings.py
"""

import json
import shutil
import numpy as np
from pathlib import Path

EMBEDDINGS_DIR = Path("data/embeddings")

# File paths
old_emb_file  = EMBEDDINGS_DIR / "embeddings.npy"
old_ids_file  = EMBEDDINGS_DIR / "chunk_ids.npy"
new_emb_file  = EMBEDDINGS_DIR / "new_embeddings.npy"
new_ids_file  = EMBEDDINGS_DIR / "new_chunk_ids.npy"

# Backup dir
backup_dir = EMBEDDINGS_DIR / "backup_before_merge"
backup_dir.mkdir(exist_ok=True)


def main():
    print("=" * 60)
    print("  RESEARCHPILOT — EMBEDDING MERGE TOOL")
    print("=" * 60)

    # ── Step 1: Validate files exist ──
    for f in [old_emb_file, old_ids_file, new_emb_file, new_ids_file]:
        if not f.exists():
            print(f"❌ Missing file: {f}")
            return
    print("✅ All required files found.\n")

    # ── Step 2: Load old embeddings ──
    print("Loading OLD embeddings...")
    old_embeddings = np.load(str(old_emb_file))
    old_ids = list(np.load(str(old_ids_file), allow_pickle=True))
    print(f"   Old: {old_embeddings.shape[0]:,} chunks, dim={old_embeddings.shape[1]}")

    # ── Step 3: Load new embeddings  ──
    print("Loading NEW embeddings (from Kaggle)...")
    new_embeddings = np.load(str(new_emb_file))
    new_ids = list(np.load(str(new_ids_file), allow_pickle=True))
    print(f"   New: {new_embeddings.shape[0]:,} chunks, dim={new_embeddings.shape[1]}")

    # ── Step 4: Deduplicate ──
    print("\nDeduplicating...")
    old_id_set = set(old_ids)
    keep_indices = []
    for i, cid in enumerate(new_ids):
        if cid not in old_id_set:
            keep_indices.append(i)

    unique_new_embeddings = new_embeddings[keep_indices]
    unique_new_ids = [new_ids[i] for i in keep_indices]
    duplicates_removed = len(new_ids) - len(unique_new_ids)
    print(f"   Duplicates skipped: {duplicates_removed}")
    print(f"   Unique new chunks:  {len(unique_new_ids):,}")

    # ── Step 5: Merge ──
    print("\nMerging...")
    merged_embeddings = np.vstack([old_embeddings, unique_new_embeddings])
    merged_ids = old_ids + unique_new_ids
    print(f"   MERGED TOTAL: {merged_embeddings.shape[0]:,} chunks")

    # ── Step 6: Backup originals ──
    print("\nBacking up originals...")
    shutil.copy2(old_emb_file, backup_dir / "embeddings_old.npy")
    shutil.copy2(old_ids_file, backup_dir / "chunk_ids_old.npy")
    print(f"   Backed up to: {backup_dir}")

    # ── Step 7: Save merged files ──
    print("\nSaving merged embeddings...")
    np.save(str(old_emb_file), merged_embeddings)
    np.save(str(old_ids_file), np.array(merged_ids, dtype=object))

    # Update the index file
    index = {
        "total_embeddings": len(merged_ids),
        "embedding_dimension": int(merged_embeddings.shape[1]),
        "model_name": "BAAI/bge-base-en-v1.5",
        "chunk_id_sample": merged_ids[:5],
    }
    with open(EMBEDDINGS_DIR / "embedding_index.json", "w") as f:
        json.dump(index, f, indent=2)

    print(f"   ✅ embeddings.npy  → {merged_embeddings.shape}")
    print(f"   ✅ chunk_ids.npy   → {len(merged_ids):,} IDs")
    print(f"   ✅ embedding_index.json updated")

    # ── Summary ──
    size_mb = (EMBEDDINGS_DIR / "embeddings.npy").stat().st_size / 1e6
    print(f"\n{'=' * 60}")
    print(f"  MERGE COMPLETE!")
    print(f"  Old:    {len(old_ids):,} chunks")
    print(f"  + New:  {len(unique_new_ids):,} chunks")
    print(f"  = Total: {len(merged_ids):,} chunks")
    print(f"  File size: {size_mb:.0f} MB")
    print(f"{'=' * 60}")
    print(f"\n👉 Now run:  python run_indexing.py --recreate")


if __name__ == "__main__":
    main()