customeragent-api / server /scripts /repair_iqra_index.py
anasraza526's picture
Clean deploy to Hugging Face
ac90985
import asyncio
import json
import os
import sys
import numpy as np
# Setup paths
server_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
sys.path.append(server_dir)
from app.services.vector_db import VectorDB
from app.services.vector_operations import VectorOperations
async def repair_index():
website_id = 22
metadata_path = os.path.join(server_dir, "vector_db", f"metadata_{website_id}.json")
index_path = os.path.join(server_dir, "vector_db", f"index_{website_id}.faiss")
print(f"Repairing index for website {website_id}...")
if not os.path.exists(metadata_path):
print(f"Error: Metadata not found at {metadata_path}")
return
with open(metadata_path, 'r') as f:
metadata_list = json.load(f)
print(f"Found {len(metadata_list)} chunks in metadata.")
new_embeddings = []
for i, meta in enumerate(metadata_list):
text = meta.get('text', meta.get('content', ''))
if not text:
print(f"Warning: Empty text in chunk {i}")
continue
if i % 5 == 0:
print(f"Embedding chunk {i}/{len(metadata_list)}...")
# Use is_query=False to apply "passage: " prefix for indexing
emb = await VectorOperations.get_embedding(text, is_query=False)
new_embeddings.append(emb)
if not new_embeddings:
print("No embeddings generated.")
return
print("Saving to new FAISS index...")
vdb = VectorDB()
# Delete old index to ensure a fresh start in memory cache
vdb.delete_index(website_id)
# Re-initialize for adding
vdb.add_vectors(np.array(new_embeddings, dtype=np.float32), metadata_list, website_id)
vdb.save(website_id)
print(f"✓ Repair complete. New index saved to {index_path}")
if __name__ == "__main__":
asyncio.run(repair_index())