customeragent-api / server /scripts /populate_vectordb.py
anasraza526's picture
Clean deploy to Hugging Face
ac90985
import asyncio
import numpy as np
from app.core.database import SessionLocal
from app.models.website import WebsiteContent
from app.services.vector_db import VectorDB
from app.services.vector_operations import VectorOperations
async def populate_vectordb():
print("Populating VectorDB for Website 8...")
db = SessionLocal()
website_id = 8
# Get content
contents = db.query(WebsiteContent).filter(WebsiteContent.website_id == website_id).all()
print(f"Found {len(contents)} content records")
if not contents:
print("No content found!")
return
# Prepare data
texts = []
metadata = []
for record in contents:
# Chunk text
chunks = VectorOperations.chunk_text(record.content, max_tokens=500)
for i, chunk in enumerate(chunks):
if len(chunk.strip()) > 50:
texts.append(chunk)
metadata.append({
"page_url": record.page_url,
"content": chunk,
"chunk_id": f"{record.id}_{i}"
})
print(f"Created {len(texts)} chunks")
if not texts:
return
# Generate embeddings
print("Generating embeddings...")
embeddings = []
for i, text in enumerate(texts):
if i % 10 == 0:
print(f"Processing chunk {i}/{len(texts)}")
embedding = await VectorOperations.get_embedding(text)
embeddings.append(embedding)
# Save to VectorDB
print("Saving to VectorDB...")
vector_db = VectorDB()
embeddings_array = np.array(embeddings, dtype=np.float32)
vector_db.add_vectors(embeddings_array, metadata)
vector_db.save(website_id)
print("✓ VectorDB populated successfully!")
if __name__ == "__main__":
asyncio.run(populate_vectordb())