customeragent-api / tests /verify_integrated_data.py
anasraza526's picture
Clean deploy to Hugging Face
ac90985
import asyncio
import os
import sys
# Disable tokenizers parallelism for stability
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Add server directory to path
sys.path.append(os.path.join(os.path.dirname(__file__), '../server'))
from app.services.medical_retriever import get_medical_retriever
from app.services.vector_operations import VectorOperations
async def verify_data():
print("πŸ” VERIFYING INTEGRATED HEALTHCARE DATA...")
# SYSTEM STABILITY: Use same safe embedding as ingestion for verification
async def safe_get_embedding(text: str):
return VectorOperations._simple_embedding(text).tolist()
# Mock it on VectorOperations before initializing retriever if possible,
# but retriever usually calls it. Let's patch it globally for this script.
VectorOperations.get_embedding = safe_get_embedding
retriever = get_medical_retriever()
print(f"\nπŸ“Š TOTAL DOCUMENTS INDEXED: {len(retriever.documents)}")
for i, doc in enumerate(retriever.documents):
source = retriever.doc_metadata[i].get('source') if i < len(retriever.doc_metadata) else "Unknown"
print(f" [{i}] Source: {source} | Text: {doc[:80]}...")
test_queries = [
("treatment for dummyitis", "MedQuAD"),
("is X effective", "PubMedQA"),
("what is flu", "WHO/CDC")
]
all_passed = True
for query_text, expected_source in test_queries:
print(f"\nπŸ”Ž Searching for: '{query_text}'...")
embedding = await VectorOperations.get_embedding(query_text)
results = retriever.search_medical(query_text, embedding, top_k=3)
if results:
match = results[0]
print(f"βœ… Found Match!")
print(f" Source: {match.get('source')}")
print(f" Retrieved Text: {match.get('text')[:100]}...")
print(f" Answer: {match.get('answer', match.get('text', ''))[:100]}...")
print(f" Confidence: {match.get('confidence'):.4f}")
if match.get('source') != expected_source:
print(f"⚠️ Warning: Source mismatch. Expected {expected_source}, got {match.get('source')}")
else:
print(f"❌ No results found for '{query_text}'")
all_passed = False
if all_passed:
print("\nπŸŽ‰ VERIFICATION SUCCESS: All datasets are correctly integrated and searchable!")
else:
print("\n❌ VERIFICATION FAILED: Some data could not be retrieved.")
if __name__ == "__main__":
asyncio.run(verify_data())