Spaces:
Runtime error
Runtime error
| import asyncio | |
| import os | |
| import sys | |
| # Disable tokenizers parallelism for stability | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
| # Add server directory to path | |
| sys.path.append(os.path.join(os.path.dirname(__file__), '../server')) | |
| from app.services.medical_retriever import get_medical_retriever | |
| from app.services.vector_operations import VectorOperations | |
| async def verify_data(): | |
| print("π VERIFYING INTEGRATED HEALTHCARE DATA...") | |
| # SYSTEM STABILITY: Use same safe embedding as ingestion for verification | |
| async def safe_get_embedding(text: str): | |
| return VectorOperations._simple_embedding(text).tolist() | |
| # Mock it on VectorOperations before initializing retriever if possible, | |
| # but retriever usually calls it. Let's patch it globally for this script. | |
| VectorOperations.get_embedding = safe_get_embedding | |
| retriever = get_medical_retriever() | |
| print(f"\nπ TOTAL DOCUMENTS INDEXED: {len(retriever.documents)}") | |
| for i, doc in enumerate(retriever.documents): | |
| source = retriever.doc_metadata[i].get('source') if i < len(retriever.doc_metadata) else "Unknown" | |
| print(f" [{i}] Source: {source} | Text: {doc[:80]}...") | |
| test_queries = [ | |
| ("treatment for dummyitis", "MedQuAD"), | |
| ("is X effective", "PubMedQA"), | |
| ("what is flu", "WHO/CDC") | |
| ] | |
| all_passed = True | |
| for query_text, expected_source in test_queries: | |
| print(f"\nπ Searching for: '{query_text}'...") | |
| embedding = await VectorOperations.get_embedding(query_text) | |
| results = retriever.search_medical(query_text, embedding, top_k=3) | |
| if results: | |
| match = results[0] | |
| print(f"β Found Match!") | |
| print(f" Source: {match.get('source')}") | |
| print(f" Retrieved Text: {match.get('text')[:100]}...") | |
| print(f" Answer: {match.get('answer', match.get('text', ''))[:100]}...") | |
| print(f" Confidence: {match.get('confidence'):.4f}") | |
| if match.get('source') != expected_source: | |
| print(f"β οΈ Warning: Source mismatch. Expected {expected_source}, got {match.get('source')}") | |
| else: | |
| print(f"β No results found for '{query_text}'") | |
| all_passed = False | |
| if all_passed: | |
| print("\nπ VERIFICATION SUCCESS: All datasets are correctly integrated and searchable!") | |
| else: | |
| print("\nβ VERIFICATION FAILED: Some data could not be retrieved.") | |
| if __name__ == "__main__": | |
| asyncio.run(verify_data()) | |