| """ |
| Test retrieval from vector database |
| Validates that semantic search is working correctly |
| """ |
|
|
| import logging |
| from typing import List, Tuple |
|
|
| from .embeddings import EmbeddingGenerator |
| from .vector_db import LegalVectorDB |
| from .config import LOG_LEVEL, LOG_FORMAT |
|
|
| logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT) |
| logger = logging.getLogger(__name__) |
|
|
|
|
| def test_query( |
| query: str, |
| vector_db: LegalVectorDB, |
| embedder: EmbeddingGenerator, |
| n_results: int = 3 |
| ) -> None: |
| """ |
| Test a single query and display results |
| |
| Args: |
| query: Query string |
| vector_db: Vector database instance |
| embedder: Embedding generator instance |
| n_results: Number of results to retrieve |
| """ |
| print(f"\n{'=' * 80}") |
| print(f"Query: {query}") |
| print(f"{'=' * 80}") |
| |
| |
| query_embedding = embedder.generate_embedding(query) |
| |
| |
| results = vector_db.query_with_embedding(query_embedding.tolist(), n_results=n_results) |
| |
| |
| if not results['documents'][0]: |
| print("No results found!") |
| return |
| |
| for i, (doc, metadata, distance) in enumerate(zip( |
| results['documents'][0], |
| results['metadatas'][0], |
| results['distances'][0] |
| ), 1): |
| print(f"\nResult {i} (Distance: {distance:.4f}):") |
| print(f" Source: {metadata.get('source_file', 'N/A')}") |
| print(f" Section: {metadata.get('article_section', 'N/A')}") |
| print(f" Words: {metadata.get('word_count', 'N/A')}") |
| print(f" Text preview: {doc[:200]}...") |
| print("-" * 80) |
|
|
|
|
| def main(): |
| """Run test queries""" |
| print("=" * 80) |
| print("Testing Vector Database Retrieval") |
| print("=" * 80) |
| |
| try: |
| |
| print("\nInitializing embedding model and vector database...") |
| embedder = EmbeddingGenerator() |
| vector_db = LegalVectorDB() |
| |
| db_count = vector_db.get_count() |
| print(f"β Embedding model loaded: {embedder.model_name}") |
| print(f"β Vector database loaded: {db_count} chunks indexed") |
| |
| if db_count == 0: |
| print("\nβ Error: Vector database is empty!") |
| print("Please run 'python -m module_a.build_vector_db' first") |
| return 1 |
| |
| |
| test_queries = [ |
| "I am a single mother, how to get citizenship for my child?", |
| "Can daughters inherit property like sons?", |
| "What documents needed for marriage registration?", |
| "citizenship through mother", |
| "right to equality", |
| "fundamental rights of citizens", |
| ] |
| |
| print(f"\nRunning {len(test_queries)} test queries...") |
| |
| for query in test_queries: |
| test_query(query, vector_db, embedder, n_results=3) |
| |
| print("\n" + "=" * 80) |
| print("Retrieval Testing Complete!") |
| print("=" * 80) |
| print("\nβ All test queries executed successfully") |
| print("β Vector database is working correctly") |
| print("\nNext step: Integrate with LLM for Step 4 (RAG chain)") |
| |
| return 0 |
| |
| except Exception as e: |
| logger.error(f"Testing failed: {e}", exc_info=True) |
| print(f"\nβ Testing failed: {e}") |
| return 1 |
|
|
|
|
| if __name__ == "__main__": |
| exit(main()) |
|
|