Spaces:

m-ahmad-official
/

backend

Running

App Files Files Community

m-ahmad-official commited on Feb 18

Commit

e88ac9f

1 Parent(s): 09b3403

update

Browse files

Files changed (1) hide show

retrieve.py +396 -70

retrieve.py CHANGED Viewed

@@ -1,104 +1,430 @@
 """
-Retrieval module for RAG Book Chatbot.
-Handles vector search using Qdrant and Cohere embeddings.
 """
 import logging
-from typing import List, Dict, Any, Optional
 logger = logging.getLogger(__name__)
 def search(
     query_text: str,
-    cohere_client: Any,
-    qdrant_client: Any,
     collection_name: str,
     top_k: int = 5,
 ) -> List[Dict[str, Any]]:
     """
-    Search for relevant chunks in Qdrant using Cohere embeddings.
     Args:
-        query_text: User's question or search query
-        cohere_client: Initialized Cohere client
-        qdrant_client: Initialized Qdrant client
-        collection_name: Name of the Qdrant collection
-        top_k: Number of results to return (default: 5)
     Returns:
-        List of search results with scores and metadata
     """
     try:
-        # Generate embedding for the query
-        logger.info(f"Generating embedding for query: {query_text[:100]}...")
-        embedding_response = cohere_client.embed(
-            texts=[query_text],
-            model="embed-english-v3.0",
-            input_type="search_query",
         )
-        # In Cohere V2, embeddings are returned as a list-like object directly
-        # The response.embeddings is iterable, and we want the first element
-        try:
-            # First try: if embeddings is directly a list of embeddings
-            if isinstance(embedding_response.embeddings, list):
-                query_embedding = list(embedding_response.embeddings[0])
-            else:
-                # Convert to list if it's an iterable object
-                embeddings_list = [e for e in embedding_response.embeddings]
-                query_embedding = list(embeddings_list[0])
-        # Search in Qdrant
-        logger.info(f"Searching Qdrant collection: {collection_name}")
-        search_results = qdrant_client.query_points(
-            collection_name=collection_name,
-            query=query_embedding,
-            limit=top_k,
         )
-        logger.info(f"Found {len(search_results.points)} results")
-        # Format results
-        results = []
-        for hit in search_results.points:
-            results.append(
-                {
-                    "id": hit.id,
-                    "score": hit.score,
-                    "payload": hit.payload,
-                }
-            )
-        return results
-    except Exception as e:
-        logger.error(
-            f"Search failed for query '{query_text[:100]}...': {type(e).__name__}: {e}",
-            exc_info=True,
         )
-        raise
-def validate_results(results: List[Dict[str, Any]]) -> float:
-    """
-    Validate that results have required metadata.
-    Args:
-        results: List of search results
-    Returns:
-        Percentage of results with complete metadata (0-1)
-    """
-    if not results:
-        return 1.0
-    required_fields = {"url", "chunk_index", "text"}
-    valid_count = 0
-    for result in results:
-        payload = result.get("payload", {})
-        if all(field in payload for field in required_fields):
-            valid_count += 1
-    return valid_count / len(results)

 """
+Retrieval pipeline for RAG validation.
+This module provides functions to:
+- Convert search queries to embeddings using Cohere
+- Perform similarity search against Qdrant collection
+- Format and return results with metadata
 """
+import argparse
+import json
+import sys
+import time
 import logging
+from pathlib import Path
+from typing import List, Dict, Any
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent))
+import cohere
+from qdrant_client import QdrantClient
+# Importfrom existing modules
+import config
+import utils
+from logging_config import setup_logging
+# Initialize logger
 logger = logging.getLogger(__name__)
+# Custom exceptions
+class ConfigurationError(Exception):
+    """Raised when required configuration is missing."""
+    pass
+class CollectionNotFoundError(Exception):
+    """Raised when Qdrant collection doesn't exist."""
+    pass
+class DimensionMismatchError(Exception):
+    """Raised when embedding dimension doesn't match collection."""
+    pass
+class APIError(Exception):
+    """Raised when Cohere or Qdrant API call fails after retries."""
+    pass
+def validate_config(cfg: dict) -> None:
+    """Validate that all required config values are present."""
+    required = ["cohere_api_key", "qdrant_url", "qdrant_api_key"]
+    missing = [key for key in required if not cfg.get(key)]
+    if missing:
+        raise ConfigurationError(
+            f"Missing required environment variables: {', '.join(missing)}"
+        )
+def init_clients(cfg: dict):
+    """Initialize Cohere and Qdrant clients."""
+    cohere_client = cohere.ClientV2(api_key=cfg["cohere_api_key"])
+    qdrant_client = QdrantClient(url=cfg["qdrant_url"], api_key=cfg["qdrant_api_key"])
+    return cohere_client, qdrant_client
+def check_collection(
+    qdrant_client: QdrantClient, collection_name: str
+) -> Dict[str, Any]:
+    """Verify collection exists and has correct vector size."""
+    try:
+        info = qdrant_client.get_collection(collection_name)
+    except Exception as e:
+        if "not found" in str(e).lower():
+            raise CollectionNotFoundError(
+                f"Collection '{collection_name}' does not exist"
+            )
+        raise
+    vector_size = info.config.params.vectors.size
+    if vector_size != 1024:
+        raise DimensionMismatchError(f"Expected vector size 1024 but got {vector_size}")
+    return {
+        "exists": True,
+        "vector_size": vector_size,
+        "points_count": info.points_count,
+    }
+def embed_query(text: str, cohere_client: cohere.ClientV2) -> List[float]:
+    """Generate embedding for a search query using Cohere."""
+    try:
+        response = cohere_client.embed(
+            texts=[text], model="embed-english-v3.0", input_type="search_query"
+        )
+        # Extract embedding from response.embeddings.float_
+        embedding = response.embeddings.float_[0]
+        return embedding
+    except Exception as e:
+        logger.error(f"Failed to generate embedding: {e}")
+        raise APIError(f"Cohere embedding failed: {e}")
+def validate_metadata_completeness(results: List[Dict[str, Any]]) -> float:
+    """
+    Check metadata completeness in search results.
+    Returns:
+        Percentage (0-100) of results with complete metadata:
+        - url present and non-empty
+        - text present with length ≥ 10
+        - at least one of title or section non-empty
+    """
+    if not results:
+        return 0.0
+    complete = 0
+    total = len(results)
+    for result in results:
+        payload = result.get("payload", {})
+        url = payload.get("url", "")
+        text = payload.get("text", "")
+        title = payload.get("title", "")
+        section = payload.get("section", "")
+        # Check completeness criteria
+        url_ok = bool(url and url.strip())
+        text_ok = len(text or "") >= 10
+        title_section_ok = bool(
+            (title and title.strip()) or (section and section.strip())
+        )
+        if url_ok and text_ok and title_section_ok:
+            complete += 1
+    percentage = (complete / total) * 100
+    logger.debug(f"Metadata completeness: {complete}/{total} = {percentage:.1f}%")
+    return percentage
+def validate_chunk_sequencing(results: List[Dict[str, Any]]) -> bool:
+    """
+    Verify that chunk_index values are properly assigned: integers >= 0 and unique per URL.
+    Note: Since search may return only a subset of chunks for a URL, we cannot
+    verify full sequential continuity (0,1,2,3...). Instead we check:
+    - All chunk_index values are integers >= 0
+    - No duplicate chunk_index for the same URL in the result set
+    Args:
+        results: List of search results
+    Returns:
+        True if chunk indices are valid, False otherwise
+    """
+    # Group by URL
+    url_chunks = {}
+    for result in results:
+        payload = result.get("payload", {})
+        url = payload.get("url", "")
+        chunk_idx = payload.get("chunk_index")
+        if url not in url_chunks:
+            url_chunks[url] = []
+        url_chunks[url].append(chunk_idx)
+    # Check each URL's chunks are valid
+    for url, indices in url_chunks.items():
+        # All indices must be integers >= 0
+        for idx in indices:
+            if not isinstance(idx, int) or idx < 0:
+                logger.debug(
+                    f"Invalid chunk_index for {url}: {idx} (must be non-negative integer)"
+                )
+                return False
+        # Check for duplicates (within this URL's results)
+        if len(set(indices)) != len(indices):
+            logger.debug(f"Duplicate chunk_index for {url}: {indices}")
+            return False
+    logger.debug(f"Chunk indexing valid for {len(url_chunks)} URLs")
+    return True
 def search(
     query_text: str,
+    cohere_client: cohere.ClientV2,
+    qdrant_client: QdrantClient,
     collection_name: str,
     top_k: int = 5,
 ) -> List[Dict[str, Any]]:
     """
+    Convert query to embedding and retrieve top-K relevant chunks.
     Args:
+        query_text: User's search query (non-empty, ≤1000 chars)
+        top_k: Number of results to return (1-100)
     Returns:
+        List of search results with id, score, and payload
     """
+    # Validate inputs
+    if not query_text or not query_text.strip():
+        raise ValueError("Query text must be non-empty")
+    query_text = query_text.strip()
+    if len(query_text) > 1000:
+        raise ValueError("Query text must be ≤ 1000 characters")
+    if top_k < 1 or top_k > 100:
+        raise ValueError("top_k must be between 1 and 100")
+    logger.info(f"Embedding query: '{query_text[:100]}...' (top_k={top_k})")
+    start_time = time.time()
+    # Generate query embedding with retry
     try:
+        embedding = utils.retry_with_backoff(
+            lambda: embed_query(query_text, cohere_client),
+            max_retries=3,
+            base_delay=1.0,
+            max_delay=10.0,
         )
+        embed_time = time.time() - start_time
+        logger.debug(
+            f"Generated embedding in {embed_time:.2f}s, dimension: {len(embedding)}"
+        )
+    except Exception as e:
+        logger.error(f"Failed to embed query: {e}")
+        raise
+    # Search Qdrant with retry
+    try:
+        search_start = time.time()
+        response = utils.retry_with_backoff(
+            lambda: qdrant_client.query_points(
+                collection_name=collection_name,
+                query=embedding,
+                limit=top_k,
+                with_payload=True,
+                with_vectors=False,
+            ),
+            max_retries=3,
+            base_delay=1.0,
+            max_delay=10.0,
         )
+        results = response.points
+        search_time = time.time() - search_start
+        logger.info(
+            f"Search completed in {search_time:.2f}s, returned {len(results)} results"
+        )
+    except Exception as e:
+        logger.error(f"Search failed: {e}")
+        raise APIError(f"Qdrant search failed: {e}")
+    # Format results
+    formatted = []
+    for result in results:
+        formatted.append(
+            {
+                "id": str(result.id),
+                "score": float(result.score),
+                "payload": result.payload,
+            }
+        )
+    total_time = time.time() - start_time
+    logger.info(f"Total query time: {total_time:.2f}s")
+    return formatted
+def format_results(
+    results: List[Dict[str, Any]], query: str, latency_ms: int
+) -> Dict[str, Any]:
+    """Format search results into JSON output structure."""
+    output = {
+        "query": query,
+        "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+        "results": results,
+        "metadata": {
+            "total_results": len(results),
+            "collection": None,  # Will be filled by main
+            "latency_ms": latency_ms,
+        },
+    }
+    return output
+def main() -> int:
+    """CLI entrypoint for retrieval."""
+    parser = argparse.ArgumentParser(
+        description="Retrieve relevant chunks from Qdrant using Cohere embeddings"
+    )
+    parser.add_argument("--query", type=str, help="Search query text")
+    parser.add_argument(
+        "--top-k", type=int, default=5, help="Number of results to return (default: 5)"
+    )
+    parser.add_argument("--output", type=str, help="Output file path (default: stdout)")
+    parser.add_argument(
+        "--config",
+        type=str,
+        default=".env",
+        help="Path to .env config file (default: .env)",
+    )
+    parser.add_argument(
+        "--validate-metadata",
+        action="store_true",
+        help="Run metadata validation on search results (requires --query)",
+    )
+    args = parser.parse_args()
+    # Setup logging
+    log_file = "retrieve.log"
+    setup_logging(log_file=log_file, console_level="INFO")
+    logger.info("=== Retrieval Pipeline Started ===")
+    try:
+        # Load config
+        logger.info(f"Loading config from {args.config}")
+        cfg = config.get_config()
+        validate_config(cfg)
+        # Initialize clients
+        logger.info("Initializing Cohere and Qdrant clients")
+        cohere_client, qdrant_client = init_clients(cfg)
+        # Check collection
+        collection_name = cfg["qdrant_collection"]
+        logger.info(f"Checking collection '{collection_name}'")
+        coll_info = check_collection(qdrant_client, collection_name)
+        logger.info(
+            f"Collection OK: vector_size={coll_info['vector_size']}, points={coll_info['points_count']}"
         )
+        # Validate query argument
+        if not args.query:
+            parser.error("--query is required")
+        # Perform search
+        results = search(
+            query_text=args.query,
+            cohere_client=cohere_client,
+            qdrant_client=qdrant_client,
+            collection_name=collection_name,
+            top_k=args.top_k,
+        )
+        # Perform metadata validation if requested
+        metadata_validation = None
+        if args.validate_metadata:
+            completeness = validate_metadata_completeness(results)
+            sequencing = validate_chunk_sequencing(results)
+            metadata_validation = {
+                "completeness_pct": round(completeness, 2),
+                "sequencing_valid": sequencing,
+                "pass": completeness >= 98.0 and sequencing,
+            }
+            logger.info(f"Metadata completeness: {completeness:.1f}%")
+            logger.info(f"Chunk sequencing: {'VALID' if sequencing else 'INVALID'}")
+            logger.info(
+                f"Validation result: {'PASS' if metadata_validation['pass'] else 'FAIL'}"
+            )
+        # Format output
+        output = {
+            "query": args.query,
+            "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+            "results": results,
+            "metadata": {
+                "total_results": len(results),
+                "collection": collection_name,
+                "vector_size": coll_info["vector_size"],
+                "points_count": coll_info["points_count"],
+            },
+        }
+        if metadata_validation:
+            output["metadata_validation"] = metadata_validation
+        # Output JSON
+        json_output = json.dumps(output, indent=2)
+        if args.output:
+            with open(args.output, "w") as f:
+                f.write(json_output)
+            logger.info(f"Results written to {args.output}")
+        else:
+            print(json_output)
+        logger.info("=== Retrieval Pipeline Completed Successfully ===")
+        return 0
+    except ValueError as ve:
+        logger.error(f"Validation error: {ve}")
+        print(f"ERROR: {ve}", file=sys.stderr)
+        return 2
+    except ConfigurationError as ce:
+        logger.error(f"Configuration error: {ce}")
+        print(f"ERROR: {ce}", file=sys.stderr)
+        return 1
+    except CollectionNotFoundError as cnfe:
+        logger.error(f"Collection error: {cnfe}")
+        print(f"ERROR: {cnfe}", file=sys.stderr)
+        return 1
+    except DimensionMismatchError as dme:
+        logger.error(f"Dimension error: {dme}")
+        print(f"ERROR: {dme}", file=sys.stderr)
+        return 1
+    except APIError as api_err:
+        logger.error(f"API error: {api_err}")
+        print(f"ERROR: {api_err}", file=sys.stderr)
+        return 1
+    except Exception as e:
+        logger.exception(f"Unexpected error: {e}")
+        print(f"ERROR: Unexpected error: {e}", file=sys.stderr)
+        return 1
+if __name__ == "__main__":
+    sys.exit(main())