| """ |
| Integration Tests for RAG Pipeline |
| |
| Tests the full RAG workflow: |
| - Vector store operations |
| - Embedding generation |
| - Document retrieval |
| - Answer generation |
| """ |
|
|
| import pytest |
| from pathlib import Path |
| from unittest.mock import Mock, patch, MagicMock |
| import json |
|
|
|
|
| class TestVectorStore: |
| """Test vector store functionality.""" |
|
|
| def test_vector_store_config(self): |
| """Test VectorStoreConfig creation.""" |
| from src.rag.store import VectorStoreConfig |
|
|
| config = VectorStoreConfig( |
| collection_name="test_collection", |
| default_top_k=10, |
| similarity_threshold=0.8, |
| ) |
|
|
| assert config.collection_name == "test_collection" |
| assert config.default_top_k == 10 |
|
|
| def test_vector_search_result(self): |
| """Test VectorSearchResult model.""" |
| from src.rag.store import VectorSearchResult |
|
|
| result = VectorSearchResult( |
| chunk_id="chunk_1", |
| document_id="doc_1", |
| text="Sample text", |
| metadata={"page": 0}, |
| similarity=0.85, |
| page=0, |
| chunk_type="text", |
| ) |
|
|
| assert result.similarity == 0.85 |
| assert result.chunk_id == "chunk_1" |
|
|
| @pytest.mark.skipif( |
| not pytest.importorskip("chromadb", reason="ChromaDB not installed"), |
| reason="ChromaDB not available" |
| ) |
| def test_chromadb_store_creation(self, tmp_path): |
| """Test ChromaDB store creation.""" |
| from src.rag.store import ChromaVectorStore, VectorStoreConfig |
|
|
| config = VectorStoreConfig( |
| persist_directory=str(tmp_path / "vectorstore"), |
| collection_name="test_collection", |
| ) |
|
|
| store = ChromaVectorStore(config) |
| assert store.count() == 0 |
|
|
|
|
| class TestEmbeddings: |
| """Test embedding functionality.""" |
|
|
| def test_embedding_config(self): |
| """Test EmbeddingConfig creation.""" |
| from src.rag.embeddings import EmbeddingConfig |
|
|
| config = EmbeddingConfig( |
| adapter_type="ollama", |
| ollama_model="nomic-embed-text", |
| batch_size=16, |
| ) |
|
|
| assert config.adapter_type == "ollama" |
| assert config.batch_size == 16 |
|
|
| def test_embedding_cache_creation(self, tmp_path): |
| """Test EmbeddingCache creation.""" |
| from src.rag.embeddings import EmbeddingCache |
|
|
| cache = EmbeddingCache(str(tmp_path), "test_model") |
| assert cache.cache_dir.exists() |
|
|
| def test_embedding_cache_operations(self, tmp_path): |
| """Test EmbeddingCache get/put operations.""" |
| from src.rag.embeddings import EmbeddingCache |
|
|
| cache = EmbeddingCache(str(tmp_path), "test_model") |
|
|
| |
| test_text = "Hello world" |
| test_embedding = [0.1, 0.2, 0.3, 0.4] |
|
|
| cache.put(test_text, test_embedding) |
| retrieved = cache.get(test_text) |
|
|
| assert retrieved == test_embedding |
|
|
| def test_ollama_embedding_dimensions(self): |
| """Test OllamaEmbedding model dimensions mapping.""" |
| from src.rag.embeddings import OllamaEmbedding |
|
|
| assert OllamaEmbedding.MODEL_DIMENSIONS["nomic-embed-text"] == 768 |
| assert OllamaEmbedding.MODEL_DIMENSIONS["mxbai-embed-large"] == 1024 |
|
|
|
|
| class TestRetriever: |
| """Test retriever functionality.""" |
|
|
| def test_retriever_config(self): |
| """Test RetrieverConfig creation.""" |
| from src.rag.retriever import RetrieverConfig |
|
|
| config = RetrieverConfig( |
| default_top_k=10, |
| similarity_threshold=0.75, |
| include_evidence=True, |
| ) |
|
|
| assert config.default_top_k == 10 |
| assert config.include_evidence is True |
|
|
| def test_retrieved_chunk(self): |
| """Test RetrievedChunk model.""" |
| from src.rag.retriever import RetrievedChunk |
|
|
| chunk = RetrievedChunk( |
| chunk_id="chunk_1", |
| document_id="doc_1", |
| text="Sample retrieved text", |
| similarity=0.9, |
| page=0, |
| chunk_type="text", |
| ) |
|
|
| assert chunk.similarity == 0.9 |
|
|
|
|
| class TestGenerator: |
| """Test generator functionality.""" |
|
|
| def test_generator_config(self): |
| """Test GeneratorConfig creation.""" |
| from src.rag.generator import GeneratorConfig |
|
|
| config = GeneratorConfig( |
| llm_provider="ollama", |
| ollama_model="llama3.2:3b", |
| temperature=0.1, |
| require_citations=True, |
| ) |
|
|
| assert config.llm_provider == "ollama" |
| assert config.require_citations is True |
|
|
| def test_citation_model(self): |
| """Test Citation model.""" |
| from src.rag.generator import Citation |
|
|
| citation = Citation( |
| index=1, |
| chunk_id="chunk_1", |
| page=0, |
| text_snippet="Sample snippet", |
| confidence=0.85, |
| ) |
|
|
| assert citation.index == 1 |
| assert citation.confidence == 0.85 |
|
|
| def test_generated_answer_model(self): |
| """Test GeneratedAnswer model.""" |
| from src.rag.generator import GeneratedAnswer, Citation |
|
|
| answer = GeneratedAnswer( |
| answer="This is the generated answer.", |
| citations=[ |
| Citation( |
| index=1, |
| chunk_id="chunk_1", |
| page=0, |
| text_snippet="Evidence text", |
| confidence=0.9, |
| ) |
| ], |
| confidence=0.85, |
| abstained=False, |
| num_chunks_used=3, |
| query="What is the answer?", |
| ) |
|
|
| assert answer.answer == "This is the generated answer." |
| assert len(answer.citations) == 1 |
| assert answer.abstained is False |
|
|
| def test_abstention(self): |
| """Test abstention behavior.""" |
| from src.rag.generator import GeneratedAnswer |
|
|
| answer = GeneratedAnswer( |
| answer="I cannot provide a confident answer.", |
| citations=[], |
| confidence=0.3, |
| abstained=True, |
| abstain_reason="Low confidence", |
| num_chunks_used=2, |
| query="Complex question", |
| ) |
|
|
| assert answer.abstained is True |
| assert answer.abstain_reason == "Low confidence" |
|
|
|
|
| class TestIndexer: |
| """Test indexer functionality.""" |
|
|
| def test_indexer_config(self): |
| """Test IndexerConfig creation.""" |
| from src.rag.indexer import IndexerConfig |
|
|
| config = IndexerConfig( |
| batch_size=64, |
| include_bbox=True, |
| skip_empty_chunks=True, |
| ) |
|
|
| assert config.batch_size == 64 |
|
|
| def test_indexing_result(self): |
| """Test IndexingResult model.""" |
| from src.rag.indexer import IndexingResult |
|
|
| result = IndexingResult( |
| document_id="doc_1", |
| source_path="/path/to/doc.pdf", |
| num_chunks_indexed=10, |
| num_chunks_skipped=2, |
| success=True, |
| ) |
|
|
| assert result.success is True |
| assert result.num_chunks_indexed == 10 |
|
|
|
|
| class TestRAGIntegration: |
| """Integration tests for full RAG pipeline.""" |
|
|
| @pytest.fixture |
| def mock_chunks(self): |
| """Create mock document chunks.""" |
| from src.rag.retriever import RetrievedChunk |
|
|
| return [ |
| RetrievedChunk( |
| chunk_id=f"chunk_{i}", |
| document_id="doc_1", |
| text=f"This is sample text from chunk {i}.", |
| similarity=0.9 - (i * 0.1), |
| page=i, |
| chunk_type="text", |
| ) |
| for i in range(3) |
| ] |
|
|
| def test_context_building(self, mock_chunks): |
| """Test building context from chunks.""" |
| from src.rag.retriever import DocumentRetriever |
|
|
| retriever = DocumentRetriever() |
|
|
| context = retriever.build_context(mock_chunks, include_metadata=True) |
|
|
| assert "chunk 0" in context.lower() |
| assert "Page 1" in context |
|
|
| def test_citation_extraction(self): |
| """Test citation extraction from text.""" |
| from src.rag.generator import GroundedGenerator |
| from src.rag.retriever import RetrievedChunk |
|
|
| generator = GroundedGenerator() |
|
|
| chunks = [ |
| RetrievedChunk( |
| chunk_id="chunk_1", |
| document_id="doc_1", |
| text="First chunk content", |
| similarity=0.9, |
| page=0, |
| ), |
| RetrievedChunk( |
| chunk_id="chunk_2", |
| document_id="doc_1", |
| text="Second chunk content", |
| similarity=0.85, |
| page=1, |
| ), |
| ] |
|
|
| answer_text = "The answer is based on [1] and [2]." |
|
|
| citations = generator._extract_citations(answer_text, chunks) |
|
|
| assert len(citations) == 2 |
| assert citations[0].index == 1 |
| assert citations[1].index == 2 |
|
|