"""Tests for LSHTokenMatcher and FAISSContextIndex - v2.0 deduplication components.""" import numpy as np import pytest from apohara_context_forge.dedup.faiss_index import FAISSContextIndex, FAISSMatch from apohara_context_forge.dedup.lsh_engine import LSHTokenMatcher, TokenBlockMatch pytestmark = pytest.mark.skipif( not __import__('importlib').util.find_spec('faiss'), reason="faiss-cpu not installed — run: pip install faiss-cpu" ) @pytest.fixture def lsh_matcher(): """Create a fresh LSHTokenMatcher for each test.""" return LSHTokenMatcher() @pytest.fixture def faiss_index(): """Create a fresh FAISSContextIndex for each test.""" return FAISSContextIndex(dim=384) class TestLSHTokenMatcher: """Tests for LSHTokenMatcher - token-level SimHash matching.""" @pytest.mark.asyncio async def test_index_prompt(self, lsh_matcher): """Index a prompt, verify blocks are stored.""" # Need >= block_size (16) tokens after tokenization. The Qwen3 BPE # collapses common English words to one token each, so a short # sentence may yield <16 tokens. Use a longer prompt to guarantee # at least one full block. text = ( "This is a test prompt that should produce multiple token blocks " "for indexing across various transformer architectures including " "GPT, Llama, Qwen, and Mistral families on AMD MI300X with ROCm." ) hashes = await lsh_matcher.index_prompt("agent1", text) # Verify blocks were indexed assert isinstance(hashes, list) # Check stats reflect the indexing stats = await lsh_matcher.stats() assert stats["total_blocks"] >= 1 assert stats["total_agents"] == 1 assert "agent1" in lsh_matcher._agent_blocks @pytest.mark.asyncio async def test_find_reusable_blocks(self, lsh_matcher): """Index one prompt, find matches in another with similar tokens.""" # Index a prompt for agent1 text1 = "You are a helpful assistant. You provide accurate and detailed responses." await lsh_matcher.index_prompt("agent1", text1) # Index another prompt for agent2 with identical beginning text2 = "You are a helpful assistant. Tell me about quantum physics." await lsh_matcher.index_prompt("agent2", text2) # Find reusable blocks in a new prompt with same prefix text3 = "You are a helpful assistant. What is machine learning?" matches = await lsh_matcher.find_reusable_blocks(text3) # Should find some matches since the prefix is the same assert isinstance(matches, list) # Matches should be sorted by hamming distance (best first) if len(matches) > 1: assert matches[0].hamming_distance <= matches[1].hamming_distance @pytest.mark.asyncio async def test_find_reusable_blocks_exclude_agent(self, lsh_matcher): """Verify exclude_agent parameter filters correctly.""" text1 = "You are a helpful assistant. This is agent1's unique content here." await lsh_matcher.index_prompt("agent1", text1) text2 = "You are a helpful assistant. This is agent2's unique content here." await lsh_matcher.index_prompt("agent2", text2) # Search excluding agent1 text3 = "You are a helpful assistant. This is agent1's unique content here." matches = await lsh_matcher.find_reusable_blocks(text3, exclude_agent="agent1") # Should not find any matches from agent1 for match in matches: assert match.cached_agent_id != "agent1" @pytest.mark.asyncio async def test_get_shared_prefix_hash(self, lsh_matcher): """Compute stable hash of shared prefix.""" text = "This is a test prompt for hashing." hash1 = await lsh_matcher.get_shared_prefix_hash(text) hash2 = await lsh_matcher.get_shared_prefix_hash(text) # Same text should produce same hash assert hash1 == hash2 assert isinstance(hash1, str) assert len(hash1) == 32 # First 32 chars of SHA256 @pytest.mark.asyncio async def test_get_shared_prefix_hash_different_texts(self, lsh_matcher): """Different texts should produce different hashes.""" text1 = "Hello world" text2 = "Goodbye world" hash1 = await lsh_matcher.get_shared_prefix_hash(text1) hash2 = await lsh_matcher.get_shared_prefix_hash(text2) assert hash1 != hash2 @pytest.mark.asyncio async def test_lsh_stats(self, lsh_matcher): """Verify index statistics.""" text = "This is a test prompt that should produce multiple token blocks." await lsh_matcher.index_prompt("agent1", text) await lsh_matcher.index_prompt("agent2", text) stats = await lsh_matcher.stats() assert "total_blocks" in stats assert "total_agents" in stats assert "block_size" in stats assert "hash_bits" in stats assert "hamming_threshold" in stats assert stats["total_agents"] == 2 assert stats["block_size"] == 16 assert stats["hash_bits"] == 64 @pytest.mark.asyncio async def test_clear_agent(self, lsh_matcher): """Remove all blocks for an agent.""" text = "This is a test prompt for clearing agent blocks." await lsh_matcher.index_prompt("agent1", text) stats_before = await lsh_matcher.stats() assert stats_before["total_agents"] == 1 removed_count = await lsh_matcher.clear_agent("agent1") assert removed_count >= 0 stats_after = await lsh_matcher.stats() assert stats_after["total_agents"] == 0 assert stats_after["total_blocks"] == 0 @pytest.mark.asyncio async def test_clear_agent_not_found(self, lsh_matcher): """Clearing non-existent agent returns 0.""" removed = await lsh_matcher.clear_agent("nonexistent") assert removed == 0 class TestFAISSContextIndex: """Tests for FAISSContextIndex - approximate nearest neighbor search.""" @pytest.mark.asyncio async def test_add_and_search(self, faiss_index): """Add embeddings, search, verify matches above threshold.""" # Add two agents with embeddings emb1 = np.random.randn(384).astype(np.float32) emb1 = emb1 / np.linalg.norm(emb1) # Normalize emb2 = np.random.randn(384).astype(np.float32) emb2 = emb2 / np.linalg.norm(emb2) idx1 = await faiss_index.add("agent1", emb1.tolist()) idx2 = await faiss_index.add("agent2", emb2.tolist()) assert idx1 == 0 assert idx2 == 1 # Search with nearly identical query query = emb1.tolist() # Same as agent1's embedding matches = await faiss_index.search(query, k=10, threshold=0.85) assert isinstance(matches, list) assert len(matches) >= 1 # Best match should be agent1 (highest similarity to itself) best = matches[0] assert isinstance(best, FAISSMatch) assert best.agent_id == "agent1" assert best.similarity > 0.99 @pytest.mark.asyncio async def test_search_with_threshold(self, faiss_index): """Verify threshold filtering works.""" # Add an agent emb = np.random.randn(384).astype(np.float32) emb = emb / np.linalg.norm(emb) await faiss_index.add("agent1", emb.tolist()) # Search with very different query random_query = np.random.randn(384).astype(np.float32) random_query = random_query / np.linalg.norm(random_query) # High threshold should filter out dissimilar results matches = await faiss_index.search(random_query.tolist(), k=5, threshold=0.99) # Should either be empty or only contain very high similarity matches for match in matches: assert match.similarity >= 0.99 @pytest.mark.asyncio async def test_search_returns_sorted_by_similarity(self, faiss_index): """Verify results are sorted by descending similarity.""" # Add multiple agents with different embeddings for i in range(5): emb = np.random.randn(384).astype(np.float32) emb = emb / np.linalg.norm(emb) await faiss_index.add(f"agent{i}", emb.tolist()) # Search query = np.random.randn(384).astype(np.float32) query = query / np.linalg.norm(query) matches = await faiss_index.search(query, k=5, threshold=0.0) # Should be sorted by similarity descending if len(matches) > 1: for i in range(len(matches) - 1): assert matches[i].similarity >= matches[i + 1].similarity @pytest.mark.asyncio async def test_remove(self, faiss_index): """Remove agent from index.""" emb = np.random.randn(384).astype(np.float32) emb = emb / np.linalg.norm(emb) await faiss_index.add("agent1", emb.tolist()) assert faiss_index.size == 1 removed = await faiss_index.remove("agent1") assert removed is True # Size stays the same (FAISS limitation), but agent should not be found assert faiss_index.size == 1 @pytest.mark.asyncio async def test_remove_not_found(self, faiss_index): """Removing non-existent agent returns False.""" removed = await faiss_index.remove("nonexistent") assert removed is False @pytest.mark.asyncio async def test_size(self, faiss_index): """Verify index size tracking.""" assert faiss_index.size == 0 emb = np.random.randn(384).astype(np.float32) emb = emb / np.linalg.norm(emb) await faiss_index.add("agent1", emb.tolist()) assert faiss_index.size == 1 await faiss_index.add("agent2", emb.tolist()) assert faiss_index.size == 2 await faiss_index.remove("agent1") assert faiss_index.size == 2 # FAISS doesn't actually remove @pytest.mark.asyncio async def test_multiple_searches(self, faiss_index): """Verify multiple searches work correctly.""" # Add multiple agents embeddings = [] for i in range(3): emb = np.random.randn(384).astype(np.float32) emb = emb / np.linalg.norm(emb) embeddings.append(emb) await faiss_index.add(f"agent{i}", emb.tolist()) # Multiple searches should all work for emb in embeddings: matches = await faiss_index.search(emb.tolist(), k=3, threshold=0.5) assert len(matches) >= 1 class TestTokenBlockMatch: """Tests for TokenBlockMatch dataclass.""" def test_token_block_match_creation(self): """Verify TokenBlockMatch has all required fields.""" match = TokenBlockMatch( block_index=0, cached_block_hash=12345, hamming_distance=2, reuse_confidence=0.97, cached_agent_id="agent1" ) assert match.block_index == 0 assert match.cached_block_hash == 12345 assert match.hamming_distance == 2 assert match.reuse_confidence == 0.97 assert match.cached_agent_id == "agent1" class TestFAISSMatch: """Tests for FAISSMatch dataclass.""" def test_faiss_match_creation(self): """Verify FAISSMatch has all required fields.""" match = FAISSMatch( agent_id="agent1", similarity=0.95, index_position=5 ) assert match.agent_id == "agent1" assert match.similarity == 0.95 assert match.index_position == 5