Spaces:

TheLinconX
/

contextforge-demo

Sleeping

contextforge-demo / tests /test_dedup.py

Pablo

fix: test_mcp_server 12 failures resolved — model fields, registry API, GPU label

466cc3d 2 days ago

12 kB

	"""Tests for LSHTokenMatcher and FAISSContextIndex - v2.0 deduplication components."""
	import numpy as np
	import pytest

	from apohara_context_forge.dedup.faiss_index import FAISSContextIndex, FAISSMatch
	from apohara_context_forge.dedup.lsh_engine import LSHTokenMatcher, TokenBlockMatch

	pytestmark = pytest.mark.skipif(
	not __import__('importlib').util.find_spec('faiss'),
	reason="faiss-cpu not installed — run: pip install faiss-cpu"
	)


	@pytest.fixture
	def lsh_matcher():
	"""Create a fresh LSHTokenMatcher for each test."""
	return LSHTokenMatcher()


	@pytest.fixture
	def faiss_index():
	"""Create a fresh FAISSContextIndex for each test."""
	return FAISSContextIndex(dim=384)


	class TestLSHTokenMatcher:
	"""Tests for LSHTokenMatcher - token-level SimHash matching."""

	@pytest.mark.asyncio
	async def test_index_prompt(self, lsh_matcher):
	"""Index a prompt, verify blocks are stored."""
	# Need >= block_size (16) tokens after tokenization. The Qwen3 BPE
	# collapses common English words to one token each, so a short
	# sentence may yield <16 tokens. Use a longer prompt to guarantee
	# at least one full block.
	text = (
	"This is a test prompt that should produce multiple token blocks "
	"for indexing across various transformer architectures including "
	"GPT, Llama, Qwen, and Mistral families on AMD MI300X with ROCm."
	)

	hashes = await lsh_matcher.index_prompt("agent1", text)

	# Verify blocks were indexed
	assert isinstance(hashes, list)

	# Check stats reflect the indexing
	stats = await lsh_matcher.stats()
	assert stats["total_blocks"] >= 1
	assert stats["total_agents"] == 1
	assert "agent1" in lsh_matcher._agent_blocks

	@pytest.mark.asyncio
	async def test_find_reusable_blocks(self, lsh_matcher):
	"""Index one prompt, find matches in another with similar tokens."""
	# Index a prompt for agent1
	text1 = "You are a helpful assistant. You provide accurate and detailed responses."
	await lsh_matcher.index_prompt("agent1", text1)

	# Index another prompt for agent2 with identical beginning
	text2 = "You are a helpful assistant. Tell me about quantum physics."
	await lsh_matcher.index_prompt("agent2", text2)

	# Find reusable blocks in a new prompt with same prefix
	text3 = "You are a helpful assistant. What is machine learning?"
	matches = await lsh_matcher.find_reusable_blocks(text3)

	# Should find some matches since the prefix is the same
	assert isinstance(matches, list)
	# Matches should be sorted by hamming distance (best first)
	if len(matches) > 1:
	assert matches[0].hamming_distance <= matches[1].hamming_distance

	@pytest.mark.asyncio
	async def test_find_reusable_blocks_exclude_agent(self, lsh_matcher):
	"""Verify exclude_agent parameter filters correctly."""
	text1 = "You are a helpful assistant. This is agent1's unique content here."
	await lsh_matcher.index_prompt("agent1", text1)

	text2 = "You are a helpful assistant. This is agent2's unique content here."
	await lsh_matcher.index_prompt("agent2", text2)

	# Search excluding agent1
	text3 = "You are a helpful assistant. This is agent1's unique content here."
	matches = await lsh_matcher.find_reusable_blocks(text3, exclude_agent="agent1")

	# Should not find any matches from agent1
	for match in matches:
	assert match.cached_agent_id != "agent1"

	@pytest.mark.asyncio
	async def test_get_shared_prefix_hash(self, lsh_matcher):
	"""Compute stable hash of shared prefix."""
	text = "This is a test prompt for hashing."

	hash1 = await lsh_matcher.get_shared_prefix_hash(text)
	hash2 = await lsh_matcher.get_shared_prefix_hash(text)

	# Same text should produce same hash
	assert hash1 == hash2
	assert isinstance(hash1, str)
	assert len(hash1) == 32 # First 32 chars of SHA256

	@pytest.mark.asyncio
	async def test_get_shared_prefix_hash_different_texts(self, lsh_matcher):
	"""Different texts should produce different hashes."""
	text1 = "Hello world"
	text2 = "Goodbye world"

	hash1 = await lsh_matcher.get_shared_prefix_hash(text1)
	hash2 = await lsh_matcher.get_shared_prefix_hash(text2)

	assert hash1 != hash2

	@pytest.mark.asyncio
	async def test_lsh_stats(self, lsh_matcher):
	"""Verify index statistics."""
	text = "This is a test prompt that should produce multiple token blocks."
	await lsh_matcher.index_prompt("agent1", text)
	await lsh_matcher.index_prompt("agent2", text)

	stats = await lsh_matcher.stats()

	assert "total_blocks" in stats
	assert "total_agents" in stats
	assert "block_size" in stats
	assert "hash_bits" in stats
	assert "hamming_threshold" in stats

	assert stats["total_agents"] == 2
	assert stats["block_size"] == 16
	assert stats["hash_bits"] == 64

	@pytest.mark.asyncio
	async def test_clear_agent(self, lsh_matcher):
	"""Remove all blocks for an agent."""
	text = "This is a test prompt for clearing agent blocks."
	await lsh_matcher.index_prompt("agent1", text)

	stats_before = await lsh_matcher.stats()
	assert stats_before["total_agents"] == 1

	removed_count = await lsh_matcher.clear_agent("agent1")

	assert removed_count >= 0
	stats_after = await lsh_matcher.stats()
	assert stats_after["total_agents"] == 0
	assert stats_after["total_blocks"] == 0

	@pytest.mark.asyncio
	async def test_clear_agent_not_found(self, lsh_matcher):
	"""Clearing non-existent agent returns 0."""
	removed = await lsh_matcher.clear_agent("nonexistent")
	assert removed == 0


	class TestFAISSContextIndex:
	"""Tests for FAISSContextIndex - approximate nearest neighbor search."""

	@pytest.mark.asyncio
	async def test_add_and_search(self, faiss_index):
	"""Add embeddings, search, verify matches above threshold."""
	# Add two agents with embeddings
	emb1 = np.random.randn(384).astype(np.float32)
	emb1 = emb1 / np.linalg.norm(emb1) # Normalize

	emb2 = np.random.randn(384).astype(np.float32)
	emb2 = emb2 / np.linalg.norm(emb2)

	idx1 = await faiss_index.add("agent1", emb1.tolist())
	idx2 = await faiss_index.add("agent2", emb2.tolist())

	assert idx1 == 0
	assert idx2 == 1

	# Search with nearly identical query
	query = emb1.tolist() # Same as agent1's embedding
	matches = await faiss_index.search(query, k=10, threshold=0.85)

	assert isinstance(matches, list)
	assert len(matches) >= 1

	# Best match should be agent1 (highest similarity to itself)
	best = matches[0]
	assert isinstance(best, FAISSMatch)
	assert best.agent_id == "agent1"
	assert best.similarity > 0.99

	@pytest.mark.asyncio
	async def test_search_with_threshold(self, faiss_index):
	"""Verify threshold filtering works."""
	# Add an agent
	emb = np.random.randn(384).astype(np.float32)
	emb = emb / np.linalg.norm(emb)
	await faiss_index.add("agent1", emb.tolist())

	# Search with very different query
	random_query = np.random.randn(384).astype(np.float32)
	random_query = random_query / np.linalg.norm(random_query)

	# High threshold should filter out dissimilar results
	matches = await faiss_index.search(random_query.tolist(), k=5, threshold=0.99)

	# Should either be empty or only contain very high similarity matches
	for match in matches:
	assert match.similarity >= 0.99

	@pytest.mark.asyncio
	async def test_search_returns_sorted_by_similarity(self, faiss_index):
	"""Verify results are sorted by descending similarity."""
	# Add multiple agents with different embeddings
	for i in range(5):
	emb = np.random.randn(384).astype(np.float32)
	emb = emb / np.linalg.norm(emb)
	await faiss_index.add(f"agent{i}", emb.tolist())

	# Search
	query = np.random.randn(384).astype(np.float32)
	query = query / np.linalg.norm(query)
	matches = await faiss_index.search(query, k=5, threshold=0.0)

	# Should be sorted by similarity descending
	if len(matches) > 1:
	for i in range(len(matches) - 1):
	assert matches[i].similarity >= matches[i + 1].similarity

	@pytest.mark.asyncio
	async def test_remove(self, faiss_index):
	"""Remove agent from index."""
	emb = np.random.randn(384).astype(np.float32)
	emb = emb / np.linalg.norm(emb)
	await faiss_index.add("agent1", emb.tolist())

	assert faiss_index.size == 1

	removed = await faiss_index.remove("agent1")
	assert removed is True

	# Size stays the same (FAISS limitation), but agent should not be found
	assert faiss_index.size == 1

	@pytest.mark.asyncio
	async def test_remove_not_found(self, faiss_index):
	"""Removing non-existent agent returns False."""
	removed = await faiss_index.remove("nonexistent")
	assert removed is False

	@pytest.mark.asyncio
	async def test_size(self, faiss_index):
	"""Verify index size tracking."""
	assert faiss_index.size == 0

	emb = np.random.randn(384).astype(np.float32)
	emb = emb / np.linalg.norm(emb)

	await faiss_index.add("agent1", emb.tolist())
	assert faiss_index.size == 1

	await faiss_index.add("agent2", emb.tolist())
	assert faiss_index.size == 2

	await faiss_index.remove("agent1")
	assert faiss_index.size == 2 # FAISS doesn't actually remove

	@pytest.mark.asyncio
	async def test_multiple_searches(self, faiss_index):
	"""Verify multiple searches work correctly."""
	# Add multiple agents
	embeddings = []
	for i in range(3):
	emb = np.random.randn(384).astype(np.float32)
	emb = emb / np.linalg.norm(emb)
	embeddings.append(emb)
	await faiss_index.add(f"agent{i}", emb.tolist())

	# Multiple searches should all work
	for emb in embeddings:
	matches = await faiss_index.search(emb.tolist(), k=3, threshold=0.5)
	assert len(matches) >= 1


	class TestTokenBlockMatch:
	"""Tests for TokenBlockMatch dataclass."""

	def test_token_block_match_creation(self):
	"""Verify TokenBlockMatch has all required fields."""
	match = TokenBlockMatch(
	block_index=0,
	cached_block_hash=12345,
	hamming_distance=2,
	reuse_confidence=0.97,
	cached_agent_id="agent1"
	)

	assert match.block_index == 0
	assert match.cached_block_hash == 12345
	assert match.hamming_distance == 2
	assert match.reuse_confidence == 0.97
	assert match.cached_agent_id == "agent1"


	class TestFAISSMatch:
	"""Tests for FAISSMatch dataclass."""

	def test_faiss_match_creation(self):
	"""Verify FAISSMatch has all required fields."""
	match = FAISSMatch(
	agent_id="agent1",
	similarity=0.95,
	index_position=5
	)

	assert match.agent_id == "agent1"
	assert match.similarity == 0.95
	assert match.index_position == 5