Spaces:

TheLinconX
/

contextforge-demo

Sleeping

contextforge-demo / apohara_context_forge /registry /context_registry.py

Pablo

fix: test_mcp_server 12 failures resolved — model fields, registry API, GPU label

466cc3d 2 days ago

18 kB

	"""ContextRegistry v3.0 - Wired to LSH + FAISS + VRAMAwareCache.

	Replaces the old Python-loop dedup and static TTLCache with:
	- LSHTokenMatcher: SimHash on actual Qwen3 token IDs, PagedAttention block alignment
	- FAISSContextIndex: O(log n) ANN search vs O(n) linear scan
	- VRAMAwareCache: 5-mode LRU/LFU hybrid with VRAM-pressure-responsive eviction

	Dependency injection - no hardcoded imports of stale modules.
	"""
	import asyncio
	import hashlib
	import logging
	from dataclasses import dataclass, field
	from typing import Any, Optional

	from apohara_context_forge.dedup.faiss_index import FAISSContextIndex, FAISSMatch
	from apohara_context_forge.dedup.lsh_engine import LSHTokenMatcher, TokenBlockMatch
	from apohara_context_forge.embeddings.embedding_engine import EmbeddingEngine
	from apohara_context_forge.kv_offset.anchor_pool import AnchorPool
	from apohara_context_forge.metrics.prometheus_metrics import (
	cache_hits,
	cache_misses,
	cache_registry_size,
	cache_evictions_total,
	)
	from apohara_context_forge.models import ContextEntry, ContextMatch
	from apohara_context_forge.registry.vram_aware_cache import VRAMAwareCache
	from apohara_context_forge.token_counter import TokenCounter

	logger = logging.getLogger(__name__)

	# vLLM PagedAttention block size
	VLLM_BLOCK_SIZE = 16


	@dataclass
	class SharedContextResult:
	"""Result of get_shared_context() - contains reusable blocks with metadata."""
	agent_id: str
	shared_blocks: list[TokenBlockMatch]
	faiss_matches: list[FAISSMatch]
	total_tokens_saved: int
	reuse_confidence: float # 0.0-1.0 weighted by hamming distance
	offset_hints: dict[str, list[float]] = field(default_factory=dict) # agent_id -> offset vector


	@dataclass
	class RegisteredAgent:
	"""Internal record of a registered agent."""
	agent_id: str
	system_prompt: str
	role_prompt: str
	token_count: int
	block_hashes: list[int] # LSH block hashes for this agent


	class ContextRegistry:
	"""
	Production-grade context registry with LSH + FAISS + VRAM-aware cache.

	Usage:
	registry = ContextRegistry(
	lsh_matcher=LSHTokenMatcher(),
	vram_cache=VRAMAwareCache(max_token_budget=50_000_000),
	faiss_index=FAISSContextIndex(dim=384),
	)
	await registry.start()

	# Register agents with shared system prompt
	await registry.register_agent("agent1", system_prompt, "retriever role")
	await registry.register_agent("agent2", system_prompt, "summarizer role")

	# Query for reusable context across agents
	result = await registry.get_shared_context(["agent1", "agent2"])

	await registry.stop()

	Key design decisions:
	- Dependency injection for all core components (testable, swappable)
	- LSH operates on token IDs, not text - aligns to vLLM PagedAttention blocks
	- FAISS provides ANN candidates; LSH filters for actual token-level reuse
	- VRAMAwareCache manages eviction based on real GPU memory pressure
	"""

	def __init__(
	self,
	lsh_matcher: Optional[LSHTokenMatcher] = None,
	vram_cache: Optional[VRAMAwareCache] = None,
	faiss_index: Optional[FAISSContextIndex] = None,
	token_counter: Optional[TokenCounter] = None,
	anchor_pool: Optional[AnchorPool] = None,
	vram_budget_tokens: int = 50_000_000,
	block_size: int = VLLM_BLOCK_SIZE,
	hamming_threshold: int = 8,
	faiss_nlist: int = 100,
	dedup: Any = None,
	):
	# Dependency injection with lazy defaults
	self._lsh = lsh_matcher or LSHTokenMatcher(
	block_size=block_size,
	hamming_threshold=hamming_threshold,
	)
	self._vram_cache = vram_cache or VRAMAwareCache(max_token_budget=vram_budget_tokens)
	# FAISS index dim must match the EmbeddingEngine output dimension
	# (we instantiate EmbeddingEngine with dim=512 in register_agent).
	# A 384-dim default crashes faiss.IndexFlatIP.add() at runtime —
	# see the cascade of test_integration failures pre-fix.
	self._faiss = faiss_index or FAISSContextIndex(dim=512)
	self._token_counter = token_counter or TokenCounter.get()
	self._anchor_pool = anchor_pool or AnchorPool()
	self._embedding_engine: Optional[EmbeddingEngine] = None
	self._block_size = block_size

	# `dedup` is a hermetic-test escape hatch — when set, register() short-
	# circuits the LSH+FAISS+ANN heavy path and uses the provided engine
	# instead. The engine only needs `embed`, `similarity`,
	# `find_shared_prefix`, and `count_prefix_tokens` — see FakeDedupEngine
	# in tests/test_mcp_server.py for the contract.
	self._dedup = dedup

	# Lightweight in-memory store for `register(agent_id, context)`. This
	# is independent from `register_agent(...)` (which exercises the full
	# KV-aware pipeline) — it backs the simple MCP /tools/register_context
	# endpoint and the test_full_flow scenario.
	self._simple_entries: dict[str, ContextEntry] = {}

	# Internal state
	self._agents: dict[str, RegisteredAgent] = {}
	self._system_prompt_hash: Optional[str] = None
	self._lock = asyncio.Lock()
	self._started = False

	async def start(self) -> None:
	"""Start background VRAM monitor and cache."""
	if self._started:
	return
	await self._vram_cache.start()
	self._started = True
	logger.info("ContextRegistry started with LSH+FAISS+VRAM cache")

	async def stop(self) -> None:
	"""Stop background monitoring and flush cache."""
	if not self._started:
	return
	await self._vram_cache.stop()
	self._started = False
	logger.info("ContextRegistry stopped")

	async def register_agent(
	self,
	agent_id: str,
	system_prompt: str,
	role_prompt: str,
	) -> ContextEntry:
	"""
	Register an agent with tokenization and LSH indexing.

	Args:
	agent_id: Unique agent identifier
	system_prompt: Shared system prompt (must be byte-identical across agents)
	role_prompt: Agent-specific role/instruction text

	Returns:
	ContextEntry with accurate token count
	"""
	loop = asyncio.get_event_loop()

	# Tokenize full context
	full_context = f"{system_prompt}\n\n{role_prompt}"
	token_ids = await loop.run_in_executor(
	None, self._token_counter.encode, full_context
	)
	token_count = len(token_ids)

	# Index system prompt for LSH (critical for prefix caching)
	system_block_hashes = await self._lsh.index_prompt(
	f"{agent_id}:system",
	system_prompt
	)

	# Index full prompt for cross-agent dedup
	full_block_hashes = await self._lsh.index_prompt(
	agent_id,
	full_context
	)

	# Generate real embedding via EmbeddingEngine (replaces pseudo-embedding)
	if self._embedding_engine is None:
	self._embedding_engine = await EmbeddingEngine.get_instance(dim=512, use_onnx=True)
	embedding = await self._embedding_engine.encode(full_context)

	# Update AnchorPool — use embedding as kv_offset_approx until
	# LMCacheConnectorV1 bridge (TASK-007) provides real KV offset vectors
	await self._anchor_pool.update_pool(
	token_ids=token_ids,
	agent_id=agent_id,
	real_kv_offset=embedding.copy(),
	neighbor_prefix_offset=None, # populated by TASK-007
	)

	# Store in VRAM-aware cache
	cache_key = f"context:{agent_id}"
	cache_value = {
	"system_prompt": system_prompt,
	"role_prompt": role_prompt,
	"full_context": full_context,
	"token_ids": token_ids,
	}
	stored = await self._vram_cache.set(
	cache_key,
	cache_value,
	token_count=token_count,
	)
	if not stored:
	logger.warning(f"VRAM cache blocked registration for {agent_id}")

	# Add to FAISS index for ANN search
	# Use real embedding from EmbeddingEngine (replaces pseudo-embedding)
	await self._faiss.add(agent_id, embedding.tolist())

	# Track registered agent
	async with self._lock:
	# Validate system prompt consistency (byte-identical for vLLM prefix caching)
	if self._system_prompt_hash is None:
	self._system_prompt_hash = self._sha256_prefix(system_prompt)
	else:
	incoming_hash = self._sha256_prefix(system_prompt)
	if incoming_hash != self._system_prompt_hash:
	logger.warning(
	f"Agent {agent_id} has DIFFERENT system prompt hash. "
	f"vLLM prefix caching will NOT work. "
	f"Expected {self._system_prompt_hash[:16]}, got {incoming_hash[:16]}"
	)

	self._agents[agent_id] = RegisteredAgent(
	agent_id=agent_id,
	system_prompt=system_prompt,
	role_prompt=role_prompt,
	token_count=token_count,
	block_hashes=full_block_hashes,
	)

	logger.debug(f"Registered agent {agent_id}, tokens={token_count}, blocks={len(full_block_hashes)}")

	return ContextEntry(
	agent_id=agent_id,
	context=full_context,
	token_count=token_count,
	compressed_token_count=None,
	ttl_seconds=0, # VRAM cache handles TTL
	)

	async def get_shared_context(
	self,
	agent_ids: list[str],
	target_agent_id: Optional[str] = None,
	) -> list[SharedContextResult]:
	"""
	Query for reusable context across multiple agents.

	Uses FAISS ANN to find candidate matches, then LSH to validate
	actual token-level reuse at PagedAttention block granularity.

	Args:
	agent_ids: Agents whose context to search
	target_agent_id: Optional target for offset hints

	Returns:
	List of SharedContextResult sorted by reuse confidence
	"""
	if len(agent_ids) < 2:
	return []

	# Gather all registered agents
	agents_to_search = []
	async with self._lock:
	for aid in agent_ids:
	if aid in self._agents:
	agents_to_search.append(self._agents[aid])

	if not agents_to_search:
	return []

	results: list[SharedContextResult] = []

	# For each agent, find matches in other agents
	for agent in agents_to_search:
	# Get full context for LSH matching
	cache_key = f"context:{agent.agent_id}"
	cache_val = await self._vram_cache.get(cache_key)
	if not cache_val:
	continue

	full_context = cache_val["full_context"]
	system_prompt = cache_val["system_prompt"]

	# Find reusable blocks via LSH
	matches = await self._lsh.find_reusable_blocks(
	full_context,
	exclude_agent=agent.agent_id,
	)

	# Filter matches by hamming threshold and compute confidence
	valid_matches = []
	total_hamming = 0
	for match in matches:
	if match.hamming_distance <= self._lsh._hamming_threshold:
	valid_matches.append(match)
	total_hamming += match.hamming_distance

	if not valid_matches:
	cache_misses.labels(agent_id=agent.agent_id).inc()
	continue

	avg_hamming = total_hamming / len(valid_matches)
	reuse_confidence = 1.0 - (avg_hamming / self._lsh._hash_bits)

	# Get FAISS ANN candidates for the system prompt
	# Use real embedding from EmbeddingEngine (replaces pseudo-embedding)
	if self._embedding_engine is None:
	self._embedding_engine = await EmbeddingEngine.get_instance(dim=512, use_onnx=True)
	system_embedding = await self._embedding_engine.encode(system_prompt)
	faiss_matches = await self._faiss.search(
	system_embedding.tolist(),
	k=5,
	threshold=0.7,
	)

	# Compute total tokens saved
	blocks_per_match = len(valid_matches)
	tokens_saved = blocks_per_match * self._block_size * len(valid_matches)

	# AnchorPool shareability prediction
	is_shareable = await self._anchor_pool.predict_shareable(
	token_ids=cache_val["token_ids"],
	target_agent_id=target_agent_id or agent.agent_id,
	)

	offset_vector = None
	if is_shareable:
	offset_result = await self._anchor_pool.approximate_offset(
	token_ids=cache_val["token_ids"],
	target_agent_id=target_agent_id or agent.agent_id,
	)
	if offset_result is not None:
	offset_vector = offset_result.placeholder_offset

	# Populate offset_hints — this field was ALWAYS empty in V3
	result = SharedContextResult(
	agent_id=agent.agent_id,
	shared_blocks=valid_matches,
	faiss_matches=faiss_matches,
	total_tokens_saved=tokens_saved,
	reuse_confidence=reuse_confidence,
	)
	if offset_vector is not None:
	result.offset_hints[agent.agent_id] = offset_vector.tolist()

	results.append(result)

	cache_hits.labels(
	agent_id=agent.agent_id,
	segment_type="system_prompt",
	).inc()

	# Sort by reuse confidence descending
	results.sort(key=lambda r: r.reuse_confidence, reverse=True)
	return results

	async def get_agent_context(self, agent_id: str) -> Optional[str]:
	"""Get the full context for an agent."""
	cache_key = f"context:{agent_id}"
	cache_val = await self._vram_cache.get(cache_key)
	if cache_val:
	return cache_val["full_context"]
	return None

	async def register(self, agent_id: str, context: str) -> ContextEntry:
	"""Lightweight register used by the MCP /tools/register_context endpoint.

	This is intentionally separate from `register_agent(...)`, which also
	indexes the system prompt for cross-agent KV reuse. The MCP endpoint
	deals with single opaque contexts, so we tokenize via TokenCounter,
	keep a `ContextEntry` in `_simple_entries`, and stop there.
	"""
	from datetime import datetime as _dt, timedelta as _td, timezone as _tz

	loop = asyncio.get_event_loop()
	try:
	token_count = await loop.run_in_executor(
	None, self._token_counter.count, context
	)
	except Exception:
	token_count = max(1, len(context.split()))

	now = _dt.now(_tz.utc)
	entry = ContextEntry(
	agent_id=agent_id,
	context=context,
	token_count=token_count,
	created_at=now,
	expires_at=now + _td(seconds=300),
	)
	async with self._lock:
	self._simple_entries[agent_id] = entry
	return entry

	async def clear(self) -> None:
	"""Drop all simple-register state. Called by the MCP server lifespan
	on shutdown so a fresh process starts from a clean registry. We do
	NOT touch LSH/FAISS here — those have their own lifecycle hooks."""
	async with self._lock:
	self._simple_entries.clear()

	async def clear_agent(self, agent_id: str) -> bool:
	"""Clear an agent's context from all stores."""
	async with self._lock:
	if agent_id not in self._agents:
	return False

	# Remove from LSH
	await self._lsh.clear_agent(agent_id)
	await self._lsh.clear_agent(f"{agent_id}:system")

	# Remove from FAISS
	await self._faiss.remove(agent_id)

	# Remove from VRAM cache
	cache_key = f"context:{agent_id}"
	await self._vram_cache.delete(cache_key)

	# Remove from agents dict
	async with self._lock:
	del self._agents[agent_id]

	cache_evictions_total.labels(reason="manual").inc()
	return True

	async def get_all_agents(self) -> list[str]:
	"""Get list of all registered agent IDs."""
	async with self._lock:
	return list(self._agents.keys())

	async def get_vram_mode(self) -> str:
	"""Get current VRAM eviction mode."""
	return self._vram_cache.mode.value

	async def get_vram_pressure(self) -> float:
	"""Get current VRAM pressure (0.0-1.0)."""
	return self._vram_cache._vram.get_pressure()

	@staticmethod
	def _sha256_prefix(text: str) -> str:
	"""SHA256 of text for prefix validation."""
	import hashlib
	return hashlib.sha256(text.encode()).hexdigest()

	@property
	def lsh_matcher(self) -> LSHTokenMatcher:
	"""Direct access to LSH matcher for advanced queries."""
	return self._lsh

	@property
	def faiss_index(self) -> FAISSContextIndex:
	"""Direct access to FAISS index for advanced queries."""
	return self._faiss

	@property
	def vram_cache(self) -> VRAMAwareCache:
	"""Direct access to VRAM cache for advanced queries."""
	return self._vram_cache

	@property
	def registry_size(self) -> int:
	"""Number of registered agents."""
	return len(self._agents)

	@property
	def is_started(self) -> bool:
	"""Whether the registry is running."""
	return self._started