Spaces:
Sleeping
Sleeping
File size: 10,205 Bytes
466cc3d 6d9c72b 466cc3d 6d9c72b 466cc3d 6d9c72b cf0a8ed 466cc3d cf0a8ed 6d9c72b 466cc3d 6d9c72b 466cc3d 6d9c72b cf0a8ed 466cc3d 6d9c72b 466cc3d 6d9c72b 466cc3d 447af01 466cc3d 447af01 466cc3d 447af01 466cc3d 447af01 466cc3d 447af01 466cc3d 447af01 6d9c72b 466cc3d 6d9c72b 466cc3d 6d9c72b 466cc3d 6d9c72b 466cc3d 6d9c72b 466cc3d 6d9c72b 466cc3d 6d9c72b 466cc3d 6d9c72b 466cc3d 447af01 466cc3d 6d9c72b 466cc3d 6d9c72b 466cc3d 6d9c72b 466cc3d 6d9c72b 466cc3d 6d9c72b 466cc3d 6d9c72b 466cc3d 6d9c72b 466cc3d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 | """FastAPI MCP-compatible server exposing ContextForge tools.
The server uses a FastAPI lifespan to construct the heavy components once
(`ContextRegistry`, `ContextCompressor`, `CompressionCoordinator`,
`MetricsCollector`, `VLLMClient`) and stores them on `app.state`. Endpoints
read these via the dependency-getter functions defined below; tests
override the same getters via `app.dependency_overrides` so endpoint logic
runs against fakes without ever entering the lifespan.
Important contracts:
- /health returns the metrics-supplied GPU label, never the request body.
- Endpoints log only metadata (agent_id, lengths) — never the raw context —
so request payloads cannot leak via stdout/stderr.
"""
from __future__ import annotations
import asyncio
import logging
from contextlib import asynccontextmanager
from typing import Any, AsyncIterator
from fastapi import Depends, FastAPI, Request
from fastapi.responses import JSONResponse
from apohara_context_forge.config import settings
from apohara_context_forge.compression.compressor import ContextCompressor
from apohara_context_forge.compression.coordinator import CompressionCoordinator
from apohara_context_forge.metrics.collector import MetricsCollector
from apohara_context_forge.models import (
CompressionDecision,
ContextEntry,
ContextMatch,
ContextRegistration,
Degradation,
MetricsSnapshot,
OptimizedContextRequest,
)
from apohara_context_forge.registry.context_registry import ContextRegistry
from apohara_context_forge.serving.vllm_client import VLLMClient
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Lifespan — constructs heavy components once and tears them down on shutdown.
# ---------------------------------------------------------------------------
@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
"""Build app.state.* once; release resources on shutdown.
Tests bypass the production heavy path either by NOT entering the
`with TestClient(app) as client:` context (so this lifespan never fires)
or by monkeypatching the constructor classes referenced by name on this
module before entering the context.
"""
app.state.registry = ContextRegistry()
app.state.compressor = ContextCompressor()
app.state.coordinator = CompressionCoordinator(
registry=app.state.registry,
compressor=app.state.compressor,
)
app.state.metrics = MetricsCollector()
app.state.vllm = VLLMClient()
logger.info(
"ContextForge started on %s:%s (vLLM %s, model %s)",
settings.contextforge_host,
settings.contextforge_port,
settings.vllm_base_url,
settings.vllm_model,
)
try:
yield
finally:
# Best-effort teardown — never let cleanup errors mask the original
# request error during shutdown.
clear = getattr(app.state.registry, "clear", None)
if clear is not None:
try:
await clear()
except Exception as exc:
logger.warning("registry.clear() failed: %s", exc)
aclose = getattr(app.state.vllm, "aclose", None)
if aclose is not None:
try:
await aclose()
except Exception as exc:
logger.warning("vllm.aclose() failed: %s", exc)
app = FastAPI(title="ContextForge", version="0.1.0", lifespan=lifespan)
# Module-level globals kept for callers that import the server outside a
# lifespan-managed TestClient (e.g. ad-hoc REPL probes). Endpoints prefer
# `request.app.state.*` via the dependency getters below.
registry = ContextRegistry()
metrics = MetricsCollector()
compressor: ContextCompressor | None = None
coordinator: CompressionCoordinator | None = None
# ---------------------------------------------------------------------------
# Dependency getters — keys for app.dependency_overrides in tests.
# ---------------------------------------------------------------------------
def get_registry(request: Request) -> ContextRegistry:
return getattr(request.app.state, "registry", registry)
def get_metrics(request: Request) -> MetricsCollector:
return getattr(request.app.state, "metrics", metrics)
def get_compressor(request: Request) -> Any:
return getattr(request.app.state, "compressor", compressor)
def get_coordinator(request: Request) -> Any:
return getattr(request.app.state, "coordinator", coordinator)
# ---------------------------------------------------------------------------
# /health — never raises. Reports {"status": "ok"|"degraded", "gpu": <label>}.
# ---------------------------------------------------------------------------
@app.get("/health")
async def health_check(metrics: MetricsCollector = Depends(get_metrics)) -> dict:
try:
label = metrics._resolve_gpu_label()
return {"status": "ok", "gpu": label}
except Exception:
# Anything failing here is a soft-degrade — clients keep polling.
return {"status": "degraded", "gpu": "unknown"}
# ---------------------------------------------------------------------------
# /tools/register_context
# ---------------------------------------------------------------------------
@app.post("/tools/register_context", response_model=ContextEntry)
async def register_context(
registration: ContextRegistration,
registry: ContextRegistry = Depends(get_registry),
metrics: MetricsCollector = Depends(get_metrics),
) -> ContextEntry:
"""Register an agent's context. Strict body validation: missing field,
empty agent_id, or extra fields all yield 422 (handled by Pydantic)."""
# Log metadata only — NEVER the raw context (sentinel-leakage test).
logger.info(
"register_context agent_id=%s ctx_len=%d",
registration.agent_id,
len(registration.context),
)
entry = await registry.register(registration.agent_id, registration.context)
# The simple register endpoint does not run cross-agent dedup, so we
# always report `matched=False`. The richer pipeline path uses
# registry.register_agent and reports its own match telemetry.
metrics.record_register(False)
return entry
# ---------------------------------------------------------------------------
# /tools/get_optimized_context
# ---------------------------------------------------------------------------
def _passthrough_decision(context: str) -> CompressionDecision:
"""Build the safe fallback returned with HTTP 503 when the coordinator
raises. Callers receive a structured payload and can re-issue or fall
back to the original context themselves."""
return CompressionDecision(
strategy="passthrough",
final_context=context,
compressed_context=context,
shared_prefix="",
original_tokens=0,
final_tokens=0,
tokens_saved=0,
rationale="coordinator_unavailable",
savings_pct=0.0,
)
@app.post("/tools/get_optimized_context")
async def get_optimized_context(
request: OptimizedContextRequest,
coordinator: Any = Depends(get_coordinator),
metrics: MetricsCollector = Depends(get_metrics),
):
"""Return a compression decision. On coordinator failure return 503 with
a passthrough decision body — the client gets a structured response, not
a 500 stack trace, and metrics.record_decision is NOT called."""
logger.info(
"get_optimized_context agent_id=%s ctx_len=%d",
request.agent_id,
len(request.context),
)
try:
decision = await coordinator.decide(request.agent_id, request.context)
except Exception as exc:
# Don't log the body — only the error class. The sentinel-leakage
# test asserts no log record contains the original context string.
logger.warning(
"coordinator.decide failed for agent_id=%s: %s",
request.agent_id,
type(exc).__name__,
)
fallback = _passthrough_decision(request.context)
return JSONResponse(status_code=503, content=fallback.model_dump(mode="json"))
metrics.record_decision(decision)
return decision
# ---------------------------------------------------------------------------
# /metrics/snapshot
# ---------------------------------------------------------------------------
@app.get("/metrics/snapshot", response_model=MetricsSnapshot)
async def metrics_snapshot_endpoint(
metrics: MetricsCollector = Depends(get_metrics),
compressor: Any = Depends(get_compressor),
) -> MetricsSnapshot:
"""Aggregate snapshot. We pull `current_model` and `degradations` from the
compressor (which the lifespan owns) and forward them to the collector,
which doesn't itself know about compressor identity."""
current_model = getattr(compressor, "current_model", None) or "xlm-roberta-large"
degradations = list(getattr(compressor, "degradations", []) or [])
return await metrics.snapshot(
current_compressor_model=current_model,
compressor_degradations=degradations,
)
# ---------------------------------------------------------------------------
# Root
# ---------------------------------------------------------------------------
@app.get("/")
async def root() -> dict:
return {
"service": "ContextForge",
"version": "0.1.0",
"description": "The shared context compiler for multi-agent LLM systems",
"docs": "/docs",
}
# ---------------------------------------------------------------------------
# Background metrics loop — opt-in helper for production runs.
# ---------------------------------------------------------------------------
async def metrics_loop() -> None:
while True:
try:
await asyncio.sleep(30)
snap = await metrics.snapshot()
logger.info(
"Metrics: VRAM=%.1fGB TTFT=%.1fms Dedup=%.1f%%",
snap.vram_used_gb,
snap.ttft_ms,
snap.dedup_rate,
)
except Exception as exc:
logger.error("Metrics collection error: %s", exc)
|