Spaces:

specimba
/

nexus-os-space

Running

App Files Files Community

specimba commited on 8 days ago

Commit

baea714

verified ·

1 Parent(s): 3412619

Copy nexus_os_v2/chimera_router.py from dataset for module imports

Browse files

Files changed (1) hide show

nexus_os_v2/chimera_router.py +386 -0

nexus_os_v2/chimera_router.py ADDED Viewed

	@@ -0,0 +1,386 @@

+"""
+ChimeraRouter v2 — Hybrid Cloud+Local Inference Orchestrator
+with QWAVE Budget Allocation and TWAVE Thermodynamic Control
+Pipeline:
+  1. Sulphur Prompt Enhancer → classify intent + complexity
+  2. QWAVE Budget Allocator → local vs cloud decision
+  3. Model Selection → pick best model from registry
+  4. TWAVE Tracker → initialize thermodynamic state
+  5. Retrieval → Pinecone + Milvus + ERNIE (CK-PLUG coupling)
+  6. Generation Loop → Ollama (local) or Cloud API
+  7. Reflection → TWAVE triggers → grounding boost or model fallback
+  8. Output → response + per_token_debug telemetry
+"""
+from typing import List, Dict, Optional, Any, Tuple
+from dataclasses import dataclass, field
+from enum import Enum
+from .model_registry import REGISTRY, SULPHUR, get, by_tier, by_cap, Tier, Capability, ModelProfile
+from .sulphur_enhancer import SulphurEnhancer, MockSulphurEnhancer, EnhancedPrompt
+from .twave_tracker import TWAVETracker, TokenState, StochasticResonance
+from .ckplug_retriever import CKPLUGCoupling, get_preset_epsilon
+from .pinecone_client import PineconeRetriever, MockPineconeRetriever
+from .milvus_client import MilvusRetriever, MockMilvusRetriever
+from .ernie_adapter import ERNIEAdapter, MockERNIEAdapter
+class RoutingDecision(Enum):
+    LOCAL_OLLAMA = "local_ollama"
+    CLOUD_API = "cloud_api"
+    FALLBACK = "fallback"            # All tiers exhausted
+    REFLECTION = "reflection"        # TWAVE triggered re-grounding
+@dataclass
+class QWAVEBudget:
+    """Quality-Wave budget allocation per request."""
+    max_tokens: int = 4096
+    target_latency_ms: float = 2000.0
+    vram_budget_gb: float = 8.0      # User's local GPU VRAM
+    cloud_budget_cents: float = 5.0  # Per-request cloud budget (cents)
+    allow_cloud: bool = True
+    allow_uncensored: bool = True
+    require_vision: bool = False
+    require_safety: bool = False
+    require_tools: bool = False
+@dataclass
+class RouterResult:
+    """Complete routing result with telemetry."""
+    selected_model: str
+    model_profile: ModelProfile
+    tier: str
+    enhanced_prompt: str
+    response: str
+    token_states: List[TokenState] = field(default_factory=list)
+    reflection_count: int = 0
+    grounding_score: float = 0.0
+    hallucination_risk: float = 0.0
+    latency_ms: float = 0.0
+    tokens_generated: int = 0
+    debug: Dict[str, Any] = field(default_factory=dict)
+class ChimeraRouter:
+    """
+    Production router for NEXUS OS v2.
+    Integrates all subsystems into a unified inference pipeline.
+    """
+    def __init__(
+        self,
+        sulphur: Optional[Any] = None,           # SulphurEnhancer or mock
+        pinecone: Optional[Any] = None,          # PineconeRetriever or mock
+        milvus: Optional[Any] = None,            # MilvusRetriever or mock
+        ernie: Optional[Any] = None,             # ERNIEAdapter or mock
+        twave: Optional[TWAVETracker] = None,
+        ollama_host: str = "http://localhost:11434",
+        default_budget: Optional[QWAVEBudget] = None,
+    ):
+        # Subsystems (use mocks if none provided)
+        self.sulphur = sulphur or MockSulphurEnhancer()
+        self.pinecone = pinecone or MockPineconeRetriever()
+        self.milvus = milvus or MockMilvusRetriever()
+        self.ernie = ernie or MockERNIEAdapter()
+        self.twave = twave or TWAVETracker()
+        self.ollama_host = ollama_host
+        self.default_budget = default_budget or QWAVEBudget()
+    def _enhance(self, prompt: str) -> EnhancedPrompt:
+        """Step 1: Sulphur prompt enhancement."""
+        return self.sulphur.enhance(prompt)
+    def _select_model(self, enhanced: EnhancedPrompt, budget: QWAVEBudget) -> Tuple[str, ModelProfile]:
+        """
+        Step 2-3: QWAVE budget allocation + model selection.
+        Returns (model_id, profile).
+        """
+        # Determine required capabilities from tags
+        required_caps = []
+        for tag in enhanced.intent_tags:
+            cap_map = {
+                "coding": Capability.CODING,
+                "reasoning": Capability.REASONING,
+                "vision": Capability.VISION,
+                "creative": Capability.INSTRUCT,  # Creative uses instruct-capable models
+                "factual": Capability.REASONING,
+                "safety": Capability.SAFETY,
+                "fast": Capability.FAST,
+                "long_context": Capability.LONG_CONTEXT,
+            }
+            if tag.lower() in cap_map:
+                required_caps.append(cap_map[tag.lower()])
+        # Filter by safety/uncensored requirements
+        if budget.require_safety:
+            # Exclude abliterated/unchained models
+            exclude = [Capability.ABLITERATED, Capability.UNCHAINED]
+        elif budget.allow_uncensored:
+            exclude = []
+        else:
+            exclude = [Capability.ABLITERATED]
+        # Determine tier from complexity + VRAM budget
+        if enhanced.complexity_score > 0.8 and budget.allow_cloud:
+            # High complexity → cloud frontier or largest local
+            preferred_tiers = [Tier.CLOUD_API, Tier.LOCAL_48GB, Tier.LOCAL_24GB]
+        elif enhanced.complexity_score > 0.6:
+            preferred_tiers = [Tier.LOCAL_24GB, Tier.LOCAL_16GB, Tier.CLOUD_API]
+        elif enhanced.complexity_score > 0.4:
+            preferred_tiers = [Tier.LOCAL_16GB, Tier.LOCAL_8GB]
+        else:
+            preferred_tiers = [Tier.LOCAL_8GB]
+        # Build candidate list
+        candidates = []
+        for tier in preferred_tiers:
+            tier_models = by_tier(tier)
+            for m in tier_models:
+                # Check capability match
+                if required_caps and not all(c in m.capabilities for c in required_caps):
+                    continue
+                # Check exclusions
+                if any(c in m.capabilities for c in exclude):
+                    continue
+                # Check VRAM (local only)
+                if tier != Tier.CLOUD_API and m.size_gb > budget.vram_budget_gb:
+                    continue
+                candidates.append(m)
+        if not candidates:
+            # Fallback: any model that fits
+            all_models = list(REGISTRY.values())
+            candidates = [m for m in all_models if m.tier != Tier.CLOUD_API and m.size_gb <= budget.vram_budget_gb]
+            if not candidates and budget.allow_cloud:
+                candidates = by_tier(Tier.CLOUD_API)
+        if not candidates:
+            raise RuntimeError("No models available for this request. Check VRAM budget or enable cloud.")
+        # Score candidates: prefer higher params for complex, lower for fast
+        def score_model(m: ModelProfile) -> float:
+            s = 0.0
+            # Capability match bonus
+            for cap in required_caps:
+                if cap in m.capabilities:
+                    s += 10.0
+            # Complexity alignment
+            if enhanced.complexity_score > 0.7:
+                s += m.params_b * 2.0  # Bigger models for complex
+            else:
+                s += (10.0 - m.params_b) * 0.5  # Smaller for simple
+            # Speed bonus
+            if Capability.FAST in m.capabilities and "fast" in enhanced.intent_tags:
+                s += 5.0
+            # VRAM efficiency (prefer smaller if equal)
+            s -= m.size_gb * 0.1
+            return s
+        candidates.sort(key=score_model, reverse=True)
+        best = candidates[0]
+        # Find registry key
+        for k, v in REGISTRY.items():
+            if v == best:
+                return k, best
+        raise RuntimeError("Model selected but not found in registry.")
+    def _retrieve(self, query: str) -> Dict[str, Any]:
+        """Step 5: Multi-source retrieval aggregation."""
+        results = {
+            "pinecone": [],
+            "milvus": [],
+            "ernie": [],
+            "aggregated": [],
+        }
+        try:
+            results["pinecone"] = self.pinecone.get_evidence_for_ckplug(query)
+        except Exception as e:
+            results["pinecone_error"] = str(e)
+        try:
+            results["milvus"] = self.milvus.get_evidence("nexus_docs", query)
+        except Exception as e:
+            results["milvus_error"] = str(e)
+        try:
+            if self.ernie.is_available():
+                results["ernie"] = self.ernie.get_evidence(query)
+        except Exception as e:
+            results["ernie_error"] = str(e)
+        # Aggregate all evidence by relevance score
+        all_evidence = []
+        for src in [results["pinecone"], results["milvus"], results["ernie"]]:
+            for item in src:
+                all_evidence.append({
+                    "text": item.get("text", ""),
+                    "relevance": item.get("relevance", 0.0),
+                    "source": item.get("type", "unknown"),
+                })
+        all_evidence.sort(key=lambda x: x["relevance"], reverse=True)
+        results["aggregated"] = all_evidence[:10]  # Top 10
+        results["top_score"] = all_evidence[0]["relevance"] if all_evidence else 0.0
+        return results
+    def _generate_local(self, model_tag: str, prompt: str, max_tokens: int, temperature: float) -> str:
+        """Generate via Ollama API."""
+        import urllib.request
+        import urllib.error
+        import json
+        payload = json.dumps({
+            "model": model_tag,
+            "messages": [{"role": "user", "content": prompt}],
+            "stream": False,
+            "options": {
+                "temperature": temperature,
+                "num_predict": max_tokens,
+            },
+        }).encode("utf-8")
+        req = urllib.request.Request(
+            f"{self.ollama_host}/api/chat",
+            data=payload,
+            headers={"Content-Type": "application/json"},
+            method="POST",
+        )
+        try:
+            with urllib.request.urlopen(req, timeout=300) as resp:
+                data = json.loads(resp.read().decode("utf-8"))
+                return data.get("message", {}).get("content", "")
+        except urllib.error.URLError as e:
+            raise RuntimeError(f"Ollama error: {e}")
+    def _generate_cloud(self, cloud_tag: str, prompt: str, max_tokens: int, temperature: float) -> str:
+        """Generate via cloud API."""
+        # Placeholder — actual implementation depends on provider SDK
+        # DeepSeek, Qwen, Kimi, GLM, GPT-5, Claude each have different APIs
+        provider = cloud_tag.split(":")[0] if ":" in cloud_tag else "unknown"
+        return f"[CLOUD:{provider}] {prompt[:100]}... (cloud generation placeholder)"
+    def route(
+        self,
+        prompt: str,
+        budget: Optional[QWAVEBudget] = None,
+        custom_model: Optional[str] = None,
+    ) -> RouterResult:
+        """
+        Main routing entry point.
+        Full pipeline: enhance → select → retrieve → generate → track.
+        """
+        budget = budget or self.default_budget
+        # Step 1: Enhance
+        enhanced = self._enhance(prompt)
+        # Step 2-3: Select model
+        if custom_model:
+            model_id = custom_model
+            profile = get(model_id)
+            if not profile:
+                raise ValueError(f"Unknown model: {custom_model}")
+        else:
+            model_id, profile = self._select_model(enhanced, budget)
+        # Step 4: Initialize TWAVE with model-specific parameters
+        model_family = profile.family
+        epsilon = get_preset_epsilon(model_family)
+        ckplug = CKPLUGCoupling(epsilon=epsilon, mu_0=profile.mu_base)
+        twave = TWAVETracker(
+            T_c=profile.T_c,
+            mu_0=profile.mu_base,
+            kappa=profile.kappa,
+        )
+        # Step 5: Retrieve
+        evidence = self._retrieve(enhanced.enhanced)
+        top_evidence_text = "\n".join([e["text"] for e in evidence["aggregated"][:3]])
+        # Build final prompt with evidence
+        final_prompt = f"""Retrieved evidence:
+{top_evidence_text}
+---
+User request:
+{enhanced.enhanced}"""
+        # Step 6: Generate
+        import time
+        t0 = time.time()
+        if profile.tier == Tier.CLOUD_API:
+            response = self._generate_cloud(
+                profile.cloud_tag or model_id,
+                final_prompt,
+                budget.max_tokens,
+                profile.default_temp,
+            )
+        else:
+            response = self._generate_local(
+                profile.ollama_tag or model_id,
+                final_prompt,
+                budget.max_tokens,
+                profile.default_temp,
+            )
+        latency_ms = (time.time() - t0) * 1000
+        tokens_est = len(response.split())  # Rough estimate
+        # Step 7: TWAVE tracking (mock for now — needs actual logit extraction)
+        # In production, this runs inside the generation loop
+        states = []  # Would be populated by per-token hooks
+        # Step 8: Assemble result
+        return RouterResult(
+            selected_model=model_id,
+            model_profile=profile,
+            tier=profile.tier.value,
+            enhanced_prompt=enhanced.enhanced,
+            response=response,
+            token_states=states,
+            reflection_count=0,
+            grounding_score=evidence.get("top_score", 0.0),
+            hallucination_risk=0.0,  # Would be computed from states
+            latency_ms=latency_ms,
+            tokens_generated=tokens_est,
+            debug={
+                "enhancement": enhanced,
+                "evidence_summary": evidence,
+                "budget": budget,
+                "ckplug_epsilon": epsilon,
+            },
+        )
+    def quick_route(self, prompt: str, budget: Optional[QWAVEBudget] = None) -> str:
+        """One-liner: just get the response text."""
+        return self.route(prompt, budget).response
+    def get_available_models(self, budget: Optional[QWAVEBudget] = None) -> List[Dict[str, Any]]:
+        """List models available under current budget."""
+        budget = budget or self.default_budget
+        available = []
+        for name, profile in REGISTRY.items():
+            fits = True
+            if profile.tier != Tier.CLOUD_API and profile.size_gb > budget.vram_budget_gb:
+                fits = False
+            if profile.tier == Tier.CLOUD_API and not budget.allow_cloud:
+                fits = False
+            available.append({
+                "id": name,
+                "name": profile.name,
+                "tier": profile.tier.value,
+                "size_gb": profile.size_gb,
+                "params_b": profile.params_b,
+                "capabilities": [c.value for c in profile.capabilities],
+                "fits_budget": fits,
+            })
+        return available