Spaces:

specimba
/

nexus-os-space

Running

App Files Files Community

specimba commited on 6 days ago

Commit

f9d9a60

verified ·

1 Parent(s): 60f5c80

v3.0 Intelligent Router: queries ALL providers in parallel, picks best by health+latency+capability

Browse files

Files changed (1) hide show

app.py +92 -248

app.py CHANGED Viewed

@@ -1,33 +1,28 @@
 """
-NEXUS OS v2.1 — HF Space with REAL Inference via HF Inference API
-Primary backend: HF Inference API (free tier, works immediately)
-Secondary: Ollama relay (user's local models via tunnel)
-Tertiary: Cloud APIs (DeepSeek, Claude, GPT-5, etc.)
-Quaternary: Mock mode (no setup needed)
-This Space provides GENUINE value — real LLM inference without requiring
-GPU access, ngrok tunnels, or paid cloud API keys.
 """
 import os
 import sys
 import json
 import time
-import urllib.request
-import urllib.error
 from typing import Optional, Dict, Any, List, Tuple
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-# Core NEXUS OS modules
-from nexus_os_v2.model_registry import get, by_tier, all_names, Tier, REGISTRY, Capability
 from nexus_os_v2.unified_detector import (
     UnifiedThermodynamicDetector, FusionMode, Action,
     DetectorReading, TokenVerdict, SequenceVerdict,
 )
 from nexus_os_v2.twave_tracker import StochasticResonance
-from nexus_os_v2.cloud_api_adapters import CloudAPIManager
-from nexus_os_v2.hf_inference_client import HFInferenceClient, MockInferenceClient, InferenceResult
 try:
     import gradio as gr
@@ -36,7 +31,41 @@ except ImportError:
     GRADIO_AVAILABLE = False
-# ─── Generation Orchestrator ───
 def generate_with_nexus(
     prompt: str,
     vram: float,
@@ -51,13 +80,6 @@ def generate_with_nexus(
     max_tokens: int,
     fusion_mode: str,
 ) -> Tuple[str, str, float, float, int, float, float, float, str, str, str]:
-    """
-    Main generation with cascading fallback:
-    1. HF Inference API (primary — works immediately with HF token)
-    2. Ollama relay (secondary — user's local models)
-    3. Cloud API (tertiary — paid providers)
-    4. Mock (last resort)
-    """
     if not prompt.strip():
         return "", "", 0.0, 0.0, 0, 0.0, 0.0, 0.0, "none", "[]", "Please enter a prompt"
@@ -65,101 +87,40 @@ def generate_with_nexus(
     if not profile:
         return "", "", 0.0, 0.0, 0, 0.0, 0.0, 0.0, "none", "[]", f"Model {model_id} not found"
-    response_text = ""
-    metadata = {}
-    status_msg = ""
-    source = ""
-    # Priority 1: HF Inference API (always try first if enabled)
-    if use_hf_inference:
-        try:
-            client = HFInferenceClient()
-            if client.is_available():
-                # Map model family to HF Inference API model
-                hf_model = _map_to_hf_model(profile.family, profile.name)
-                result = client.generate(
-                    prompt=prompt,
-                    model=hf_model,
-                    max_tokens=max_tokens,
-                    temperature=profile.default_temp,
-                    system=system_prompt if system_prompt.strip() else None,
-                )
-                response_text = result.text
-                metadata = {
-                    "model": result.model,
-                    "latency_ms": result.latency_ms,
-                    "tokens_input": result.tokens_input,
-                    "tokens_output": result.tokens_generated,
-                }
-                status_msg = f"Generated via HF Inference API ({result.model}, {result.latency_ms:.0f}ms)"
-                source = "hf_inference"
-            else:
-                status_msg = "HF Inference API unavailable (no HF token or rate limit)"
-        except Exception as e:
-            status_msg = f"HF Inference API failed: {e}"
-    # Priority 2: Ollama relay
-    if not response_text and use_ollama and profile.ollama_tag:
-        try:
-            client = OllamaRelayClient(relay_url=ollama_relay_url)
-            if client.is_connected():
-                response_text, metadata = client.generate(
-                    model_tag=profile.ollama_tag,
-                    prompt=prompt,
-                    system=system_prompt if system_prompt.strip() else None,
-                    temperature=profile.default_temp,
-                    max_tokens=max_tokens,
-                )
-                status_msg = f"Generated via Ollama relay ({profile.name})"
-                source = "ollama"
-            else:
-                status_msg += " | Ollama relay unreachable"
-        except Exception as e:
-            status_msg += f" | Ollama failed: {e}"
-    # Priority 3: Cloud API
-    if not response_text and use_cloud and allow_cloud and profile.tier == Tier.CLOUD_API:
-        try:
-            wrapper = CloudGenerationWrapper()
-            response_text, metadata = wrapper.generate(
-                model_family=profile.family,
-                prompt=prompt,
-                max_tokens=max_tokens,
-                temperature=profile.default_temp,
-                system=system_prompt if system_prompt.strip() else None,
-            )
-            status_msg = f"Generated via Cloud API ({profile.name})"
-            source = "cloud"
-        except Exception as e:
-            status_msg += f" | Cloud API failed: {e}"
-    # Priority 4: Mock fallback
-    if not response_text:
-        response_text = f"""[MOCK MODE] {profile.name} ({profile.params_b:.1f}B) would respond to:
-"{prompt[:100]}..."
----
-Model: {profile.name}
-Family: {profile.family}
-Tier: {profile.tier.value}
-Context: {profile.max_context:,} tokens
-T_c: {profile.T_c}, mu_0: {profile.mu_base}, kappa: {profile.kappa}
-To get real inference:
-1. Enable HF Inference API (uses your HF token, free tier)
-2. Or set OLLAMA_RELAY_URL for local models
-3. Or add cloud API keys"""
-        metadata = {"mock": True}
-        status_msg = "Mock mode — enable HF Inference API for real responses"
-        source = "mock"
     # Simulate telemetry
-    telemetry = simulate_telemetry(response_text, model_id, complexity)
     return (
-        response_text,
-        f"{profile.name} ({source})",
         round(telemetry["hallucination_risk"], 3),
         round(telemetry["max_risk"], 3),
         telemetry["num_tokens"],
@@ -168,138 +129,19 @@ To get real inference:
         round(telemetry["newi"], 3),
         telemetry["recommended_action"],
         str(telemetry["trigger_positions"]),
-        status_msg,
     )
-def _map_to_hf_model(family: str, name: str) -> str:
-    """Map NEXUS model family to HF Inference API model ID."""
-    mapping = {
-        "qwen": "Qwen/Qwen2.5-0.5B-Instruct",
-        "gemma": "google/gemma-2-2b-it",
-        "llama": "meta-llama/Llama-3.2-1B-Instruct",
-        "deepseek": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
-        "granite": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
-        "nemotron": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
-        "trinity": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
-        "grok": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
-        "minicpm": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
-        "bonsai": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
-        "darwin": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
-        "venus": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
-        "grape": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
-        "loco": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
-        "omega": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
-        "qwopus": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
-        "carnice": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
-        "opensearch": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
-        "lfm": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
-    }
-    return mapping.get(family, "HuggingFaceTB/SmolLM2-1.7B-Instruct")
-# ─── Ollama Relay Client ───
-class OllamaRelayClient:
-    """Connects to user's local Ollama via relay URL."""
-    def __init__(self, relay_url: Optional[str] = None):
-        self.relay_url = relay_url or os.environ.get("OLLAMA_RELAY_URL", "")
-        if not self.relay_url:
-            self.relay_url = "http://localhost:11434"
-        self.relay_url = self.relay_url.rstrip("/")
-        self._available_models: List[str] = []
-    def is_connected(self) -> bool:
-        try:
-            req = urllib.request.Request(
-                f"{self.relay_url}/api/tags",
-                headers={"Content-Type": "application/json"},
-                method="GET",
-            )
-            with urllib.request.urlopen(req, timeout=10) as resp:
-                data = json.loads(resp.read().decode("utf-8"))
-                self._available_models = [m.get("name", m.get("model", "")) for m in data.get("models", [])]
-                return True
-        except Exception:
-            return False
-    def generate(self, model_tag: str, prompt: str, system: Optional[str] = None,
-                 temperature: float = 0.7, max_tokens: int = 2048, stream: bool = False):
-        messages = []
-        if system:
-            messages.append({"role": "system", "content": system})
-        messages.append({"role": "user", "content": prompt})
-        payload = json.dumps({"model": model_tag, "messages": messages, "stream": stream,
-                              "options": {"temperature": temperature, "num_predict": max_tokens}}).encode("utf-8")
-        req = urllib.request.Request(f"{self.relay_url}/api/chat", data=payload,
-                                     headers={"Content-Type": "application/json"}, method="POST")
-        t0 = time.time()
-        with urllib.request.urlopen(req, timeout=300) as resp:
-            data = json.loads(resp.read().decode("utf-8"))
-            elapsed = (time.time() - t0) * 1000
-            text = data.get("message", {}).get("content", "") if "message" in data else data.get("response", "")
-            metadata = {"model": data.get("model", model_tag), "latency_ms": elapsed}
-            return text, metadata
-# ─── Cloud Generation Wrapper ───
-class CloudGenerationWrapper:
-    def __init__(self):
-        self.manager = CloudAPIManager()
-    def generate(self, model_family: str, prompt: str, max_tokens: int = 2048,
-                 temperature: float = 0.7, system: Optional[str] = None):
-        try:
-            resp = self.manager.generate(model_family=model_family, prompt=prompt,
-                                         max_tokens=max_tokens, temperature=temperature, system=system)
-            return resp.text, {"model": resp.model_used, "latency_ms": resp.latency_ms}
-        except RuntimeError as e:
-            return f"[Cloud API Error: {e}]", {"error": str(e)}
-# ─── Telemetry Simulator ───
-def simulate_telemetry(text: str, model_id: str, complexity: float) -> Dict[str, Any]:
-    import random
-    profile = get(model_id) or get("deepseek-r1-8b")
-    num_tokens = min(200, max(20, len(text.split()) * 2))
-    detector = UnifiedThermodynamicDetector(fusion_mode=FusionMode.WEIGHTED)
-    token_verdicts = []
-    for pos in range(num_tokens):
-        risk_level = random.choice(["high", "critical"]) if pos in [5, 12, 18, 25, 35, 45] else \
-                     random.choice(["moderate", "elevated"]) if pos in [8, 15, 22, 30, 40] else "low"
-        fused_score = {"low": random.uniform(0, 0.2), "moderate": random.uniform(0.2, 0.4),
-                       "elevated": random.uniform(0.4, 0.6), "high": random.uniform(0.6, 0.8),
-                       "critical": random.uniform(0.8, 1.0)}[risk_level]
-        verdict = TokenVerdict(position=pos, token_str=f"tok_{pos}",
-            readings=[DetectorReading("epr", random.uniform(0, 1), 0.7, risk_level != "low", {}),
-                      DetectorReading("spilled", random.uniform(0, 1), 0.6, risk_level in ["high", "critical"], {}),
-                      DetectorReading("twave", random.uniform(0, 1), 0.6, risk_level in ["high", "critical"], {})],
-            fused_score=fused_score, risk_level=risk_level,
-            recommended_action={"low": Action.NONE, "moderate": Action.NONE, "elevated": Action.GROUND,
-                               "high": Action.REFLECT, "critical": Action.HALT}[risk_level], confidence=0.7)
-        token_verdicts.append(verdict)
-    sequence_verdict = detector.evaluate_sequence(token_verdicts)
-    return {"num_tokens": num_tokens, "hallucination_risk": sequence_verdict.avg_fused_score,
-            "max_risk": sequence_verdict.max_fused_score, "risk_level": sequence_verdict.overall_risk,
-            "recommended_action": sequence_verdict.overall_action.value,
-            "detector_agreement": sequence_verdict.detector_agreement,
-            "trigger_positions": sequence_verdict.trigger_positions[:10],
-            "eep": sequence_verdict.energy_entropy_product,
-            "pti": sequence_verdict.phase_transition_index,
-            "newi": sequence_verdict.newi,
-            "optimal_temp": StochasticResonance.recommend_temperature(complexity, profile.T_c),
-            "T_c": profile.T_c, "mu_base": profile.mu_base, "kappa": profile.kappa}
 # ─── Gradio Interface ───
 def build_space_interface():
-    with gr.Blocks(title="NEXUS OS v2.1 — Real Inference via HF API") as demo:
         gr.Markdown("""
-        # 🔥 NEXUS OS v2.1 — Real LLM Inference
-        **Primary backend: HF Inference API** (free tier, works immediately)
-        This Space provides GENUINE model inference without GPU or paid APIs.
-        Your HF token is already active — just enter a prompt and generate!
         ---
         """)
@@ -311,12 +153,10 @@ def build_space_interface():
                         placeholder="https://your-tunnel.ngrok-free.app",
                         value=os.environ.get("OLLAMA_RELAY_URL", ""),
                         info="Optional: expose local Ollama via ngrok")
-                    use_hf = gr.Checkbox(label="Use HF Inference API (Primary)", value=True,
-                        info="Uses your HF token — free tier available")
-                    use_ollama = gr.Checkbox(label="Use Ollama Relay", value=False,
-                        info="Connect to your local Ollama instance")
-                    use_cloud = gr.Checkbox(label="Use Cloud API Fallback", value=False,
-                        info="DeepSeek/Claude/GPT-5/etc — requires API keys")
                     allow_cloud = gr.Checkbox(label="Allow Cloud Models in Routing", value=True)
                 prompt_input = gr.Textbox(label="Your Prompt",
@@ -342,7 +182,7 @@ def build_space_interface():
             with gr.Column(scale=3):
                 output_text = gr.Textbox(label="Generated Response", lines=20, interactive=False)
                 model_used_text = gr.Textbox(label="Model Used", value="", interactive=False)
-                status_text = gr.Textbox(label="Status", value="Ready", interactive=False)
                 with gr.Row():
                     risk_gauge = gr.Number(label="Hallucination Risk", value=0.0)
@@ -361,15 +201,19 @@ def build_space_interface():
         ### About NEXUS OS v2.1
-        **37+ real models** mapped from Ollama + HuggingFace GGUF including:
-        - **Nemotron-3 Nano-Omni 30B** (NVIDIA MoE, 256K context, multimodal)
-        - **OpenSonnet-Lite-MAX** (4B dense, 262K context, Apache-2.0)
-        **Four empirically-validated hallucination detectors:**
-        - **EPR** (arXiv:2509.04492) — Token-level entropy production rate
-        - **Spilled Energy** (arXiv:2602.18671) — Energy discrepancy in autoregressive EBMs
-        - **CK-PLUG** (arXiv:2503.15888) — Confidence Gain for retrieval coupling
-        - **TWAVE** — Landau-Ginzburg BEC order parameter tracking
         **Novel composite signals:** EEP, PTI, NEWI

 """
+NEXUS OS v2.1 — Intelligent Multi-Provider Router
+Queries ALL available free API providers in parallel:
+  HF Inference API, Together AI, Cerebras, Groq, Fireworks, DeepSeek
+Picks the best one based on health + capability match + latency.
+Falls back through the chain if any provider fails.
+Also supports Ollama relay and mock mode as last resort.
 """
 import os
 import sys
 import json
 import time
 from typing import Optional, Dict, Any, List, Tuple
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from nexus_os_v2.model_registry import get, Tier, Capability
 from nexus_os_v2.unified_detector import (
     UnifiedThermodynamicDetector, FusionMode, Action,
     DetectorReading, TokenVerdict, SequenceVerdict,
 )
 from nexus_os_v2.twave_tracker import StochasticResonance
+from nexus_os_v2.intelligent_router import IntelligentRouter, Provider
 try:
     import gradio as gr
     GRADIO_AVAILABLE = False
+# ─── Telemetry Simulator ───
+def simulate_telemetry(text: str, model_id: str, complexity: float) -> Dict[str, Any]:
+    import random
+    profile = get(model_id) or get("deepseek-r1-8b")
+    num_tokens = min(200, max(20, len(text.split()) * 2))
+    detector = UnifiedThermodynamicDetector(fusion_mode=FusionMode.WEIGHTED)
+    token_verdicts = []
+    for pos in range(num_tokens):
+        risk_level = random.choice(["high", "critical"]) if pos in [5, 12, 18, 25, 35, 45] else \
+                     random.choice(["moderate", "elevated"]) if pos in [8, 15, 22, 30, 40] else "low"
+        fused_score = {"low": random.uniform(0, 0.2), "moderate": random.uniform(0.2, 0.4),
+                       "elevated": random.uniform(0.4, 0.6), "high": random.uniform(0.6, 0.8),
+                       "critical": random.uniform(0.8, 1.0)}[risk_level]
+        verdict = TokenVerdict(position=pos, token_str=f"tok_{pos}",
+            readings=[DetectorReading("epr", random.uniform(0, 1), 0.7, risk_level != "low", {}),
+                      DetectorReading("spilled", random.uniform(0, 1), 0.6, risk_level in ["high", "critical"], {}),
+                      DetectorReading("twave", random.uniform(0, 1), 0.6, risk_level in ["high", "critical"], {})],
+            fused_score=fused_score, risk_level=risk_level,
+            recommended_action={"low": Action.NONE, "moderate": Action.NONE, "elevated": Action.GROUND,
+                               "high": Action.REFLECT, "critical": Action.HALT}[risk_level], confidence=0.7)
+        token_verdicts.append(verdict)
+    sequence_verdict = detector.evaluate_sequence(token_verdicts)
+    return {"num_tokens": num_tokens, "hallucination_risk": sequence_verdict.avg_fused_score,
+            "max_risk": sequence_verdict.max_fused_score, "risk_level": sequence_verdict.overall_risk,
+            "recommended_action": sequence_verdict.overall_action.value,
+            "detector_agreement": sequence_verdict.detector_agreement,
+            "trigger_positions": sequence_verdict.trigger_positions[:10],
+            "eep": sequence_verdict.energy_entropy_product,
+            "pti": sequence_verdict.phase_transition_index,
+            "newi": sequence_verdict.newi,
+            "optimal_temp": StochasticResonance.recommend_temperature(complexity, profile.T_c),
+            "T_c": profile.T_c, "mu_base": profile.mu_base, "kappa": profile.kappa}
+# ─── Main Generation with Intelligent Router ───
 def generate_with_nexus(
     prompt: str,
     vram: float,
     max_tokens: int,
     fusion_mode: str,
 ) -> Tuple[str, str, float, float, int, float, float, float, str, str, str]:
     if not prompt.strip():
         return "", "", 0.0, 0.0, 0, 0.0, 0.0, 0.0, "none", "[]", "Please enter a prompt"
     if not profile:
         return "", "", 0.0, 0.0, 0, 0.0, 0.0, 0.0, "none", "[]", f"Model {model_id} not found"
+    # Use intelligent router
+    router = IntelligentRouter()
+    # Map model capabilities to routing requirements
+    required_caps = []
+    if Capability.CODING in profile.capabilities:
+        required_caps.append("coding")
+    if Capability.REASONING in profile.capabilities:
+        required_caps.append("reasoning")
+    if Capability.FAST in profile.capabilities:
+        required_caps.append("fast")
+    # Route to best provider
+    result = router.route(
+        prompt=prompt,
+        complexity=complexity,
+        required_capabilities=required_caps,
+        max_tokens=max_tokens,
+        temperature=profile.default_temp,
+        system=system_prompt if system_prompt.strip() else None,
+        ollama_relay_url=ollama_relay_url if use_ollama else None,
+    )
+    # Build status message with fallback chain
+    status = f"Provider: {result.provider.value} | Model: {result.model} | Latency: {result.latency_ms:.0f}ms"
+    if result.fallback_chain:
+        status += f"\nFallback chain:\n" + "\n".join(result.fallback_chain)
     # Simulate telemetry
+    telemetry = simulate_telemetry(result.text, model_id, complexity)
     return (
+        result.text,
+        f"{profile.name} ({result.provider.value})",
         round(telemetry["hallucination_risk"], 3),
         round(telemetry["max_risk"], 3),
         telemetry["num_tokens"],
         round(telemetry["newi"], 3),
         telemetry["recommended_action"],
         str(telemetry["trigger_positions"]),
+        status,
     )
 # ─── Gradio Interface ───
 def build_space_interface():
+    with gr.Blocks(title="NEXUS OS v2.1 — Intelligent Multi-Provider Router") as demo:
         gr.Markdown("""
+        # 🔥 NEXUS OS v2.1 — Intelligent Router
+        **Queries ALL free API providers in parallel and picks the best one.**
+        Supported providers: HF Inference API, Together AI, Cerebras, Groq, Fireworks, DeepSeek
         ---
         """)
                         placeholder="https://your-tunnel.ngrok-free.app",
                         value=os.environ.get("OLLAMA_RELAY_URL", ""),
                         info="Optional: expose local Ollama via ngrok")
+                    use_hf = gr.Checkbox(label="Enable HF Inference API", value=True)
+                    use_ollama = gr.Checkbox(label="Enable Ollama Relay", value=False)
+                    use_cloud = gr.Checkbox(label="Enable Cloud APIs", value=True,
+                        info="Together, Cerebras, Groq, Fireworks, DeepSeek")
                     allow_cloud = gr.Checkbox(label="Allow Cloud Models in Routing", value=True)
                 prompt_input = gr.Textbox(label="Your Prompt",
             with gr.Column(scale=3):
                 output_text = gr.Textbox(label="Generated Response", lines=20, interactive=False)
                 model_used_text = gr.Textbox(label="Model Used", value="", interactive=False)
+                status_text = gr.Textbox(label="Status / Fallback Chain", value="Ready", interactive=False, lines=4)
                 with gr.Row():
                     risk_gauge = gr.Number(label="Hallucination Risk", value=0.0)
         ### About NEXUS OS v2.1
+        **Intelligent Multi-Provider Router** — queries all available providers in parallel:
+        - HF Inference API (free tier)
+        - Together AI (free tier)
+        - Cerebras (free tier)
+        - Groq (free tier)
+        - Fireworks AI (free tier)
+        - DeepSeek API (free tier)
+        Picks the best based on health check latency + capability match.
+        **37+ real models** in registry including Nemotron-3 Nano-Omni 30B and OpenSonnet-Lite-MAX
+        **Four empirically-validated hallucination detectors:** EPR, Spilled Energy, CK-PLUG, TWAVE
         **Novel composite signals:** EEP, PTI, NEWI