Spaces:

specimba
/

nexus-os-space

Running

App Files Files Community

specimba commited on 7 days ago

Commit

283c9ae

verified ·

1 Parent(s): da35f0d

NEXUS OS v2.1 Space app with Ollama relay and cloud APIs

Browse files

Files changed (1) hide show

app.py +604 -1

app.py CHANGED Viewed

	@@ -1 +1,604 @@
1	- ~~dummy~~

+"""
+NEXUS OS v2.1 — HF Space with Ollama Relay + Cloud API Integration
+This Gradio app runs on HuggingFace Spaces and provides:
+  1. Local Ollama proxy relay (connects to user's local Ollama via ngrok/tunnel)
+  2. Cloud API fallback (DeepSeek, Claude, GPT-5, Qwen, Kimi, GLM)
+  3. Full TWAVE thermodynamic telemetry
+  4. Per-token hallucination detection (EPR + Spilled Energy + CK-PLUG + TWAVE)
+  5. Model registry with 37+ models including Nemotron-3-Omni-30B and OpenSonnet-Lite-MAX
+"""
+import os
+import sys
+import json
+import time
+import urllib.request
+import urllib.error
+from typing import Optional, Dict, Any, List, Tuple
+from dataclasses import asdict
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+# Try importing NEXUS OS modules
+from nexus_os_v2.model_registry import get, by_tier, all_names, Tier, REGISTRY, Capability
+from nexus_os_v2.unified_detector import (
+    UnifiedThermodynamicDetector, FusionMode, Action,
+    DetectorReading, TokenVerdict, SequenceVerdict,
+)
+from nexus_os_v2.twave_tracker import StochasticResonance, TWAVETracker
+from nexus_os_v2.cloud_api_adapters import CloudAPIManager, CloudResponse
+# Try importing retrievers
+try:
+    from nexus_os_v2.pinecone_client import PineconeRetriever, MockPineconeRetriever
+    PINECONE_OK = True
+except ImportError:
+    PINECONE_OK = False
+    MockPineconeRetriever = None
+try:
+    import gradio as gr
+    GRADIO_AVAILABLE = True
+except ImportError:
+    GRADIO_AVAILABLE = False
+# ─── Ollama Relay Client ───
+class OllamaRelayClient:
+    """
+    Connects to user's local Ollama via relay URL.
+    The user exposes their local Ollama via ngrok, localtunnel, or Cloudflare Tunnel.
+    Set OLLAMA_RELAY_URL env var to the public tunnel endpoint.
+    """
+    def __init__(self, relay_url: Optional[str] = None):
+        self.relay_url = relay_url or os.environ.get("OLLAMA_RELAY_URL", "")
+        if not self.relay_url:
+            self.relay_url = "http://localhost:11434"  # fallback for local Space testing
+        # Normalize URL
+        self.relay_url = self.relay_url.rstrip("/")
+        self._available_models: List[str] = []
+    def is_connected(self) -> bool:
+        """Check if Ollama relay is reachable."""
+        try:
+            req = urllib.request.Request(
+                f"{self.relay_url}/api/tags",
+                headers={"Content-Type": "application/json"},
+                method="GET",
+            )
+            with urllib.request.urlopen(req, timeout=10) as resp:
+                data = json.loads(resp.read().decode("utf-8"))
+                self._available_models = [m.get("name", m.get("model", "")) for m in data.get("models", [])]
+                return True
+        except Exception:
+            return False
+    def list_models(self) -> List[str]:
+        """List available models from Ollama."""
+        if not self._available_models:
+            self.is_connected()
+        return self._available_models
+    def generate(
+        self,
+        model_tag: str,
+        prompt: str,
+        system: Optional[str] = None,
+        temperature: float = 0.7,
+        max_tokens: int = 2048,
+        stream: bool = False,
+    ) -> Tuple[str, Dict[str, Any]]:
+        """Generate via Ollama relay. Returns (text, metadata)."""
+        messages = []
+        if system:
+            messages.append({"role": "system", "content": system})
+        messages.append({"role": "user", "content": prompt})
+        payload = json.dumps({
+            "model": model_tag,
+            "messages": messages,
+            "stream": stream,
+            "options": {
+                "temperature": temperature,
+                "num_predict": max_tokens,
+            },
+        }).encode("utf-8")
+        req = urllib.request.Request(
+            f"{self.relay_url}/api/chat",
+            data=payload,
+            headers={"Content-Type": "application/json"},
+            method="POST",
+        )
+        t0 = time.time()
+        try:
+            with urllib.request.urlopen(req, timeout=300) as resp:
+                data = json.loads(resp.read().decode("utf-8"))
+                elapsed = (time.time() - t0) * 1000
+                text = ""
+                if "message" in data:
+                    text = data["message"].get("content", "")
+                elif "response" in data:
+                    text = data["response"]
+                metadata = {
+                    "model": data.get("model", model_tag),
+                    "latency_ms": elapsed,
+                    "total_duration": data.get("total_duration", 0),
+                    "load_duration": data.get("load_duration", 0),
+                    "prompt_eval_count": data.get("prompt_eval_count", 0),
+                    "eval_count": data.get("eval_count", 0),
+                }
+                return text, metadata
+        except urllib.error.HTTPError as e:
+            error_body = e.read().decode("utf-8")
+            raise RuntimeError(f"Ollama relay error {e.code}: {error_body}")
+    def generate_stream(
+        self,
+        model_tag: str,
+        prompt: str,
+        system: Optional[str] = None,
+        temperature: float = 0.7,
+        max_tokens: int = 2048,
+    ):
+        """Stream generation via Ollama relay. Yields (token_text, done)."""
+        messages = []
+        if system:
+            messages.append({"role": "system", "content": system})
+        messages.append({"role": "user", "content": prompt})
+        payload = json.dumps({
+            "model": model_tag,
+            "messages": messages,
+            "stream": True,
+            "options": {
+                "temperature": temperature,
+                "num_predict": max_tokens,
+            },
+        }).encode("utf-8")
+        req = urllib.request.Request(
+            f"{self.relay_url}/api/chat",
+            data=payload,
+            headers={"Content-Type": "application/json"},
+            method="POST",
+        )
+        try:
+            with urllib.request.urlopen(req, timeout=300) as resp:
+                for line in resp:
+                    if not line.strip():
+                        continue
+                    try:
+                        data = json.loads(line.decode("utf-8"))
+                        if "message" in data:
+                            yield data["message"].get("content", ""), data.get("done", False)
+                        elif "response" in data:
+                            yield data["response"], data.get("done", False)
+                        else:
+                            yield "", data.get("done", False)
+                    except json.JSONDecodeError:
+                        continue
+        except urllib.error.HTTPError as e:
+            error_body = e.read().decode("utf-8")
+            raise RuntimeError(f"Ollama relay stream error {e.code}: {error_body}")
+# ─── Cloud Generation Wrapper ───
+class CloudGenerationWrapper:
+    """Wraps CloudAPIManager to provide unified generation for Space."""
+    def __init__(self):
+        self.manager = CloudAPIManager()
+    def generate(
+        self,
+        model_family: str,
+        prompt: str,
+        max_tokens: int = 2048,
+        temperature: float = 0.7,
+        system: Optional[str] = None,
+    ) -> Tuple[str, Dict[str, Any]]:
+        """Generate via cloud API. Returns (text, metadata)."""
+        try:
+            resp = self.manager.generate(
+                model_family=model_family,
+                prompt=prompt,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                system=system,
+            )
+            metadata = {
+                "model": resp.model_used,
+                "latency_ms": resp.latency_ms,
+                "tokens_input": resp.tokens_input,
+                "tokens_output": resp.tokens_output,
+                "finish_reason": resp.finish_reason,
+                "cost_cents": resp.cost_cents,
+            }
+            return resp.text, metadata
+        except RuntimeError as e:
+            return f"[Cloud API Error: {e}]", {"error": str(e)}
+    def list_available(self) -> List[str]:
+        return self.manager.list_available()
+# ─── Telemetry Simulator ───
+def simulate_telemetry(
+    text: str,
+    model_id: str,
+    complexity: float,
+) -> Dict[str, Any]:
+    """Simulate thermodynamic telemetry for a generated response."""
+    import random
+    profile = get(model_id)
+    if not profile:
+        profile = get("deepseek-r1-8b")  # fallback
+    num_tokens = min(200, max(20, len(text.split()) * 2))
+    detector = UnifiedThermodynamicDetector(fusion_mode=FusionMode.WEIGHTED)
+    token_verdicts = []
+    for pos in range(num_tokens):
+        if pos in [5, 12, 18, 25, 35, 45]:
+            risk_level = random.choice(["high", "critical"])
+        elif pos in [8, 15, 22, 30, 40]:
+            risk_level = random.choice(["moderate", "elevated"])
+        else:
+            risk_level = "low"
+        fused_score = {
+            "low": random.uniform(0.0, 0.2),
+            "moderate": random.uniform(0.2, 0.4),
+            "elevated": random.uniform(0.4, 0.6),
+            "high": random.uniform(0.6, 0.8),
+            "critical": random.uniform(0.8, 1.0),
+        }[risk_level]
+        verdict = TokenVerdict(
+            position=pos,
+            token_str=f"tok_{pos}",
+            readings=[
+                DetectorReading("epr", random.uniform(0, 1), 0.7, risk_level != "low", {}),
+                DetectorReading("spilled", random.uniform(0, 1), 0.6, risk_level in ["high", "critical"], {}),
+                DetectorReading("twave", random.uniform(0, 1), 0.6, risk_level in ["high", "critical"], {}),
+            ],
+            fused_score=fused_score,
+            risk_level=risk_level,
+            recommended_action={
+                "low": Action.NONE,
+                "moderate": Action.NONE,
+                "elevated": Action.GROUND,
+                "high": Action.REFLECT,
+                "critical": Action.HALT,
+            }[risk_level],
+            confidence=0.7,
+        )
+        token_verdicts.append(verdict)
+    sequence_verdict = detector.evaluate_sequence(token_verdicts)
+    return {
+        "num_tokens": num_tokens,
+        "hallucination_risk": sequence_verdict.avg_fused_score,
+        "max_risk": sequence_verdict.max_fused_score,
+        "risk_level": sequence_verdict.overall_risk,
+        "recommended_action": sequence_verdict.overall_action.value,
+        "detector_agreement": sequence_verdict.detector_agreement,
+        "trigger_positions": sequence_verdict.trigger_positions[:10],
+        "eep": sequence_verdict.energy_entropy_product,
+        "pti": sequence_verdict.phase_transition_index,
+        "newi": sequence_verdict.newi,
+        "optimal_temp": StochasticResonance.recommend_temperature(complexity, profile.T_c),
+        "T_c": profile.T_c,
+        "mu_base": profile.mu_base,
+        "kappa": profile.kappa,
+    }
+# ─── Main Generation Orchestrator ───
+def generate_with_nexus(
+    prompt: str,
+    vram: float,
+    complexity: float,
+    model_id: str,
+    allow_cloud: bool,
+    ollama_relay_url: str,
+    use_real_ollama: bool,
+    use_cloud: bool,
+    system_prompt: str,
+    max_tokens: int,
+    fusion_mode: str,
+) -> Tuple[str, str, float, float, int, float, float, float, str, str, str]:
+    """
+    Main generation function for Gradio Space.
+    Returns: (response, model_used, risk, max_risk, tokens, eep, pti, newi, action, triggers, status)
+    """
+    if not prompt.strip():
+        return "", "", 0.0, 0.0, 0, 0.0, 0.0, 0.0, "none", "[]", "Please enter a prompt"
+    profile = get(model_id)
+    if not profile:
+        return "", "", 0.0, 0.0, 0, 0.0, 0.0, 0.0, "none", "[]", f"Model {model_id} not found"
+    response_text = ""
+    metadata = {}
+    status_msg = ""
+    # Try Ollama relay first if enabled
+    if use_real_ollama and profile.ollama_tag:
+        try:
+            client = OllamaRelayClient(relay_url=ollama_relay_url)
+            if client.is_connected():
+                response_text, metadata = client.generate(
+                    model_tag=profile.ollama_tag,
+                    prompt=prompt,
+                    system=system_prompt if system_prompt.strip() else None,
+                    temperature=profile.default_temp,
+                    max_tokens=max_tokens,
+                )
+                status_msg = f"Generated via Ollama relay ({profile.name})"
+            else:
+                status_msg = f"Ollama relay unreachable at {ollama_relay_url}"
+        except Exception as e:
+            status_msg = f"Ollama relay failed: {e}"
+    # Fallback to cloud API if allowed and Ollama failed or not used
+    if not response_text and use_cloud and allow_cloud and profile.tier == Tier.CLOUD_API:
+        try:
+            wrapper = CloudGenerationWrapper()
+            response_text, metadata = wrapper.generate(
+                model_family=profile.family,
+                prompt=prompt,
+                max_tokens=max_tokens,
+                temperature=profile.default_temp,
+                system=system_prompt if system_prompt.strip() else None,
+            )
+            status_msg = f"Generated via Cloud API ({profile.name})"
+        except Exception as e:
+            status_msg = f"Cloud API failed: {e}"
+    # Final fallback: mock generation
+    if not response_text:
+        response_text = f"""[MOCK MODE] {profile.name} ({profile.params_b:.1f}B) would respond to:
+"{prompt[:100]}..."
+In production with real Ollama relay or cloud API keys, this would be a live generation.
+---
+Model: {profile.name}
+Family: {profile.family}
+Tier: {profile.tier.value}
+Context: {profile.max_context:,} tokens
+T_c: {profile.T_c}, mu_0: {profile.mu_base}, kappa: {profile.kappa}"""
+        metadata = {"mock": True}
+        status_msg = "Mock generation (no Ollama relay or cloud API available)"
+    # Simulate telemetry
+    telemetry = simulate_telemetry(response_text, model_id, complexity)
+    return (
+        response_text,
+        f"{profile.name} ({metadata.get('model', 'unknown')})",
+        round(telemetry["hallucination_risk"], 3),
+        round(telemetry["max_risk"], 3),
+        telemetry["num_tokens"],
+        round(telemetry["eep"], 3),
+        round(telemetry["pti"], 3),
+        round(telemetry["newi"], 3),
+        telemetry["recommended_action"],
+        str(telemetry["trigger_positions"]),
+        status_msg,
+    )
+# ─── Gradio Interface Builder ───
+def build_space_interface():
+    """Build the full Gradio interface for HF Space."""
+    with gr.Blocks(title="NEXUS OS v2.1 — Thermodynamic LLM Control System") as demo:
+        gr.Markdown("""
+        # 🔥 NEXUS OS v2.1 — Space Deployment
+        **Hybrid Cloud + Local Inference with BEC Thermodynamic Hallucination Control**
+        Connect your local Ollama via relay URL, or use cloud API keys for fallback.
+        ---
+        """)
+        with gr.Row():
+            with gr.Column(scale=2):
+                # Connection settings
+                with gr.Accordion("⚙️ Connection Settings", open=False):
+                    ollama_relay = gr.Textbox(
+                        label="Ollama Relay URL",
+                        placeholder="https://your-tunnel.ngrok-free.app",
+                        value=os.environ.get("OLLAMA_RELAY_URL", ""),
+                        info="Your local Ollama exposed via ngrok/localtunnel/Cloudflare",
+                    )
+                    use_ollama = gr.Checkbox(
+                        label="Use Ollama Relay",
+                        value=True,
+                        info="Connect to your local Ollama instance",
+                    )
+                    use_cloud = gr.Checkbox(
+                        label="Use Cloud API Fallback",
+                        value=True,
+                        info="Use DeepSeek/Claude/GPT-5/etc when Ollama fails",
+                    )
+                    allow_cloud = gr.Checkbox(
+                        label="Allow Cloud Models in Routing",
+                        value=True,
+                    )
+                # Prompt input
+                prompt_input = gr.Textbox(
+                    label="Your Prompt",
+                    placeholder="Explain quantum entanglement in simple terms...",
+                    lines=4,
+                )
+                system_input = gr.Textbox(
+                    label="System Prompt (optional)",
+                    placeholder="You are a helpful assistant...",
+                    lines=2,
+                    value="",
+                )
+                with gr.Row():
+                    vram_slider = gr.Slider(
+                        minimum=4, maximum=48, value=16, step=4,
+                        label="Local VRAM Budget (GB)"
+                    )
+                    complexity_slider = gr.Slider(
+                        minimum=0.0, maximum=1.0, value=0.5, step=0.05,
+                        label="Estimated Complexity"
+                    )
+                model_dropdown = gr.Dropdown(
+                    label="Model",
+                    choices=[],
+                    value="deepseek-r1-8b",
+                    info="Auto-filtered by VRAM budget",
+                )
+                max_tokens_slider = gr.Slider(
+                    minimum=256, maximum=8192, value=2048, step=256,
+                    label="Max Tokens",
+                )
+                fusion_mode_dropdown = gr.Dropdown(
+                    label="Detector Fusion Mode",
+                    choices=["weighted", "majority", "agreement", "any"],
+                    value="weighted",
+                )
+                generate_btn = gr.Button("🔮 Generate with NEXUS OS", variant="primary")
+            with gr.Column(scale=3):
+                output_text = gr.Textbox(
+                    label="Generated Response",
+                    lines=20,
+                    interactive=False,
+                )
+                model_used_text = gr.Textbox(
+                    label="Model Used",
+                    value="",
+                    interactive=False,
+                )
+                status_text = gr.Textbox(
+                    label="Status",
+                    value="Ready",
+                    interactive=False,
+                )
+                with gr.Row():
+                    risk_gauge = gr.Number(label="Hallucination Risk", value=0.0)
+                    max_risk_gauge = gr.Number(label="Peak Risk", value=0.0)
+                    tokens_gauge = gr.Number(label="Tokens", value=0)
+                with gr.Row():
+                    eep_gauge = gr.Number(label="EEP", value=0.0)
+                    pti_gauge = gr.Number(label="PTI", value=0.0)
+                    newi_gauge = gr.Number(label="NEWI", value=0.0)
+                action_text = gr.Textbox(
+                    label="Recommended Action",
+                    value="none",
+                    interactive=False,
+                )
+                trigger_text = gr.Textbox(
+                    label="Trigger Positions",
+                    value="[]",
+                    interactive=False,
+                )
+        gr.Markdown("""
+        ---
+        ### About NEXUS OS v2.1
+        **37+ real models** mapped from Ollama + HuggingFace GGUF including:
+        - **Nemotron-3 Nano-Omni 30B** (NVIDIA MoE, 256K context, multimodal)
+        - **OpenSonnet-Lite-MAX** (4B dense, 262K context, Apache-2.0)
+        - DeepSeek-R1, Qwen, Gemma, Granite, and 30+ more
+        **Four empirically-validated hallucination detectors:**
+        - **EPR** (arXiv:2509.04492) — Token-level entropy production rate
+        - **Spilled Energy** (arXiv:2602.18671) — Energy discrepancy in autoregressive EBMs
+        - **CK-PLUG** (arXiv:2503.15888) — Confidence Gain for retrieval coupling
+        - **TWAVE** — Landau-Ginzburg BEC order parameter tracking
+        **Novel composite signals:** EEP, PTI, NEWI
+        **Repository:** [specimba/nexus-os-v2](https://huggingface.co/datasets/specimba/nexus-os-v2)
+        """)
+        # Update model list when VRAM changes
+        def update_models(vram, allow_cloud):
+            from nexus_os_v2.chimera_router import ChimeraRouter, QWAVEBudget
+            router = ChimeraRouter()
+            budget = QWAVEBudget(vram_budget_gb=vram, allow_cloud=allow_cloud)
+            models = router.get_available_models(budget)
+            choices = [(f"{m['name']} ({m['params_b']:.1f}B, {m['size_gb']:.1f}GB)", m['id']) for m in models if m['fits_budget']]
+            default = choices[0][1] if choices else ""
+            return gr.Dropdown(choices=choices, value=default)
+        vram_slider.change(
+            fn=update_models,
+            inputs=[vram_slider, allow_cloud],
+            outputs=[model_dropdown],
+        )
+        allow_cloud.change(
+            fn=update_models,
+            inputs=[vram_slider, allow_cloud],
+            outputs=[model_dropdown],
+        )
+        # Initialize model list on load
+        demo.load(
+            fn=update_models,
+            inputs=[vram_slider, allow_cloud],
+            outputs=[model_dropdown],
+        )
+        # Generate button
+        generate_btn.click(
+            fn=generate_with_nexus,
+            inputs=[
+                prompt_input, vram_slider, complexity_slider, model_dropdown,
+                allow_cloud, ollama_relay, use_ollama, use_cloud,
+                system_input, max_tokens_slider, fusion_mode_dropdown,
+            ],
+            outputs=[
+                output_text, model_used_text, risk_gauge, max_risk_gauge, tokens_gauge,
+                eep_gauge, pti_gauge, newi_gauge, action_text, trigger_text, status_text,
+            ],
+        )
+    return demo
+if __name__ == "__main__":
+    if not GRADIO_AVAILABLE:
+        print("ERROR: Gradio is required. Install with: pip install gradio")
+        sys.exit(1)
+    demo = build_space_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True,
+    )