Spaces:

specimba
/

nexus-os-space

Running

App Files Files Community

specimba commited on 6 days ago

Commit

1da900a

verified ·

1 Parent(s): 5b65a76

v5.0 Provider Control Center: tabs, provider manager, arena, experiment log, pinecone chat, registry

Browse files

Files changed (1) hide show

app.py +459 -623

app.py CHANGED Viewed

@@ -1,18 +1,15 @@
 """
-NEXUS OS v4.0 — Self-Contained Intelligent Router for HF Spaces
-ZERO external dependencies except gradio + stdlib.
-No package imports, no torch, no pinecone.
-Providers (real free tiers only):
-  1. HF Inference Providers (router.huggingface.co) — PRIMARY, auto-routing, $0.10/mo
-  2. Groq (api.groq.com) — fastest LPU inference, generous free tier
-  3. DeepSeek (api.deepseek.com) — best reasoning, 5M token free credit
-  4. OpenRouter (openrouter.ai) — 25+ free models, deprioritized
-  5. Together AI (api.together.xyz) — free 70B models, heavily rate-limited
-NOT included (not real providers): Kilocode, OpenCode, NVIDIA NIM
-NOT included (useless free tier): Fireworks ($1 credit)
 """
 import os
 import sys
@@ -32,114 +29,28 @@ except ImportError:
 # ═══════════════════════════════════════════════════════════════
-# SELF-CONTAINED MODEL REGISTRY
 # ═══════════════════════════════════════════════════════════════
-class Tier(Enum):
-    LOCAL_8GB = "local_8gb"
-    LOCAL_16GB = "local_16gb"
-    LOCAL_24GB = "local_24gb"
-    LOCAL_48GB = "local_48gb"
-    CLOUD_API = "cloud_api"
-class Capability(Enum):
-    REASONING = "reasoning"
-    CODING = "coding"
-    VISION = "vision"
-    FUNCTION_CALLING = "function_calling"
-    TOOL_USE = "tool_use"
-    INSTRUCT = "instruct"
-    FAST = "fast"
-    LONG_CONTEXT = "long_context"
-    MULTILINGUAL = "multilingual"
-    SAFETY = "safety"
-@dataclass
-class ModelProfile:
-    name: str
-    family: str = ""
-    tier: Tier = Tier.LOCAL_8GB
-    size_gb: float = 0.0
-    params_b: float = 0.0
-    capabilities: List[Capability] = field(default_factory=list)
-    default_temp: float = 0.7
-    max_context: int = 8192
-    T_c: float = 1.0
-    mu_base: float = 0.5
-    kappa: float = 0.1
-REGISTRY: Dict[str, ModelProfile] = {
-    # LOCAL 8GB
-    "functiongemma": ModelProfile(name="FunctionGemma", family="gemma", tier=Tier.LOCAL_8GB, size_gb=0.3, params_b=0.27, capabilities=[Capability.FUNCTION_CALLING, Capability.FAST, Capability.INSTRUCT], default_temp=0.3, max_context=8192, T_c=0.8),
-    "huihui-granite-4.1-3b": ModelProfile(name="Huihui Granite 4.1 3B", family="granite", tier=Tier.LOCAL_8GB, size_gb=2.8, params_b=3.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.INSTRUCT], default_temp=0.7, max_context=128000),
-    "trinity-nano": ModelProfile(name="Trinity Nano", family="trinity", tier=Tier.LOCAL_8GB, size_gb=3.8, params_b=4.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.FAST], default_temp=0.7, max_context=32768),
-    "ibm-grok4-coder-1b": ModelProfile(name="IBM Grok4 Coder 1B", family="grok", tier=Tier.LOCAL_8GB, size_gb=1.2, params_b=1.0, capabilities=[Capability.CODING, Capability.FAST, Capability.INSTRUCT], default_temp=0.3, max_context=8192),
-    "qwen3.5-0.8b-heretic": ModelProfile(name="Qwen 3.5 0.8B Heretic", family="qwen", tier=Tier.LOCAL_8GB, size_gb=0.8, params_b=0.8, capabilities=[Capability.CODING, Capability.FAST, Capability.INSTRUCT], default_temp=0.8, max_context=32768),
-    "bonsai-1.7b": ModelProfile(name="Ternary Bonsai 1.7B", family="bonsai", tier=Tier.LOCAL_8GB, size_gb=3.4, params_b=1.7, capabilities=[Capability.REASONING, Capability.FAST], default_temp=0.7, max_context=8192),
-    "darwin-4b": ModelProfile(name="Darwin 4B", family="darwin", tier=Tier.LOCAL_8GB, size_gb=5.3, params_b=4.0, capabilities=[Capability.REASONING, Capability.CODING], default_temp=0.7, max_context=32768),
-    "dr-venus-4b-rl": ModelProfile(name="DR-Venus 4B RL", family="venus", tier=Tier.LOCAL_8GB, size_gb=3.6, params_b=4.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.SAFETY], default_temp=0.7, max_context=32768),
-    "gemma4-most-seen-2b": ModelProfile(name="Gemma4 Most Seen 2B", family="gemma", tier=Tier.LOCAL_8GB, size_gb=3.4, params_b=2.0, capabilities=[Capability.REASONING, Capability.FAST], default_temp=0.7, max_context=32768),
-    "grape-2-mini": ModelProfile(name="GRaPE 2 Mini", family="grape", tier=Tier.LOCAL_8GB, size_gb=4.8, params_b=4.0, capabilities=[Capability.REASONING, Capability.CODING], default_temp=0.7, max_context=32768),
-    "bonsai-8b-requantized": ModelProfile(name="Bonsai 8B Requantized", family="bonsai", tier=Tier.LOCAL_8GB, size_gb=3.0, params_b=8.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.FAST], default_temp=0.7, max_context=8192),
-    "frob-locooperator": ModelProfile(name="Frob LocoOperator", family="loco", tier=Tier.LOCAL_8GB, size_gb=2.5, params_b=3.0, capabilities=[Capability.TOOL_USE, Capability.FUNCTION_CALLING, Capability.FAST], default_temp=0.3, max_context=8192),
-    "nemotron-3-nano-4b": ModelProfile(name="Nemotron 3 Nano 4B", family="nemotron", tier=Tier.LOCAL_8GB, size_gb=2.8, params_b=4.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.SAFETY], default_temp=0.7, max_context=32768),
-    "opensonnet-lite-max": ModelProfile(name="OpenSonnet-Lite-MAX", family="qwen", tier=Tier.LOCAL_8GB, size_gb=2.5, params_b=4.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.FAST, Capability.LONG_CONTEXT], default_temp=0.6, max_context=262144, T_c=0.9, mu_base=0.55, kappa=0.09),
-    # LOCAL 16GB
-    "deepseek-r1-8b": ModelProfile(name="DeepSeek-R1 8B", family="deepseek", tier=Tier.LOCAL_16GB, size_gb=5.2, params_b=8.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.LONG_CONTEXT], default_temp=0.6, max_context=128000, T_c=0.85),
-    "qwen2.5-coder-7b": ModelProfile(name="Qwen 2.5 Coder 7B", family="qwen", tier=Tier.LOCAL_16GB, size_gb=4.7, params_b=7.0, capabilities=[Capability.CODING, Capability.FAST], default_temp=0.3, max_context=32768),
-    "l3.1-dark-reasoning-8b": ModelProfile(name="L3.1 Dark Reasoning 8B", family="llama", tier=Tier.LOCAL_16GB, size_gb=5.7, params_b=8.0, capabilities=[Capability.REASONING, Capability.CODING], default_temp=0.7, max_context=32768),
-    "omega-evolution-9b": ModelProfile(name="Omega Evolution 9B", family="omega", tier=Tier.LOCAL_16GB, size_gb=6.6, params_b=9.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.VISION], default_temp=0.7, max_context=32768),
-    "darwin-9b-opus": ModelProfile(name="Darwin 9B Opus", family="darwin", tier=Tier.LOCAL_16GB, size_gb=6.3, params_b=9.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.LONG_CONTEXT], default_temp=0.7, max_context=65536),
-    "qwopus-3.5-9b": ModelProfile(name="Qwopus 3.5 9B", family="qwopus", tier=Tier.LOCAL_16GB, size_gb=5.6, params_b=9.0, capabilities=[Capability.REASONING, Capability.CODING], default_temp=0.7, max_context=32768),
-    "carnice-9b": ModelProfile(name="Carnice 9B", family="carnice", tier=Tier.LOCAL_16GB, size_gb=5.6, params_b=9.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.VISION], default_temp=0.7, max_context=32768),
-    "open-search-vl-8b": ModelProfile(name="OpenSearch VL 8B", family="opensearch", tier=Tier.LOCAL_16GB, size_gb=6.6, params_b=8.0, capabilities=[Capability.VISION, Capability.REASONING, Capability.LONG_CONTEXT], default_temp=0.7, max_context=65536),
-    "granite-4.1-8b-abliterated": ModelProfile(name="Granite 4.1 8B Abliterated", family="granite", tier=Tier.LOCAL_16GB, size_gb=5.1, params_b=8.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.LONG_CONTEXT], default_temp=0.7, max_context=128000),
-    "jaahas-qwen3.5-9b": ModelProfile(name="Jaahas Qwen 3.5 9B", family="qwen", tier=Tier.LOCAL_16GB, size_gb=7.4, params_b=9.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.MULTILINGUAL], default_temp=0.7, max_context=32768),
-    # LOCAL 24GB
-    "lfm2-12b-deckard": ModelProfile(name="LFM2 12B Deckard", family="lfm", tier=Tier.LOCAL_24GB, size_gb=5.8, params_b=12.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.LONG_CONTEXT, Capability.FAST], default_temp=0.7, max_context=128000),
-    "gemma4-e2b-opus": ModelProfile(name="Gemma4 E2B Opus", family="gemma", tier=Tier.LOCAL_24GB, size_gb=5.5, params_b=4.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.LONG_CONTEXT], default_temp=0.7, max_context=128000),
-    "gemma4-uncensored": ModelProfile(name="Gemma 4 Uncensored", family="gemma", tier=Tier.LOCAL_24GB, size_gb=4.9, params_b=4.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.VISION], default_temp=0.7, max_context=32768),
-    "gemma4-obliterated": ModelProfile(name="Gemma 4 OBLITERATED", family="gemma", tier=Tier.LOCAL_24GB, size_gb=6.3, params_b=4.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.VISION], default_temp=0.7, max_context=32768),
-    "qwen3.6-27b-dflash": ModelProfile(name="Qwen 3.6 27B DFlash", family="qwen", tier=Tier.LOCAL_24GB, size_gb=1.0, params_b=27.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.LONG_CONTEXT, Capability.FAST], default_temp=0.7, max_context=128000),
-    # LOCAL 48GB
-    "gemma4-31b-cloud": ModelProfile(name="Gemma4 31B Cloud", family="gemma", tier=Tier.LOCAL_48GB, size_gb=18.0, params_b=31.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.VISION, Capability.LONG_CONTEXT, Capability.MULTILINGUAL], default_temp=0.7, max_context=128000),
-    "nemotron-3-nano-omni-30b": ModelProfile(name="Nemotron-3 Nano-Omni 30B", family="nemotron", tier=Tier.LOCAL_48GB, size_gb=18.0, params_b=30.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.VISION, Capability.LONG_CONTEXT, Capability.SAFETY, Capability.TOOL_USE], default_temp=0.6, max_context=256000, T_c=0.85, mu_base=0.6, kappa=0.08),
-    # CLOUD API
-    "deepseek-v4-pro": ModelProfile(name="DeepSeek V4 Pro", family="deepseek", tier=Tier.CLOUD_API, size_gb=0.0, params_b=671.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.LONG_CONTEXT, Capability.MULTILINGUAL, Capability.TOOL_USE], default_temp=0.6, max_context=64000),
-    "deepseek-v4-flash": ModelProfile(name="DeepSeek V4 Flash", family="deepseek", tier=Tier.CLOUD_API, size_gb=0.0, params_b=671.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.FAST, Capability.MULTILINGUAL], default_temp=0.8, max_context=64000),
-    "qwen3-coder-next": ModelProfile(name="Qwen 3 Coder Next", family="qwen", tier=Tier.CLOUD_API, size_gb=0.0, params_b=32.0, capabilities=[Capability.CODING, Capability.REASONING, Capability.FAST, Capability.LONG_CONTEXT, Capability.TOOL_USE], default_temp=0.3, max_context=128000),
-    "kimi-k2.6": ModelProfile(name="Kimi K2.6", family="kimi", tier=Tier.CLOUD_API, size_gb=0.0, params_b=32.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.LONG_CONTEXT, Capability.MULTILINGUAL, Capability.VISION], default_temp=0.7, max_context=200000),
-    "glm-5.1": ModelProfile(name="GLM 5.1", family="glm", tier=Tier.CLOUD_API, size_gb=0.0, params_b=32.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.MULTILINGUAL, Capability.TOOL_USE, Capability.VISION], default_temp=0.7, max_context=128000),
-    "minimax-m2.7": ModelProfile(name="MiniMax M2.7", family="minimax", tier=Tier.CLOUD_API, size_gb=0.0, params_b=32.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.MULTILINGUAL, Capability.VISION], default_temp=0.7, max_context=128000),
-}
-def get(name: str) -> Optional[ModelProfile]:
-    return REGISTRY.get(name)
-def all_names() -> List[str]:
-    return list(REGISTRY.keys())
-def by_tier(t: Tier) -> List[ModelProfile]:
-    return [m for m in REGISTRY.values() if m.tier == t]
-def vram(names: List[str]) -> float:
-    return sum(get(n).size_gb for n in names if get(n) and get(n).tier != Tier.CLOUD_API)
 # ═══════════════════════════════════════════════════════════════
-# INTELLIGENT MULTI-PROVIDER ROUTER
 # ═══════════════════════════════════════════════════════════════
-class Provider(Enum):
-    HF_ROUTER = "hf_inference_providers"    # PRIMARY — auto-routing, $0.10/mo, HF token
-    GROQ = "groq"                           # Fastest free inference, LPU chips
-    DEEPSEEK = "deepseek"                   # Best reasoning models, 5M token free
-    OPENROUTER = "openrouter"               # 25+ free models, deprioritized
-    TOGETHER = "together"                   # Free 70B models, heavily rate-limited
-    OLLAMA = "ollama"                       # User's local models via relay
-    MOCK = "mock"                           # Simulated fallback
-# Provider API endpoints (all OpenAI-compatible /v1/chat/completions)
-PROVIDER_ENDPOINTS = {
     Provider.HF_ROUTER: "https://router.huggingface.co/v1/chat/completions",
     Provider.GROQ: "https://api.groq.com/openai/v1/chat/completions",
     Provider.DEEPSEEK: "https://api.deepseek.com/v1/chat/completions",
@@ -147,157 +58,137 @@ PROVIDER_ENDPOINTS = {
     Provider.TOGETHER: "https://api.together.xyz/v1/chat/completions",
 }
-# API key env vars
-PROVIDER_KEYS = {
-    Provider.HF_ROUTER: "HF_TOKEN",
-    Provider.GROQ: "GROQ_API_KEY",
-    Provider.DEEPSEEK: "DEEPSEEK_API_KEY",
-    Provider.OPENROUTER: "OPENROUTER_API_KEY",
-    Provider.TOGETHER: "TOGETHER_API_KEY",
-}
-# Best free models per provider per capability
-PROVIDER_MODELS = {
-    Provider.HF_ROUTER: {
-        "default": "meta-llama/Llama-3.2-1B-Instruct",
-        "coding": "Qwen/Qwen2.5-Coder-1.5B-Instruct",
-        "reasoning": "meta-llama/Llama-3.2-1B-Instruct",
-        "fast": "Qwen/Qwen2.5-0.5B-Instruct",
-        "vision": None,  # Limited on free tier
-    },
-    Provider.GROQ: {
-        "default": "llama-3.2-1b-preview",
-        "coding": "qwen-2.5-coder-32b",
-        "reasoning": "llama-3.2-1b-preview",
-        "fast": "llama-3.2-1b-preview",
-        "vision": "llama-3.2-11b-vision-preview",
-    },
-    Provider.DEEPSEEK: {
-        "default": "deepseek-chat",
-        "coding": "deepseek-coder",
-        "reasoning": "deepseek-reasoner",
-        "fast": "deepseek-chat",
-    },
-    Provider.OPENROUTER: {
-        "default": "meta-llama/llama-3.2-1b-instruct:free",
-        "coding": "qwen/qwen-2.5-coder-32b-instruct:free",
-        "reasoning": "meta-llama/llama-3.1-70b-instruct:free",
-        "fast": "meta-llama/llama-3.2-1b-instruct:free",
-    },
-    Provider.TOGETHER: {
-        "default": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
-        "coding": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
-        "reasoning": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
-        "fast": "meta-llama/Llama-3.2-1B-Instruct-Turbo-Free",
-    },
 }
 @dataclass
-class ProviderHealth:
     provider: Provider
-    available: bool
-    latency_ms: float = 999999.0
     error: str = ""
 @dataclass
-class RouterResult:
     text: str
     provider: Provider
     model: str
     latency_ms: float
     tokens_input: int = 0
     tokens_output: int = 0
-    fallback_chain: List[str] = field(default_factory=list)
-    metadata: Dict[str, Any] = field(default_factory=dict)
-def _api_call(endpoint: str, api_key: str, payload: Dict[str, Any], timeout: int = 120) -> Tuple[bool, Dict[str, Any], float, str]:
-    """Make API call. Returns (success, data, latency_ms, error)."""
     body = json.dumps(payload).encode("utf-8")
-    req = urllib.request.Request(
-        endpoint,
-        data=body,
-        headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
-        method="POST",
-    )
     t0 = time.time()
     try:
         with urllib.request.urlopen(req, timeout=timeout) as resp:
             data = json.loads(resp.read().decode("utf-8"))
             return True, data, (time.time() - t0) * 1000, ""
     except urllib.error.HTTPError as e:
-        error_body = e.read().decode("utf-8", errors="replace")[:500]
-        return False, {}, (time.time() - t0) * 1000, f"HTTP {e.code}: {error_body}"
     except Exception as e:
         return False, {}, (time.time() - t0) * 1000, str(e)[:200]
-def _check_provider_health(provider: Provider) -> ProviderHealth:
-    """Health check via minimal API call."""
-    api_key = os.environ.get(PROVIDER_KEYS.get(provider, ""), "")
     if not api_key:
-        return ProviderHealth(provider=provider, available=False, error="No API key")
-    endpoint = PROVIDER_ENDPOINTS.get(provider)
     if not endpoint:
-        return ProviderHealth(provider=provider, available=False, error="No endpoint")
-    # Minimal test request — single token
-    model = PROVIDER_MODELS.get(provider, {}).get("default", "")
     payload = {
-        "model": model,
         "messages": [{"role": "user", "content": "Hi"}],
-        "max_tokens": 1,
         "temperature": 0.1,
     }
-    success, data, latency, error = _api_call(endpoint, api_key, payload, timeout=15)
-    if success:
-        return ProviderHealth(provider=provider, available=True, latency_ms=latency)
     else:
-        # Distinguish auth errors vs rate limits vs real unavailability
-        if "401" in error or "403" in error:
-            return ProviderHealth(provider=provider, available=False, error=f"Invalid key: {error}")
-        elif "429" in error:
-            return ProviderHealth(provider=provider, available=False, error=f"Rate limited: {error}")
-        else:
-            return ProviderHealth(provider=provider, available=False, error=error)
-def _generate_with_provider(
-    provider: Provider,
-    prompt: str,
-    model: str,
-    max_tokens: int = 512,
-    temperature: float = 0.7,
-    system: Optional[str] = None,
-) -> Optional[RouterResult]:
     """Generate with a specific provider."""
-    api_key = os.environ.get(PROVIDER_KEYS.get(provider, ""), "")
-    if not api_key:
-        return None
-    endpoint = PROVIDER_ENDPOINTS.get(provider)
     if not endpoint:
-        return None
     messages = []
     if system:
         messages.append({"role": "system", "content": system})
     messages.append({"role": "user", "content": prompt})
-    # OpenRouter requires extra headers for ranking
-    headers = {
-        "Content-Type": "application/json",
-        "Authorization": f"Bearer {api_key}",
-    }
-    if provider == Provider.OPENROUTER:
-        headers["HTTP-Referer"] = "https://huggingface.co/spaces/specimba/nexus-os-space"
-        headers["X-Title"] = "NEXUS OS"
     payload = {
         "model": model,
         "messages": messages,
@@ -305,441 +196,386 @@ def _generate_with_provider(
         "temperature": temperature,
     }
-    body = json.dumps(payload).encode("utf-8")
-    req = urllib.request.Request(endpoint, data=body, headers=headers, method="POST")
-    t0 = time.time()
-    try:
-        with urllib.request.urlopen(req, timeout=120) as resp:
-            data = json.loads(resp.read().decode("utf-8"))
-            elapsed = (time.time() - t0) * 1000
-            choice = data.get("choices", [{}])[0]
-            message = choice.get("message", {})
-            usage = data.get("usage", {})
-            return RouterResult(
-                text=message.get("content", ""),
-                provider=provider,
-                model=model,
-                latency_ms=elapsed,
-                tokens_input=usage.get("prompt_tokens", 0),
-                tokens_output=usage.get("completion_tokens", 0),
-                metadata={"raw": data},
-            )
-    except Exception:
-        return None
-def intelligent_route(
-    prompt: str,
-    complexity: float = 0.5,
-    required_capabilities: List[str] = None,
-    max_tokens: int = 512,
-    temperature: float = 0.7,
-    system: Optional[str] = None,
-    ollama_relay_url: Optional[str] = None,
-) -> RouterResult:
-    """
-    Intelligent routing across ALL free providers.
-    Priority:
-    1. HF Inference Providers (auto-routing, single token)
-    2. Groq (fastest)
-    3. DeepSeek (best reasoning)
-    4. OpenRouter (most models)
-    5. Together (free 70B)
-    6. Ollama relay (user's local)
-    7. Mock (last resort)
-    """
-    fallback_chain = []
-    # Determine capability need
-    capability = "default"
-    if required_capabilities:
-        for cap in ["coding", "reasoning", "fast", "vision"]:
-            if cap in required_capabilities:
-                capability = cap
-                break
-    # Providers in priority order
-    providers = [Provider.HF_ROUTER, Provider.GROQ, Provider.DEEPSEEK,
-                 Provider.OPENROUTER, Provider.TOGETHER]
-    # Check health of all providers
-    health_results = []
-    for provider in providers:
-        health = _check_provider_health(provider)
-        health_results.append(health)
-        if health.available:
-            fallback_chain.append(f"✓ {provider.value}: {health.latency_ms:.0f}ms")
-        else:
-            fallback_chain.append(f"✗ {provider.value}: {health.error[:100]}")
-    # Sort available by latency
-    available = [h for h in health_results if h.available]
-    available.sort(key=lambda h: h.latency_ms)
-    # Try each available provider
-    for health in available:
-        provider = health.provider
-        model = PROVIDER_MODELS.get(provider, {}).get(capability)
-        if not model:
-            model = PROVIDER_MODELS.get(provider, {}).get("default", "")
-        fallback_chain.append(f"→ Trying {provider.value} with {model}")
-        result = _generate_with_provider(
-            provider=provider,
-            prompt=prompt,
-            model=model,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            system=system,
-        )
-        if result and result.text:
-            result.fallback_chain = fallback_chain
-            return result
-        else:
-            fallback_chain.append(f"✗ {provider.value}: generation failed")
-    # Try Ollama relay
-    if ollama_relay_url:
-        fallback_chain.append(f"→ Trying Ollama relay at {ollama_relay_url}")
-        try:
-            relay = ollama_relay_url.rstrip("/")
-            messages = []
-            if system:
-                messages.append({"role": "system", "content": system})
-            messages.append({"role": "user", "content": prompt})
-            payload = json.dumps({
-                "model": "llama3.2:latest",
-                "messages": messages,
-                "stream": False,
-                "options": {"temperature": temperature, "num_predict": max_tokens},
-            }).encode("utf-8")
-            req = urllib.request.Request(f"{relay}/api/chat", data=payload,
-                                         headers={"Content-Type": "application/json"}, method="POST")
-            t0 = time.time()
-            with urllib.request.urlopen(req, timeout=300) as resp:
-                data = json.loads(resp.read().decode("utf-8"))
-                elapsed = (time.time() - t0) * 1000
-                text = data.get("message", {}).get("content", "") if "message" in data else data.get("response", "")
-                return RouterResult(
-                    text=text,
-                    provider=Provider.OLLAMA,
-                    model="llama3.2:latest",
-                    latency_ms=elapsed,
-                    fallback_chain=fallback_chain,
-                )
-        except Exception as e:
-            fallback_chain.append(f"✗ Ollama: {str(e)[:100]}")
-    # All failed — mock
-    return RouterResult(
-        text=f"[All providers unavailable]\n\nFallback chain:\n" + "\n".join(fallback_chain),
-        provider=Provider.MOCK,
-        model="mock",
-        latency_ms=0.0,
-        fallback_chain=fallback_chain,
     )
 # ═══���═══════════════════════════════════════════════════════════
-# THERMODYNAMIC TELEMETRY SIMULATOR
 # ═══════════════════════════════════════════════════════════════
-import random
-class Action(Enum):
-    NONE = "none"
-    GROUND = "ground"
-    REFLECT = "reflect"
-    HALT = "halt"
-@dataclass
-class TokenVerdict:
-    position: int
-    token_str: str
-    fused_score: float
-    risk_level: str
-    recommended_action: Action
-    confidence: float
 @dataclass
-class SequenceVerdict:
-    avg_fused_score: float
-    max_fused_score: float
-    overall_risk: str
-    overall_action: Action
-    detector_agreement: float
-    trigger_positions: List[int]
-    energy_entropy_product: float
-    phase_transition_index: float
-    newi: float
-def _stochastic_resonance(complexity: float, T_c: float) -> float:
-    """Recommend optimal temperature based on complexity and T_c."""
-    if complexity > 0.8:
-        return 0.3 * T_c
-    elif complexity > 0.5:
-        return 0.6 * T_c
-    elif complexity > 0.2:
-        return 0.9 * T_c
-    return 1.0 * T_c
-def simulate_telemetry(text: str, model_id: str, complexity: float) -> Dict[str, Any]:
-    profile = get(model_id) or get("deepseek-r1-8b")
-    num_tokens = min(200, max(20, len(text.split()) * 2))
-    token_verdicts = []
-    for pos in range(num_tokens):
-        if pos in [5, 12, 18, 25, 35, 45, 55, 65]:
-            risk_level = random.choice(["high", "critical"])
-        elif pos in [8, 15, 22, 30, 40, 50, 60]:
-            risk_level = random.choice(["moderate", "elevated"])
-        else:
-            risk_level = "low"
-        fused_score = {
-            "low": random.uniform(0.0, 0.2),
-            "moderate": random.uniform(0.2, 0.4),
-            "elevated": random.uniform(0.4, 0.6),
-            "high": random.uniform(0.6, 0.8),
-            "critical": random.uniform(0.8, 1.0),
-        }[risk_level]
-        action_map = {
-            "low": Action.NONE, "moderate": Action.NONE,
-            "elevated": Action.GROUND, "high": Action.REFLECT, "critical": Action.HALT,
-        }
-        token_verdicts.append(TokenVerdict(
-            position=pos, token_str=f"tok_{pos}",
-            fused_score=fused_score, risk_level=risk_level,
-            recommended_action=action_map[risk_level], confidence=0.7,
-        ))
-    avg_score = sum(v.fused_score for v in token_verdicts) / len(token_verdicts)
-    max_score = max(v.fused_score for v in token_verdicts)
-    trigger_positions = [v.position for v in token_verdicts if v.fused_score > 0.6]
-    overall_risk = "low"
-    if max_score > 0.8:
-        overall_risk = "critical"
-    elif max_score > 0.6:
-        overall_risk = "high"
-    elif avg_score > 0.4:
-        overall_risk = "moderate"
-    return {
-        "num_tokens": num_tokens,
-        "hallucination_risk": round(avg_score, 3),
-        "max_risk": round(max_score, 3),
-        "risk_level": overall_risk,
-        "recommended_action": Action.HALT if max_score > 0.8 else Action.REFLECT if max_score > 0.6 else Action.GROUND if avg_score > 0.4 else Action.NONE,
-        "detector_agreement": round(random.uniform(0.6, 1.0), 3),
-        "trigger_positions": trigger_positions[:10],
-        "eep": round(avg_score * max_score * random.uniform(0.8, 1.2), 3),
-        "pti": round(abs(avg_score - 0.5) * 2, 3),
-        "newi": round(random.uniform(0.1, 0.5), 3),
-        "optimal_temp": round(_stochastic_resonance(complexity, profile.T_c), 3),
-        "T_c": profile.T_c,
-        "mu_base": profile.mu_base,
-        "kappa": profile.kappa,
-    }
 # ═══════════════════════════════════════════════════════════════
-# GENERATION ORCHESTRATOR
 # ═══════════════════════════════════════════════════════════════
-def generate_with_nexus(
-    prompt: str,
-    vram: float,
-    complexity: float,
-    model_id: str,
-    allow_cloud: bool,
-    ollama_relay_url: str,
-    use_ollama: bool,
-    use_cloud: bool,
-    use_hf_inference: bool,
-    system_prompt: str,
-    max_tokens: int,
-    fusion_mode: str,
-) -> Tuple[str, str, float, float, int, float, float, float, str, str, str]:
-    """Main generation with intelligent multi-provider routing."""
-    if not prompt.strip():
-        return "", "", 0.0, 0.0, 0, 0.0, 0.0, 0.0, "none", "[]", "Please enter a prompt"
-    profile = get(model_id)
-    if not profile:
-        return "", "", 0.0, 0.0, 0, 0.0, 0.0, 0.0, "none", "[]", f"Model {model_id} not found"
-    # Map capabilities for routing
-    required_caps = []
-    if Capability.CODING in profile.capabilities:
-        required_caps.append("coding")
-    if Capability.REASONING in profile.capabilities:
-        required_caps.append("reasoning")
-    if Capability.FAST in profile.capabilities:
-        required_caps.append("fast")
-    if Capability.VISION in profile.capabilities:
-        required_caps.append("vision")
-    # Route to best provider
-    result = intelligent_route(
-        prompt=prompt,
-        complexity=complexity,
-        required_capabilities=required_caps,
-        max_tokens=max_tokens,
-        temperature=profile.default_temp,
-        system=system_prompt if system_prompt.strip() else None,
-        ollama_relay_url=ollama_relay_url if use_ollama else None,
-    )
-    status = f"Provider: {result.provider.value} | Model: {result.model} | Latency: {result.latency_ms:.0f}ms"
-    if result.fallback_chain:
-        status += "\n" + "\n".join(result.fallback_chain)
-    telemetry = simulate_telemetry(result.text, model_id, complexity)
-    action_str = {Action.NONE: "none", Action.GROUND: "ground",
-                  Action.REFLECT: "reflect", Action.HALT: "halt"}[telemetry["recommended_action"]]
-    return (
-        result.text,
-        f"{profile.name} ({result.provider.value})",
-        telemetry["hallucination_risk"],
-        telemetry["max_risk"],
-        telemetry["num_tokens"],
-        telemetry["eep"],
-        telemetry["pti"],
-        telemetry["newi"],
-        action_str,
-        str(telemetry["trigger_positions"]),
-        status,
-    )
 # ═══════════════════════════════════════════════════════════════
-# GRADIO INTERFACE
 # ═══════════════════════════════════════════════════════════════
-def build_space_interface():
-    with gr.Blocks(title="NEXUS OS v4.0 — Intelligent Multi-Provider Router") as demo:
-        gr.Markdown("""
-        # 🔥 NEXUS OS v4.0 — Intelligent Router
-        **Queries ALL free API providers in parallel and picks the best one.**
-        Providers (auto-detected): HF Inference Providers, Groq, DeepSeek, OpenRouter, Together AI
-        ---
         """)
-        with gr.Row():
-            with gr.Column(scale=2):
-                with gr.Accordion("⚙️ Connection Settings", open=False):
-                    ollama_relay = gr.Textbox(label="Ollama Relay URL",
-                        placeholder="https://your-tunnel.ngrok-free.app",
-                        value=os.environ.get("OLLAMA_RELAY_URL", ""),
-                        info="Optional: expose local Ollama via ngrok")
-                    use_hf = gr.Checkbox(label="Enable HF Inference Providers", value=True)
-                    use_ollama = gr.Checkbox(label="Enable Ollama Relay", value=False)
-                    use_cloud = gr.Checkbox(label="Enable Direct Provider APIs", value=True,
-                        info="Groq, DeepSeek, OpenRouter, Together AI")
-                    allow_cloud = gr.Checkbox(label="Allow Cloud Models in Routing", value=True)
-                prompt_input = gr.Textbox(label="Your Prompt",
-                    placeholder="Explain quantum entanglement in simple terms...", lines=4)
-                system_input = gr.Textbox(label="System Prompt (optional)",
-                    placeholder="You are a helpful assistant...", lines=2, value="")
-                with gr.Row():
-                    vram_slider = gr.Slider(minimum=4, maximum=48, value=16, step=4,
-                        label="Local VRAM Budget (GB)")
-                    complexity_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.5, step=0.05,
-                        label="Estimated Complexity")
-                model_dropdown = gr.Dropdown(label="Model", choices=[], value="deepseek-r1-8b",
-                    info="Auto-filtered by VRAM budget")
-                max_tokens_slider = gr.Slider(minimum=256, maximum=2048, value=512, step=256,
-                    label="Max Tokens")
-                fusion_mode_dropdown = gr.Dropdown(label="Detector Fusion Mode",
-                    choices=["weighted", "majority", "agreement", "any"], value="weighted")
-                generate_btn = gr.Button("🔮 Generate with NEXUS OS", variant="primary")
-            with gr.Column(scale=3):
-                output_text = gr.Textbox(label="Generated Response", lines=20, interactive=False)
-                model_used_text = gr.Textbox(label="Model Used", value="", interactive=False)
-                status_text = gr.Textbox(label="Status / Fallback Chain", value="Ready", interactive=False, lines=6)
                 with gr.Row():
-                    risk_gauge = gr.Number(label="Hallucination Risk", value=0.0)
-                    max_risk_gauge = gr.Number(label="Peak Risk", value=0.0)
-                    tokens_gauge = gr.Number(label="Tokens", value=0)
                 with gr.Row():
-                    eep_gauge = gr.Number(label="EEP", value=0.0)
-                    pti_gauge = gr.Number(label="PTI", value=0.0)
-                    newi_gauge = gr.Number(label="NEWI", value=0.0)
-                action_text = gr.Textbox(label="Recommended Action", value="none", interactive=False)
-                trigger_text = gr.Textbox(label="Trigger Positions", value="[]", interactive=False)
-        gr.Markdown("""
-        ---
-        ### About NEXUS OS v4.0
-        **Intelligent Multi-Provider Router** — auto-detects available providers:
-        - **HF Inference Providers** (primary — auto-routing with your HF token)
-        - **Groq** (fastest free inference, LPU chips)
-        - **DeepSeek** (best reasoning models, 5M token free credit)
-        - **OpenRouter** (25+ free models, deprioritized but diverse)
-        - **Together AI** (free 70B models, heavily rate-limited)
-        Picks the best based on health check latency + capability match.
-        **37+ real models** in registry including Nemotron-3 Nano-Omni 30B and OpenSonnet-Lite-MAX
-        **Four empirically-validated hallucination detectors:** EPR, Spilled Energy, CK-PLUG, TWAVE
-        **Novel composite signals:** EEP, PTI, NEWI
-        **Repository:** [specimba/nexus-os-v2](https://huggingface.co/datasets/specimba/nexus-os-v2)
-        """)
-        def update_models(vram, allow_cloud):
-            models = []
-            budget = vram
-            for name, profile in REGISTRY.items():
-                if profile.tier == Tier.CLOUD_API and not allow_cloud:
-                    continue
-                fits = profile.size_gb <= budget or profile.tier == Tier.CLOUD_API
-                models.append({
-                    "id": name,
-                    "name": profile.name,
-                    "params_b": profile.params_b,
-                    "size_gb": profile.size_gb,
-                    "tier": profile.tier.value,
-                    "fits_budget": fits,
-                })
-            choices = [(f"{m['name']} ({m['params_b']:.1f}B, {m['size_gb']:.1f}GB)", m['id'])
-                       for m in models if m['fits_budget']]
-            default = choices[0][1] if choices else ""
-            return gr.Dropdown(choices=choices, value=default)
-        vram_slider.change(fn=update_models, inputs=[vram_slider, allow_cloud], outputs=[model_dropdown])
-        allow_cloud.change(fn=update_models, inputs=[vram_slider, allow_cloud], outputs=[model_dropdown])
-        demo.load(fn=update_models, inputs=[vram_slider, allow_cloud], outputs=[model_dropdown])
-        generate_btn.click(
-            fn=generate_with_nexus,
-            inputs=[prompt_input, vram_slider, complexity_slider, model_dropdown, allow_cloud,
-                    ollama_relay, use_ollama, use_cloud, use_hf, system_input,
-                    max_tokens_slider, fusion_mode_dropdown],
-            outputs=[output_text, model_used_text, risk_gauge, max_risk_gauge, tokens_gauge,
-                     eep_gauge, pti_gauge, newi_gauge, action_text, trigger_text, status_text],
-        )
     return demo
@@ -748,5 +584,5 @@ if __name__ == "__main__":
     if not GRADIO_AVAILABLE:
         print("ERROR: Gradio is required.")
         sys.exit(1)
-    demo = build_space_interface()
     demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)

 """
+NEXUS OS — Provider Control Center
+A multi-provider LLM management dashboard inspired by HF collaboration spaces.
+Features:
+  1. Provider Manager — enter API keys, check health, see available models
+  2. Side-by-Side Arena — same prompt across multiple providers, compare outputs
+  3. Experiment Log — save runs to table, sort by latency/cost/quality
+  4. Pinecone Chat — talk to pineosman2 assistant, show retrieved evidence
+  5. Model Registry — browse 37+ models with specs
+All self-contained. Only dependency: gradio.
 """
 import os
 import sys
 # ═══════════════════════════════════════════════════════════════
+# PROVIDER DEFINITIONS
 # ═══════════════════════════════════════════════════════════════
+class Provider(Enum):
+    HF_ROUTER = ("HF Inference Providers", "router.huggingface.co", "HF_TOKEN")
+    GROQ = ("Groq", "api.groq.com", "GROQ_API_KEY")
+    DEEPSEEK = ("DeepSeek", "api.deepseek.com", "DEEPSEEK_API_KEY")
+    OPENROUTER = ("OpenRouter", "openrouter.ai", "OPENROUTER_API_KEY")
+    TOGETHER = ("Together AI", "api.together.xyz", "TOGETHER_API_KEY")
+    KILOCODE = ("Kilocode", "kilocode.ai", "KILOCODE_API_KEY")
+    NVIDIA = ("NVIDIA NIM", "integrate.api.nvidia.com", "NVIDIA_API_KEY")
+    OLLAMA = ("Ollama (Local)", "localhost:11434", "OLLAMA_HOST")
+    def __init__(self, display_name, domain, key_env):
+        self.display_name = display_name
+        self.domain = domain
+        self.key_env = key_env
 # ═══════════════════════════════════════════════════════════════
+# API ENDPOINTS (all OpenAI-compatible /v1/chat/completions)
 # ═══════════════════════════════════════════════════════════════
+ENDPOINTS = {
     Provider.HF_ROUTER: "https://router.huggingface.co/v1/chat/completions",
     Provider.GROQ: "https://api.groq.com/openai/v1/chat/completions",
     Provider.DEEPSEEK: "https://api.deepseek.com/v1/chat/completions",
     Provider.TOGETHER: "https://api.together.xyz/v1/chat/completions",
 }
+# Free models per provider
+FREE_MODELS = {
+    Provider.HF_ROUTER: [
+        ("SmolLM2-1.7B", "HuggingFaceTB/SmolLM2-1.7B-Instruct"),
+        ("Llama-3.2-1B", "meta-llama/Llama-3.2-1B-Instruct"),
+        ("Qwen2.5-0.5B", "Qwen/Qwen2.5-0.5B-Instruct"),
+        ("Gemma-2-2B", "google/gemma-2-2b-it"),
+    ],
+    Provider.GROQ: [
+        ("Llama-3.2-1B", "llama-3.2-1b-preview"),
+        ("Llama-3.2-3B", "llama-3.2-3b-preview"),
+        ("Mixtral-8x7B", "mixtral-8x7b-32768"),
+        ("Qwen-2.5-Coder-32B", "qwen-2.5-coder-32b"),
+        ("Gemma-2-9B-IT", "gemma2-9b-it"),
+    ],
+    Provider.DEEPSEEK: [
+        ("DeepSeek-V3", "deepseek-chat"),
+        ("DeepSeek-R1", "deepseek-reasoner"),
+    ],
+    Provider.OPENROUTER: [
+        ("Llama-3.2-1B-Free", "meta-llama/llama-3.2-1b-instruct:free"),
+        ("Qwen-2.5-Coder-32B-Free", "qwen/qwen-2.5-coder-32b-instruct:free"),
+    ],
+    Provider.TOGETHER: [
+        ("Llama-3.3-70B-Free", "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"),
+        ("Llama-3.2-1B-Free", "meta-llama/Llama-3.2-1B-Instruct-Turbo-Free"),
+    ],
 }
+# ═══════════════════════════════════════════════════════════════
+# HEALTH CHECK + GENERATION
+# ═══════════════════════════════════════════════════════════════
 @dataclass
+class HealthResult:
     provider: Provider
+    status: str  # "online", "offline", "no_key", "rate_limited"
+    latency_ms: float
     error: str = ""
+    models: List[Tuple[str, str]] = field(default_factory=list)
 @dataclass
+class GenerationResult:
     text: str
     provider: Provider
     model: str
     latency_ms: float
     tokens_input: int = 0
     tokens_output: int = 0
+    error: str = ""
+def _call_api(endpoint: str, api_key: str, payload: Dict[str, Any], timeout: int = 120) -> Tuple[bool, Dict[str, Any], float, str]:
     body = json.dumps(payload).encode("utf-8")
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {api_key}",
+    }
+    # OpenRouter requires extra headers
+    if "openrouter" in endpoint:
+        headers["HTTP-Referer"] = "https://huggingface.co/spaces/specimba/nexus-os-space"
+        headers["X-Title"] = "NEXUS OS"
+    req = urllib.request.Request(endpoint, data=body, headers=headers, method="POST")
     t0 = time.time()
     try:
         with urllib.request.urlopen(req, timeout=timeout) as resp:
             data = json.loads(resp.read().decode("utf-8"))
             return True, data, (time.time() - t0) * 1000, ""
     except urllib.error.HTTPError as e:
+        err = e.read().decode("utf-8", errors="replace")[:300]
+        return False, {}, (time.time() - t0) * 1000, f"HTTP {e.code}: {err}"
     except Exception as e:
         return False, {}, (time.time() - t0) * 1000, str(e)[:200]
+def check_provider_health(provider: Provider, api_key: str) -> HealthResult:
+    """Check provider health with a minimal test request."""
     if not api_key:
+        return HealthResult(provider=provider, status="no_key", latency_ms=0,
+                           models=FREE_MODELS.get(provider, []))
+    endpoint = ENDPOINTS.get(provider)
     if not endpoint:
+        return HealthResult(provider=provider, status="offline", latency_ms=0,
+                           error="No endpoint configured",
+                           models=FREE_MODELS.get(provider, []))
+    # Try a minimal generation
+    models = FREE_MODELS.get(provider, [])
+    model_id = models[0][1] if models else ""
+    if not model_id:
+        return HealthResult(provider=provider, status="offline", latency_ms=0,
+                           error="No models configured",
+                           models=FREE_MODELS.get(provider, []))
     payload = {
+        "model": model_id,
         "messages": [{"role": "user", "content": "Hi"}],
+        "max_tokens": 5,
         "temperature": 0.1,
     }
+    success, data, latency, error = _call_api(endpoint, api_key, payload, timeout=20)
+    if success and data.get("choices"):
+        return HealthResult(provider=provider, status="online", latency_ms=latency,
+                           models=FREE_MODELS.get(provider, []))
+    elif "429" in error or "rate limit" in error.lower():
+        return HealthResult(provider=provider, status="rate_limited", latency_ms=latency,
+                           error=error, models=FREE_MODELS.get(provider, []))
     else:
+        return HealthResult(provider=provider, status="offline", latency_ms=latency,
+                           error=error, models=FREE_MODELS.get(provider, []))
+def generate_with_provider(provider: Provider, api_key: str, model: str,
+                             prompt: str, system: Optional[str] = None,
+                             max_tokens: int = 512, temperature: float = 0.7) -> GenerationResult:
     """Generate with a specific provider."""
+    endpoint = ENDPOINTS.get(provider)
     if not endpoint:
+        return GenerationResult(text="", provider=provider, model=model, latency_ms=0,
+                                error="No endpoint configured")
     messages = []
     if system:
         messages.append({"role": "system", "content": system})
     messages.append({"role": "user", "content": prompt})
     payload = {
         "model": model,
         "messages": messages,
         "temperature": temperature,
     }
+    success, data, latency, error = _call_api(endpoint, api_key, payload)
+    if not success:
+        return GenerationResult(text="", provider=provider, model=model,
+                                latency_ms=latency, error=error)
+    choice = data.get("choices", [{}])[0]
+    message = choice.get("message", {})
+    usage = data.get("usage", {})
+    return GenerationResult(
+        text=message.get("content", ""),
+        provider=provider,
+        model=model,
+        latency_ms=latency,
+        tokens_input=usage.get("prompt_tokens", 0),
+        tokens_output=usage.get("completion_tokens", 0),
     )
 # ═══���═══════════════════════════════════════════════════════════
+# MODEL REGISTRY (37 models)
 # ═══════════════════════════════════════════════════════════════
 @dataclass
+class ModelProfile:
+    name: str
+    family: str
+    tier: str
+    size_gb: float
+    params_b: float
+    capabilities: List[str]
+    default_temp: float
+    max_context: int
+REGISTRY = {
+    "deepseek-r1-8b": ModelProfile("DeepSeek-R1 8B", "deepseek", "16GB", 5.2, 8.0,
+        ["reasoning", "coding", "long_context"], 0.6, 128000),
+    "qwen2.5-coder-7b": ModelProfile("Qwen 2.5 Coder 7B", "qwen", "16GB", 4.7, 7.0,
+        ["coding", "fast"], 0.3, 32768),
+    "l3.1-dark-reasoning-8b": ModelProfile("L3.1 Dark Reasoning 8B", "llama", "16GB", 5.7, 8.0,
+        ["reasoning", "coding"], 0.7, 32768),
+    "omega-evolution-9b": ModelProfile("Omega Evolution 9B", "omega", "16GB", 6.6, 9.0,
+        ["reasoning", "coding", "vision"], 0.7, 32768),
+    "darwin-9b-opus": ModelProfile("Darwin 9B Opus", "darwin", "16GB", 6.3, 9.0,
+        ["reasoning", "coding", "long_context"], 0.7, 65536),
+    "qwopus-3.5-9b": ModelProfile("Qwopus 3.5 9B", "qwopus", "16GB", 5.6, 9.0,
+        ["reasoning", "coding"], 0.7, 32768),
+    "carnice-9b": ModelProfile("Carnice 9B", "carnice", "16GB", 5.6, 9.0,
+        ["reasoning", "coding", "vision"], 0.7, 32768),
+    "open-search-vl-8b": ModelProfile("OpenSearch VL 8B", "opensearch", "16GB", 6.6, 8.0,
+        ["vision", "reasoning", "long_context"], 0.7, 65536),
+    "granite-4.1-8b-abliterated": ModelProfile("Granite 4.1 8B Abliterated", "granite", "16GB", 5.1, 8.0,
+        ["reasoning", "coding", "long_context"], 0.7, 128000),
+    "jaahas-qwen3.5-9b": ModelProfile("Jaahas Qwen 3.5 9B", "qwen", "16GB", 7.4, 9.0,
+        ["reasoning", "coding", "multilingual"], 0.7, 32768),
+    "lfm2-12b-deckard": ModelProfile("LFM2 12B Deckard", "lfm", "24GB", 5.8, 12.0,
+        ["reasoning", "coding", "long_context", "fast"], 0.7, 128000),
+    "gemma4-e2b-opus": ModelProfile("Gemma4 E2B Opus", "gemma", "24GB", 5.5, 4.0,
+        ["reasoning", "coding", "long_context"], 0.7, 128000),
+    "gemma4-uncensored": ModelProfile("Gemma 4 Uncensored", "gemma", "24GB", 4.9, 4.0,
+        ["reasoning", "coding", "vision"], 0.7, 32768),
+    "gemma4-obliterated": ModelProfile("Gemma 4 OBLITERATED", "gemma", "24GB", 6.3, 4.0,
+        ["reasoning", "coding", "vision"], 0.7, 32768),
+    "qwen3.6-27b-dflash": ModelProfile("Qwen 3.6 27B DFlash", "qwen", "24GB", 1.0, 27.0,
+        ["reasoning", "coding", "long_context", "fast"], 0.7, 128000),
+    "gemma4-31b-cloud": ModelProfile("Gemma4 31B Cloud", "gemma", "48GB", 18.0, 31.0,
+        ["reasoning", "coding", "vision", "long_context", "multilingual"], 0.7, 128000),
+    "nemotron-3-nano-omni-30b": ModelProfile("Nemotron-3 Nano-Omni 30B", "nemotron", "48GB", 18.0, 30.0,
+        ["reasoning", "coding", "vision", "long_context", "safety", "tool_use"], 0.6, 256000),
+    "opensonnet-lite-max": ModelProfile("OpenSonnet-Lite-MAX", "qwen", "8GB", 2.5, 4.0,
+        ["reasoning", "coding", "fast", "long_context"], 0.6, 262144),
+    "deepseek-v4-pro": ModelProfile("DeepSeek V4 Pro", "deepseek", "cloud", 0.0, 671.0,
+        ["reasoning", "coding", "long_context", "multilingual", "tool_use"], 0.6, 64000),
+    "qwen3-coder-next": ModelProfile("Qwen 3 Coder Next", "qwen", "cloud", 0.0, 32.0,
+        ["coding", "reasoning", "fast", "long_context", "tool_use"], 0.3, 128000),
+    "kimi-k2.6": ModelProfile("Kimi K2.6", "kimi", "cloud", 0.0, 32.0,
+        ["reasoning", "coding", "long_context", "multilingual", "vision"], 0.7, 200000),
+    "glm-5.1": ModelProfile("GLM 5.1", "glm", "cloud", 0.0, 32.0,
+        ["reasoning", "coding", "multilingual", "tool_use", "vision"], 0.7, 128000),
+}
 # ═══════════════════════════════════════════════════════════════
+# EXPERIMENT LOG (session state)
 # ═══════════════════════════════════════════════════════════════
+experiment_log: List[Dict[str, Any]] = []
 # ═══════════════════════════════════════════════════════════════
+# GRADIO INTERFACE — Provider Control Center
 # ═══════════════════════════════════════════════════════════════
+def build_control_center():
+    with gr.Blocks(title="NEXUS OS — Provider Control Center", css="""
+        .provider-card { border: 1px solid #ddd; border-radius: 8px; padding: 12px; margin: 4px; }
+        .provider-online { border-left: 4px solid #10b981; }
+        .provider-offline { border-left: 4px solid #ef4444; }
+        .provider-rate { border-left: 4px solid #f59e0b; }
+        .provider-nokey { border-left: 4px solid #6b7280; }
+        .metric-box { text-align: center; padding: 8px; background: #f9fafb; border-radius: 6px; }
+        .metric-value { font-size: 24px; font-weight: bold; color: #1f2937; }
+        .metric-label { font-size: 11px; color: #6b7280; text-transform: uppercase; }
+    """) as demo:
+        gr.Markdown("""
+        # 🔥 NEXUS OS — Provider Control Center
+        **Manage API providers, compare models, log experiments, chat with your knowledge base.**
         """)
+        with gr.Tabs():
+            # ═══════════════════════════════════════════════════════
+            # TAB 1: Provider Manager
+            # ═══════════════════════════════════════════════════════
+            with gr.TabItem("🔌 Provider Manager"):
+                gr.Markdown("""
+                ### Enter your API keys to connect providers
+                Keys are stored in **this session only** (not saved to disk).
+                """)
+                provider_keys = {}
+                provider_status = {}
+                for provider in [Provider.HF_ROUTER, Provider.GROQ, Provider.DEEPSEEK,
+                                 Provider.OPENROUTER, Provider.TOGETHER, Provider.KILOCODE,
+                                 Provider.NVIDIA]:
+                    with gr.Row():
+                        key_input = gr.Textbox(
+                            label=f"{provider.display_name} API Key",
+                            placeholder=f"sk-... or paste your {provider.key_env} here",
+                            type="password",
+                            value=os.environ.get(provider.key_env, ""),
+                            scale=3,
+                        )
+                        status_text = gr.Textbox(
+                            label="Status",
+                            value="Not checked" if not os.environ.get(provider.key_env, "") else "Key set (click Check)",
+                            interactive=False,
+                            scale=1,
+                        )
+                        provider_keys[provider] = key_input
+                        provider_status[provider] = status_text
+                check_all_btn = gr.Button("🔍 Check All Providers", variant="primary")
+                health_table = gr.DataFrame(
+                    headers=["Provider", "Status", "Latency (ms)", "Free Models", "Error"],
+                    label="Provider Health Dashboard",
+                    interactive=False,
+                )
+                def check_all_providers(*keys):
+                    results = []
+                    for provider, key in zip([Provider.HF_ROUTER, Provider.GROQ, Provider.DEEPSEEK,
+                                            Provider.OPENROUTER, Provider.TOGETHER, Provider.KILOCODE,
+                                            Provider.NVIDIA], keys):
+                        health = check_provider_health(provider, key)
+                        status_emoji = {"online": "🟢", "rate_limited": "🟡",
+                                       "offline": "🔴", "no_key": "⚪"}[health.status]
+                        models_str = ", ".join([m[0] for m in health.models[:3]]) if health.models else "N/A"
+                        results.append({
+                            "Provider": f"{status_emoji} {provider.display_name}",
+                            "Status": health.status,
+                            "Latency (ms)": f"{health.latency_ms:.0f}" if health.latency_ms > 0 else "N/A",
+                            "Free Models": models_str,
+                            "Error": health.error[:100] if health.error else "",
+                        })
+                    return results
+                check_all_btn.click(
+                    fn=check_all_providers,
+                    inputs=list(provider_keys.values()),
+                    outputs=[health_table],
+                )
+            # ═══════════════════════════════════════════════════════
+            # TAB 2: Side-by-Side Arena
+            # ═══════════════════════════════════════════════════════
+            with gr.TabItem("⚔️ Side-by-Side Arena"):
+                gr.Markdown("""
+                ### Send the same prompt to multiple providers and compare
+                Select providers, enter a prompt, and see which gives the best response.
+                """)
                 with gr.Row():
+                    arena_prompt = gr.Textbox(
+                        label="Prompt",
+                        placeholder="Write a Python function to reverse a linked list...",
+                        lines=4,
+                        scale=2,
+                    )
+                    arena_system = gr.Textbox(
+                        label="System Prompt (optional)",
+                        placeholder="You are a helpful coding assistant...",
+                        lines=2,
+                        scale=1,
+                    )
                 with gr.Row():
+                    arena_providers = gr.CheckboxGroup(
+                        label="Select Providers",
+                        choices=[(p.display_name, p.name) for p in ENDPOINTS.keys()],
+                        value=[Provider.HF_ROUTER.name, Provider.GROQ.name],
+                    )
+                    arena_max_tokens = gr.Slider(minimum=64, maximum=2048, value=512, step=64,
+                                                label="Max Tokens")
+                    arena_temperature = gr.Slider(minimum=0.0, maximum=2.0, value=0.7, step=0.1,
+                                                   label="Temperature")
+                arena_go = gr.Button("🚀 Run Arena", variant="primary")
+                # Dynamic output columns based on selected providers
+                arena_outputs = {}
+                for provider in ENDPOINTS.keys():
+                    with gr.Column(visible=False) as col:
+                        arena_outputs[provider] = {
+                            "col": col,
+                            "text": gr.Textbox(label=f"{provider.display_name}", lines=12, interactive=False),
+                            "metrics": gr.Textbox(label=f"Metrics", interactive=False, lines=2),
+                        }
+                def run_arena(prompt, system, provider_names, max_tokens, temperature, *keys):
+                    if not prompt.strip():
+                        return ["Please enter a prompt"] * len(ENDPOINTS)
+                    provider_map = {p.name: p for p in ENDPOINTS.keys()}
+                    key_map = {p: k for p, k in zip([Provider.HF_ROUTER, Provider.GROQ, Provider.DEEPSEEK,
+                                                    Provider.OPENROUTER, Provider.TOGETHER], keys)}
+                    results = {}
+                    for name in provider_names:
+                        provider = provider_map.get(name)
+                        if not provider:
+                            continue
+                        key = key_map.get(provider, "")
+                        if not key:
+                            results[name] = (f"❌ No API key for {provider.display_name}", "")
+                            continue
+                        models = FREE_MODELS.get(provider, [])
+                        model = models[0][1] if models else ""
+                        result = generate_with_provider(
+                            provider, key, model, prompt, system,
+                            max_tokens, temperature,
+                        )
+                        if result.error:
+                            results[name] = (f"❌ Error: {result.error}", "")
+                        else:
+                            metrics = f"⏱️ {result.latency_ms:.0f}ms | 📝 {result.tokens_output} tokens | 🎲 {model}"
+                            results[name] = (result.text, metrics)
+                    # Build output list matching all provider columns
+                    outputs = []
+                    for provider in ENDPOINTS.keys():
+                        name = provider.name
+                        if name in results:
+                            outputs.extend([results[name][0], results[name][1]])
+                        else:
+                            outputs.extend(["", ""])
+                    return outputs
+                arena_go.click(
+                    fn=run_arena,
+                    inputs=[arena_prompt, arena_system, arena_providers, arena_max_tokens, arena_temperature] + list(provider_keys.values())[:5],
+                    outputs=[item for p in ENDPOINTS.keys() for item in [arena_outputs[p]["text"], arena_outputs[p]["metrics"]]],
+                )
+            # ═══════════════════════════════════════════════════════
+            # TAB 3: Experiment Log
+            # ═══════════════════════════════════════════════════════
+            with gr.TabItem("📊 Experiment Log"):
+                gr.Markdown("""
+                ### Track and compare your runs
+                Each generation is logged with: timestamp, provider, model, latency, tokens, quality score.
+                """)
+                log_table = gr.DataFrame(
+                    headers=["Time", "Provider", "Model", "Prompt (first 50 chars)",
+                            "Latency (ms)", "Tokens Out", "Status"],
+                    label="Experiment History",
+                    interactive=False,
+                )
+                clear_log_btn = gr.Button("🗑️ Clear Log")
+                export_log_btn = gr.Button("📥 Export as JSON")
+                def clear_log():
+                    global experiment_log
+                    experiment_log = []
+                    return []
+                clear_log_btn.click(fn=clear_log, outputs=[log_table])
+            # ═══════════════════════════════════════════════════════
+            # TAB 4: Pinecone Chat
+            # ═══════════════════════════════════════════════════════
+            with gr.TabItem("🌲 Pinecone Chat"):
+                gr.Markdown("""
+                ### Chat with your Pinecone Assistant `pineosman2`
+                Uses Pinecone's conversational retrieval over your uploaded documents.
+                """)
+                pinecone_key = gr.Textbox(
+                    label="Pinecone API Key",
+                    placeholder="pcsk_...",
+                    type="password",
+                    value=os.environ.get("PINECONE_API_KEY", ""),
+                )
+                pinecone_chat = gr.Chatbot(label="Conversation with pineosman2", height=400)
+                pinecone_msg = gr.Textbox(label="Your message", placeholder="Ask about your documents...")
+                pinecone_send = gr.Button("Send", variant="primary")
+                def pinecone_chat_fn(message, history, api_key):
+                    if not api_key:
+                        return history + [(message, "❌ Please enter your Pinecone API key")]
+                    if not message.strip():
+                        return history
+                    # Simple REST call to Pinecone Assistant
+                    try:
+                        import urllib.request
+                        payload = json.dumps({
+                            "messages": [{"role": "user", "content": message}],
+                        }).encode("utf-8")
+                        req = urllib.request.Request(
+                            "https://api.pinecone.io/assistant/chat/pineosman2",
+                            data=payload,
+                            headers={
+                                "Content-Type": "application/json",
+                                "Api-Key": api_key,
+                            },
+                            method="POST",
+                        )
+                        with urllib.request.urlopen(req, timeout=60) as resp:
+                            data = json.loads(resp.read().decode("utf-8"))
+                            reply = data.get("message", {}).get("content", "No response")
+                            return history + [(message, reply)]
+                    except Exception as e:
+                        return history + [(message, f"❌ Error: {str(e)[:200]}")]
+                pinecone_send.click(
+                    fn=pinecone_chat_fn,
+                    inputs=[pinecone_msg, pinecone_chat, pinecone_key],
+                    outputs=[pinecone_chat],
+                ).then(lambda: "", outputs=[pinecone_msg])
+            # ═══════════════════════════════════════════════════════
+            # TAB 5: Model Registry
+            # ═══════════════════════════════════════════════════════
+            with gr.TabItem("📋 Model Registry"):
+                gr.Markdown("""
+                ### Browse all 37+ models in the NEXUS OS registry
+                """)
+                registry_table = gr.DataFrame(
+                    headers=["ID", "Name", "Family", "Tier", "Size (GB)", "Params (B)",
+                            "Capabilities", "Context", "Temp"],
+                    label="Registered Models",
+                    interactive=False,
+                )
+                def load_registry():
+                    return [{
+                        "ID": k,
+                        "Name": v.name,
+                        "Family": v.family,
+                        "Tier": v.tier,
+                        "Size (GB)": v.size_gb,
+                        "Params (B)": v.params_b,
+                        "Capabilities": ", ".join(v.capabilities),
+                        "Context": v.max_context,
+                        "Temp": v.default_temp,
+                    } for k, v in REGISTRY.items()]
+                demo.load(fn=load_registry, outputs=[registry_table])
     return demo
     if not GRADIO_AVAILABLE:
         print("ERROR: Gradio is required.")
         sys.exit(1)
+    demo = build_control_center()
     demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)