specimba commited on
Commit
1da900a
·
verified ·
1 Parent(s): 5b65a76

v5.0 Provider Control Center: tabs, provider manager, arena, experiment log, pinecone chat, registry

Browse files
Files changed (1) hide show
  1. app.py +459 -623
app.py CHANGED
@@ -1,18 +1,15 @@
1
  """
2
- NEXUS OS v4.0 Self-Contained Intelligent Router for HF Spaces
3
 
4
- ZERO external dependencies except gradio + stdlib.
5
- No package imports, no torch, no pinecone.
 
 
 
 
 
6
 
7
- Providers (real free tiers only):
8
- 1. HF Inference Providers (router.huggingface.co) — PRIMARY, auto-routing, $0.10/mo
9
- 2. Groq (api.groq.com) — fastest LPU inference, generous free tier
10
- 3. DeepSeek (api.deepseek.com) — best reasoning, 5M token free credit
11
- 4. OpenRouter (openrouter.ai) — 25+ free models, deprioritized
12
- 5. Together AI (api.together.xyz) — free 70B models, heavily rate-limited
13
-
14
- NOT included (not real providers): Kilocode, OpenCode, NVIDIA NIM
15
- NOT included (useless free tier): Fireworks ($1 credit)
16
  """
17
  import os
18
  import sys
@@ -32,114 +29,28 @@ except ImportError:
32
 
33
 
34
  # ═══════════════════════════════════════════════════════════════
35
- # SELF-CONTAINED MODEL REGISTRY
36
  # ═══════════════════════════════════════════════════════════════
37
- class Tier(Enum):
38
- LOCAL_8GB = "local_8gb"
39
- LOCAL_16GB = "local_16gb"
40
- LOCAL_24GB = "local_24gb"
41
- LOCAL_48GB = "local_48gb"
42
- CLOUD_API = "cloud_api"
43
-
44
- class Capability(Enum):
45
- REASONING = "reasoning"
46
- CODING = "coding"
47
- VISION = "vision"
48
- FUNCTION_CALLING = "function_calling"
49
- TOOL_USE = "tool_use"
50
- INSTRUCT = "instruct"
51
- FAST = "fast"
52
- LONG_CONTEXT = "long_context"
53
- MULTILINGUAL = "multilingual"
54
- SAFETY = "safety"
55
-
56
- @dataclass
57
- class ModelProfile:
58
- name: str
59
- family: str = ""
60
- tier: Tier = Tier.LOCAL_8GB
61
- size_gb: float = 0.0
62
- params_b: float = 0.0
63
- capabilities: List[Capability] = field(default_factory=list)
64
- default_temp: float = 0.7
65
- max_context: int = 8192
66
- T_c: float = 1.0
67
- mu_base: float = 0.5
68
- kappa: float = 0.1
69
-
70
- REGISTRY: Dict[str, ModelProfile] = {
71
- # LOCAL 8GB
72
- "functiongemma": ModelProfile(name="FunctionGemma", family="gemma", tier=Tier.LOCAL_8GB, size_gb=0.3, params_b=0.27, capabilities=[Capability.FUNCTION_CALLING, Capability.FAST, Capability.INSTRUCT], default_temp=0.3, max_context=8192, T_c=0.8),
73
- "huihui-granite-4.1-3b": ModelProfile(name="Huihui Granite 4.1 3B", family="granite", tier=Tier.LOCAL_8GB, size_gb=2.8, params_b=3.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.INSTRUCT], default_temp=0.7, max_context=128000),
74
- "trinity-nano": ModelProfile(name="Trinity Nano", family="trinity", tier=Tier.LOCAL_8GB, size_gb=3.8, params_b=4.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.FAST], default_temp=0.7, max_context=32768),
75
- "ibm-grok4-coder-1b": ModelProfile(name="IBM Grok4 Coder 1B", family="grok", tier=Tier.LOCAL_8GB, size_gb=1.2, params_b=1.0, capabilities=[Capability.CODING, Capability.FAST, Capability.INSTRUCT], default_temp=0.3, max_context=8192),
76
- "qwen3.5-0.8b-heretic": ModelProfile(name="Qwen 3.5 0.8B Heretic", family="qwen", tier=Tier.LOCAL_8GB, size_gb=0.8, params_b=0.8, capabilities=[Capability.CODING, Capability.FAST, Capability.INSTRUCT], default_temp=0.8, max_context=32768),
77
- "bonsai-1.7b": ModelProfile(name="Ternary Bonsai 1.7B", family="bonsai", tier=Tier.LOCAL_8GB, size_gb=3.4, params_b=1.7, capabilities=[Capability.REASONING, Capability.FAST], default_temp=0.7, max_context=8192),
78
- "darwin-4b": ModelProfile(name="Darwin 4B", family="darwin", tier=Tier.LOCAL_8GB, size_gb=5.3, params_b=4.0, capabilities=[Capability.REASONING, Capability.CODING], default_temp=0.7, max_context=32768),
79
- "dr-venus-4b-rl": ModelProfile(name="DR-Venus 4B RL", family="venus", tier=Tier.LOCAL_8GB, size_gb=3.6, params_b=4.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.SAFETY], default_temp=0.7, max_context=32768),
80
- "gemma4-most-seen-2b": ModelProfile(name="Gemma4 Most Seen 2B", family="gemma", tier=Tier.LOCAL_8GB, size_gb=3.4, params_b=2.0, capabilities=[Capability.REASONING, Capability.FAST], default_temp=0.7, max_context=32768),
81
- "grape-2-mini": ModelProfile(name="GRaPE 2 Mini", family="grape", tier=Tier.LOCAL_8GB, size_gb=4.8, params_b=4.0, capabilities=[Capability.REASONING, Capability.CODING], default_temp=0.7, max_context=32768),
82
- "bonsai-8b-requantized": ModelProfile(name="Bonsai 8B Requantized", family="bonsai", tier=Tier.LOCAL_8GB, size_gb=3.0, params_b=8.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.FAST], default_temp=0.7, max_context=8192),
83
- "frob-locooperator": ModelProfile(name="Frob LocoOperator", family="loco", tier=Tier.LOCAL_8GB, size_gb=2.5, params_b=3.0, capabilities=[Capability.TOOL_USE, Capability.FUNCTION_CALLING, Capability.FAST], default_temp=0.3, max_context=8192),
84
- "nemotron-3-nano-4b": ModelProfile(name="Nemotron 3 Nano 4B", family="nemotron", tier=Tier.LOCAL_8GB, size_gb=2.8, params_b=4.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.SAFETY], default_temp=0.7, max_context=32768),
85
- "opensonnet-lite-max": ModelProfile(name="OpenSonnet-Lite-MAX", family="qwen", tier=Tier.LOCAL_8GB, size_gb=2.5, params_b=4.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.FAST, Capability.LONG_CONTEXT], default_temp=0.6, max_context=262144, T_c=0.9, mu_base=0.55, kappa=0.09),
86
- # LOCAL 16GB
87
- "deepseek-r1-8b": ModelProfile(name="DeepSeek-R1 8B", family="deepseek", tier=Tier.LOCAL_16GB, size_gb=5.2, params_b=8.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.LONG_CONTEXT], default_temp=0.6, max_context=128000, T_c=0.85),
88
- "qwen2.5-coder-7b": ModelProfile(name="Qwen 2.5 Coder 7B", family="qwen", tier=Tier.LOCAL_16GB, size_gb=4.7, params_b=7.0, capabilities=[Capability.CODING, Capability.FAST], default_temp=0.3, max_context=32768),
89
- "l3.1-dark-reasoning-8b": ModelProfile(name="L3.1 Dark Reasoning 8B", family="llama", tier=Tier.LOCAL_16GB, size_gb=5.7, params_b=8.0, capabilities=[Capability.REASONING, Capability.CODING], default_temp=0.7, max_context=32768),
90
- "omega-evolution-9b": ModelProfile(name="Omega Evolution 9B", family="omega", tier=Tier.LOCAL_16GB, size_gb=6.6, params_b=9.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.VISION], default_temp=0.7, max_context=32768),
91
- "darwin-9b-opus": ModelProfile(name="Darwin 9B Opus", family="darwin", tier=Tier.LOCAL_16GB, size_gb=6.3, params_b=9.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.LONG_CONTEXT], default_temp=0.7, max_context=65536),
92
- "qwopus-3.5-9b": ModelProfile(name="Qwopus 3.5 9B", family="qwopus", tier=Tier.LOCAL_16GB, size_gb=5.6, params_b=9.0, capabilities=[Capability.REASONING, Capability.CODING], default_temp=0.7, max_context=32768),
93
- "carnice-9b": ModelProfile(name="Carnice 9B", family="carnice", tier=Tier.LOCAL_16GB, size_gb=5.6, params_b=9.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.VISION], default_temp=0.7, max_context=32768),
94
- "open-search-vl-8b": ModelProfile(name="OpenSearch VL 8B", family="opensearch", tier=Tier.LOCAL_16GB, size_gb=6.6, params_b=8.0, capabilities=[Capability.VISION, Capability.REASONING, Capability.LONG_CONTEXT], default_temp=0.7, max_context=65536),
95
- "granite-4.1-8b-abliterated": ModelProfile(name="Granite 4.1 8B Abliterated", family="granite", tier=Tier.LOCAL_16GB, size_gb=5.1, params_b=8.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.LONG_CONTEXT], default_temp=0.7, max_context=128000),
96
- "jaahas-qwen3.5-9b": ModelProfile(name="Jaahas Qwen 3.5 9B", family="qwen", tier=Tier.LOCAL_16GB, size_gb=7.4, params_b=9.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.MULTILINGUAL], default_temp=0.7, max_context=32768),
97
- # LOCAL 24GB
98
- "lfm2-12b-deckard": ModelProfile(name="LFM2 12B Deckard", family="lfm", tier=Tier.LOCAL_24GB, size_gb=5.8, params_b=12.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.LONG_CONTEXT, Capability.FAST], default_temp=0.7, max_context=128000),
99
- "gemma4-e2b-opus": ModelProfile(name="Gemma4 E2B Opus", family="gemma", tier=Tier.LOCAL_24GB, size_gb=5.5, params_b=4.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.LONG_CONTEXT], default_temp=0.7, max_context=128000),
100
- "gemma4-uncensored": ModelProfile(name="Gemma 4 Uncensored", family="gemma", tier=Tier.LOCAL_24GB, size_gb=4.9, params_b=4.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.VISION], default_temp=0.7, max_context=32768),
101
- "gemma4-obliterated": ModelProfile(name="Gemma 4 OBLITERATED", family="gemma", tier=Tier.LOCAL_24GB, size_gb=6.3, params_b=4.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.VISION], default_temp=0.7, max_context=32768),
102
- "qwen3.6-27b-dflash": ModelProfile(name="Qwen 3.6 27B DFlash", family="qwen", tier=Tier.LOCAL_24GB, size_gb=1.0, params_b=27.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.LONG_CONTEXT, Capability.FAST], default_temp=0.7, max_context=128000),
103
- # LOCAL 48GB
104
- "gemma4-31b-cloud": ModelProfile(name="Gemma4 31B Cloud", family="gemma", tier=Tier.LOCAL_48GB, size_gb=18.0, params_b=31.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.VISION, Capability.LONG_CONTEXT, Capability.MULTILINGUAL], default_temp=0.7, max_context=128000),
105
- "nemotron-3-nano-omni-30b": ModelProfile(name="Nemotron-3 Nano-Omni 30B", family="nemotron", tier=Tier.LOCAL_48GB, size_gb=18.0, params_b=30.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.VISION, Capability.LONG_CONTEXT, Capability.SAFETY, Capability.TOOL_USE], default_temp=0.6, max_context=256000, T_c=0.85, mu_base=0.6, kappa=0.08),
106
- # CLOUD API
107
- "deepseek-v4-pro": ModelProfile(name="DeepSeek V4 Pro", family="deepseek", tier=Tier.CLOUD_API, size_gb=0.0, params_b=671.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.LONG_CONTEXT, Capability.MULTILINGUAL, Capability.TOOL_USE], default_temp=0.6, max_context=64000),
108
- "deepseek-v4-flash": ModelProfile(name="DeepSeek V4 Flash", family="deepseek", tier=Tier.CLOUD_API, size_gb=0.0, params_b=671.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.FAST, Capability.MULTILINGUAL], default_temp=0.8, max_context=64000),
109
- "qwen3-coder-next": ModelProfile(name="Qwen 3 Coder Next", family="qwen", tier=Tier.CLOUD_API, size_gb=0.0, params_b=32.0, capabilities=[Capability.CODING, Capability.REASONING, Capability.FAST, Capability.LONG_CONTEXT, Capability.TOOL_USE], default_temp=0.3, max_context=128000),
110
- "kimi-k2.6": ModelProfile(name="Kimi K2.6", family="kimi", tier=Tier.CLOUD_API, size_gb=0.0, params_b=32.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.LONG_CONTEXT, Capability.MULTILINGUAL, Capability.VISION], default_temp=0.7, max_context=200000),
111
- "glm-5.1": ModelProfile(name="GLM 5.1", family="glm", tier=Tier.CLOUD_API, size_gb=0.0, params_b=32.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.MULTILINGUAL, Capability.TOOL_USE, Capability.VISION], default_temp=0.7, max_context=128000),
112
- "minimax-m2.7": ModelProfile(name="MiniMax M2.7", family="minimax", tier=Tier.CLOUD_API, size_gb=0.0, params_b=32.0, capabilities=[Capability.REASONING, Capability.CODING, Capability.MULTILINGUAL, Capability.VISION], default_temp=0.7, max_context=128000),
113
- }
114
-
115
- def get(name: str) -> Optional[ModelProfile]:
116
- return REGISTRY.get(name)
117
-
118
- def all_names() -> List[str]:
119
- return list(REGISTRY.keys())
120
-
121
- def by_tier(t: Tier) -> List[ModelProfile]:
122
- return [m for m in REGISTRY.values() if m.tier == t]
123
 
124
- def vram(names: List[str]) -> float:
125
- return sum(get(n).size_gb for n in names if get(n) and get(n).tier != Tier.CLOUD_API)
 
 
126
 
127
 
128
  # ═══════════════════════════════════════════════════════════════
129
- # INTELLIGENT MULTI-PROVIDER ROUTER
130
  # ═══════════════════════════════════════════════════════════════
131
-
132
- class Provider(Enum):
133
- HF_ROUTER = "hf_inference_providers" # PRIMARY — auto-routing, $0.10/mo, HF token
134
- GROQ = "groq" # Fastest free inference, LPU chips
135
- DEEPSEEK = "deepseek" # Best reasoning models, 5M token free
136
- OPENROUTER = "openrouter" # 25+ free models, deprioritized
137
- TOGETHER = "together" # Free 70B models, heavily rate-limited
138
- OLLAMA = "ollama" # User's local models via relay
139
- MOCK = "mock" # Simulated fallback
140
-
141
- # Provider API endpoints (all OpenAI-compatible /v1/chat/completions)
142
- PROVIDER_ENDPOINTS = {
143
  Provider.HF_ROUTER: "https://router.huggingface.co/v1/chat/completions",
144
  Provider.GROQ: "https://api.groq.com/openai/v1/chat/completions",
145
  Provider.DEEPSEEK: "https://api.deepseek.com/v1/chat/completions",
@@ -147,157 +58,137 @@ PROVIDER_ENDPOINTS = {
147
  Provider.TOGETHER: "https://api.together.xyz/v1/chat/completions",
148
  }
149
 
150
- # API key env vars
151
- PROVIDER_KEYS = {
152
- Provider.HF_ROUTER: "HF_TOKEN",
153
- Provider.GROQ: "GROQ_API_KEY",
154
- Provider.DEEPSEEK: "DEEPSEEK_API_KEY",
155
- Provider.OPENROUTER: "OPENROUTER_API_KEY",
156
- Provider.TOGETHER: "TOGETHER_API_KEY",
157
- }
158
-
159
- # Best free models per provider per capability
160
- PROVIDER_MODELS = {
161
- Provider.HF_ROUTER: {
162
- "default": "meta-llama/Llama-3.2-1B-Instruct",
163
- "coding": "Qwen/Qwen2.5-Coder-1.5B-Instruct",
164
- "reasoning": "meta-llama/Llama-3.2-1B-Instruct",
165
- "fast": "Qwen/Qwen2.5-0.5B-Instruct",
166
- "vision": None, # Limited on free tier
167
- },
168
- Provider.GROQ: {
169
- "default": "llama-3.2-1b-preview",
170
- "coding": "qwen-2.5-coder-32b",
171
- "reasoning": "llama-3.2-1b-preview",
172
- "fast": "llama-3.2-1b-preview",
173
- "vision": "llama-3.2-11b-vision-preview",
174
- },
175
- Provider.DEEPSEEK: {
176
- "default": "deepseek-chat",
177
- "coding": "deepseek-coder",
178
- "reasoning": "deepseek-reasoner",
179
- "fast": "deepseek-chat",
180
- },
181
- Provider.OPENROUTER: {
182
- "default": "meta-llama/llama-3.2-1b-instruct:free",
183
- "coding": "qwen/qwen-2.5-coder-32b-instruct:free",
184
- "reasoning": "meta-llama/llama-3.1-70b-instruct:free",
185
- "fast": "meta-llama/llama-3.2-1b-instruct:free",
186
- },
187
- Provider.TOGETHER: {
188
- "default": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
189
- "coding": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
190
- "reasoning": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
191
- "fast": "meta-llama/Llama-3.2-1B-Instruct-Turbo-Free",
192
- },
193
  }
194
 
195
 
 
 
 
196
  @dataclass
197
- class ProviderHealth:
198
  provider: Provider
199
- available: bool
200
- latency_ms: float = 999999.0
201
  error: str = ""
 
202
 
203
 
204
  @dataclass
205
- class RouterResult:
206
  text: str
207
  provider: Provider
208
  model: str
209
  latency_ms: float
210
  tokens_input: int = 0
211
  tokens_output: int = 0
212
- fallback_chain: List[str] = field(default_factory=list)
213
- metadata: Dict[str, Any] = field(default_factory=dict)
214
 
215
 
216
- def _api_call(endpoint: str, api_key: str, payload: Dict[str, Any], timeout: int = 120) -> Tuple[bool, Dict[str, Any], float, str]:
217
- """Make API call. Returns (success, data, latency_ms, error)."""
218
  body = json.dumps(payload).encode("utf-8")
219
- req = urllib.request.Request(
220
- endpoint,
221
- data=body,
222
- headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
223
- method="POST",
224
- )
 
 
 
 
225
  t0 = time.time()
226
  try:
227
  with urllib.request.urlopen(req, timeout=timeout) as resp:
228
  data = json.loads(resp.read().decode("utf-8"))
229
  return True, data, (time.time() - t0) * 1000, ""
230
  except urllib.error.HTTPError as e:
231
- error_body = e.read().decode("utf-8", errors="replace")[:500]
232
- return False, {}, (time.time() - t0) * 1000, f"HTTP {e.code}: {error_body}"
233
  except Exception as e:
234
  return False, {}, (time.time() - t0) * 1000, str(e)[:200]
235
 
236
 
237
- def _check_provider_health(provider: Provider) -> ProviderHealth:
238
- """Health check via minimal API call."""
239
- api_key = os.environ.get(PROVIDER_KEYS.get(provider, ""), "")
240
  if not api_key:
241
- return ProviderHealth(provider=provider, available=False, error="No API key")
 
242
 
243
- endpoint = PROVIDER_ENDPOINTS.get(provider)
244
  if not endpoint:
245
- return ProviderHealth(provider=provider, available=False, error="No endpoint")
 
 
 
 
 
 
 
 
 
 
246
 
247
- # Minimal test request — single token
248
- model = PROVIDER_MODELS.get(provider, {}).get("default", "")
249
  payload = {
250
- "model": model,
251
  "messages": [{"role": "user", "content": "Hi"}],
252
- "max_tokens": 1,
253
  "temperature": 0.1,
254
  }
255
 
256
- success, data, latency, error = _api_call(endpoint, api_key, payload, timeout=15)
257
 
258
- if success:
259
- return ProviderHealth(provider=provider, available=True, latency_ms=latency)
 
 
 
 
260
  else:
261
- # Distinguish auth errors vs rate limits vs real unavailability
262
- if "401" in error or "403" in error:
263
- return ProviderHealth(provider=provider, available=False, error=f"Invalid key: {error}")
264
- elif "429" in error:
265
- return ProviderHealth(provider=provider, available=False, error=f"Rate limited: {error}")
266
- else:
267
- return ProviderHealth(provider=provider, available=False, error=error)
268
 
269
 
270
- def _generate_with_provider(
271
- provider: Provider,
272
- prompt: str,
273
- model: str,
274
- max_tokens: int = 512,
275
- temperature: float = 0.7,
276
- system: Optional[str] = None,
277
- ) -> Optional[RouterResult]:
278
  """Generate with a specific provider."""
279
- api_key = os.environ.get(PROVIDER_KEYS.get(provider, ""), "")
280
- if not api_key:
281
- return None
282
-
283
- endpoint = PROVIDER_ENDPOINTS.get(provider)
284
  if not endpoint:
285
- return None
 
286
 
287
  messages = []
288
  if system:
289
  messages.append({"role": "system", "content": system})
290
  messages.append({"role": "user", "content": prompt})
291
 
292
- # OpenRouter requires extra headers for ranking
293
- headers = {
294
- "Content-Type": "application/json",
295
- "Authorization": f"Bearer {api_key}",
296
- }
297
- if provider == Provider.OPENROUTER:
298
- headers["HTTP-Referer"] = "https://huggingface.co/spaces/specimba/nexus-os-space"
299
- headers["X-Title"] = "NEXUS OS"
300
-
301
  payload = {
302
  "model": model,
303
  "messages": messages,
@@ -305,441 +196,386 @@ def _generate_with_provider(
305
  "temperature": temperature,
306
  }
307
 
308
- body = json.dumps(payload).encode("utf-8")
309
- req = urllib.request.Request(endpoint, data=body, headers=headers, method="POST")
310
-
311
- t0 = time.time()
312
- try:
313
- with urllib.request.urlopen(req, timeout=120) as resp:
314
- data = json.loads(resp.read().decode("utf-8"))
315
- elapsed = (time.time() - t0) * 1000
316
-
317
- choice = data.get("choices", [{}])[0]
318
- message = choice.get("message", {})
319
- usage = data.get("usage", {})
320
-
321
- return RouterResult(
322
- text=message.get("content", ""),
323
- provider=provider,
324
- model=model,
325
- latency_ms=elapsed,
326
- tokens_input=usage.get("prompt_tokens", 0),
327
- tokens_output=usage.get("completion_tokens", 0),
328
- metadata={"raw": data},
329
- )
330
- except Exception:
331
- return None
332
-
333
-
334
- def intelligent_route(
335
- prompt: str,
336
- complexity: float = 0.5,
337
- required_capabilities: List[str] = None,
338
- max_tokens: int = 512,
339
- temperature: float = 0.7,
340
- system: Optional[str] = None,
341
- ollama_relay_url: Optional[str] = None,
342
- ) -> RouterResult:
343
- """
344
- Intelligent routing across ALL free providers.
345
 
346
- Priority:
347
- 1. HF Inference Providers (auto-routing, single token)
348
- 2. Groq (fastest)
349
- 3. DeepSeek (best reasoning)
350
- 4. OpenRouter (most models)
351
- 5. Together (free 70B)
352
- 6. Ollama relay (user's local)
353
- 7. Mock (last resort)
354
- """
355
- fallback_chain = []
356
 
357
- # Determine capability need
358
- capability = "default"
359
- if required_capabilities:
360
- for cap in ["coding", "reasoning", "fast", "vision"]:
361
- if cap in required_capabilities:
362
- capability = cap
363
- break
364
 
365
- # Providers in priority order
366
- providers = [Provider.HF_ROUTER, Provider.GROQ, Provider.DEEPSEEK,
367
- Provider.OPENROUTER, Provider.TOGETHER]
368
-
369
- # Check health of all providers
370
- health_results = []
371
- for provider in providers:
372
- health = _check_provider_health(provider)
373
- health_results.append(health)
374
- if health.available:
375
- fallback_chain.append(f"✓ {provider.value}: {health.latency_ms:.0f}ms")
376
- else:
377
- fallback_chain.append(f"✗ {provider.value}: {health.error[:100]}")
378
-
379
- # Sort available by latency
380
- available = [h for h in health_results if h.available]
381
- available.sort(key=lambda h: h.latency_ms)
382
-
383
- # Try each available provider
384
- for health in available:
385
- provider = health.provider
386
- model = PROVIDER_MODELS.get(provider, {}).get(capability)
387
- if not model:
388
- model = PROVIDER_MODELS.get(provider, {}).get("default", "")
389
-
390
- fallback_chain.append(f"→ Trying {provider.value} with {model}")
391
-
392
- result = _generate_with_provider(
393
- provider=provider,
394
- prompt=prompt,
395
- model=model,
396
- max_tokens=max_tokens,
397
- temperature=temperature,
398
- system=system,
399
- )
400
-
401
- if result and result.text:
402
- result.fallback_chain = fallback_chain
403
- return result
404
- else:
405
- fallback_chain.append(f"✗ {provider.value}: generation failed")
406
-
407
- # Try Ollama relay
408
- if ollama_relay_url:
409
- fallback_chain.append(f"→ Trying Ollama relay at {ollama_relay_url}")
410
- try:
411
- relay = ollama_relay_url.rstrip("/")
412
- messages = []
413
- if system:
414
- messages.append({"role": "system", "content": system})
415
- messages.append({"role": "user", "content": prompt})
416
- payload = json.dumps({
417
- "model": "llama3.2:latest",
418
- "messages": messages,
419
- "stream": False,
420
- "options": {"temperature": temperature, "num_predict": max_tokens},
421
- }).encode("utf-8")
422
- req = urllib.request.Request(f"{relay}/api/chat", data=payload,
423
- headers={"Content-Type": "application/json"}, method="POST")
424
- t0 = time.time()
425
- with urllib.request.urlopen(req, timeout=300) as resp:
426
- data = json.loads(resp.read().decode("utf-8"))
427
- elapsed = (time.time() - t0) * 1000
428
- text = data.get("message", {}).get("content", "") if "message" in data else data.get("response", "")
429
- return RouterResult(
430
- text=text,
431
- provider=Provider.OLLAMA,
432
- model="llama3.2:latest",
433
- latency_ms=elapsed,
434
- fallback_chain=fallback_chain,
435
- )
436
- except Exception as e:
437
- fallback_chain.append(f"✗ Ollama: {str(e)[:100]}")
438
-
439
- # All failed — mock
440
- return RouterResult(
441
- text=f"[All providers unavailable]\n\nFallback chain:\n" + "\n".join(fallback_chain),
442
- provider=Provider.MOCK,
443
- model="mock",
444
- latency_ms=0.0,
445
- fallback_chain=fallback_chain,
446
  )
447
 
448
 
449
  # ═══���═══════════════════════════════════════════════════════════
450
- # THERMODYNAMIC TELEMETRY SIMULATOR
451
  # ═══════════════════════════════════════════════════════════════
452
- import random
453
-
454
- class Action(Enum):
455
- NONE = "none"
456
- GROUND = "ground"
457
- REFLECT = "reflect"
458
- HALT = "halt"
459
-
460
- @dataclass
461
- class TokenVerdict:
462
- position: int
463
- token_str: str
464
- fused_score: float
465
- risk_level: str
466
- recommended_action: Action
467
- confidence: float
468
-
469
  @dataclass
470
- class SequenceVerdict:
471
- avg_fused_score: float
472
- max_fused_score: float
473
- overall_risk: str
474
- overall_action: Action
475
- detector_agreement: float
476
- trigger_positions: List[int]
477
- energy_entropy_product: float
478
- phase_transition_index: float
479
- newi: float
480
-
481
- def _stochastic_resonance(complexity: float, T_c: float) -> float:
482
- """Recommend optimal temperature based on complexity and T_c."""
483
- if complexity > 0.8:
484
- return 0.3 * T_c
485
- elif complexity > 0.5:
486
- return 0.6 * T_c
487
- elif complexity > 0.2:
488
- return 0.9 * T_c
489
- return 1.0 * T_c
490
-
491
- def simulate_telemetry(text: str, model_id: str, complexity: float) -> Dict[str, Any]:
492
- profile = get(model_id) or get("deepseek-r1-8b")
493
- num_tokens = min(200, max(20, len(text.split()) * 2))
494
-
495
- token_verdicts = []
496
- for pos in range(num_tokens):
497
- if pos in [5, 12, 18, 25, 35, 45, 55, 65]:
498
- risk_level = random.choice(["high", "critical"])
499
- elif pos in [8, 15, 22, 30, 40, 50, 60]:
500
- risk_level = random.choice(["moderate", "elevated"])
501
- else:
502
- risk_level = "low"
503
-
504
- fused_score = {
505
- "low": random.uniform(0.0, 0.2),
506
- "moderate": random.uniform(0.2, 0.4),
507
- "elevated": random.uniform(0.4, 0.6),
508
- "high": random.uniform(0.6, 0.8),
509
- "critical": random.uniform(0.8, 1.0),
510
- }[risk_level]
511
-
512
- action_map = {
513
- "low": Action.NONE, "moderate": Action.NONE,
514
- "elevated": Action.GROUND, "high": Action.REFLECT, "critical": Action.HALT,
515
- }
516
-
517
- token_verdicts.append(TokenVerdict(
518
- position=pos, token_str=f"tok_{pos}",
519
- fused_score=fused_score, risk_level=risk_level,
520
- recommended_action=action_map[risk_level], confidence=0.7,
521
- ))
522
-
523
- avg_score = sum(v.fused_score for v in token_verdicts) / len(token_verdicts)
524
- max_score = max(v.fused_score for v in token_verdicts)
525
- trigger_positions = [v.position for v in token_verdicts if v.fused_score > 0.6]
526
-
527
- overall_risk = "low"
528
- if max_score > 0.8:
529
- overall_risk = "critical"
530
- elif max_score > 0.6:
531
- overall_risk = "high"
532
- elif avg_score > 0.4:
533
- overall_risk = "moderate"
534
-
535
- return {
536
- "num_tokens": num_tokens,
537
- "hallucination_risk": round(avg_score, 3),
538
- "max_risk": round(max_score, 3),
539
- "risk_level": overall_risk,
540
- "recommended_action": Action.HALT if max_score > 0.8 else Action.REFLECT if max_score > 0.6 else Action.GROUND if avg_score > 0.4 else Action.NONE,
541
- "detector_agreement": round(random.uniform(0.6, 1.0), 3),
542
- "trigger_positions": trigger_positions[:10],
543
- "eep": round(avg_score * max_score * random.uniform(0.8, 1.2), 3),
544
- "pti": round(abs(avg_score - 0.5) * 2, 3),
545
- "newi": round(random.uniform(0.1, 0.5), 3),
546
- "optimal_temp": round(_stochastic_resonance(complexity, profile.T_c), 3),
547
- "T_c": profile.T_c,
548
- "mu_base": profile.mu_base,
549
- "kappa": profile.kappa,
550
- }
551
 
552
 
553
  # ═══════════════════════════════════════════════════════════════
554
- # GENERATION ORCHESTRATOR
555
  # ═══════════════════════════════════════════════════════════════
556
- def generate_with_nexus(
557
- prompt: str,
558
- vram: float,
559
- complexity: float,
560
- model_id: str,
561
- allow_cloud: bool,
562
- ollama_relay_url: str,
563
- use_ollama: bool,
564
- use_cloud: bool,
565
- use_hf_inference: bool,
566
- system_prompt: str,
567
- max_tokens: int,
568
- fusion_mode: str,
569
- ) -> Tuple[str, str, float, float, int, float, float, float, str, str, str]:
570
- """Main generation with intelligent multi-provider routing."""
571
- if not prompt.strip():
572
- return "", "", 0.0, 0.0, 0, 0.0, 0.0, 0.0, "none", "[]", "Please enter a prompt"
573
-
574
- profile = get(model_id)
575
- if not profile:
576
- return "", "", 0.0, 0.0, 0, 0.0, 0.0, 0.0, "none", "[]", f"Model {model_id} not found"
577
-
578
- # Map capabilities for routing
579
- required_caps = []
580
- if Capability.CODING in profile.capabilities:
581
- required_caps.append("coding")
582
- if Capability.REASONING in profile.capabilities:
583
- required_caps.append("reasoning")
584
- if Capability.FAST in profile.capabilities:
585
- required_caps.append("fast")
586
- if Capability.VISION in profile.capabilities:
587
- required_caps.append("vision")
588
-
589
- # Route to best provider
590
- result = intelligent_route(
591
- prompt=prompt,
592
- complexity=complexity,
593
- required_capabilities=required_caps,
594
- max_tokens=max_tokens,
595
- temperature=profile.default_temp,
596
- system=system_prompt if system_prompt.strip() else None,
597
- ollama_relay_url=ollama_relay_url if use_ollama else None,
598
- )
599
-
600
- status = f"Provider: {result.provider.value} | Model: {result.model} | Latency: {result.latency_ms:.0f}ms"
601
- if result.fallback_chain:
602
- status += "\n" + "\n".join(result.fallback_chain)
603
-
604
- telemetry = simulate_telemetry(result.text, model_id, complexity)
605
- action_str = {Action.NONE: "none", Action.GROUND: "ground",
606
- Action.REFLECT: "reflect", Action.HALT: "halt"}[telemetry["recommended_action"]]
607
-
608
- return (
609
- result.text,
610
- f"{profile.name} ({result.provider.value})",
611
- telemetry["hallucination_risk"],
612
- telemetry["max_risk"],
613
- telemetry["num_tokens"],
614
- telemetry["eep"],
615
- telemetry["pti"],
616
- telemetry["newi"],
617
- action_str,
618
- str(telemetry["trigger_positions"]),
619
- status,
620
- )
621
-
622
 
623
  # ═══════════════════════════════════════════════════════════════
624
- # GRADIO INTERFACE
625
  # ═══════════════════════════════════════════════════════════════
626
- def build_space_interface():
627
- with gr.Blocks(title="NEXUS OS v4.0 Intelligent Multi-Provider Router") as demo:
628
- gr.Markdown("""
629
- # 🔥 NEXUS OS v4.0 Intelligent Router
630
-
631
- **Queries ALL free API providers in parallel and picks the best one.**
 
 
 
 
 
632
 
633
- Providers (auto-detected): HF Inference Providers, Groq, DeepSeek, OpenRouter, Together AI
 
634
 
635
- ---
636
  """)
637
 
638
- with gr.Row():
639
- with gr.Column(scale=2):
640
- with gr.Accordion("⚙️ Connection Settings", open=False):
641
- ollama_relay = gr.Textbox(label="Ollama Relay URL",
642
- placeholder="https://your-tunnel.ngrok-free.app",
643
- value=os.environ.get("OLLAMA_RELAY_URL", ""),
644
- info="Optional: expose local Ollama via ngrok")
645
- use_hf = gr.Checkbox(label="Enable HF Inference Providers", value=True)
646
- use_ollama = gr.Checkbox(label="Enable Ollama Relay", value=False)
647
- use_cloud = gr.Checkbox(label="Enable Direct Provider APIs", value=True,
648
- info="Groq, DeepSeek, OpenRouter, Together AI")
649
- allow_cloud = gr.Checkbox(label="Allow Cloud Models in Routing", value=True)
650
 
651
- prompt_input = gr.Textbox(label="Your Prompt",
652
- placeholder="Explain quantum entanglement in simple terms...", lines=4)
653
- system_input = gr.Textbox(label="System Prompt (optional)",
654
- placeholder="You are a helpful assistant...", lines=2, value="")
655
 
656
- with gr.Row():
657
- vram_slider = gr.Slider(minimum=4, maximum=48, value=16, step=4,
658
- label="Local VRAM Budget (GB)")
659
- complexity_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.5, step=0.05,
660
- label="Estimated Complexity")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
661
 
662
- model_dropdown = gr.Dropdown(label="Model", choices=[], value="deepseek-r1-8b",
663
- info="Auto-filtered by VRAM budget")
664
- max_tokens_slider = gr.Slider(minimum=256, maximum=2048, value=512, step=256,
665
- label="Max Tokens")
666
- fusion_mode_dropdown = gr.Dropdown(label="Detector Fusion Mode",
667
- choices=["weighted", "majority", "agreement", "any"], value="weighted")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
668
 
669
- generate_btn = gr.Button("🔮 Generate with NEXUS OS", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
670
 
671
- with gr.Column(scale=3):
672
- output_text = gr.Textbox(label="Generated Response", lines=20, interactive=False)
673
- model_used_text = gr.Textbox(label="Model Used", value="", interactive=False)
674
- status_text = gr.Textbox(label="Status / Fallback Chain", value="Ready", interactive=False, lines=6)
675
 
676
  with gr.Row():
677
- risk_gauge = gr.Number(label="Hallucination Risk", value=0.0)
678
- max_risk_gauge = gr.Number(label="Peak Risk", value=0.0)
679
- tokens_gauge = gr.Number(label="Tokens", value=0)
 
 
 
 
 
 
 
 
 
 
680
  with gr.Row():
681
- eep_gauge = gr.Number(label="EEP", value=0.0)
682
- pti_gauge = gr.Number(label="PTI", value=0.0)
683
- newi_gauge = gr.Number(label="NEWI", value=0.0)
 
 
 
 
 
 
684
 
685
- action_text = gr.Textbox(label="Recommended Action", value="none", interactive=False)
686
- trigger_text = gr.Textbox(label="Trigger Positions", value="[]", interactive=False)
687
-
688
- gr.Markdown("""
689
- ---
690
-
691
- ### About NEXUS OS v4.0
692
-
693
- **Intelligent Multi-Provider Router** — auto-detects available providers:
694
- - **HF Inference Providers** (primary auto-routing with your HF token)
695
- - **Groq** (fastest free inference, LPU chips)
696
- - **DeepSeek** (best reasoning models, 5M token free credit)
697
- - **OpenRouter** (25+ free models, deprioritized but diverse)
698
- - **Together AI** (free 70B models, heavily rate-limited)
699
-
700
- Picks the best based on health check latency + capability match.
701
-
702
- **37+ real models** in registry including Nemotron-3 Nano-Omni 30B and OpenSonnet-Lite-MAX
703
-
704
- **Four empirically-validated hallucination detectors:** EPR, Spilled Energy, CK-PLUG, TWAVE
705
-
706
- **Novel composite signals:** EEP, PTI, NEWI
707
-
708
- **Repository:** [specimba/nexus-os-v2](https://huggingface.co/datasets/specimba/nexus-os-v2)
709
- """)
710
-
711
- def update_models(vram, allow_cloud):
712
- models = []
713
- budget = vram
714
- for name, profile in REGISTRY.items():
715
- if profile.tier == Tier.CLOUD_API and not allow_cloud:
716
- continue
717
- fits = profile.size_gb <= budget or profile.tier == Tier.CLOUD_API
718
- models.append({
719
- "id": name,
720
- "name": profile.name,
721
- "params_b": profile.params_b,
722
- "size_gb": profile.size_gb,
723
- "tier": profile.tier.value,
724
- "fits_budget": fits,
725
- })
726
- choices = [(f"{m['name']} ({m['params_b']:.1f}B, {m['size_gb']:.1f}GB)", m['id'])
727
- for m in models if m['fits_budget']]
728
- default = choices[0][1] if choices else ""
729
- return gr.Dropdown(choices=choices, value=default)
730
-
731
- vram_slider.change(fn=update_models, inputs=[vram_slider, allow_cloud], outputs=[model_dropdown])
732
- allow_cloud.change(fn=update_models, inputs=[vram_slider, allow_cloud], outputs=[model_dropdown])
733
- demo.load(fn=update_models, inputs=[vram_slider, allow_cloud], outputs=[model_dropdown])
734
-
735
- generate_btn.click(
736
- fn=generate_with_nexus,
737
- inputs=[prompt_input, vram_slider, complexity_slider, model_dropdown, allow_cloud,
738
- ollama_relay, use_ollama, use_cloud, use_hf, system_input,
739
- max_tokens_slider, fusion_mode_dropdown],
740
- outputs=[output_text, model_used_text, risk_gauge, max_risk_gauge, tokens_gauge,
741
- eep_gauge, pti_gauge, newi_gauge, action_text, trigger_text, status_text],
742
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
743
 
744
  return demo
745
 
@@ -748,5 +584,5 @@ if __name__ == "__main__":
748
  if not GRADIO_AVAILABLE:
749
  print("ERROR: Gradio is required.")
750
  sys.exit(1)
751
- demo = build_space_interface()
752
  demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)
 
1
  """
2
+ NEXUS OS — Provider Control Center
3
 
4
+ A multi-provider LLM management dashboard inspired by HF collaboration spaces.
5
+ Features:
6
+ 1. Provider Manager — enter API keys, check health, see available models
7
+ 2. Side-by-Side Arena — same prompt across multiple providers, compare outputs
8
+ 3. Experiment Log — save runs to table, sort by latency/cost/quality
9
+ 4. Pinecone Chat — talk to pineosman2 assistant, show retrieved evidence
10
+ 5. Model Registry — browse 37+ models with specs
11
 
12
+ All self-contained. Only dependency: gradio.
 
 
 
 
 
 
 
 
13
  """
14
  import os
15
  import sys
 
29
 
30
 
31
  # ═══════════════════════════════════════════════════════════════
32
+ # PROVIDER DEFINITIONS
33
  # ═══════════════════════════════════════════════════════════════
34
+ class Provider(Enum):
35
+ HF_ROUTER = ("HF Inference Providers", "router.huggingface.co", "HF_TOKEN")
36
+ GROQ = ("Groq", "api.groq.com", "GROQ_API_KEY")
37
+ DEEPSEEK = ("DeepSeek", "api.deepseek.com", "DEEPSEEK_API_KEY")
38
+ OPENROUTER = ("OpenRouter", "openrouter.ai", "OPENROUTER_API_KEY")
39
+ TOGETHER = ("Together AI", "api.together.xyz", "TOGETHER_API_KEY")
40
+ KILOCODE = ("Kilocode", "kilocode.ai", "KILOCODE_API_KEY")
41
+ NVIDIA = ("NVIDIA NIM", "integrate.api.nvidia.com", "NVIDIA_API_KEY")
42
+ OLLAMA = ("Ollama (Local)", "localhost:11434", "OLLAMA_HOST")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ def __init__(self, display_name, domain, key_env):
45
+ self.display_name = display_name
46
+ self.domain = domain
47
+ self.key_env = key_env
48
 
49
 
50
  # ═══════════════════════════════════════════════════════════════
51
+ # API ENDPOINTS (all OpenAI-compatible /v1/chat/completions)
52
  # ═══════════════════════════════════════════════════════════════
53
+ ENDPOINTS = {
 
 
 
 
 
 
 
 
 
 
 
54
  Provider.HF_ROUTER: "https://router.huggingface.co/v1/chat/completions",
55
  Provider.GROQ: "https://api.groq.com/openai/v1/chat/completions",
56
  Provider.DEEPSEEK: "https://api.deepseek.com/v1/chat/completions",
 
58
  Provider.TOGETHER: "https://api.together.xyz/v1/chat/completions",
59
  }
60
 
61
+ # Free models per provider
62
+ FREE_MODELS = {
63
+ Provider.HF_ROUTER: [
64
+ ("SmolLM2-1.7B", "HuggingFaceTB/SmolLM2-1.7B-Instruct"),
65
+ ("Llama-3.2-1B", "meta-llama/Llama-3.2-1B-Instruct"),
66
+ ("Qwen2.5-0.5B", "Qwen/Qwen2.5-0.5B-Instruct"),
67
+ ("Gemma-2-2B", "google/gemma-2-2b-it"),
68
+ ],
69
+ Provider.GROQ: [
70
+ ("Llama-3.2-1B", "llama-3.2-1b-preview"),
71
+ ("Llama-3.2-3B", "llama-3.2-3b-preview"),
72
+ ("Mixtral-8x7B", "mixtral-8x7b-32768"),
73
+ ("Qwen-2.5-Coder-32B", "qwen-2.5-coder-32b"),
74
+ ("Gemma-2-9B-IT", "gemma2-9b-it"),
75
+ ],
76
+ Provider.DEEPSEEK: [
77
+ ("DeepSeek-V3", "deepseek-chat"),
78
+ ("DeepSeek-R1", "deepseek-reasoner"),
79
+ ],
80
+ Provider.OPENROUTER: [
81
+ ("Llama-3.2-1B-Free", "meta-llama/llama-3.2-1b-instruct:free"),
82
+ ("Qwen-2.5-Coder-32B-Free", "qwen/qwen-2.5-coder-32b-instruct:free"),
83
+ ],
84
+ Provider.TOGETHER: [
85
+ ("Llama-3.3-70B-Free", "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"),
86
+ ("Llama-3.2-1B-Free", "meta-llama/Llama-3.2-1B-Instruct-Turbo-Free"),
87
+ ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  }
89
 
90
 
91
+ # ═══════════════════════════════════════════════════════════════
92
+ # HEALTH CHECK + GENERATION
93
+ # ═══════════════════════════════════════════════════════════════
94
  @dataclass
95
+ class HealthResult:
96
  provider: Provider
97
+ status: str # "online", "offline", "no_key", "rate_limited"
98
+ latency_ms: float
99
  error: str = ""
100
+ models: List[Tuple[str, str]] = field(default_factory=list)
101
 
102
 
103
  @dataclass
104
+ class GenerationResult:
105
  text: str
106
  provider: Provider
107
  model: str
108
  latency_ms: float
109
  tokens_input: int = 0
110
  tokens_output: int = 0
111
+ error: str = ""
 
112
 
113
 
114
+ def _call_api(endpoint: str, api_key: str, payload: Dict[str, Any], timeout: int = 120) -> Tuple[bool, Dict[str, Any], float, str]:
 
115
  body = json.dumps(payload).encode("utf-8")
116
+ headers = {
117
+ "Content-Type": "application/json",
118
+ "Authorization": f"Bearer {api_key}",
119
+ }
120
+ # OpenRouter requires extra headers
121
+ if "openrouter" in endpoint:
122
+ headers["HTTP-Referer"] = "https://huggingface.co/spaces/specimba/nexus-os-space"
123
+ headers["X-Title"] = "NEXUS OS"
124
+
125
+ req = urllib.request.Request(endpoint, data=body, headers=headers, method="POST")
126
  t0 = time.time()
127
  try:
128
  with urllib.request.urlopen(req, timeout=timeout) as resp:
129
  data = json.loads(resp.read().decode("utf-8"))
130
  return True, data, (time.time() - t0) * 1000, ""
131
  except urllib.error.HTTPError as e:
132
+ err = e.read().decode("utf-8", errors="replace")[:300]
133
+ return False, {}, (time.time() - t0) * 1000, f"HTTP {e.code}: {err}"
134
  except Exception as e:
135
  return False, {}, (time.time() - t0) * 1000, str(e)[:200]
136
 
137
 
138
+ def check_provider_health(provider: Provider, api_key: str) -> HealthResult:
139
+ """Check provider health with a minimal test request."""
 
140
  if not api_key:
141
+ return HealthResult(provider=provider, status="no_key", latency_ms=0,
142
+ models=FREE_MODELS.get(provider, []))
143
 
144
+ endpoint = ENDPOINTS.get(provider)
145
  if not endpoint:
146
+ return HealthResult(provider=provider, status="offline", latency_ms=0,
147
+ error="No endpoint configured",
148
+ models=FREE_MODELS.get(provider, []))
149
+
150
+ # Try a minimal generation
151
+ models = FREE_MODELS.get(provider, [])
152
+ model_id = models[0][1] if models else ""
153
+ if not model_id:
154
+ return HealthResult(provider=provider, status="offline", latency_ms=0,
155
+ error="No models configured",
156
+ models=FREE_MODELS.get(provider, []))
157
 
 
 
158
  payload = {
159
+ "model": model_id,
160
  "messages": [{"role": "user", "content": "Hi"}],
161
+ "max_tokens": 5,
162
  "temperature": 0.1,
163
  }
164
 
165
+ success, data, latency, error = _call_api(endpoint, api_key, payload, timeout=20)
166
 
167
+ if success and data.get("choices"):
168
+ return HealthResult(provider=provider, status="online", latency_ms=latency,
169
+ models=FREE_MODELS.get(provider, []))
170
+ elif "429" in error or "rate limit" in error.lower():
171
+ return HealthResult(provider=provider, status="rate_limited", latency_ms=latency,
172
+ error=error, models=FREE_MODELS.get(provider, []))
173
  else:
174
+ return HealthResult(provider=provider, status="offline", latency_ms=latency,
175
+ error=error, models=FREE_MODELS.get(provider, []))
 
 
 
 
 
176
 
177
 
178
+ def generate_with_provider(provider: Provider, api_key: str, model: str,
179
+ prompt: str, system: Optional[str] = None,
180
+ max_tokens: int = 512, temperature: float = 0.7) -> GenerationResult:
 
 
 
 
 
181
  """Generate with a specific provider."""
182
+ endpoint = ENDPOINTS.get(provider)
 
 
 
 
183
  if not endpoint:
184
+ return GenerationResult(text="", provider=provider, model=model, latency_ms=0,
185
+ error="No endpoint configured")
186
 
187
  messages = []
188
  if system:
189
  messages.append({"role": "system", "content": system})
190
  messages.append({"role": "user", "content": prompt})
191
 
 
 
 
 
 
 
 
 
 
192
  payload = {
193
  "model": model,
194
  "messages": messages,
 
196
  "temperature": temperature,
197
  }
198
 
199
+ success, data, latency, error = _call_api(endpoint, api_key, payload)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
+ if not success:
202
+ return GenerationResult(text="", provider=provider, model=model,
203
+ latency_ms=latency, error=error)
 
 
 
 
 
 
 
204
 
205
+ choice = data.get("choices", [{}])[0]
206
+ message = choice.get("message", {})
207
+ usage = data.get("usage", {})
 
 
 
 
208
 
209
+ return GenerationResult(
210
+ text=message.get("content", ""),
211
+ provider=provider,
212
+ model=model,
213
+ latency_ms=latency,
214
+ tokens_input=usage.get("prompt_tokens", 0),
215
+ tokens_output=usage.get("completion_tokens", 0),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  )
217
 
218
 
219
  # ═══���═══════════════════════════════════════════════════════════
220
+ # MODEL REGISTRY (37 models)
221
  # ═══════════════════════════════════════════════════════════════
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  @dataclass
223
+ class ModelProfile:
224
+ name: str
225
+ family: str
226
+ tier: str
227
+ size_gb: float
228
+ params_b: float
229
+ capabilities: List[str]
230
+ default_temp: float
231
+ max_context: int
232
+
233
+ REGISTRY = {
234
+ "deepseek-r1-8b": ModelProfile("DeepSeek-R1 8B", "deepseek", "16GB", 5.2, 8.0,
235
+ ["reasoning", "coding", "long_context"], 0.6, 128000),
236
+ "qwen2.5-coder-7b": ModelProfile("Qwen 2.5 Coder 7B", "qwen", "16GB", 4.7, 7.0,
237
+ ["coding", "fast"], 0.3, 32768),
238
+ "l3.1-dark-reasoning-8b": ModelProfile("L3.1 Dark Reasoning 8B", "llama", "16GB", 5.7, 8.0,
239
+ ["reasoning", "coding"], 0.7, 32768),
240
+ "omega-evolution-9b": ModelProfile("Omega Evolution 9B", "omega", "16GB", 6.6, 9.0,
241
+ ["reasoning", "coding", "vision"], 0.7, 32768),
242
+ "darwin-9b-opus": ModelProfile("Darwin 9B Opus", "darwin", "16GB", 6.3, 9.0,
243
+ ["reasoning", "coding", "long_context"], 0.7, 65536),
244
+ "qwopus-3.5-9b": ModelProfile("Qwopus 3.5 9B", "qwopus", "16GB", 5.6, 9.0,
245
+ ["reasoning", "coding"], 0.7, 32768),
246
+ "carnice-9b": ModelProfile("Carnice 9B", "carnice", "16GB", 5.6, 9.0,
247
+ ["reasoning", "coding", "vision"], 0.7, 32768),
248
+ "open-search-vl-8b": ModelProfile("OpenSearch VL 8B", "opensearch", "16GB", 6.6, 8.0,
249
+ ["vision", "reasoning", "long_context"], 0.7, 65536),
250
+ "granite-4.1-8b-abliterated": ModelProfile("Granite 4.1 8B Abliterated", "granite", "16GB", 5.1, 8.0,
251
+ ["reasoning", "coding", "long_context"], 0.7, 128000),
252
+ "jaahas-qwen3.5-9b": ModelProfile("Jaahas Qwen 3.5 9B", "qwen", "16GB", 7.4, 9.0,
253
+ ["reasoning", "coding", "multilingual"], 0.7, 32768),
254
+ "lfm2-12b-deckard": ModelProfile("LFM2 12B Deckard", "lfm", "24GB", 5.8, 12.0,
255
+ ["reasoning", "coding", "long_context", "fast"], 0.7, 128000),
256
+ "gemma4-e2b-opus": ModelProfile("Gemma4 E2B Opus", "gemma", "24GB", 5.5, 4.0,
257
+ ["reasoning", "coding", "long_context"], 0.7, 128000),
258
+ "gemma4-uncensored": ModelProfile("Gemma 4 Uncensored", "gemma", "24GB", 4.9, 4.0,
259
+ ["reasoning", "coding", "vision"], 0.7, 32768),
260
+ "gemma4-obliterated": ModelProfile("Gemma 4 OBLITERATED", "gemma", "24GB", 6.3, 4.0,
261
+ ["reasoning", "coding", "vision"], 0.7, 32768),
262
+ "qwen3.6-27b-dflash": ModelProfile("Qwen 3.6 27B DFlash", "qwen", "24GB", 1.0, 27.0,
263
+ ["reasoning", "coding", "long_context", "fast"], 0.7, 128000),
264
+ "gemma4-31b-cloud": ModelProfile("Gemma4 31B Cloud", "gemma", "48GB", 18.0, 31.0,
265
+ ["reasoning", "coding", "vision", "long_context", "multilingual"], 0.7, 128000),
266
+ "nemotron-3-nano-omni-30b": ModelProfile("Nemotron-3 Nano-Omni 30B", "nemotron", "48GB", 18.0, 30.0,
267
+ ["reasoning", "coding", "vision", "long_context", "safety", "tool_use"], 0.6, 256000),
268
+ "opensonnet-lite-max": ModelProfile("OpenSonnet-Lite-MAX", "qwen", "8GB", 2.5, 4.0,
269
+ ["reasoning", "coding", "fast", "long_context"], 0.6, 262144),
270
+ "deepseek-v4-pro": ModelProfile("DeepSeek V4 Pro", "deepseek", "cloud", 0.0, 671.0,
271
+ ["reasoning", "coding", "long_context", "multilingual", "tool_use"], 0.6, 64000),
272
+ "qwen3-coder-next": ModelProfile("Qwen 3 Coder Next", "qwen", "cloud", 0.0, 32.0,
273
+ ["coding", "reasoning", "fast", "long_context", "tool_use"], 0.3, 128000),
274
+ "kimi-k2.6": ModelProfile("Kimi K2.6", "kimi", "cloud", 0.0, 32.0,
275
+ ["reasoning", "coding", "long_context", "multilingual", "vision"], 0.7, 200000),
276
+ "glm-5.1": ModelProfile("GLM 5.1", "glm", "cloud", 0.0, 32.0,
277
+ ["reasoning", "coding", "multilingual", "tool_use", "vision"], 0.7, 128000),
278
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
 
280
 
281
  # ═══════════════════════════════════════════════════════════════
282
+ # EXPERIMENT LOG (session state)
283
  # ═══════════════════════════════════════════════════════════════
284
+ experiment_log: List[Dict[str, Any]] = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
  # ═══════════════════════════════════════════════════════════════
287
+ # GRADIO INTERFACE — Provider Control Center
288
  # ═══════════════════════════════════════════════════════════════
289
+ def build_control_center():
290
+ with gr.Blocks(title="NEXUS OS — Provider Control Center", css="""
291
+ .provider-card { border: 1px solid #ddd; border-radius: 8px; padding: 12px; margin: 4px; }
292
+ .provider-online { border-left: 4px solid #10b981; }
293
+ .provider-offline { border-left: 4px solid #ef4444; }
294
+ .provider-rate { border-left: 4px solid #f59e0b; }
295
+ .provider-nokey { border-left: 4px solid #6b7280; }
296
+ .metric-box { text-align: center; padding: 8px; background: #f9fafb; border-radius: 6px; }
297
+ .metric-value { font-size: 24px; font-weight: bold; color: #1f2937; }
298
+ .metric-label { font-size: 11px; color: #6b7280; text-transform: uppercase; }
299
+ """) as demo:
300
 
301
+ gr.Markdown("""
302
+ # 🔥 NEXUS OS — Provider Control Center
303
 
304
+ **Manage API providers, compare models, log experiments, chat with your knowledge base.**
305
  """)
306
 
307
+ with gr.Tabs():
308
+
309
+ # ═══════════════════════════════════════════════════════
310
+ # TAB 1: Provider Manager
311
+ # ═══════════════════════════════════════════════════════
312
+ with gr.TabItem("🔌 Provider Manager"):
313
+ gr.Markdown("""
314
+ ### Enter your API keys to connect providers
 
 
 
 
315
 
316
+ Keys are stored in **this session only** (not saved to disk).
317
+ """)
 
 
318
 
319
+ provider_keys = {}
320
+ provider_status = {}
321
+
322
+ for provider in [Provider.HF_ROUTER, Provider.GROQ, Provider.DEEPSEEK,
323
+ Provider.OPENROUTER, Provider.TOGETHER, Provider.KILOCODE,
324
+ Provider.NVIDIA]:
325
+ with gr.Row():
326
+ key_input = gr.Textbox(
327
+ label=f"{provider.display_name} API Key",
328
+ placeholder=f"sk-... or paste your {provider.key_env} here",
329
+ type="password",
330
+ value=os.environ.get(provider.key_env, ""),
331
+ scale=3,
332
+ )
333
+ status_text = gr.Textbox(
334
+ label="Status",
335
+ value="Not checked" if not os.environ.get(provider.key_env, "") else "Key set (click Check)",
336
+ interactive=False,
337
+ scale=1,
338
+ )
339
+ provider_keys[provider] = key_input
340
+ provider_status[provider] = status_text
341
 
342
+ check_all_btn = gr.Button("🔍 Check All Providers", variant="primary")
343
+ health_table = gr.DataFrame(
344
+ headers=["Provider", "Status", "Latency (ms)", "Free Models", "Error"],
345
+ label="Provider Health Dashboard",
346
+ interactive=False,
347
+ )
348
+
349
+ def check_all_providers(*keys):
350
+ results = []
351
+ for provider, key in zip([Provider.HF_ROUTER, Provider.GROQ, Provider.DEEPSEEK,
352
+ Provider.OPENROUTER, Provider.TOGETHER, Provider.KILOCODE,
353
+ Provider.NVIDIA], keys):
354
+ health = check_provider_health(provider, key)
355
+ status_emoji = {"online": "🟢", "rate_limited": "🟡",
356
+ "offline": "🔴", "no_key": "⚪"}[health.status]
357
+ models_str = ", ".join([m[0] for m in health.models[:3]]) if health.models else "N/A"
358
+ results.append({
359
+ "Provider": f"{status_emoji} {provider.display_name}",
360
+ "Status": health.status,
361
+ "Latency (ms)": f"{health.latency_ms:.0f}" if health.latency_ms > 0 else "N/A",
362
+ "Free Models": models_str,
363
+ "Error": health.error[:100] if health.error else "",
364
+ })
365
+ return results
366
 
367
+ check_all_btn.click(
368
+ fn=check_all_providers,
369
+ inputs=list(provider_keys.values()),
370
+ outputs=[health_table],
371
+ )
372
+
373
+ # ═══════════════════════════════════════════════════════
374
+ # TAB 2: Side-by-Side Arena
375
+ # ═══════════════════════════════════════════════════════
376
+ with gr.TabItem("⚔️ Side-by-Side Arena"):
377
+ gr.Markdown("""
378
+ ### Send the same prompt to multiple providers and compare
379
 
380
+ Select providers, enter a prompt, and see which gives the best response.
381
+ """)
 
 
382
 
383
  with gr.Row():
384
+ arena_prompt = gr.Textbox(
385
+ label="Prompt",
386
+ placeholder="Write a Python function to reverse a linked list...",
387
+ lines=4,
388
+ scale=2,
389
+ )
390
+ arena_system = gr.Textbox(
391
+ label="System Prompt (optional)",
392
+ placeholder="You are a helpful coding assistant...",
393
+ lines=2,
394
+ scale=1,
395
+ )
396
+
397
  with gr.Row():
398
+ arena_providers = gr.CheckboxGroup(
399
+ label="Select Providers",
400
+ choices=[(p.display_name, p.name) for p in ENDPOINTS.keys()],
401
+ value=[Provider.HF_ROUTER.name, Provider.GROQ.name],
402
+ )
403
+ arena_max_tokens = gr.Slider(minimum=64, maximum=2048, value=512, step=64,
404
+ label="Max Tokens")
405
+ arena_temperature = gr.Slider(minimum=0.0, maximum=2.0, value=0.7, step=0.1,
406
+ label="Temperature")
407
 
408
+ arena_go = gr.Button("🚀 Run Arena", variant="primary")
409
+
410
+ # Dynamic output columns based on selected providers
411
+ arena_outputs = {}
412
+ for provider in ENDPOINTS.keys():
413
+ with gr.Column(visible=False) as col:
414
+ arena_outputs[provider] = {
415
+ "col": col,
416
+ "text": gr.Textbox(label=f"{provider.display_name}", lines=12, interactive=False),
417
+ "metrics": gr.Textbox(label=f"Metrics", interactive=False, lines=2),
418
+ }
419
+
420
+ def run_arena(prompt, system, provider_names, max_tokens, temperature, *keys):
421
+ if not prompt.strip():
422
+ return ["Please enter a prompt"] * len(ENDPOINTS)
423
+
424
+ provider_map = {p.name: p for p in ENDPOINTS.keys()}
425
+ key_map = {p: k for p, k in zip([Provider.HF_ROUTER, Provider.GROQ, Provider.DEEPSEEK,
426
+ Provider.OPENROUTER, Provider.TOGETHER], keys)}
427
+
428
+ results = {}
429
+ for name in provider_names:
430
+ provider = provider_map.get(name)
431
+ if not provider:
432
+ continue
433
+ key = key_map.get(provider, "")
434
+ if not key:
435
+ results[name] = (f"❌ No API key for {provider.display_name}", "")
436
+ continue
437
+
438
+ models = FREE_MODELS.get(provider, [])
439
+ model = models[0][1] if models else ""
440
+
441
+ result = generate_with_provider(
442
+ provider, key, model, prompt, system,
443
+ max_tokens, temperature,
444
+ )
445
+
446
+ if result.error:
447
+ results[name] = (f"❌ Error: {result.error}", "")
448
+ else:
449
+ metrics = f"⏱️ {result.latency_ms:.0f}ms | 📝 {result.tokens_output} tokens | 🎲 {model}"
450
+ results[name] = (result.text, metrics)
451
+
452
+ # Build output list matching all provider columns
453
+ outputs = []
454
+ for provider in ENDPOINTS.keys():
455
+ name = provider.name
456
+ if name in results:
457
+ outputs.extend([results[name][0], results[name][1]])
458
+ else:
459
+ outputs.extend(["", ""])
460
+ return outputs
461
+
462
+ arena_go.click(
463
+ fn=run_arena,
464
+ inputs=[arena_prompt, arena_system, arena_providers, arena_max_tokens, arena_temperature] + list(provider_keys.values())[:5],
465
+ outputs=[item for p in ENDPOINTS.keys() for item in [arena_outputs[p]["text"], arena_outputs[p]["metrics"]]],
466
+ )
467
+
468
+ # ═══════════════════════════════════════════════════════
469
+ # TAB 3: Experiment Log
470
+ # ═══════════════════════════════════════════════════════
471
+ with gr.TabItem("📊 Experiment Log"):
472
+ gr.Markdown("""
473
+ ### Track and compare your runs
474
+
475
+ Each generation is logged with: timestamp, provider, model, latency, tokens, quality score.
476
+ """)
477
+
478
+ log_table = gr.DataFrame(
479
+ headers=["Time", "Provider", "Model", "Prompt (first 50 chars)",
480
+ "Latency (ms)", "Tokens Out", "Status"],
481
+ label="Experiment History",
482
+ interactive=False,
483
+ )
484
+
485
+ clear_log_btn = gr.Button("🗑️ Clear Log")
486
+ export_log_btn = gr.Button("📥 Export as JSON")
487
+
488
+ def clear_log():
489
+ global experiment_log
490
+ experiment_log = []
491
+ return []
492
+
493
+ clear_log_btn.click(fn=clear_log, outputs=[log_table])
494
+
495
+ # ═══════════════════════════════════════════════════════
496
+ # TAB 4: Pinecone Chat
497
+ # ═══════════════════════════════════════════════════════
498
+ with gr.TabItem("🌲 Pinecone Chat"):
499
+ gr.Markdown("""
500
+ ### Chat with your Pinecone Assistant `pineosman2`
501
+
502
+ Uses Pinecone's conversational retrieval over your uploaded documents.
503
+ """)
504
+
505
+ pinecone_key = gr.Textbox(
506
+ label="Pinecone API Key",
507
+ placeholder="pcsk_...",
508
+ type="password",
509
+ value=os.environ.get("PINECONE_API_KEY", ""),
510
+ )
511
+
512
+ pinecone_chat = gr.Chatbot(label="Conversation with pineosman2", height=400)
513
+ pinecone_msg = gr.Textbox(label="Your message", placeholder="Ask about your documents...")
514
+ pinecone_send = gr.Button("Send", variant="primary")
515
+
516
+ def pinecone_chat_fn(message, history, api_key):
517
+ if not api_key:
518
+ return history + [(message, "❌ Please enter your Pinecone API key")]
519
+ if not message.strip():
520
+ return history
521
+
522
+ # Simple REST call to Pinecone Assistant
523
+ try:
524
+ import urllib.request
525
+ payload = json.dumps({
526
+ "messages": [{"role": "user", "content": message}],
527
+ }).encode("utf-8")
528
+ req = urllib.request.Request(
529
+ "https://api.pinecone.io/assistant/chat/pineosman2",
530
+ data=payload,
531
+ headers={
532
+ "Content-Type": "application/json",
533
+ "Api-Key": api_key,
534
+ },
535
+ method="POST",
536
+ )
537
+ with urllib.request.urlopen(req, timeout=60) as resp:
538
+ data = json.loads(resp.read().decode("utf-8"))
539
+ reply = data.get("message", {}).get("content", "No response")
540
+ return history + [(message, reply)]
541
+ except Exception as e:
542
+ return history + [(message, f"❌ Error: {str(e)[:200]}")]
543
+
544
+ pinecone_send.click(
545
+ fn=pinecone_chat_fn,
546
+ inputs=[pinecone_msg, pinecone_chat, pinecone_key],
547
+ outputs=[pinecone_chat],
548
+ ).then(lambda: "", outputs=[pinecone_msg])
549
+
550
+ # ═══════════════════════════════════════════════════════
551
+ # TAB 5: Model Registry
552
+ # ═══════════════════════════════════════════════════════
553
+ with gr.TabItem("📋 Model Registry"):
554
+ gr.Markdown("""
555
+ ### Browse all 37+ models in the NEXUS OS registry
556
+ """)
557
+
558
+ registry_table = gr.DataFrame(
559
+ headers=["ID", "Name", "Family", "Tier", "Size (GB)", "Params (B)",
560
+ "Capabilities", "Context", "Temp"],
561
+ label="Registered Models",
562
+ interactive=False,
563
+ )
564
+
565
+ def load_registry():
566
+ return [{
567
+ "ID": k,
568
+ "Name": v.name,
569
+ "Family": v.family,
570
+ "Tier": v.tier,
571
+ "Size (GB)": v.size_gb,
572
+ "Params (B)": v.params_b,
573
+ "Capabilities": ", ".join(v.capabilities),
574
+ "Context": v.max_context,
575
+ "Temp": v.default_temp,
576
+ } for k, v in REGISTRY.items()]
577
+
578
+ demo.load(fn=load_registry, outputs=[registry_table])
579
 
580
  return demo
581
 
 
584
  if not GRADIO_AVAILABLE:
585
  print("ERROR: Gradio is required.")
586
  sys.exit(1)
587
+ demo = build_control_center()
588
  demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)