specimba commited on
Commit
f9d9a60
Β·
verified Β·
1 Parent(s): 60f5c80

v3.0 Intelligent Router: queries ALL providers in parallel, picks best by health+latency+capability

Browse files
Files changed (1) hide show
  1. app.py +92 -248
app.py CHANGED
@@ -1,33 +1,28 @@
1
  """
2
- NEXUS OS v2.1 β€” HF Space with REAL Inference via HF Inference API
3
 
4
- Primary backend: HF Inference API (free tier, works immediately)
5
- Secondary: Ollama relay (user's local models via tunnel)
6
- Tertiary: Cloud APIs (DeepSeek, Claude, GPT-5, etc.)
7
- Quaternary: Mock mode (no setup needed)
8
 
9
- This Space provides GENUINE value β€” real LLM inference without requiring
10
- GPU access, ngrok tunnels, or paid cloud API keys.
11
  """
12
  import os
13
  import sys
14
  import json
15
  import time
16
- import urllib.request
17
- import urllib.error
18
  from typing import Optional, Dict, Any, List, Tuple
19
 
20
  sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
21
 
22
- # Core NEXUS OS modules
23
- from nexus_os_v2.model_registry import get, by_tier, all_names, Tier, REGISTRY, Capability
24
  from nexus_os_v2.unified_detector import (
25
  UnifiedThermodynamicDetector, FusionMode, Action,
26
  DetectorReading, TokenVerdict, SequenceVerdict,
27
  )
28
  from nexus_os_v2.twave_tracker import StochasticResonance
29
- from nexus_os_v2.cloud_api_adapters import CloudAPIManager
30
- from nexus_os_v2.hf_inference_client import HFInferenceClient, MockInferenceClient, InferenceResult
31
 
32
  try:
33
  import gradio as gr
@@ -36,7 +31,41 @@ except ImportError:
36
  GRADIO_AVAILABLE = False
37
 
38
 
39
- # ─── Generation Orchestrator ───
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  def generate_with_nexus(
41
  prompt: str,
42
  vram: float,
@@ -51,13 +80,6 @@ def generate_with_nexus(
51
  max_tokens: int,
52
  fusion_mode: str,
53
  ) -> Tuple[str, str, float, float, int, float, float, float, str, str, str]:
54
- """
55
- Main generation with cascading fallback:
56
- 1. HF Inference API (primary β€” works immediately with HF token)
57
- 2. Ollama relay (secondary β€” user's local models)
58
- 3. Cloud API (tertiary β€” paid providers)
59
- 4. Mock (last resort)
60
- """
61
  if not prompt.strip():
62
  return "", "", 0.0, 0.0, 0, 0.0, 0.0, 0.0, "none", "[]", "Please enter a prompt"
63
 
@@ -65,101 +87,40 @@ def generate_with_nexus(
65
  if not profile:
66
  return "", "", 0.0, 0.0, 0, 0.0, 0.0, 0.0, "none", "[]", f"Model {model_id} not found"
67
 
68
- response_text = ""
69
- metadata = {}
70
- status_msg = ""
71
- source = ""
72
-
73
- # Priority 1: HF Inference API (always try first if enabled)
74
- if use_hf_inference:
75
- try:
76
- client = HFInferenceClient()
77
- if client.is_available():
78
- # Map model family to HF Inference API model
79
- hf_model = _map_to_hf_model(profile.family, profile.name)
80
- result = client.generate(
81
- prompt=prompt,
82
- model=hf_model,
83
- max_tokens=max_tokens,
84
- temperature=profile.default_temp,
85
- system=system_prompt if system_prompt.strip() else None,
86
- )
87
- response_text = result.text
88
- metadata = {
89
- "model": result.model,
90
- "latency_ms": result.latency_ms,
91
- "tokens_input": result.tokens_input,
92
- "tokens_output": result.tokens_generated,
93
- }
94
- status_msg = f"Generated via HF Inference API ({result.model}, {result.latency_ms:.0f}ms)"
95
- source = "hf_inference"
96
- else:
97
- status_msg = "HF Inference API unavailable (no HF token or rate limit)"
98
- except Exception as e:
99
- status_msg = f"HF Inference API failed: {e}"
100
 
101
- # Priority 2: Ollama relay
102
- if not response_text and use_ollama and profile.ollama_tag:
103
- try:
104
- client = OllamaRelayClient(relay_url=ollama_relay_url)
105
- if client.is_connected():
106
- response_text, metadata = client.generate(
107
- model_tag=profile.ollama_tag,
108
- prompt=prompt,
109
- system=system_prompt if system_prompt.strip() else None,
110
- temperature=profile.default_temp,
111
- max_tokens=max_tokens,
112
- )
113
- status_msg = f"Generated via Ollama relay ({profile.name})"
114
- source = "ollama"
115
- else:
116
- status_msg += " | Ollama relay unreachable"
117
- except Exception as e:
118
- status_msg += f" | Ollama failed: {e}"
119
 
120
- # Priority 3: Cloud API
121
- if not response_text and use_cloud and allow_cloud and profile.tier == Tier.CLOUD_API:
122
- try:
123
- wrapper = CloudGenerationWrapper()
124
- response_text, metadata = wrapper.generate(
125
- model_family=profile.family,
126
- prompt=prompt,
127
- max_tokens=max_tokens,
128
- temperature=profile.default_temp,
129
- system=system_prompt if system_prompt.strip() else None,
130
- )
131
- status_msg = f"Generated via Cloud API ({profile.name})"
132
- source = "cloud"
133
- except Exception as e:
134
- status_msg += f" | Cloud API failed: {e}"
135
 
136
- # Priority 4: Mock fallback
137
- if not response_text:
138
- response_text = f"""[MOCK MODE] {profile.name} ({profile.params_b:.1f}B) would respond to:
139
-
140
- "{prompt[:100]}..."
141
-
142
- ---
143
- Model: {profile.name}
144
- Family: {profile.family}
145
- Tier: {profile.tier.value}
146
- Context: {profile.max_context:,} tokens
147
- T_c: {profile.T_c}, mu_0: {profile.mu_base}, kappa: {profile.kappa}
148
-
149
- To get real inference:
150
- 1. Enable HF Inference API (uses your HF token, free tier)
151
- 2. Or set OLLAMA_RELAY_URL for local models
152
- 3. Or add cloud API keys"""
153
- metadata = {"mock": True}
154
- status_msg = "Mock mode β€” enable HF Inference API for real responses"
155
- source = "mock"
156
 
157
  # Simulate telemetry
158
- telemetry = simulate_telemetry(response_text, model_id, complexity)
159
 
160
  return (
161
- response_text,
162
- f"{profile.name} ({source})",
163
  round(telemetry["hallucination_risk"], 3),
164
  round(telemetry["max_risk"], 3),
165
  telemetry["num_tokens"],
@@ -168,138 +129,19 @@ To get real inference:
168
  round(telemetry["newi"], 3),
169
  telemetry["recommended_action"],
170
  str(telemetry["trigger_positions"]),
171
- status_msg,
172
  )
173
 
174
 
175
- def _map_to_hf_model(family: str, name: str) -> str:
176
- """Map NEXUS model family to HF Inference API model ID."""
177
- mapping = {
178
- "qwen": "Qwen/Qwen2.5-0.5B-Instruct",
179
- "gemma": "google/gemma-2-2b-it",
180
- "llama": "meta-llama/Llama-3.2-1B-Instruct",
181
- "deepseek": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
182
- "granite": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
183
- "nemotron": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
184
- "trinity": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
185
- "grok": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
186
- "minicpm": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
187
- "bonsai": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
188
- "darwin": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
189
- "venus": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
190
- "grape": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
191
- "loco": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
192
- "omega": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
193
- "qwopus": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
194
- "carnice": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
195
- "opensearch": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
196
- "lfm": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
197
- }
198
- return mapping.get(family, "HuggingFaceTB/SmolLM2-1.7B-Instruct")
199
-
200
-
201
- # ─── Ollama Relay Client ───
202
- class OllamaRelayClient:
203
- """Connects to user's local Ollama via relay URL."""
204
- def __init__(self, relay_url: Optional[str] = None):
205
- self.relay_url = relay_url or os.environ.get("OLLAMA_RELAY_URL", "")
206
- if not self.relay_url:
207
- self.relay_url = "http://localhost:11434"
208
- self.relay_url = self.relay_url.rstrip("/")
209
- self._available_models: List[str] = []
210
-
211
- def is_connected(self) -> bool:
212
- try:
213
- req = urllib.request.Request(
214
- f"{self.relay_url}/api/tags",
215
- headers={"Content-Type": "application/json"},
216
- method="GET",
217
- )
218
- with urllib.request.urlopen(req, timeout=10) as resp:
219
- data = json.loads(resp.read().decode("utf-8"))
220
- self._available_models = [m.get("name", m.get("model", "")) for m in data.get("models", [])]
221
- return True
222
- except Exception:
223
- return False
224
-
225
- def generate(self, model_tag: str, prompt: str, system: Optional[str] = None,
226
- temperature: float = 0.7, max_tokens: int = 2048, stream: bool = False):
227
- messages = []
228
- if system:
229
- messages.append({"role": "system", "content": system})
230
- messages.append({"role": "user", "content": prompt})
231
- payload = json.dumps({"model": model_tag, "messages": messages, "stream": stream,
232
- "options": {"temperature": temperature, "num_predict": max_tokens}}).encode("utf-8")
233
- req = urllib.request.Request(f"{self.relay_url}/api/chat", data=payload,
234
- headers={"Content-Type": "application/json"}, method="POST")
235
- t0 = time.time()
236
- with urllib.request.urlopen(req, timeout=300) as resp:
237
- data = json.loads(resp.read().decode("utf-8"))
238
- elapsed = (time.time() - t0) * 1000
239
- text = data.get("message", {}).get("content", "") if "message" in data else data.get("response", "")
240
- metadata = {"model": data.get("model", model_tag), "latency_ms": elapsed}
241
- return text, metadata
242
-
243
-
244
- # ─── Cloud Generation Wrapper ───
245
- class CloudGenerationWrapper:
246
- def __init__(self):
247
- self.manager = CloudAPIManager()
248
-
249
- def generate(self, model_family: str, prompt: str, max_tokens: int = 2048,
250
- temperature: float = 0.7, system: Optional[str] = None):
251
- try:
252
- resp = self.manager.generate(model_family=model_family, prompt=prompt,
253
- max_tokens=max_tokens, temperature=temperature, system=system)
254
- return resp.text, {"model": resp.model_used, "latency_ms": resp.latency_ms}
255
- except RuntimeError as e:
256
- return f"[Cloud API Error: {e}]", {"error": str(e)}
257
-
258
-
259
- # ─── Telemetry Simulator ───
260
- def simulate_telemetry(text: str, model_id: str, complexity: float) -> Dict[str, Any]:
261
- import random
262
- profile = get(model_id) or get("deepseek-r1-8b")
263
- num_tokens = min(200, max(20, len(text.split()) * 2))
264
- detector = UnifiedThermodynamicDetector(fusion_mode=FusionMode.WEIGHTED)
265
- token_verdicts = []
266
- for pos in range(num_tokens):
267
- risk_level = random.choice(["high", "critical"]) if pos in [5, 12, 18, 25, 35, 45] else \
268
- random.choice(["moderate", "elevated"]) if pos in [8, 15, 22, 30, 40] else "low"
269
- fused_score = {"low": random.uniform(0, 0.2), "moderate": random.uniform(0.2, 0.4),
270
- "elevated": random.uniform(0.4, 0.6), "high": random.uniform(0.6, 0.8),
271
- "critical": random.uniform(0.8, 1.0)}[risk_level]
272
- verdict = TokenVerdict(position=pos, token_str=f"tok_{pos}",
273
- readings=[DetectorReading("epr", random.uniform(0, 1), 0.7, risk_level != "low", {}),
274
- DetectorReading("spilled", random.uniform(0, 1), 0.6, risk_level in ["high", "critical"], {}),
275
- DetectorReading("twave", random.uniform(0, 1), 0.6, risk_level in ["high", "critical"], {})],
276
- fused_score=fused_score, risk_level=risk_level,
277
- recommended_action={"low": Action.NONE, "moderate": Action.NONE, "elevated": Action.GROUND,
278
- "high": Action.REFLECT, "critical": Action.HALT}[risk_level], confidence=0.7)
279
- token_verdicts.append(verdict)
280
- sequence_verdict = detector.evaluate_sequence(token_verdicts)
281
- return {"num_tokens": num_tokens, "hallucination_risk": sequence_verdict.avg_fused_score,
282
- "max_risk": sequence_verdict.max_fused_score, "risk_level": sequence_verdict.overall_risk,
283
- "recommended_action": sequence_verdict.overall_action.value,
284
- "detector_agreement": sequence_verdict.detector_agreement,
285
- "trigger_positions": sequence_verdict.trigger_positions[:10],
286
- "eep": sequence_verdict.energy_entropy_product,
287
- "pti": sequence_verdict.phase_transition_index,
288
- "newi": sequence_verdict.newi,
289
- "optimal_temp": StochasticResonance.recommend_temperature(complexity, profile.T_c),
290
- "T_c": profile.T_c, "mu_base": profile.mu_base, "kappa": profile.kappa}
291
-
292
-
293
  # ─── Gradio Interface ───
294
  def build_space_interface():
295
- with gr.Blocks(title="NEXUS OS v2.1 β€” Real Inference via HF API") as demo:
296
  gr.Markdown("""
297
- # πŸ”₯ NEXUS OS v2.1 β€” Real LLM Inference
298
 
299
- **Primary backend: HF Inference API** (free tier, works immediately)
300
 
301
- This Space provides GENUINE model inference without GPU or paid APIs.
302
- Your HF token is already active β€” just enter a prompt and generate!
303
 
304
  ---
305
  """)
@@ -311,12 +153,10 @@ def build_space_interface():
311
  placeholder="https://your-tunnel.ngrok-free.app",
312
  value=os.environ.get("OLLAMA_RELAY_URL", ""),
313
  info="Optional: expose local Ollama via ngrok")
314
- use_hf = gr.Checkbox(label="Use HF Inference API (Primary)", value=True,
315
- info="Uses your HF token β€” free tier available")
316
- use_ollama = gr.Checkbox(label="Use Ollama Relay", value=False,
317
- info="Connect to your local Ollama instance")
318
- use_cloud = gr.Checkbox(label="Use Cloud API Fallback", value=False,
319
- info="DeepSeek/Claude/GPT-5/etc β€” requires API keys")
320
  allow_cloud = gr.Checkbox(label="Allow Cloud Models in Routing", value=True)
321
 
322
  prompt_input = gr.Textbox(label="Your Prompt",
@@ -342,7 +182,7 @@ def build_space_interface():
342
  with gr.Column(scale=3):
343
  output_text = gr.Textbox(label="Generated Response", lines=20, interactive=False)
344
  model_used_text = gr.Textbox(label="Model Used", value="", interactive=False)
345
- status_text = gr.Textbox(label="Status", value="Ready", interactive=False)
346
 
347
  with gr.Row():
348
  risk_gauge = gr.Number(label="Hallucination Risk", value=0.0)
@@ -361,15 +201,19 @@ def build_space_interface():
361
 
362
  ### About NEXUS OS v2.1
363
 
364
- **37+ real models** mapped from Ollama + HuggingFace GGUF including:
365
- - **Nemotron-3 Nano-Omni 30B** (NVIDIA MoE, 256K context, multimodal)
366
- - **OpenSonnet-Lite-MAX** (4B dense, 262K context, Apache-2.0)
 
 
 
 
 
 
 
 
367
 
368
- **Four empirically-validated hallucination detectors:**
369
- - **EPR** (arXiv:2509.04492) β€” Token-level entropy production rate
370
- - **Spilled Energy** (arXiv:2602.18671) β€” Energy discrepancy in autoregressive EBMs
371
- - **CK-PLUG** (arXiv:2503.15888) β€” Confidence Gain for retrieval coupling
372
- - **TWAVE** β€” Landau-Ginzburg BEC order parameter tracking
373
 
374
  **Novel composite signals:** EEP, PTI, NEWI
375
 
 
1
  """
2
+ NEXUS OS v2.1 β€” Intelligent Multi-Provider Router
3
 
4
+ Queries ALL available free API providers in parallel:
5
+ HF Inference API, Together AI, Cerebras, Groq, Fireworks, DeepSeek
6
+ Picks the best one based on health + capability match + latency.
7
+ Falls back through the chain if any provider fails.
8
 
9
+ Also supports Ollama relay and mock mode as last resort.
 
10
  """
11
  import os
12
  import sys
13
  import json
14
  import time
 
 
15
  from typing import Optional, Dict, Any, List, Tuple
16
 
17
  sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
18
 
19
+ from nexus_os_v2.model_registry import get, Tier, Capability
 
20
  from nexus_os_v2.unified_detector import (
21
  UnifiedThermodynamicDetector, FusionMode, Action,
22
  DetectorReading, TokenVerdict, SequenceVerdict,
23
  )
24
  from nexus_os_v2.twave_tracker import StochasticResonance
25
+ from nexus_os_v2.intelligent_router import IntelligentRouter, Provider
 
26
 
27
  try:
28
  import gradio as gr
 
31
  GRADIO_AVAILABLE = False
32
 
33
 
34
+ # ─── Telemetry Simulator ───
35
+ def simulate_telemetry(text: str, model_id: str, complexity: float) -> Dict[str, Any]:
36
+ import random
37
+ profile = get(model_id) or get("deepseek-r1-8b")
38
+ num_tokens = min(200, max(20, len(text.split()) * 2))
39
+ detector = UnifiedThermodynamicDetector(fusion_mode=FusionMode.WEIGHTED)
40
+ token_verdicts = []
41
+ for pos in range(num_tokens):
42
+ risk_level = random.choice(["high", "critical"]) if pos in [5, 12, 18, 25, 35, 45] else \
43
+ random.choice(["moderate", "elevated"]) if pos in [8, 15, 22, 30, 40] else "low"
44
+ fused_score = {"low": random.uniform(0, 0.2), "moderate": random.uniform(0.2, 0.4),
45
+ "elevated": random.uniform(0.4, 0.6), "high": random.uniform(0.6, 0.8),
46
+ "critical": random.uniform(0.8, 1.0)}[risk_level]
47
+ verdict = TokenVerdict(position=pos, token_str=f"tok_{pos}",
48
+ readings=[DetectorReading("epr", random.uniform(0, 1), 0.7, risk_level != "low", {}),
49
+ DetectorReading("spilled", random.uniform(0, 1), 0.6, risk_level in ["high", "critical"], {}),
50
+ DetectorReading("twave", random.uniform(0, 1), 0.6, risk_level in ["high", "critical"], {})],
51
+ fused_score=fused_score, risk_level=risk_level,
52
+ recommended_action={"low": Action.NONE, "moderate": Action.NONE, "elevated": Action.GROUND,
53
+ "high": Action.REFLECT, "critical": Action.HALT}[risk_level], confidence=0.7)
54
+ token_verdicts.append(verdict)
55
+ sequence_verdict = detector.evaluate_sequence(token_verdicts)
56
+ return {"num_tokens": num_tokens, "hallucination_risk": sequence_verdict.avg_fused_score,
57
+ "max_risk": sequence_verdict.max_fused_score, "risk_level": sequence_verdict.overall_risk,
58
+ "recommended_action": sequence_verdict.overall_action.value,
59
+ "detector_agreement": sequence_verdict.detector_agreement,
60
+ "trigger_positions": sequence_verdict.trigger_positions[:10],
61
+ "eep": sequence_verdict.energy_entropy_product,
62
+ "pti": sequence_verdict.phase_transition_index,
63
+ "newi": sequence_verdict.newi,
64
+ "optimal_temp": StochasticResonance.recommend_temperature(complexity, profile.T_c),
65
+ "T_c": profile.T_c, "mu_base": profile.mu_base, "kappa": profile.kappa}
66
+
67
+
68
+ # ─── Main Generation with Intelligent Router ───
69
  def generate_with_nexus(
70
  prompt: str,
71
  vram: float,
 
80
  max_tokens: int,
81
  fusion_mode: str,
82
  ) -> Tuple[str, str, float, float, int, float, float, float, str, str, str]:
 
 
 
 
 
 
 
83
  if not prompt.strip():
84
  return "", "", 0.0, 0.0, 0, 0.0, 0.0, 0.0, "none", "[]", "Please enter a prompt"
85
 
 
87
  if not profile:
88
  return "", "", 0.0, 0.0, 0, 0.0, 0.0, 0.0, "none", "[]", f"Model {model_id} not found"
89
 
90
+ # Use intelligent router
91
+ router = IntelligentRouter()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
+ # Map model capabilities to routing requirements
94
+ required_caps = []
95
+ if Capability.CODING in profile.capabilities:
96
+ required_caps.append("coding")
97
+ if Capability.REASONING in profile.capabilities:
98
+ required_caps.append("reasoning")
99
+ if Capability.FAST in profile.capabilities:
100
+ required_caps.append("fast")
 
 
 
 
 
 
 
 
 
 
101
 
102
+ # Route to best provider
103
+ result = router.route(
104
+ prompt=prompt,
105
+ complexity=complexity,
106
+ required_capabilities=required_caps,
107
+ max_tokens=max_tokens,
108
+ temperature=profile.default_temp,
109
+ system=system_prompt if system_prompt.strip() else None,
110
+ ollama_relay_url=ollama_relay_url if use_ollama else None,
111
+ )
 
 
 
 
 
112
 
113
+ # Build status message with fallback chain
114
+ status = f"Provider: {result.provider.value} | Model: {result.model} | Latency: {result.latency_ms:.0f}ms"
115
+ if result.fallback_chain:
116
+ status += f"\nFallback chain:\n" + "\n".join(result.fallback_chain)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  # Simulate telemetry
119
+ telemetry = simulate_telemetry(result.text, model_id, complexity)
120
 
121
  return (
122
+ result.text,
123
+ f"{profile.name} ({result.provider.value})",
124
  round(telemetry["hallucination_risk"], 3),
125
  round(telemetry["max_risk"], 3),
126
  telemetry["num_tokens"],
 
129
  round(telemetry["newi"], 3),
130
  telemetry["recommended_action"],
131
  str(telemetry["trigger_positions"]),
132
+ status,
133
  )
134
 
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  # ─── Gradio Interface ───
137
  def build_space_interface():
138
+ with gr.Blocks(title="NEXUS OS v2.1 β€” Intelligent Multi-Provider Router") as demo:
139
  gr.Markdown("""
140
+ # πŸ”₯ NEXUS OS v2.1 β€” Intelligent Router
141
 
142
+ **Queries ALL free API providers in parallel and picks the best one.**
143
 
144
+ Supported providers: HF Inference API, Together AI, Cerebras, Groq, Fireworks, DeepSeek
 
145
 
146
  ---
147
  """)
 
153
  placeholder="https://your-tunnel.ngrok-free.app",
154
  value=os.environ.get("OLLAMA_RELAY_URL", ""),
155
  info="Optional: expose local Ollama via ngrok")
156
+ use_hf = gr.Checkbox(label="Enable HF Inference API", value=True)
157
+ use_ollama = gr.Checkbox(label="Enable Ollama Relay", value=False)
158
+ use_cloud = gr.Checkbox(label="Enable Cloud APIs", value=True,
159
+ info="Together, Cerebras, Groq, Fireworks, DeepSeek")
 
 
160
  allow_cloud = gr.Checkbox(label="Allow Cloud Models in Routing", value=True)
161
 
162
  prompt_input = gr.Textbox(label="Your Prompt",
 
182
  with gr.Column(scale=3):
183
  output_text = gr.Textbox(label="Generated Response", lines=20, interactive=False)
184
  model_used_text = gr.Textbox(label="Model Used", value="", interactive=False)
185
+ status_text = gr.Textbox(label="Status / Fallback Chain", value="Ready", interactive=False, lines=4)
186
 
187
  with gr.Row():
188
  risk_gauge = gr.Number(label="Hallucination Risk", value=0.0)
 
201
 
202
  ### About NEXUS OS v2.1
203
 
204
+ **Intelligent Multi-Provider Router** β€” queries all available providers in parallel:
205
+ - HF Inference API (free tier)
206
+ - Together AI (free tier)
207
+ - Cerebras (free tier)
208
+ - Groq (free tier)
209
+ - Fireworks AI (free tier)
210
+ - DeepSeek API (free tier)
211
+
212
+ Picks the best based on health check latency + capability match.
213
+
214
+ **37+ real models** in registry including Nemotron-3 Nano-Omni 30B and OpenSonnet-Lite-MAX
215
 
216
+ **Four empirically-validated hallucination detectors:** EPR, Spilled Energy, CK-PLUG, TWAVE
 
 
 
 
217
 
218
  **Novel composite signals:** EEP, PTI, NEWI
219