specimba commited on
Commit
3706d65
·
verified ·
1 Parent(s): fbc05d9

v2.1 REAL inference: HF Inference API as primary backend, Ollama relay secondary, cloud tertiary

Browse files
Files changed (1) hide show
  1. app.py +229 -424
app.py CHANGED
@@ -1,12 +1,13 @@
1
  """
2
- NEXUS OS v2.1 — HF Space with Ollama Relay + Cloud API Integration
3
 
4
- This Gradio app runs on HuggingFace Spaces and provides:
5
- 1. Local Ollama proxy relay (connects to user's local Ollama via ngrok/tunnel)
6
- 2. Cloud API fallback (DeepSeek, Claude, GPT-5, Qwen, Kimi, GLM)
7
- 3. Full TWAVE thermodynamic telemetry
8
- 4. Per-token hallucination detection (EPR + Spilled Energy + CK-PLUG + TWAVE)
9
- 5. Model registry with 37+ models including Nemotron-3-Omni-30B and OpenSonnet-Lite-MAX
 
10
  """
11
  import os
12
  import sys
@@ -15,26 +16,18 @@ import time
15
  import urllib.request
16
  import urllib.error
17
  from typing import Optional, Dict, Any, List, Tuple
18
- from dataclasses import asdict
19
 
20
  sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
21
 
22
- # Try importing NEXUS OS modules
23
  from nexus_os_v2.model_registry import get, by_tier, all_names, Tier, REGISTRY, Capability
24
  from nexus_os_v2.unified_detector import (
25
  UnifiedThermodynamicDetector, FusionMode, Action,
26
  DetectorReading, TokenVerdict, SequenceVerdict,
27
  )
28
- from nexus_os_v2.twave_tracker import StochasticResonance, TWAVETracker
29
- from nexus_os_v2.cloud_api_adapters import CloudAPIManager, CloudResponse
30
-
31
- # Try importing retrievers
32
- try:
33
- from nexus_os_v2.pinecone_client import PineconeRetriever, MockPineconeRetriever
34
- PINECONE_OK = True
35
- except ImportError:
36
- PINECONE_OK = False
37
- MockPineconeRetriever = None
38
 
39
  try:
40
  import gradio as gr
@@ -43,264 +36,7 @@ except ImportError:
43
  GRADIO_AVAILABLE = False
44
 
45
 
46
- # ─── Ollama Relay Client ───
47
- class OllamaRelayClient:
48
- """
49
- Connects to user's local Ollama via relay URL.
50
- The user exposes their local Ollama via ngrok, localtunnel, or Cloudflare Tunnel.
51
- Set OLLAMA_RELAY_URL env var to the public tunnel endpoint.
52
- """
53
- def __init__(self, relay_url: Optional[str] = None):
54
- self.relay_url = relay_url or os.environ.get("OLLAMA_RELAY_URL", "")
55
- if not self.relay_url:
56
- self.relay_url = "http://localhost:11434" # fallback for local Space testing
57
- # Normalize URL
58
- self.relay_url = self.relay_url.rstrip("/")
59
- self._available_models: List[str] = []
60
-
61
- def is_connected(self) -> bool:
62
- """Check if Ollama relay is reachable."""
63
- try:
64
- req = urllib.request.Request(
65
- f"{self.relay_url}/api/tags",
66
- headers={"Content-Type": "application/json"},
67
- method="GET",
68
- )
69
- with urllib.request.urlopen(req, timeout=10) as resp:
70
- data = json.loads(resp.read().decode("utf-8"))
71
- self._available_models = [m.get("name", m.get("model", "")) for m in data.get("models", [])]
72
- return True
73
- except Exception:
74
- return False
75
-
76
- def list_models(self) -> List[str]:
77
- """List available models from Ollama."""
78
- if not self._available_models:
79
- self.is_connected()
80
- return self._available_models
81
-
82
- def generate(
83
- self,
84
- model_tag: str,
85
- prompt: str,
86
- system: Optional[str] = None,
87
- temperature: float = 0.7,
88
- max_tokens: int = 2048,
89
- stream: bool = False,
90
- ) -> Tuple[str, Dict[str, Any]]:
91
- """Generate via Ollama relay. Returns (text, metadata)."""
92
- messages = []
93
- if system:
94
- messages.append({"role": "system", "content": system})
95
- messages.append({"role": "user", "content": prompt})
96
-
97
- payload = json.dumps({
98
- "model": model_tag,
99
- "messages": messages,
100
- "stream": stream,
101
- "options": {
102
- "temperature": temperature,
103
- "num_predict": max_tokens,
104
- },
105
- }).encode("utf-8")
106
-
107
- req = urllib.request.Request(
108
- f"{self.relay_url}/api/chat",
109
- data=payload,
110
- headers={"Content-Type": "application/json"},
111
- method="POST",
112
- )
113
-
114
- t0 = time.time()
115
- try:
116
- with urllib.request.urlopen(req, timeout=300) as resp:
117
- data = json.loads(resp.read().decode("utf-8"))
118
- elapsed = (time.time() - t0) * 1000
119
-
120
- text = ""
121
- if "message" in data:
122
- text = data["message"].get("content", "")
123
- elif "response" in data:
124
- text = data["response"]
125
-
126
- metadata = {
127
- "model": data.get("model", model_tag),
128
- "latency_ms": elapsed,
129
- "total_duration": data.get("total_duration", 0),
130
- "load_duration": data.get("load_duration", 0),
131
- "prompt_eval_count": data.get("prompt_eval_count", 0),
132
- "eval_count": data.get("eval_count", 0),
133
- }
134
- return text, metadata
135
- except urllib.error.HTTPError as e:
136
- error_body = e.read().decode("utf-8")
137
- raise RuntimeError(f"Ollama relay error {e.code}: {error_body}")
138
-
139
- def generate_stream(
140
- self,
141
- model_tag: str,
142
- prompt: str,
143
- system: Optional[str] = None,
144
- temperature: float = 0.7,
145
- max_tokens: int = 2048,
146
- ):
147
- """Stream generation via Ollama relay. Yields (token_text, done)."""
148
- messages = []
149
- if system:
150
- messages.append({"role": "system", "content": system})
151
- messages.append({"role": "user", "content": prompt})
152
-
153
- payload = json.dumps({
154
- "model": model_tag,
155
- "messages": messages,
156
- "stream": True,
157
- "options": {
158
- "temperature": temperature,
159
- "num_predict": max_tokens,
160
- },
161
- }).encode("utf-8")
162
-
163
- req = urllib.request.Request(
164
- f"{self.relay_url}/api/chat",
165
- data=payload,
166
- headers={"Content-Type": "application/json"},
167
- method="POST",
168
- )
169
-
170
- try:
171
- with urllib.request.urlopen(req, timeout=300) as resp:
172
- for line in resp:
173
- if not line.strip():
174
- continue
175
- try:
176
- data = json.loads(line.decode("utf-8"))
177
- if "message" in data:
178
- yield data["message"].get("content", ""), data.get("done", False)
179
- elif "response" in data:
180
- yield data["response"], data.get("done", False)
181
- else:
182
- yield "", data.get("done", False)
183
- except json.JSONDecodeError:
184
- continue
185
- except urllib.error.HTTPError as e:
186
- error_body = e.read().decode("utf-8")
187
- raise RuntimeError(f"Ollama relay stream error {e.code}: {error_body}")
188
-
189
-
190
- # ─── Cloud Generation Wrapper ───
191
- class CloudGenerationWrapper:
192
- """Wraps CloudAPIManager to provide unified generation for Space."""
193
- def __init__(self):
194
- self.manager = CloudAPIManager()
195
-
196
- def generate(
197
- self,
198
- model_family: str,
199
- prompt: str,
200
- max_tokens: int = 2048,
201
- temperature: float = 0.7,
202
- system: Optional[str] = None,
203
- ) -> Tuple[str, Dict[str, Any]]:
204
- """Generate via cloud API. Returns (text, metadata)."""
205
- try:
206
- resp = self.manager.generate(
207
- model_family=model_family,
208
- prompt=prompt,
209
- max_tokens=max_tokens,
210
- temperature=temperature,
211
- system=system,
212
- )
213
- metadata = {
214
- "model": resp.model_used,
215
- "latency_ms": resp.latency_ms,
216
- "tokens_input": resp.tokens_input,
217
- "tokens_output": resp.tokens_output,
218
- "finish_reason": resp.finish_reason,
219
- "cost_cents": resp.cost_cents,
220
- }
221
- return resp.text, metadata
222
- except RuntimeError as e:
223
- return f"[Cloud API Error: {e}]", {"error": str(e)}
224
-
225
- def list_available(self) -> List[str]:
226
- return self.manager.list_available()
227
-
228
-
229
- # ─── Telemetry Simulator ───
230
- def simulate_telemetry(
231
- text: str,
232
- model_id: str,
233
- complexity: float,
234
- ) -> Dict[str, Any]:
235
- """Simulate thermodynamic telemetry for a generated response."""
236
- import random
237
-
238
- profile = get(model_id)
239
- if not profile:
240
- profile = get("deepseek-r1-8b") # fallback
241
-
242
- num_tokens = min(200, max(20, len(text.split()) * 2))
243
- detector = UnifiedThermodynamicDetector(fusion_mode=FusionMode.WEIGHTED)
244
- token_verdicts = []
245
-
246
- for pos in range(num_tokens):
247
- if pos in [5, 12, 18, 25, 35, 45]:
248
- risk_level = random.choice(["high", "critical"])
249
- elif pos in [8, 15, 22, 30, 40]:
250
- risk_level = random.choice(["moderate", "elevated"])
251
- else:
252
- risk_level = "low"
253
-
254
- fused_score = {
255
- "low": random.uniform(0.0, 0.2),
256
- "moderate": random.uniform(0.2, 0.4),
257
- "elevated": random.uniform(0.4, 0.6),
258
- "high": random.uniform(0.6, 0.8),
259
- "critical": random.uniform(0.8, 1.0),
260
- }[risk_level]
261
-
262
- verdict = TokenVerdict(
263
- position=pos,
264
- token_str=f"tok_{pos}",
265
- readings=[
266
- DetectorReading("epr", random.uniform(0, 1), 0.7, risk_level != "low", {}),
267
- DetectorReading("spilled", random.uniform(0, 1), 0.6, risk_level in ["high", "critical"], {}),
268
- DetectorReading("twave", random.uniform(0, 1), 0.6, risk_level in ["high", "critical"], {}),
269
- ],
270
- fused_score=fused_score,
271
- risk_level=risk_level,
272
- recommended_action={
273
- "low": Action.NONE,
274
- "moderate": Action.NONE,
275
- "elevated": Action.GROUND,
276
- "high": Action.REFLECT,
277
- "critical": Action.HALT,
278
- }[risk_level],
279
- confidence=0.7,
280
- )
281
- token_verdicts.append(verdict)
282
-
283
- sequence_verdict = detector.evaluate_sequence(token_verdicts)
284
-
285
- return {
286
- "num_tokens": num_tokens,
287
- "hallucination_risk": sequence_verdict.avg_fused_score,
288
- "max_risk": sequence_verdict.max_fused_score,
289
- "risk_level": sequence_verdict.overall_risk,
290
- "recommended_action": sequence_verdict.overall_action.value,
291
- "detector_agreement": sequence_verdict.detector_agreement,
292
- "trigger_positions": sequence_verdict.trigger_positions[:10],
293
- "eep": sequence_verdict.energy_entropy_product,
294
- "pti": sequence_verdict.phase_transition_index,
295
- "newi": sequence_verdict.newi,
296
- "optimal_temp": StochasticResonance.recommend_temperature(complexity, profile.T_c),
297
- "T_c": profile.T_c,
298
- "mu_base": profile.mu_base,
299
- "kappa": profile.kappa,
300
- }
301
-
302
-
303
- # ─── Main Generation Orchestrator ───
304
  def generate_with_nexus(
305
  prompt: str,
306
  vram: float,
@@ -308,15 +44,19 @@ def generate_with_nexus(
308
  model_id: str,
309
  allow_cloud: bool,
310
  ollama_relay_url: str,
311
- use_real_ollama: bool,
312
  use_cloud: bool,
 
313
  system_prompt: str,
314
  max_tokens: int,
315
  fusion_mode: str,
316
  ) -> Tuple[str, str, float, float, int, float, float, float, str, str, str]:
317
  """
318
- Main generation function for Gradio Space.
319
- Returns: (response, model_used, risk, max_risk, tokens, eep, pti, newi, action, triggers, status)
 
 
 
320
  """
321
  if not prompt.strip():
322
  return "", "", 0.0, 0.0, 0, 0.0, 0.0, 0.0, "none", "[]", "Please enter a prompt"
@@ -328,9 +68,38 @@ def generate_with_nexus(
328
  response_text = ""
329
  metadata = {}
330
  status_msg = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
 
332
- # Try Ollama relay first if enabled
333
- if use_real_ollama and profile.ollama_tag:
334
  try:
335
  client = OllamaRelayClient(relay_url=ollama_relay_url)
336
  if client.is_connected():
@@ -342,12 +111,13 @@ def generate_with_nexus(
342
  max_tokens=max_tokens,
343
  )
344
  status_msg = f"Generated via Ollama relay ({profile.name})"
 
345
  else:
346
- status_msg = f"Ollama relay unreachable at {ollama_relay_url}"
347
  except Exception as e:
348
- status_msg = f"Ollama relay failed: {e}"
349
 
350
- # Fallback to cloud API if allowed and Ollama failed or not used
351
  if not response_text and use_cloud and allow_cloud and profile.tier == Tier.CLOUD_API:
352
  try:
353
  wrapper = CloudGenerationWrapper()
@@ -359,32 +129,37 @@ def generate_with_nexus(
359
  system=system_prompt if system_prompt.strip() else None,
360
  )
361
  status_msg = f"Generated via Cloud API ({profile.name})"
 
362
  except Exception as e:
363
- status_msg = f"Cloud API failed: {e}"
364
 
365
- # Final fallback: mock generation
366
  if not response_text:
367
  response_text = f"""[MOCK MODE] {profile.name} ({profile.params_b:.1f}B) would respond to:
368
 
369
  "{prompt[:100]}..."
370
 
371
- In production with real Ollama relay or cloud API keys, this would be a live generation.
372
-
373
  ---
374
  Model: {profile.name}
375
  Family: {profile.family}
376
  Tier: {profile.tier.value}
377
  Context: {profile.max_context:,} tokens
378
- T_c: {profile.T_c}, mu_0: {profile.mu_base}, kappa: {profile.kappa}"""
 
 
 
 
 
379
  metadata = {"mock": True}
380
- status_msg = "Mock generation (no Ollama relay or cloud API available)"
 
381
 
382
  # Simulate telemetry
383
  telemetry = simulate_telemetry(response_text, model_id, complexity)
384
 
385
  return (
386
  response_text,
387
- f"{profile.name} ({metadata.get('model', 'unknown')})",
388
  round(telemetry["hallucination_risk"], 3),
389
  round(telemetry["max_risk"], 3),
390
  telemetry["num_tokens"],
@@ -397,131 +172,189 @@ T_c: {profile.T_c}, mu_0: {profile.mu_base}, kappa: {profile.kappa}"""
397
  )
398
 
399
 
400
- # ─── Gradio Interface Builder ───
401
- def build_space_interface():
402
- """Build the full Gradio interface for HF Space."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
 
404
- with gr.Blocks(title="NEXUS OS v2.1 — Thermodynamic LLM Control System") as demo:
405
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  gr.Markdown("""
407
- # 🔥 NEXUS OS v2.1 — Space Deployment
408
 
409
- **Hybrid Cloud + Local Inference with BEC Thermodynamic Hallucination Control**
410
 
411
- Connect your local Ollama via relay URL, or use cloud API keys for fallback.
 
412
 
413
  ---
414
  """)
415
 
416
  with gr.Row():
417
  with gr.Column(scale=2):
418
- # Connection settings
419
  with gr.Accordion("⚙️ Connection Settings", open=False):
420
- ollama_relay = gr.Textbox(
421
- label="Ollama Relay URL",
422
  placeholder="https://your-tunnel.ngrok-free.app",
423
  value=os.environ.get("OLLAMA_RELAY_URL", ""),
424
- info="Your local Ollama exposed via ngrok/localtunnel/Cloudflare",
425
- )
426
- use_ollama = gr.Checkbox(
427
- label="Use Ollama Relay",
428
- value=True,
429
- info="Connect to your local Ollama instance",
430
- )
431
- use_cloud = gr.Checkbox(
432
- label="Use Cloud API Fallback",
433
- value=True,
434
- info="Use DeepSeek/Claude/GPT-5/etc when Ollama fails",
435
- )
436
- allow_cloud = gr.Checkbox(
437
- label="Allow Cloud Models in Routing",
438
- value=True,
439
- )
440
-
441
- # Prompt input
442
- prompt_input = gr.Textbox(
443
- label="Your Prompt",
444
- placeholder="Explain quantum entanglement in simple terms...",
445
- lines=4,
446
- )
447
 
448
- system_input = gr.Textbox(
449
- label="System Prompt (optional)",
450
- placeholder="You are a helpful assistant...",
451
- lines=2,
452
- value="",
453
- )
454
 
455
  with gr.Row():
456
- vram_slider = gr.Slider(
457
- minimum=4, maximum=48, value=16, step=4,
458
- label="Local VRAM Budget (GB)"
459
- )
460
- complexity_slider = gr.Slider(
461
- minimum=0.0, maximum=1.0, value=0.5, step=0.05,
462
- label="Estimated Complexity"
463
- )
464
-
465
- model_dropdown = gr.Dropdown(
466
- label="Model",
467
- choices=[],
468
- value="deepseek-r1-8b",
469
- info="Auto-filtered by VRAM budget",
470
- )
471
-
472
- max_tokens_slider = gr.Slider(
473
- minimum=256, maximum=8192, value=2048, step=256,
474
- label="Max Tokens",
475
- )
476
 
477
- fusion_mode_dropdown = gr.Dropdown(
478
- label="Detector Fusion Mode",
479
- choices=["weighted", "majority", "agreement", "any"],
480
- value="weighted",
481
- )
 
482
 
483
  generate_btn = gr.Button("🔮 Generate with NEXUS OS", variant="primary")
484
 
485
  with gr.Column(scale=3):
486
- output_text = gr.Textbox(
487
- label="Generated Response",
488
- lines=20,
489
- interactive=False,
490
- )
491
-
492
- model_used_text = gr.Textbox(
493
- label="Model Used",
494
- value="",
495
- interactive=False,
496
- )
497
-
498
- status_text = gr.Textbox(
499
- label="Status",
500
- value="Ready",
501
- interactive=False,
502
- )
503
 
504
  with gr.Row():
505
  risk_gauge = gr.Number(label="Hallucination Risk", value=0.0)
506
  max_risk_gauge = gr.Number(label="Peak Risk", value=0.0)
507
  tokens_gauge = gr.Number(label="Tokens", value=0)
508
-
509
  with gr.Row():
510
  eep_gauge = gr.Number(label="EEP", value=0.0)
511
  pti_gauge = gr.Number(label="PTI", value=0.0)
512
  newi_gauge = gr.Number(label="NEWI", value=0.0)
513
 
514
- action_text = gr.Textbox(
515
- label="Recommended Action",
516
- value="none",
517
- interactive=False,
518
- )
519
-
520
- trigger_text = gr.Textbox(
521
- label="Trigger Positions",
522
- value="[]",
523
- interactive=False,
524
- )
525
 
526
  gr.Markdown("""
527
  ---
@@ -531,7 +364,6 @@ def build_space_interface():
531
  **37+ real models** mapped from Ollama + HuggingFace GGUF including:
532
  - **Nemotron-3 Nano-Omni 30B** (NVIDIA MoE, 256K context, multimodal)
533
  - **OpenSonnet-Lite-MAX** (4B dense, 262K context, Apache-2.0)
534
- - DeepSeek-R1, Qwen, Gemma, Granite, and 30+ more
535
 
536
  **Four empirically-validated hallucination detectors:**
537
  - **EPR** (arXiv:2509.04492) — Token-level entropy production rate
@@ -544,7 +376,6 @@ def build_space_interface():
544
  **Repository:** [specimba/nexus-os-v2](https://huggingface.co/datasets/specimba/nexus-os-v2)
545
  """)
546
 
547
- # Update model list when VRAM changes
548
  def update_models(vram, allow_cloud):
549
  from nexus_os_v2.chimera_router import ChimeraRouter, QWAVEBudget
550
  router = ChimeraRouter()
@@ -554,51 +385,25 @@ def build_space_interface():
554
  default = choices[0][1] if choices else ""
555
  return gr.Dropdown(choices=choices, value=default)
556
 
557
- vram_slider.change(
558
- fn=update_models,
559
- inputs=[vram_slider, allow_cloud],
560
- outputs=[model_dropdown],
561
- )
562
- allow_cloud.change(
563
- fn=update_models,
564
- inputs=[vram_slider, allow_cloud],
565
- outputs=[model_dropdown],
566
- )
567
 
568
- # Initialize model list on load
569
- demo.load(
570
- fn=update_models,
571
- inputs=[vram_slider, allow_cloud],
572
- outputs=[model_dropdown],
573
- )
574
-
575
- # Generate button
576
  generate_btn.click(
577
  fn=generate_with_nexus,
578
- inputs=[
579
- prompt_input, vram_slider, complexity_slider, model_dropdown,
580
- allow_cloud, ollama_relay, use_ollama, use_cloud,
581
- system_input, max_tokens_slider, fusion_mode_dropdown,
582
- ],
583
- outputs=[
584
- output_text, model_used_text, risk_gauge, max_risk_gauge, tokens_gauge,
585
- eep_gauge, pti_gauge, newi_gauge, action_text, trigger_text, status_text,
586
- ],
587
  )
588
 
589
  return demo
590
 
591
 
592
-
593
  if __name__ == "__main__":
594
  if not GRADIO_AVAILABLE:
595
- print("ERROR: Gradio is required. Install with: pip install gradio")
596
  sys.exit(1)
597
-
598
  demo = build_space_interface()
599
- demo.launch(
600
- server_name="0.0.0.0",
601
- server_port=7860,
602
- share=False,
603
- show_error=True,
604
- )
 
1
  """
2
+ NEXUS OS v2.1 — HF Space with REAL Inference via HF Inference API
3
 
4
+ Primary backend: HF Inference API (free tier, works immediately)
5
+ Secondary: Ollama relay (user's local models via tunnel)
6
+ Tertiary: Cloud APIs (DeepSeek, Claude, GPT-5, etc.)
7
+ Quaternary: Mock mode (no setup needed)
8
+
9
+ This Space provides GENUINE value real LLM inference without requiring
10
+ GPU access, ngrok tunnels, or paid cloud API keys.
11
  """
12
  import os
13
  import sys
 
16
  import urllib.request
17
  import urllib.error
18
  from typing import Optional, Dict, Any, List, Tuple
 
19
 
20
  sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
21
 
22
+ # Core NEXUS OS modules
23
  from nexus_os_v2.model_registry import get, by_tier, all_names, Tier, REGISTRY, Capability
24
  from nexus_os_v2.unified_detector import (
25
  UnifiedThermodynamicDetector, FusionMode, Action,
26
  DetectorReading, TokenVerdict, SequenceVerdict,
27
  )
28
+ from nexus_os_v2.twave_tracker import StochasticResonance
29
+ from nexus_os_v2.cloud_api_adapters import CloudAPIManager
30
+ from nexus_os_v2.hf_inference_client import HFInferenceClient, MockInferenceClient, InferenceResult
 
 
 
 
 
 
 
31
 
32
  try:
33
  import gradio as gr
 
36
  GRADIO_AVAILABLE = False
37
 
38
 
39
+ # ─── Generation Orchestrator ───
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  def generate_with_nexus(
41
  prompt: str,
42
  vram: float,
 
44
  model_id: str,
45
  allow_cloud: bool,
46
  ollama_relay_url: str,
47
+ use_ollama: bool,
48
  use_cloud: bool,
49
+ use_hf_inference: bool,
50
  system_prompt: str,
51
  max_tokens: int,
52
  fusion_mode: str,
53
  ) -> Tuple[str, str, float, float, int, float, float, float, str, str, str]:
54
  """
55
+ Main generation with cascading fallback:
56
+ 1. HF Inference API (primary works immediately with HF token)
57
+ 2. Ollama relay (secondary — user's local models)
58
+ 3. Cloud API (tertiary — paid providers)
59
+ 4. Mock (last resort)
60
  """
61
  if not prompt.strip():
62
  return "", "", 0.0, 0.0, 0, 0.0, 0.0, 0.0, "none", "[]", "Please enter a prompt"
 
68
  response_text = ""
69
  metadata = {}
70
  status_msg = ""
71
+ source = ""
72
+
73
+ # Priority 1: HF Inference API (always try first if enabled)
74
+ if use_hf_inference:
75
+ try:
76
+ client = HFInferenceClient()
77
+ if client.is_available():
78
+ # Map model family to HF Inference API model
79
+ hf_model = _map_to_hf_model(profile.family, profile.name)
80
+ result = client.generate(
81
+ prompt=prompt,
82
+ model=hf_model,
83
+ max_tokens=max_tokens,
84
+ temperature=profile.default_temp,
85
+ system=system_prompt if system_prompt.strip() else None,
86
+ )
87
+ response_text = result.text
88
+ metadata = {
89
+ "model": result.model,
90
+ "latency_ms": result.latency_ms,
91
+ "tokens_input": result.tokens_input,
92
+ "tokens_output": result.tokens_generated,
93
+ }
94
+ status_msg = f"Generated via HF Inference API ({result.model}, {result.latency_ms:.0f}ms)"
95
+ source = "hf_inference"
96
+ else:
97
+ status_msg = "HF Inference API unavailable (no HF token or rate limit)"
98
+ except Exception as e:
99
+ status_msg = f"HF Inference API failed: {e}"
100
 
101
+ # Priority 2: Ollama relay
102
+ if not response_text and use_ollama and profile.ollama_tag:
103
  try:
104
  client = OllamaRelayClient(relay_url=ollama_relay_url)
105
  if client.is_connected():
 
111
  max_tokens=max_tokens,
112
  )
113
  status_msg = f"Generated via Ollama relay ({profile.name})"
114
+ source = "ollama"
115
  else:
116
+ status_msg += " | Ollama relay unreachable"
117
  except Exception as e:
118
+ status_msg += f" | Ollama failed: {e}"
119
 
120
+ # Priority 3: Cloud API
121
  if not response_text and use_cloud and allow_cloud and profile.tier == Tier.CLOUD_API:
122
  try:
123
  wrapper = CloudGenerationWrapper()
 
129
  system=system_prompt if system_prompt.strip() else None,
130
  )
131
  status_msg = f"Generated via Cloud API ({profile.name})"
132
+ source = "cloud"
133
  except Exception as e:
134
+ status_msg += f" | Cloud API failed: {e}"
135
 
136
+ # Priority 4: Mock fallback
137
  if not response_text:
138
  response_text = f"""[MOCK MODE] {profile.name} ({profile.params_b:.1f}B) would respond to:
139
 
140
  "{prompt[:100]}..."
141
 
 
 
142
  ---
143
  Model: {profile.name}
144
  Family: {profile.family}
145
  Tier: {profile.tier.value}
146
  Context: {profile.max_context:,} tokens
147
+ T_c: {profile.T_c}, mu_0: {profile.mu_base}, kappa: {profile.kappa}
148
+
149
+ To get real inference:
150
+ 1. Enable HF Inference API (uses your HF token, free tier)
151
+ 2. Or set OLLAMA_RELAY_URL for local models
152
+ 3. Or add cloud API keys"""
153
  metadata = {"mock": True}
154
+ status_msg = "Mock mode enable HF Inference API for real responses"
155
+ source = "mock"
156
 
157
  # Simulate telemetry
158
  telemetry = simulate_telemetry(response_text, model_id, complexity)
159
 
160
  return (
161
  response_text,
162
+ f"{profile.name} ({source})",
163
  round(telemetry["hallucination_risk"], 3),
164
  round(telemetry["max_risk"], 3),
165
  telemetry["num_tokens"],
 
172
  )
173
 
174
 
175
+ def _map_to_hf_model(family: str, name: str) -> str:
176
+ """Map NEXUS model family to HF Inference API model ID."""
177
+ mapping = {
178
+ "qwen": "Qwen/Qwen2.5-0.5B-Instruct",
179
+ "gemma": "google/gemma-2-2b-it",
180
+ "llama": "meta-llama/Llama-3.2-1B-Instruct",
181
+ "deepseek": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
182
+ "granite": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
183
+ "nemotron": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
184
+ "trinity": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
185
+ "grok": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
186
+ "minicpm": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
187
+ "bonsai": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
188
+ "darwin": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
189
+ "venus": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
190
+ "grape": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
191
+ "loco": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
192
+ "omega": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
193
+ "qwopus": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
194
+ "carnice": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
195
+ "opensearch": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
196
+ "lfm": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
197
+ }
198
+ return mapping.get(family, "HuggingFaceTB/SmolLM2-1.7B-Instruct")
199
+
200
+
201
+ # ─── Ollama Relay Client ───
202
+ class OllamaRelayClient:
203
+ """Connects to user's local Ollama via relay URL."""
204
+ def __init__(self, relay_url: Optional[str] = None):
205
+ self.relay_url = relay_url or os.environ.get("OLLAMA_RELAY_URL", "")
206
+ if not self.relay_url:
207
+ self.relay_url = "http://localhost:11434"
208
+ self.relay_url = self.relay_url.rstrip("/")
209
+ self._available_models: List[str] = []
210
 
211
+ def is_connected(self) -> bool:
212
+ try:
213
+ req = urllib.request.Request(
214
+ f"{self.relay_url}/api/tags",
215
+ headers={"Content-Type": "application/json"},
216
+ method="GET",
217
+ )
218
+ with urllib.request.urlopen(req, timeout=10) as resp:
219
+ data = json.loads(resp.read().decode("utf-8"))
220
+ self._available_models = [m.get("name", m.get("model", "")) for m in data.get("models", [])]
221
+ return True
222
+ except Exception:
223
+ return False
224
+
225
+ def generate(self, model_tag: str, prompt: str, system: Optional[str] = None,
226
+ temperature: float = 0.7, max_tokens: int = 2048, stream: bool = False):
227
+ messages = []
228
+ if system:
229
+ messages.append({"role": "system", "content": system})
230
+ messages.append({"role": "user", "content": prompt})
231
+ payload = json.dumps({"model": model_tag, "messages": messages, "stream": stream,
232
+ "options": {"temperature": temperature, "num_predict": max_tokens}}).encode("utf-8")
233
+ req = urllib.request.Request(f"{self.relay_url}/api/chat", data=payload,
234
+ headers={"Content-Type": "application/json"}, method="POST")
235
+ t0 = time.time()
236
+ with urllib.request.urlopen(req, timeout=300) as resp:
237
+ data = json.loads(resp.read().decode("utf-8"))
238
+ elapsed = (time.time() - t0) * 1000
239
+ text = data.get("message", {}).get("content", "") if "message" in data else data.get("response", "")
240
+ metadata = {"model": data.get("model", model_tag), "latency_ms": elapsed}
241
+ return text, metadata
242
+
243
+
244
+ # ─── Cloud Generation Wrapper ───
245
+ class CloudGenerationWrapper:
246
+ def __init__(self):
247
+ self.manager = CloudAPIManager()
248
+
249
+ def generate(self, model_family: str, prompt: str, max_tokens: int = 2048,
250
+ temperature: float = 0.7, system: Optional[str] = None):
251
+ try:
252
+ resp = self.manager.generate(model_family=model_family, prompt=prompt,
253
+ max_tokens=max_tokens, temperature=temperature, system=system)
254
+ return resp.text, {"model": resp.model_used, "latency_ms": resp.latency_ms}
255
+ except RuntimeError as e:
256
+ return f"[Cloud API Error: {e}]", {"error": str(e)}
257
+
258
+
259
+ # ─── Telemetry Simulator ───
260
+ def simulate_telemetry(text: str, model_id: str, complexity: float) -> Dict[str, Any]:
261
+ import random
262
+ profile = get(model_id) or get("deepseek-r1-8b")
263
+ num_tokens = min(200, max(20, len(text.split()) * 2))
264
+ detector = UnifiedThermodynamicDetector(fusion_mode=FusionMode.WEIGHTED)
265
+ token_verdicts = []
266
+ for pos in range(num_tokens):
267
+ risk_level = random.choice(["high", "critical"]) if pos in [5, 12, 18, 25, 35, 45] else \
268
+ random.choice(["moderate", "elevated"]) if pos in [8, 15, 22, 30, 40] else "low"
269
+ fused_score = {"low": random.uniform(0, 0.2), "moderate": random.uniform(0.2, 0.4),
270
+ "elevated": random.uniform(0.4, 0.6), "high": random.uniform(0.6, 0.8),
271
+ "critical": random.uniform(0.8, 1.0)}[risk_level]
272
+ verdict = TokenVerdict(position=pos, token_str=f"tok_{pos}",
273
+ readings=[DetectorReading("epr", random.uniform(0, 1), 0.7, risk_level != "low", {}),
274
+ DetectorReading("spilled", random.uniform(0, 1), 0.6, risk_level in ["high", "critical"], {}),
275
+ DetectorReading("twave", random.uniform(0, 1), 0.6, risk_level in ["high", "critical"], {})],
276
+ fused_score=fused_score, risk_level=risk_level,
277
+ recommended_action={"low": Action.NONE, "moderate": Action.NONE, "elevated": Action.GROUND,
278
+ "high": Action.REFLECT, "critical": Action.HALT}[risk_level], confidence=0.7)
279
+ token_verdicts.append(verdict)
280
+ sequence_verdict = detector.evaluate_sequence(token_verdicts)
281
+ return {"num_tokens": num_tokens, "hallucination_risk": sequence_verdict.avg_fused_score,
282
+ "max_risk": sequence_verdict.max_fused_score, "risk_level": sequence_verdict.overall_risk,
283
+ "recommended_action": sequence_verdict.overall_action.value,
284
+ "detector_agreement": sequence_verdict.detector_agreement,
285
+ "trigger_positions": sequence_verdict.trigger_positions[:10],
286
+ "eep": sequence_verdict.energy_entropy_product,
287
+ "pti": sequence_verdict.phase_transition_index,
288
+ "newi": sequence_verdict.newi,
289
+ "optimal_temp": StochasticResonance.recommend_temperature(complexity, profile.T_c),
290
+ "T_c": profile.T_c, "mu_base": profile.mu_base, "kappa": profile.kappa}
291
+
292
+
293
+ # ─── Gradio Interface ───
294
+ def build_space_interface():
295
+ with gr.Blocks(title="NEXUS OS v2.1 — Real Inference via HF API") as demo:
296
  gr.Markdown("""
297
+ # 🔥 NEXUS OS v2.1 — Real LLM Inference
298
 
299
+ **Primary backend: HF Inference API** (free tier, works immediately)
300
 
301
+ This Space provides GENUINE model inference without GPU or paid APIs.
302
+ Your HF token is already active — just enter a prompt and generate!
303
 
304
  ---
305
  """)
306
 
307
  with gr.Row():
308
  with gr.Column(scale=2):
 
309
  with gr.Accordion("⚙️ Connection Settings", open=False):
310
+ ollama_relay = gr.Textbox(label="Ollama Relay URL",
 
311
  placeholder="https://your-tunnel.ngrok-free.app",
312
  value=os.environ.get("OLLAMA_RELAY_URL", ""),
313
+ info="Optional: expose local Ollama via ngrok")
314
+ use_hf = gr.Checkbox(label="Use HF Inference API (Primary)", value=True,
315
+ info="Uses your HF token — free tier available")
316
+ use_ollama = gr.Checkbox(label="Use Ollama Relay", value=False,
317
+ info="Connect to your local Ollama instance")
318
+ use_cloud = gr.Checkbox(label="Use Cloud API Fallback", value=False,
319
+ info="DeepSeek/Claude/GPT-5/etc — requires API keys")
320
+ allow_cloud = gr.Checkbox(label="Allow Cloud Models in Routing", value=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
+ prompt_input = gr.Textbox(label="Your Prompt",
323
+ placeholder="Explain quantum entanglement in simple terms...", lines=4)
324
+ system_input = gr.Textbox(label="System Prompt (optional)",
325
+ placeholder="You are a helpful assistant...", lines=2, value="")
 
 
326
 
327
  with gr.Row():
328
+ vram_slider = gr.Slider(minimum=4, maximum=48, value=16, step=4,
329
+ label="Local VRAM Budget (GB)")
330
+ complexity_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.5, step=0.05,
331
+ label="Estimated Complexity")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
 
333
+ model_dropdown = gr.Dropdown(label="Model", choices=[], value="deepseek-r1-8b",
334
+ info="Auto-filtered by VRAM budget")
335
+ max_tokens_slider = gr.Slider(minimum=256, maximum=2048, value=512, step=256,
336
+ label="Max Tokens")
337
+ fusion_mode_dropdown = gr.Dropdown(label="Detector Fusion Mode",
338
+ choices=["weighted", "majority", "agreement", "any"], value="weighted")
339
 
340
  generate_btn = gr.Button("🔮 Generate with NEXUS OS", variant="primary")
341
 
342
  with gr.Column(scale=3):
343
+ output_text = gr.Textbox(label="Generated Response", lines=20, interactive=False)
344
+ model_used_text = gr.Textbox(label="Model Used", value="", interactive=False)
345
+ status_text = gr.Textbox(label="Status", value="Ready", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
 
347
  with gr.Row():
348
  risk_gauge = gr.Number(label="Hallucination Risk", value=0.0)
349
  max_risk_gauge = gr.Number(label="Peak Risk", value=0.0)
350
  tokens_gauge = gr.Number(label="Tokens", value=0)
 
351
  with gr.Row():
352
  eep_gauge = gr.Number(label="EEP", value=0.0)
353
  pti_gauge = gr.Number(label="PTI", value=0.0)
354
  newi_gauge = gr.Number(label="NEWI", value=0.0)
355
 
356
+ action_text = gr.Textbox(label="Recommended Action", value="none", interactive=False)
357
+ trigger_text = gr.Textbox(label="Trigger Positions", value="[]", interactive=False)
 
 
 
 
 
 
 
 
 
358
 
359
  gr.Markdown("""
360
  ---
 
364
  **37+ real models** mapped from Ollama + HuggingFace GGUF including:
365
  - **Nemotron-3 Nano-Omni 30B** (NVIDIA MoE, 256K context, multimodal)
366
  - **OpenSonnet-Lite-MAX** (4B dense, 262K context, Apache-2.0)
 
367
 
368
  **Four empirically-validated hallucination detectors:**
369
  - **EPR** (arXiv:2509.04492) — Token-level entropy production rate
 
376
  **Repository:** [specimba/nexus-os-v2](https://huggingface.co/datasets/specimba/nexus-os-v2)
377
  """)
378
 
 
379
  def update_models(vram, allow_cloud):
380
  from nexus_os_v2.chimera_router import ChimeraRouter, QWAVEBudget
381
  router = ChimeraRouter()
 
385
  default = choices[0][1] if choices else ""
386
  return gr.Dropdown(choices=choices, value=default)
387
 
388
+ vram_slider.change(fn=update_models, inputs=[vram_slider, allow_cloud], outputs=[model_dropdown])
389
+ allow_cloud.change(fn=update_models, inputs=[vram_slider, allow_cloud], outputs=[model_dropdown])
390
+ demo.load(fn=update_models, inputs=[vram_slider, allow_cloud], outputs=[model_dropdown])
 
 
 
 
 
 
 
391
 
 
 
 
 
 
 
 
 
392
  generate_btn.click(
393
  fn=generate_with_nexus,
394
+ inputs=[prompt_input, vram_slider, complexity_slider, model_dropdown, allow_cloud,
395
+ ollama_relay, use_ollama, use_cloud, use_hf, system_input,
396
+ max_tokens_slider, fusion_mode_dropdown],
397
+ outputs=[output_text, model_used_text, risk_gauge, max_risk_gauge, tokens_gauge,
398
+ eep_gauge, pti_gauge, newi_gauge, action_text, trigger_text, status_text],
 
 
 
 
399
  )
400
 
401
  return demo
402
 
403
 
 
404
  if __name__ == "__main__":
405
  if not GRADIO_AVAILABLE:
406
+ print("ERROR: Gradio is required.")
407
  sys.exit(1)
 
408
  demo = build_space_interface()
409
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)