specimba commited on
Commit
baea714
·
verified ·
1 Parent(s): 3412619

Copy nexus_os_v2/chimera_router.py from dataset for module imports

Browse files
Files changed (1) hide show
  1. nexus_os_v2/chimera_router.py +386 -0
nexus_os_v2/chimera_router.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ChimeraRouter v2 — Hybrid Cloud+Local Inference Orchestrator
3
+ with QWAVE Budget Allocation and TWAVE Thermodynamic Control
4
+
5
+ Pipeline:
6
+ 1. Sulphur Prompt Enhancer → classify intent + complexity
7
+ 2. QWAVE Budget Allocator → local vs cloud decision
8
+ 3. Model Selection → pick best model from registry
9
+ 4. TWAVE Tracker → initialize thermodynamic state
10
+ 5. Retrieval → Pinecone + Milvus + ERNIE (CK-PLUG coupling)
11
+ 6. Generation Loop → Ollama (local) or Cloud API
12
+ 7. Reflection → TWAVE triggers → grounding boost or model fallback
13
+ 8. Output → response + per_token_debug telemetry
14
+ """
15
+ from typing import List, Dict, Optional, Any, Tuple
16
+ from dataclasses import dataclass, field
17
+ from enum import Enum
18
+
19
+ from .model_registry import REGISTRY, SULPHUR, get, by_tier, by_cap, Tier, Capability, ModelProfile
20
+ from .sulphur_enhancer import SulphurEnhancer, MockSulphurEnhancer, EnhancedPrompt
21
+ from .twave_tracker import TWAVETracker, TokenState, StochasticResonance
22
+ from .ckplug_retriever import CKPLUGCoupling, get_preset_epsilon
23
+ from .pinecone_client import PineconeRetriever, MockPineconeRetriever
24
+ from .milvus_client import MilvusRetriever, MockMilvusRetriever
25
+ from .ernie_adapter import ERNIEAdapter, MockERNIEAdapter
26
+
27
+
28
+ class RoutingDecision(Enum):
29
+ LOCAL_OLLAMA = "local_ollama"
30
+ CLOUD_API = "cloud_api"
31
+ FALLBACK = "fallback" # All tiers exhausted
32
+ REFLECTION = "reflection" # TWAVE triggered re-grounding
33
+
34
+
35
+ @dataclass
36
+ class QWAVEBudget:
37
+ """Quality-Wave budget allocation per request."""
38
+ max_tokens: int = 4096
39
+ target_latency_ms: float = 2000.0
40
+ vram_budget_gb: float = 8.0 # User's local GPU VRAM
41
+ cloud_budget_cents: float = 5.0 # Per-request cloud budget (cents)
42
+ allow_cloud: bool = True
43
+ allow_uncensored: bool = True
44
+ require_vision: bool = False
45
+ require_safety: bool = False
46
+ require_tools: bool = False
47
+
48
+
49
+ @dataclass
50
+ class RouterResult:
51
+ """Complete routing result with telemetry."""
52
+ selected_model: str
53
+ model_profile: ModelProfile
54
+ tier: str
55
+ enhanced_prompt: str
56
+ response: str
57
+ token_states: List[TokenState] = field(default_factory=list)
58
+ reflection_count: int = 0
59
+ grounding_score: float = 0.0
60
+ hallucination_risk: float = 0.0
61
+ latency_ms: float = 0.0
62
+ tokens_generated: int = 0
63
+ debug: Dict[str, Any] = field(default_factory=dict)
64
+
65
+
66
+ class ChimeraRouter:
67
+ """
68
+ Production router for NEXUS OS v2.
69
+ Integrates all subsystems into a unified inference pipeline.
70
+ """
71
+
72
+ def __init__(
73
+ self,
74
+ sulphur: Optional[Any] = None, # SulphurEnhancer or mock
75
+ pinecone: Optional[Any] = None, # PineconeRetriever or mock
76
+ milvus: Optional[Any] = None, # MilvusRetriever or mock
77
+ ernie: Optional[Any] = None, # ERNIEAdapter or mock
78
+ twave: Optional[TWAVETracker] = None,
79
+ ollama_host: str = "http://localhost:11434",
80
+ default_budget: Optional[QWAVEBudget] = None,
81
+ ):
82
+ # Subsystems (use mocks if none provided)
83
+ self.sulphur = sulphur or MockSulphurEnhancer()
84
+ self.pinecone = pinecone or MockPineconeRetriever()
85
+ self.milvus = milvus or MockMilvusRetriever()
86
+ self.ernie = ernie or MockERNIEAdapter()
87
+ self.twave = twave or TWAVETracker()
88
+ self.ollama_host = ollama_host
89
+ self.default_budget = default_budget or QWAVEBudget()
90
+
91
+ def _enhance(self, prompt: str) -> EnhancedPrompt:
92
+ """Step 1: Sulphur prompt enhancement."""
93
+ return self.sulphur.enhance(prompt)
94
+
95
+ def _select_model(self, enhanced: EnhancedPrompt, budget: QWAVEBudget) -> Tuple[str, ModelProfile]:
96
+ """
97
+ Step 2-3: QWAVE budget allocation + model selection.
98
+ Returns (model_id, profile).
99
+ """
100
+ # Determine required capabilities from tags
101
+ required_caps = []
102
+ for tag in enhanced.intent_tags:
103
+ cap_map = {
104
+ "coding": Capability.CODING,
105
+ "reasoning": Capability.REASONING,
106
+ "vision": Capability.VISION,
107
+ "creative": Capability.INSTRUCT, # Creative uses instruct-capable models
108
+ "factual": Capability.REASONING,
109
+ "safety": Capability.SAFETY,
110
+ "fast": Capability.FAST,
111
+ "long_context": Capability.LONG_CONTEXT,
112
+ }
113
+ if tag.lower() in cap_map:
114
+ required_caps.append(cap_map[tag.lower()])
115
+
116
+ # Filter by safety/uncensored requirements
117
+ if budget.require_safety:
118
+ # Exclude abliterated/unchained models
119
+ exclude = [Capability.ABLITERATED, Capability.UNCHAINED]
120
+ elif budget.allow_uncensored:
121
+ exclude = []
122
+ else:
123
+ exclude = [Capability.ABLITERATED]
124
+
125
+ # Determine tier from complexity + VRAM budget
126
+ if enhanced.complexity_score > 0.8 and budget.allow_cloud:
127
+ # High complexity → cloud frontier or largest local
128
+ preferred_tiers = [Tier.CLOUD_API, Tier.LOCAL_48GB, Tier.LOCAL_24GB]
129
+ elif enhanced.complexity_score > 0.6:
130
+ preferred_tiers = [Tier.LOCAL_24GB, Tier.LOCAL_16GB, Tier.CLOUD_API]
131
+ elif enhanced.complexity_score > 0.4:
132
+ preferred_tiers = [Tier.LOCAL_16GB, Tier.LOCAL_8GB]
133
+ else:
134
+ preferred_tiers = [Tier.LOCAL_8GB]
135
+
136
+ # Build candidate list
137
+ candidates = []
138
+ for tier in preferred_tiers:
139
+ tier_models = by_tier(tier)
140
+ for m in tier_models:
141
+ # Check capability match
142
+ if required_caps and not all(c in m.capabilities for c in required_caps):
143
+ continue
144
+ # Check exclusions
145
+ if any(c in m.capabilities for c in exclude):
146
+ continue
147
+ # Check VRAM (local only)
148
+ if tier != Tier.CLOUD_API and m.size_gb > budget.vram_budget_gb:
149
+ continue
150
+ candidates.append(m)
151
+
152
+ if not candidates:
153
+ # Fallback: any model that fits
154
+ all_models = list(REGISTRY.values())
155
+ candidates = [m for m in all_models if m.tier != Tier.CLOUD_API and m.size_gb <= budget.vram_budget_gb]
156
+ if not candidates and budget.allow_cloud:
157
+ candidates = by_tier(Tier.CLOUD_API)
158
+
159
+ if not candidates:
160
+ raise RuntimeError("No models available for this request. Check VRAM budget or enable cloud.")
161
+
162
+ # Score candidates: prefer higher params for complex, lower for fast
163
+ def score_model(m: ModelProfile) -> float:
164
+ s = 0.0
165
+ # Capability match bonus
166
+ for cap in required_caps:
167
+ if cap in m.capabilities:
168
+ s += 10.0
169
+ # Complexity alignment
170
+ if enhanced.complexity_score > 0.7:
171
+ s += m.params_b * 2.0 # Bigger models for complex
172
+ else:
173
+ s += (10.0 - m.params_b) * 0.5 # Smaller for simple
174
+ # Speed bonus
175
+ if Capability.FAST in m.capabilities and "fast" in enhanced.intent_tags:
176
+ s += 5.0
177
+ # VRAM efficiency (prefer smaller if equal)
178
+ s -= m.size_gb * 0.1
179
+ return s
180
+
181
+ candidates.sort(key=score_model, reverse=True)
182
+ best = candidates[0]
183
+
184
+ # Find registry key
185
+ for k, v in REGISTRY.items():
186
+ if v == best:
187
+ return k, best
188
+
189
+ raise RuntimeError("Model selected but not found in registry.")
190
+
191
+ def _retrieve(self, query: str) -> Dict[str, Any]:
192
+ """Step 5: Multi-source retrieval aggregation."""
193
+ results = {
194
+ "pinecone": [],
195
+ "milvus": [],
196
+ "ernie": [],
197
+ "aggregated": [],
198
+ }
199
+
200
+ try:
201
+ results["pinecone"] = self.pinecone.get_evidence_for_ckplug(query)
202
+ except Exception as e:
203
+ results["pinecone_error"] = str(e)
204
+
205
+ try:
206
+ results["milvus"] = self.milvus.get_evidence("nexus_docs", query)
207
+ except Exception as e:
208
+ results["milvus_error"] = str(e)
209
+
210
+ try:
211
+ if self.ernie.is_available():
212
+ results["ernie"] = self.ernie.get_evidence(query)
213
+ except Exception as e:
214
+ results["ernie_error"] = str(e)
215
+
216
+ # Aggregate all evidence by relevance score
217
+ all_evidence = []
218
+ for src in [results["pinecone"], results["milvus"], results["ernie"]]:
219
+ for item in src:
220
+ all_evidence.append({
221
+ "text": item.get("text", ""),
222
+ "relevance": item.get("relevance", 0.0),
223
+ "source": item.get("type", "unknown"),
224
+ })
225
+
226
+ all_evidence.sort(key=lambda x: x["relevance"], reverse=True)
227
+ results["aggregated"] = all_evidence[:10] # Top 10
228
+ results["top_score"] = all_evidence[0]["relevance"] if all_evidence else 0.0
229
+
230
+ return results
231
+
232
+ def _generate_local(self, model_tag: str, prompt: str, max_tokens: int, temperature: float) -> str:
233
+ """Generate via Ollama API."""
234
+ import urllib.request
235
+ import urllib.error
236
+ import json
237
+
238
+ payload = json.dumps({
239
+ "model": model_tag,
240
+ "messages": [{"role": "user", "content": prompt}],
241
+ "stream": False,
242
+ "options": {
243
+ "temperature": temperature,
244
+ "num_predict": max_tokens,
245
+ },
246
+ }).encode("utf-8")
247
+
248
+ req = urllib.request.Request(
249
+ f"{self.ollama_host}/api/chat",
250
+ data=payload,
251
+ headers={"Content-Type": "application/json"},
252
+ method="POST",
253
+ )
254
+
255
+ try:
256
+ with urllib.request.urlopen(req, timeout=300) as resp:
257
+ data = json.loads(resp.read().decode("utf-8"))
258
+ return data.get("message", {}).get("content", "")
259
+ except urllib.error.URLError as e:
260
+ raise RuntimeError(f"Ollama error: {e}")
261
+
262
+ def _generate_cloud(self, cloud_tag: str, prompt: str, max_tokens: int, temperature: float) -> str:
263
+ """Generate via cloud API."""
264
+ # Placeholder — actual implementation depends on provider SDK
265
+ # DeepSeek, Qwen, Kimi, GLM, GPT-5, Claude each have different APIs
266
+ provider = cloud_tag.split(":")[0] if ":" in cloud_tag else "unknown"
267
+ return f"[CLOUD:{provider}] {prompt[:100]}... (cloud generation placeholder)"
268
+
269
+ def route(
270
+ self,
271
+ prompt: str,
272
+ budget: Optional[QWAVEBudget] = None,
273
+ custom_model: Optional[str] = None,
274
+ ) -> RouterResult:
275
+ """
276
+ Main routing entry point.
277
+ Full pipeline: enhance → select → retrieve → generate → track.
278
+ """
279
+ budget = budget or self.default_budget
280
+
281
+ # Step 1: Enhance
282
+ enhanced = self._enhance(prompt)
283
+
284
+ # Step 2-3: Select model
285
+ if custom_model:
286
+ model_id = custom_model
287
+ profile = get(model_id)
288
+ if not profile:
289
+ raise ValueError(f"Unknown model: {custom_model}")
290
+ else:
291
+ model_id, profile = self._select_model(enhanced, budget)
292
+
293
+ # Step 4: Initialize TWAVE with model-specific parameters
294
+ model_family = profile.family
295
+ epsilon = get_preset_epsilon(model_family)
296
+ ckplug = CKPLUGCoupling(epsilon=epsilon, mu_0=profile.mu_base)
297
+ twave = TWAVETracker(
298
+ T_c=profile.T_c,
299
+ mu_0=profile.mu_base,
300
+ kappa=profile.kappa,
301
+ )
302
+
303
+ # Step 5: Retrieve
304
+ evidence = self._retrieve(enhanced.enhanced)
305
+ top_evidence_text = "\n".join([e["text"] for e in evidence["aggregated"][:3]])
306
+
307
+ # Build final prompt with evidence
308
+ final_prompt = f"""Retrieved evidence:
309
+ {top_evidence_text}
310
+
311
+ ---
312
+
313
+ User request:
314
+ {enhanced.enhanced}"""
315
+
316
+ # Step 6: Generate
317
+ import time
318
+ t0 = time.time()
319
+
320
+ if profile.tier == Tier.CLOUD_API:
321
+ response = self._generate_cloud(
322
+ profile.cloud_tag or model_id,
323
+ final_prompt,
324
+ budget.max_tokens,
325
+ profile.default_temp,
326
+ )
327
+ else:
328
+ response = self._generate_local(
329
+ profile.ollama_tag or model_id,
330
+ final_prompt,
331
+ budget.max_tokens,
332
+ profile.default_temp,
333
+ )
334
+
335
+ latency_ms = (time.time() - t0) * 1000
336
+ tokens_est = len(response.split()) # Rough estimate
337
+
338
+ # Step 7: TWAVE tracking (mock for now — needs actual logit extraction)
339
+ # In production, this runs inside the generation loop
340
+ states = [] # Would be populated by per-token hooks
341
+
342
+ # Step 8: Assemble result
343
+ return RouterResult(
344
+ selected_model=model_id,
345
+ model_profile=profile,
346
+ tier=profile.tier.value,
347
+ enhanced_prompt=enhanced.enhanced,
348
+ response=response,
349
+ token_states=states,
350
+ reflection_count=0,
351
+ grounding_score=evidence.get("top_score", 0.0),
352
+ hallucination_risk=0.0, # Would be computed from states
353
+ latency_ms=latency_ms,
354
+ tokens_generated=tokens_est,
355
+ debug={
356
+ "enhancement": enhanced,
357
+ "evidence_summary": evidence,
358
+ "budget": budget,
359
+ "ckplug_epsilon": epsilon,
360
+ },
361
+ )
362
+
363
+ def quick_route(self, prompt: str, budget: Optional[QWAVEBudget] = None) -> str:
364
+ """One-liner: just get the response text."""
365
+ return self.route(prompt, budget).response
366
+
367
+ def get_available_models(self, budget: Optional[QWAVEBudget] = None) -> List[Dict[str, Any]]:
368
+ """List models available under current budget."""
369
+ budget = budget or self.default_budget
370
+ available = []
371
+ for name, profile in REGISTRY.items():
372
+ fits = True
373
+ if profile.tier != Tier.CLOUD_API and profile.size_gb > budget.vram_budget_gb:
374
+ fits = False
375
+ if profile.tier == Tier.CLOUD_API and not budget.allow_cloud:
376
+ fits = False
377
+ available.append({
378
+ "id": name,
379
+ "name": profile.name,
380
+ "tier": profile.tier.value,
381
+ "size_gb": profile.size_gb,
382
+ "params_b": profile.params_b,
383
+ "capabilities": [c.value for c in profile.capabilities],
384
+ "fits_budget": fits,
385
+ })
386
+ return available