specimba commited on
Commit
baeecf7
·
verified ·
1 Parent(s): 2a24138

Rename chimera_router_v2.1.py to chimera_router_v2_1.py (fix Python import syntax)

Browse files
Files changed (1) hide show
  1. nexus_os_v2/chimera_router_v2_1.py +644 -0
nexus_os_v2/chimera_router_v2_1.py ADDED
@@ -0,0 +1,644 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ChimeraRouter v2.1 — Production Telemetry-Integrated Inference Orchestrator
3
+
4
+ Integrates:
5
+ - Sulphur prompt enhancement
6
+ - QWAVE budget allocation
7
+ - Multi-source retrieval (Pinecone + Milvus + ERNIE)
8
+ - TWAVE token-level thermodynamic tracking via Ollama telemetry
9
+ - CK-PLUG confidence gain coupling
10
+ - EPR + Spilled Energy unified detector
11
+ - Cloud API adapters (DeepSeek, Qwen, Kimi, GLM, OpenAI, Claude)
12
+ - Model fallback controller (reflection → grounding → switch → cloud)
13
+ - Stochastic resonance optimal temperature
14
+
15
+ Architecture:
16
+ 1. Sulphur enhancement
17
+ 2. QWAVE budget → model selection
18
+ 3. Multi-source retrieval
19
+ 4. Stochastic resonance T_eff optimization
20
+ 5. TWAVE tracker + unified detector initialization
21
+ 6. Generation (Ollama or Cloud API)
22
+ 7. Post-hoc telemetry + per_token_debug
23
+ 8. Fallback controller if risk too high
24
+ 9. Result assembly
25
+ """
26
+ import os
27
+ import time
28
+ import json
29
+ from typing import List, Dict, Optional, Any, Tuple
30
+ from dataclasses import dataclass, field
31
+ from enum import Enum
32
+
33
+ from .model_registry import REGISTRY, SULPHUR, get, by_tier, by_cap, Tier, Capability, ModelProfile
34
+ from .sulphur_enhancer import SulphurEnhancer, MockSulphurEnhancer, EnhancedPrompt
35
+ from .twave_tracker import TWAVETracker, TokenState, StochasticResonance, GenerationTrajectory
36
+ from .ckplug_retriever import CKPLUGCoupling, get_preset_epsilon
37
+ from .pinecone_client import PineconeRetriever, MockPineconeRetriever
38
+ from .milvus_client import MilvusRetriever, MockMilvusRetriever
39
+ from .ernie_adapter import ERNIEAdapter, MockERNIEAdapter
40
+ from .ollama_telemetry import OllamaStreamingClient, OllamaTelemetryExtractor, estimate_entropy_from_response
41
+ from .per_token_debug import PerTokenDebug, GenerationTelemetry
42
+ from .unified_detector import UnifiedThermodynamicDetector, FusionMode, Action
43
+ from .epr_detector import EPRDetector, SequenceEPR
44
+ from .spilled_energy import SpilledEnergyDetector, CombinedThermodynamicDetector
45
+ from .cloud_api_adapters import CloudAPIManager, CloudResponse
46
+
47
+
48
+ class RoutingDecision(Enum):
49
+ LOCAL_OLLAMA = "local_ollama"
50
+ CLOUD_API = "cloud_api"
51
+ FALLBACK = "fallback"
52
+ REFLECTION = "reflection"
53
+
54
+
55
+ @dataclass
56
+ class QWAVEBudget:
57
+ max_tokens: int = 4096
58
+ target_latency_ms: float = 2000.0
59
+ vram_budget_gb: float = 8.0
60
+ cloud_budget_cents: float = 5.0
61
+ allow_cloud: bool = True
62
+ allow_uncensored: bool = True
63
+ require_vision: bool = False
64
+ require_safety: bool = False
65
+ require_tools: bool = False
66
+
67
+
68
+ @dataclass
69
+ class RouterResult:
70
+ selected_model: str
71
+ model_profile: ModelProfile
72
+ tier: str
73
+ enhanced_prompt: str
74
+ response: str
75
+ token_states: List[TokenState] = field(default_factory=list)
76
+ per_token_debug: List[PerTokenDebug] = field(default_factory=list)
77
+ generation_telemetry: Optional[GenerationTelemetry] = None
78
+ reflection_count: int = 0
79
+ grounding_score: float = 0.0
80
+ hallucination_risk: float = 0.0
81
+ latency_ms: float = 0.0
82
+ tokens_generated: int = 0
83
+ detector_verdict: Optional[Any] = None
84
+ fallback_history: List[str] = field(default_factory=list)
85
+ debug: Dict[str, Any] = field(default_factory=dict)
86
+
87
+
88
+ class ModelFallbackController:
89
+ """
90
+ Handles generation failures and high hallucination risk through escalation:
91
+ Level 1: Increase retrieval grounding (re-ground prompt)
92
+ Level 2: Backtrack and regenerate (reflection)
93
+ Level 3: Switch to more capable model (larger params or cloud)
94
+ Level 4: Fallback to cloud API (guaranteed generation)
95
+ """
96
+
97
+ MAX_RETRIES = 3
98
+
99
+ def __init__(self, router: "ChimeraRouterV2"):
100
+ self.router = router
101
+ self.retry_count = 0
102
+
103
+ def escalate(
104
+ self,
105
+ prompt: str,
106
+ enhanced: EnhancedPrompt,
107
+ evidence: Dict[str, Any],
108
+ budget: QWAVEBudget,
109
+ previous_model_id: str,
110
+ previous_risk: float,
111
+ ) -> Tuple[str, str, ModelProfile, List[str]]:
112
+ """
113
+ Escalate to next level. Returns (new_model_id, new_prompt, profile, history).
114
+ """
115
+ history = []
116
+
117
+ # Level 1: Increase grounding if retrieval available
118
+ if evidence.get("aggregated") and self.retry_count == 0:
119
+ top_evidence = "\n".join([
120
+ f"[HIGH-PRIORITY EVIDENCE] {e.get('text', '')[:400]}"
121
+ for e in evidence.get("aggregated", [])[:5]
122
+ ])
123
+ new_prompt = f"""CRITICAL: Use ONLY the following verified evidence to answer.
124
+ Do not rely on parametric knowledge if it conflicts with the evidence.
125
+
126
+ {top_evidence}
127
+
128
+ ---
129
+
130
+ {enhanced.enhanced}"""
131
+ history.append("re-grounded with more evidence")
132
+ self.retry_count += 1
133
+ return previous_model_id, new_prompt, get(previous_model_id), history
134
+
135
+ # Level 2: Switch to next larger model in same tier
136
+ current_profile = get(previous_model_id)
137
+ if current_profile and current_profile.tier != Tier.CLOUD_API:
138
+ current_tier_models = by_tier(current_profile.tier)
139
+ larger_models = [m for m in current_tier_models
140
+ if m.params_b > current_profile.params_b
141
+ and m.size_gb <= budget.vram_budget_gb]
142
+ if larger_models and self.retry_count < 2:
143
+ larger = max(larger_models, key=lambda m: m.params_b)
144
+ for k, v in REGISTRY.items():
145
+ if v == larger:
146
+ history.append(f"switched to larger model {k} ({larger.params_b:.1f}B)")
147
+ self.retry_count += 1
148
+ return k, enhanced.enhanced, larger, history
149
+
150
+ # Level 3: Upgrade to next tier
151
+ tier_upgrade = {
152
+ Tier.LOCAL_8GB: Tier.LOCAL_16GB,
153
+ Tier.LOCAL_16GB: Tier.LOCAL_24GB,
154
+ Tier.LOCAL_24GB: Tier.LOCAL_48GB,
155
+ Tier.LOCAL_48GB: Tier.CLOUD_API,
156
+ }
157
+
158
+ if current_profile and current_profile.tier in tier_upgrade:
159
+ next_tier = tier_upgrade[current_profile.tier]
160
+ next_tier_models = by_tier(next_tier)
161
+ available = [m for m in next_tier_models
162
+ if m.tier == Tier.CLOUD_API or m.size_gb <= budget.vram_budget_gb]
163
+
164
+ if available and self.retry_count < self.MAX_RETRIES:
165
+ best = max(available, key=lambda m: m.params_b)
166
+ for k, v in REGISTRY.items():
167
+ if v == best:
168
+ history.append(f"upgraded to {next_tier.value} with {k}")
169
+ self.retry_count += 1
170
+ return k, enhanced.enhanced, best, history
171
+
172
+ # Level 4: Cloud fallback
173
+ if budget.allow_cloud:
174
+ cloud_models = by_tier(Tier.CLOUD_API)
175
+ if cloud_models:
176
+ best_cloud = max(cloud_models, key=lambda m: m.params_b)
177
+ for k, v in REGISTRY.items():
178
+ if v == best_cloud:
179
+ history.append(f"cloud fallback to {k}")
180
+ self.retry_count += 1
181
+ return k, enhanced.enhanced, best_cloud, history
182
+
183
+ # Exhausted all options
184
+ history.append("all fallback options exhausted")
185
+ return previous_model_id, enhanced.enhanced, current_profile, history
186
+
187
+
188
+ class ChimeraRouterV2:
189
+ """
190
+ Production router with telemetry integration and fallback controller.
191
+ """
192
+
193
+ def __init__(
194
+ self,
195
+ sulphur: Optional[Any] = None,
196
+ pinecone: Optional[Any] = None,
197
+ milvus: Optional[Any] = None,
198
+ ernie: Optional[Any] = None,
199
+ twave: Optional[TWAVETracker] = None,
200
+ ollama_host: str = "http://localhost:11434",
201
+ default_budget: Optional[QWAVEBudget] = None,
202
+ use_telemetry: bool = True,
203
+ detector_fusion: FusionMode = FusionMode.WEIGHTED,
204
+ ):
205
+ self.sulphur = sulphur or MockSulphurEnhancer()
206
+ self.pinecone = pinecone or MockPineconeRetriever()
207
+ self.milvus = milvus or MockMilvusRetriever()
208
+ self.ernie = ernie or MockERNIEAdapter()
209
+ self.twave = twave
210
+ self.ollama_host = ollama_host
211
+ self.default_budget = default_budget or QWAVEBudget()
212
+ self.use_telemetry = use_telemetry
213
+ self.detector_fusion = detector_fusion
214
+
215
+ # Subsystems
216
+ self.cloud_manager = CloudAPIManager()
217
+ self._ollama_client: Optional[OllamaStreamingClient] = None
218
+ self._fallback: Optional[ModelFallbackController] = None
219
+
220
+ @property
221
+ def ollama_client(self) -> OllamaStreamingClient:
222
+ if self._ollama_client is None:
223
+ telemetry = None
224
+ if self.use_telemetry:
225
+ telemetry = OllamaTelemetryExtractor(
226
+ ollama_host=self.ollama_host,
227
+ embedding_model="functiongemma:latest",
228
+ telemetry_interval=5,
229
+ )
230
+ self._ollama_client = OllamaStreamingClient(
231
+ ollama_host=self.ollama_host,
232
+ telemetry_extractor=telemetry,
233
+ )
234
+ return self._ollama_client
235
+
236
+ def _enhance(self, prompt: str) -> EnhancedPrompt:
237
+ return self.sulphur.enhance(prompt)
238
+
239
+ def _select_model(self, enhanced: EnhancedPrompt, budget: QWAVEBudget) -> Tuple[str, ModelProfile]:
240
+ required_caps = []
241
+ for tag in enhanced.intent_tags:
242
+ cap_map = {
243
+ "coding": Capability.CODING,
244
+ "reasoning": Capability.REASONING,
245
+ "vision": Capability.VISION,
246
+ "creative": Capability.INSTRUCT,
247
+ "factual": Capability.REASONING,
248
+ "safety": Capability.SAFETY,
249
+ "fast": Capability.FAST,
250
+ "long_context": Capability.LONG_CONTEXT,
251
+ }
252
+ if tag.lower() in cap_map:
253
+ required_caps.append(cap_map[tag.lower()])
254
+
255
+ if budget.require_safety:
256
+ exclude = [Capability.ABLITERATED, Capability.UNCHAINED]
257
+ elif budget.allow_uncensored:
258
+ exclude = []
259
+ else:
260
+ exclude = [Capability.ABLITERATED]
261
+
262
+ optimal_T_ratio = StochasticResonance.recommend_temperature(
263
+ enhanced.complexity_score,
264
+ T_c=1.0,
265
+ )
266
+
267
+ if enhanced.complexity_score > 0.8 and budget.allow_cloud:
268
+ preferred_tiers = [Tier.CLOUD_API, Tier.LOCAL_48GB, Tier.LOCAL_24GB]
269
+ elif enhanced.complexity_score > 0.6:
270
+ preferred_tiers = [Tier.LOCAL_24GB, Tier.LOCAL_16GB, Tier.CLOUD_API]
271
+ elif enhanced.complexity_score > 0.4:
272
+ preferred_tiers = [Tier.LOCAL_16GB, Tier.LOCAL_8GB]
273
+ else:
274
+ preferred_tiers = [Tier.LOCAL_8GB]
275
+
276
+ candidates = []
277
+ for tier in preferred_tiers:
278
+ tier_models = by_tier(tier)
279
+ for m in tier_models:
280
+ if required_caps and not all(c in m.capabilities for c in required_caps):
281
+ continue
282
+ if any(c in m.capabilities for c in exclude):
283
+ continue
284
+ if tier != Tier.CLOUD_API and m.size_gb > budget.vram_budget_gb:
285
+ continue
286
+ candidates.append(m)
287
+
288
+ if not candidates:
289
+ all_models = list(REGISTRY.values())
290
+ candidates = [m for m in all_models if m.tier != Tier.CLOUD_API and m.size_gb <= budget.vram_budget_gb]
291
+ if not candidates and budget.allow_cloud:
292
+ candidates = by_tier(Tier.CLOUD_API)
293
+
294
+ if not candidates:
295
+ raise RuntimeError("No models available for this request.")
296
+
297
+ def score_model(m: ModelProfile) -> float:
298
+ s = 0.0
299
+ for cap in required_caps:
300
+ if cap in m.capabilities:
301
+ s += 10.0
302
+ if enhanced.complexity_score > 0.7:
303
+ s += m.params_b * 2.0
304
+ else:
305
+ s += (10.0 - m.params_b) * 0.5
306
+ if Capability.FAST in m.capabilities and "fast" in enhanced.intent_tags:
307
+ s += 5.0
308
+ s -= m.size_gb * 0.1
309
+ temp_diff = abs(m.default_temp - optimal_T_ratio)
310
+ s -= temp_diff * 2.0
311
+ return s
312
+
313
+ candidates.sort(key=score_model, reverse=True)
314
+ best = candidates[0]
315
+
316
+ for k, v in REGISTRY.items():
317
+ if v == best:
318
+ return k, best
319
+
320
+ raise RuntimeError("Model selected but not found in registry.")
321
+
322
+ def _retrieve(self, query: str) -> Dict[str, Any]:
323
+ results = {"pinecone": [], "milvus": [], "ernie": [], "aggregated": []}
324
+
325
+ try:
326
+ results["pinecone"] = self.pinecone.get_evidence_for_ckplug(query)
327
+ except Exception as e:
328
+ results["pinecone_error"] = str(e)
329
+
330
+ try:
331
+ results["milvus"] = self.milvus.get_evidence("nexus_docs", query)
332
+ except Exception as e:
333
+ results["milvus_error"] = str(e)
334
+
335
+ try:
336
+ if self.ernie.is_available():
337
+ results["ernie"] = self.ernie.get_evidence(query)
338
+ except Exception as e:
339
+ results["ernie_error"] = str(e)
340
+
341
+ all_evidence = []
342
+ for src in [results["pinecone"], results["milvus"], results["ernie"]]:
343
+ for item in src:
344
+ all_evidence.append({
345
+ "text": item.get("text", ""),
346
+ "relevance": item.get("relevance", 0.0),
347
+ "source": item.get("type", item.get("collection", "unknown")),
348
+ })
349
+
350
+ all_evidence.sort(key=lambda x: x["relevance"], reverse=True)
351
+ results["aggregated"] = all_evidence[:10]
352
+ results["top_score"] = all_evidence[0]["relevance"] if all_evidence else 0.0
353
+
354
+ return results
355
+
356
+ def _generate_with_telemetry(
357
+ self,
358
+ model_tag: str,
359
+ prompt: str,
360
+ profile: ModelProfile,
361
+ budget: QWAVEBudget,
362
+ ) -> Tuple[str, List[PerTokenDebug], GenerationTelemetry, Any]:
363
+ twave = TWAVETracker(
364
+ T_c=profile.T_c,
365
+ mu_0=profile.mu_base,
366
+ kappa=profile.kappa,
367
+ )
368
+
369
+ temperature = profile.default_temp
370
+
371
+ client = self.ollama_client
372
+
373
+ response, tokens, trajectory = client.generate(
374
+ model_tag=model_tag,
375
+ prompt=prompt,
376
+ system="You are a helpful assistant. Use retrieved evidence when answering.",
377
+ temperature=temperature,
378
+ max_tokens=budget.max_tokens,
379
+ )
380
+
381
+ debugs = client.telemetry.to_per_token_debug(
382
+ trajectory=trajectory,
383
+ twave=twave,
384
+ model_id=profile.name,
385
+ tier=profile.tier.value,
386
+ )
387
+
388
+ gen_telemetry = GenerationTelemetry(
389
+ request_id=f"req_{int(time.time())}",
390
+ prompt=prompt,
391
+ tokens=debugs,
392
+ total_tokens=len(debugs),
393
+ selected_model=profile.name,
394
+ model_family=profile.family,
395
+ model_params_b=profile.params_b,
396
+ model_quantization=profile.quantization,
397
+ )
398
+ gen_telemetry.compute_aggregates()
399
+
400
+ # Run unified detector for post-hoc analysis
401
+ detector = UnifiedThermodynamicDetector(
402
+ fusion_mode=self.detector_fusion,
403
+ enable_epr=True,
404
+ enable_spilled=True,
405
+ enable_ckplug=False, # Post-hoc, no RAG context per token
406
+ enable_twave=True,
407
+ )
408
+
409
+ token_verdicts = []
410
+ for i, debug in enumerate(debugs):
411
+ verdict = detector.evaluate_token(
412
+ position=i,
413
+ token_str=debug.token_str,
414
+ topk_probs=None, # Not available post-hoc
415
+ token_id=debug.token_id or 0,
416
+ full_logits=None,
417
+ sampled_token_id=debug.token_id or 0,
418
+ probs_distribution=None,
419
+ log_prob_policy=0.0,
420
+ log_prob_ref=0.0,
421
+ visual_attention=debug.attention_mass_to_image or 1.0,
422
+ prev_psi=debugs[i-1].twave_psi if i > 0 else 0.0,
423
+ )
424
+ token_verdicts.append(verdict)
425
+
426
+ sequence_verdict = detector.evaluate_sequence(token_verdicts)
427
+
428
+ return response, debugs, gen_telemetry, sequence_verdict
429
+
430
+ def _generate_non_streaming(
431
+ self,
432
+ model_tag: str,
433
+ prompt: str,
434
+ profile: ModelProfile,
435
+ budget: QWAVEBudget,
436
+ ) -> str:
437
+ client = OllamaStreamingClient(ollama_host=self.ollama_host)
438
+ return client.generate_non_streaming(
439
+ model_tag=model_tag,
440
+ prompt=prompt,
441
+ system="You are a helpful assistant. Use retrieved evidence when answering.",
442
+ temperature=profile.default_temp,
443
+ max_tokens=budget.max_tokens,
444
+ )
445
+
446
+ def _generate_cloud(
447
+ self,
448
+ model_id: str,
449
+ prompt: str,
450
+ budget: QWAVEBudget,
451
+ profile: ModelProfile,
452
+ ) -> str:
453
+ """Generate via cloud API using the appropriate adapter."""
454
+ if not self.cloud_manager.is_available(profile.family):
455
+ return f"[CLOUD: {profile.name}] {prompt[:200]}... (no API key configured for {profile.family})"
456
+
457
+ try:
458
+ response = self.cloud_manager.generate(
459
+ model_family=profile.family,
460
+ prompt=prompt,
461
+ max_tokens=budget.max_tokens,
462
+ temperature=profile.default_temp,
463
+ system="You are a helpful assistant. Use retrieved evidence when answering.",
464
+ )
465
+ return response.text
466
+ except RuntimeError as e:
467
+ return f"[CLOUD ERROR: {profile.name}] {str(e)}"
468
+
469
+ def route(
470
+ self,
471
+ prompt: str,
472
+ budget: Optional[QWAVEBudget] = None,
473
+ custom_model: Optional[str] = None,
474
+ use_telemetry: Optional[bool] = None,
475
+ max_retries: int = 3,
476
+ ) -> RouterResult:
477
+ """
478
+ Main routing entry point with full telemetry and fallback.
479
+
480
+ Pipeline:
481
+ 1. Enhance prompt
482
+ 2. Select model
483
+ 3. Retrieve evidence
484
+ 4. Generate (with telemetry if available)
485
+ 5. Evaluate risk
486
+ 6. Fallback if needed
487
+ 7. Assemble result
488
+ """
489
+ budget = budget or self.default_budget
490
+ use_telemetry = use_telemetry if use_telemetry is not None else self.use_telemetry
491
+
492
+ # Step 1: Enhance
493
+ enhanced = self._enhance(prompt)
494
+
495
+ # Step 2-3: Select model
496
+ if custom_model:
497
+ model_id = custom_model
498
+ profile = get(model_id)
499
+ if not profile:
500
+ raise ValueError(f"Unknown model: {custom_model}")
501
+ else:
502
+ model_id, profile = self._select_model(enhanced, budget)
503
+
504
+ # Step 4: Retrieve
505
+ evidence = self._retrieve(enhanced.enhanced)
506
+ top_evidence = "\n".join([
507
+ f"[{e.get('source', 'unknown')}] {e.get('text', '')[:300]}"
508
+ for e in evidence.get("aggregated", [])[:3]
509
+ ])
510
+
511
+ final_prompt = f"""Retrieved evidence:
512
+ {top_evidence}
513
+
514
+ ---
515
+
516
+ {enhanced.enhanced}"""
517
+
518
+ # Step 5: Generate with fallback loop
519
+ response = ""
520
+ debugs = []
521
+ gen_telemetry = None
522
+ sequence_verdict = None
523
+ tokens_est = 0
524
+ fallback_history = []
525
+
526
+ self._fallback = ModelFallbackController(self)
527
+
528
+ for attempt in range(max_retries + 1):
529
+ t0 = time.time()
530
+
531
+ try:
532
+ if profile.tier == Tier.CLOUD_API:
533
+ response = self._generate_cloud(model_id, final_prompt, budget, profile)
534
+ tokens_est = len(response.split())
535
+ elif use_telemetry and attempt == 0:
536
+ # Try telemetry generation on first attempt
537
+ response, debugs, gen_telemetry, sequence_verdict = self._generate_with_telemetry(
538
+ profile.ollama_tag or model_id,
539
+ final_prompt,
540
+ profile,
541
+ budget,
542
+ )
543
+ tokens_est = len(debugs)
544
+ else:
545
+ # Fallback: non-streaming
546
+ response = self._generate_non_streaming(
547
+ profile.ollama_tag or model_id,
548
+ final_prompt,
549
+ profile,
550
+ budget,
551
+ )
552
+ tokens_est = len(response.split())
553
+
554
+ latency_ms = (time.time() - t0) * 1000
555
+
556
+ # Evaluate risk
557
+ risk = 0.0
558
+ if sequence_verdict:
559
+ risk = sequence_verdict.avg_fused_score
560
+ elif gen_telemetry:
561
+ risk = gen_telemetry.hallucination_risk_score
562
+ else:
563
+ risk = estimate_entropy_from_response(response)
564
+
565
+ # Check if we need to escalate
566
+ if risk > 0.6 and attempt < max_retries:
567
+ new_model, new_prompt, new_profile, history = self._fallback.escalate(
568
+ final_prompt, enhanced, evidence, budget, model_id, risk
569
+ )
570
+ if new_model != model_id or new_prompt != final_prompt:
571
+ model_id = new_model
572
+ profile = new_profile
573
+ final_prompt = new_prompt
574
+ fallback_history.extend(history)
575
+ continue
576
+
577
+ # Success or exhausted retries
578
+ break
579
+
580
+ except RuntimeError as e:
581
+ latency_ms = (time.time() - t0) * 1000
582
+ if attempt < max_retries:
583
+ # Try fallback
584
+ new_model, new_prompt, new_profile, history = self._fallback.escalate(
585
+ final_prompt, enhanced, evidence, budget, model_id, 0.0
586
+ )
587
+ model_id = new_model
588
+ profile = new_profile
589
+ final_prompt = new_prompt
590
+ fallback_history.extend([f"error: {str(e)}"] + history)
591
+ else:
592
+ response = f"[ERROR] Generation failed after {max_retries} attempts: {str(e)}"
593
+ break
594
+
595
+ # Step 7: Assemble result
596
+ return RouterResult(
597
+ selected_model=model_id,
598
+ model_profile=profile,
599
+ tier=profile.tier.value,
600
+ enhanced_prompt=enhanced.enhanced,
601
+ response=response,
602
+ per_token_debug=debugs,
603
+ generation_telemetry=gen_telemetry,
604
+ reflection_count=gen_telemetry.reflection_count if gen_telemetry else 0,
605
+ grounding_score=evidence.get("top_score", 0.0),
606
+ hallucination_risk=sequence_verdict.avg_fused_score if sequence_verdict else estimate_entropy_from_response(response),
607
+ latency_ms=latency_ms,
608
+ tokens_generated=tokens_est,
609
+ detector_verdict=sequence_verdict,
610
+ fallback_history=fallback_history,
611
+ debug={
612
+ "enhancement": enhanced,
613
+ "evidence_summary": evidence,
614
+ "budget": budget,
615
+ "ckplug_epsilon": get_preset_epsilon(profile.family),
616
+ "optimal_temp_ratio": StochasticResonance.recommend_temperature(enhanced.complexity_score),
617
+ "fallback_attempts": len(fallback_history),
618
+ },
619
+ )
620
+
621
+ def quick_route(self, prompt: str, budget: Optional[QWAVEBudget] = None) -> str:
622
+ return self.route(prompt, budget=budget, use_telemetry=False).response
623
+
624
+ def get_available_models(self, budget: Optional[QWAVEBudget] = None) -> List[Dict[str, Any]]:
625
+ budget = budget or self.default_budget
626
+ available = []
627
+ for name, profile in REGISTRY.items():
628
+ fits = True
629
+ if profile.tier != Tier.CLOUD_API and profile.size_gb > budget.vram_budget_gb:
630
+ fits = False
631
+ if profile.tier == Tier.CLOUD_API and not budget.allow_cloud:
632
+ fits = False
633
+ available.append({
634
+ "id": name,
635
+ "name": profile.name,
636
+ "tier": profile.tier.value,
637
+ "size_gb": profile.size_gb,
638
+ "params_b": profile.params_b,
639
+ "capabilities": [c.value for c in profile.capabilities],
640
+ "fits_budget": fits,
641
+ "T_c": profile.T_c,
642
+ "mu_base": profile.mu_base,
643
+ })
644
+ return available