consolidate: HQ ComputeRouter multi-backend weighted routing
Browse files- hq/compute_router.py +72 -0
hq/compute_router.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Agent Q3 [HQ] — ComputeRouter
|
| 3 |
+
Weighted round-robin across Local Ollama → HuggingFace → OpenRouter → RunPod
|
| 4 |
+
Strategies: round_robin | local_first | hf_first | runpod_first | load_based
|
| 5 |
+
"""
|
| 6 |
+
import os, httpx, asyncio, random
|
| 7 |
+
from typing import Literal
|
| 8 |
+
|
| 9 |
+
STRATEGY = os.getenv("COMPUTE_STRATEGY", "round_robin")
|
| 10 |
+
|
| 11 |
+
BACKENDS = {
|
| 12 |
+
"local": {"url": "http://localhost:11434/v1/chat/completions", "weight": 0.40},
|
| 13 |
+
"hf": {"url": os.getenv("HF_ENDPOINT", ""), "weight": 0.25},
|
| 14 |
+
"openrouter":{"url": "https://openrouter.ai/api/v1/chat/completions", "weight": 0.25},
|
| 15 |
+
"runpod": {"url": os.getenv("RUNPOD_ENDPOINT", ""), "weight": 0.10},
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
MODEL_MAP = {
|
| 19 |
+
"reasoner": os.getenv("REASONER_MODEL", "gemma4:e4b-instruct-q4_K_M"),
|
| 20 |
+
"coder": os.getenv("CODER_MODEL", "qwen3.5:4b-instruct-q4_K_M"),
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
class ComputeRouter:
|
| 24 |
+
def __init__(self):
|
| 25 |
+
self.strategy = STRATEGY
|
| 26 |
+
self._queue_depths = {k: 0 for k in BACKENDS}
|
| 27 |
+
|
| 28 |
+
def _pick_backend(self) -> str:
|
| 29 |
+
if self.strategy == "local_first":
|
| 30 |
+
return "local"
|
| 31 |
+
if self.strategy == "hf_first":
|
| 32 |
+
return "hf"
|
| 33 |
+
if self.strategy == "runpod_first":
|
| 34 |
+
return "runpod"
|
| 35 |
+
if self.strategy == "load_based":
|
| 36 |
+
return min(self._queue_depths, key=self._queue_depths.get)
|
| 37 |
+
# round_robin weighted
|
| 38 |
+
backends = list(BACKENDS.keys())
|
| 39 |
+
weights = [BACKENDS[b]["weight"] for b in backends]
|
| 40 |
+
return random.choices(backends, weights=weights, k=1)[0]
|
| 41 |
+
|
| 42 |
+
async def route(self, messages: list, target: str = "reasoner") -> dict:
|
| 43 |
+
backend = self._pick_backend()
|
| 44 |
+
url = BACKENDS[backend]["url"]
|
| 45 |
+
model = MODEL_MAP.get(target, MODEL_MAP["reasoner"])
|
| 46 |
+
headers = {"Content-Type": "application/json"}
|
| 47 |
+
|
| 48 |
+
if backend == "openrouter":
|
| 49 |
+
headers["Authorization"] = f"Bearer {os.getenv('OPENROUTER_API_KEY', '')}"
|
| 50 |
+
model = "google/gemma-3-4b-it" if target == "reasoner" else "qwen/qwen-2.5-coder-7b-instruct"
|
| 51 |
+
elif backend == "hf":
|
| 52 |
+
headers["Authorization"] = f"Bearer {os.getenv('HF_TOKEN', '')}"
|
| 53 |
+
elif backend == "runpod":
|
| 54 |
+
headers["Authorization"] = f"Bearer {os.getenv('RUNPOD_API_KEY', '')}"
|
| 55 |
+
|
| 56 |
+
payload = {"model": model, "messages": messages}
|
| 57 |
+
self._queue_depths[backend] += 1
|
| 58 |
+
try:
|
| 59 |
+
async with httpx.AsyncClient(timeout=60) as client:
|
| 60 |
+
r = await client.post(url, json=payload, headers=headers)
|
| 61 |
+
r.raise_for_status()
|
| 62 |
+
return r.json()
|
| 63 |
+
except Exception as e:
|
| 64 |
+
# Fallback to local
|
| 65 |
+
async with httpx.AsyncClient(timeout=60) as client:
|
| 66 |
+
r = await client.post(BACKENDS["local"]["url"], json={"model": MODEL_MAP[target], "messages": messages})
|
| 67 |
+
return r.json()
|
| 68 |
+
finally:
|
| 69 |
+
self._queue_depths[backend] -= 1
|
| 70 |
+
|
| 71 |
+
def health(self) -> dict:
|
| 72 |
+
return {k: {"weight": v["weight"], "queue": self._queue_depths[k]} for k, v in BACKENDS.items()}
|