madDegen commited on
Commit
216bfa1
·
verified ·
1 Parent(s): 6ff51b1

consolidate: HQ ComputeRouter multi-backend weighted routing

Browse files
Files changed (1) hide show
  1. hq/compute_router.py +72 -0
hq/compute_router.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Agent Q3 [HQ] — ComputeRouter
3
+ Weighted round-robin across Local Ollama → HuggingFace → OpenRouter → RunPod
4
+ Strategies: round_robin | local_first | hf_first | runpod_first | load_based
5
+ """
6
+ import os, httpx, asyncio, random
7
+ from typing import Literal
8
+
9
+ STRATEGY = os.getenv("COMPUTE_STRATEGY", "round_robin")
10
+
11
+ BACKENDS = {
12
+ "local": {"url": "http://localhost:11434/v1/chat/completions", "weight": 0.40},
13
+ "hf": {"url": os.getenv("HF_ENDPOINT", ""), "weight": 0.25},
14
+ "openrouter":{"url": "https://openrouter.ai/api/v1/chat/completions", "weight": 0.25},
15
+ "runpod": {"url": os.getenv("RUNPOD_ENDPOINT", ""), "weight": 0.10},
16
+ }
17
+
18
+ MODEL_MAP = {
19
+ "reasoner": os.getenv("REASONER_MODEL", "gemma4:e4b-instruct-q4_K_M"),
20
+ "coder": os.getenv("CODER_MODEL", "qwen3.5:4b-instruct-q4_K_M"),
21
+ }
22
+
23
+ class ComputeRouter:
24
+ def __init__(self):
25
+ self.strategy = STRATEGY
26
+ self._queue_depths = {k: 0 for k in BACKENDS}
27
+
28
+ def _pick_backend(self) -> str:
29
+ if self.strategy == "local_first":
30
+ return "local"
31
+ if self.strategy == "hf_first":
32
+ return "hf"
33
+ if self.strategy == "runpod_first":
34
+ return "runpod"
35
+ if self.strategy == "load_based":
36
+ return min(self._queue_depths, key=self._queue_depths.get)
37
+ # round_robin weighted
38
+ backends = list(BACKENDS.keys())
39
+ weights = [BACKENDS[b]["weight"] for b in backends]
40
+ return random.choices(backends, weights=weights, k=1)[0]
41
+
42
+ async def route(self, messages: list, target: str = "reasoner") -> dict:
43
+ backend = self._pick_backend()
44
+ url = BACKENDS[backend]["url"]
45
+ model = MODEL_MAP.get(target, MODEL_MAP["reasoner"])
46
+ headers = {"Content-Type": "application/json"}
47
+
48
+ if backend == "openrouter":
49
+ headers["Authorization"] = f"Bearer {os.getenv('OPENROUTER_API_KEY', '')}"
50
+ model = "google/gemma-3-4b-it" if target == "reasoner" else "qwen/qwen-2.5-coder-7b-instruct"
51
+ elif backend == "hf":
52
+ headers["Authorization"] = f"Bearer {os.getenv('HF_TOKEN', '')}"
53
+ elif backend == "runpod":
54
+ headers["Authorization"] = f"Bearer {os.getenv('RUNPOD_API_KEY', '')}"
55
+
56
+ payload = {"model": model, "messages": messages}
57
+ self._queue_depths[backend] += 1
58
+ try:
59
+ async with httpx.AsyncClient(timeout=60) as client:
60
+ r = await client.post(url, json=payload, headers=headers)
61
+ r.raise_for_status()
62
+ return r.json()
63
+ except Exception as e:
64
+ # Fallback to local
65
+ async with httpx.AsyncClient(timeout=60) as client:
66
+ r = await client.post(BACKENDS["local"]["url"], json={"model": MODEL_MAP[target], "messages": messages})
67
+ return r.json()
68
+ finally:
69
+ self._queue_depths[backend] -= 1
70
+
71
+ def health(self) -> dict:
72
+ return {k: {"weight": v["weight"], "queue": self._queue_depths[k]} for k, v in BACKENDS.items()}