File size: 3,044 Bytes
216bfa1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
"""
Agent Q3 [HQ] — ComputeRouter
Weighted round-robin across Local Ollama → HuggingFace → OpenRouter → RunPod
Strategies: round_robin | local_first | hf_first | runpod_first | load_based
"""
import os, httpx, asyncio, random
from typing import Literal

STRATEGY = os.getenv("COMPUTE_STRATEGY", "round_robin")

BACKENDS = {
    "local":     {"url": "http://localhost:11434/v1/chat/completions", "weight": 0.40},
    "hf":        {"url": os.getenv("HF_ENDPOINT", ""), "weight": 0.25},
    "openrouter":{"url": "https://openrouter.ai/api/v1/chat/completions", "weight": 0.25},
    "runpod":    {"url": os.getenv("RUNPOD_ENDPOINT", ""), "weight": 0.10},
}

MODEL_MAP = {
    "reasoner": os.getenv("REASONER_MODEL", "gemma4:e4b-instruct-q4_K_M"),
    "coder":    os.getenv("CODER_MODEL",    "qwen3.5:4b-instruct-q4_K_M"),
}

class ComputeRouter:
    def __init__(self):
        self.strategy = STRATEGY
        self._queue_depths = {k: 0 for k in BACKENDS}

    def _pick_backend(self) -> str:
        if self.strategy == "local_first":
            return "local"
        if self.strategy == "hf_first":
            return "hf"
        if self.strategy == "runpod_first":
            return "runpod"
        if self.strategy == "load_based":
            return min(self._queue_depths, key=self._queue_depths.get)
        # round_robin weighted
        backends = list(BACKENDS.keys())
        weights  = [BACKENDS[b]["weight"] for b in backends]
        return random.choices(backends, weights=weights, k=1)[0]

    async def route(self, messages: list, target: str = "reasoner") -> dict:
        backend = self._pick_backend()
        url     = BACKENDS[backend]["url"]
        model   = MODEL_MAP.get(target, MODEL_MAP["reasoner"])
        headers = {"Content-Type": "application/json"}

        if backend == "openrouter":
            headers["Authorization"] = f"Bearer {os.getenv('OPENROUTER_API_KEY', '')}"
            model = "google/gemma-3-4b-it" if target == "reasoner" else "qwen/qwen-2.5-coder-7b-instruct"
        elif backend == "hf":
            headers["Authorization"] = f"Bearer {os.getenv('HF_TOKEN', '')}"
        elif backend == "runpod":
            headers["Authorization"] = f"Bearer {os.getenv('RUNPOD_API_KEY', '')}"

        payload = {"model": model, "messages": messages}
        self._queue_depths[backend] += 1
        try:
            async with httpx.AsyncClient(timeout=60) as client:
                r = await client.post(url, json=payload, headers=headers)
                r.raise_for_status()
                return r.json()
        except Exception as e:
            # Fallback to local
            async with httpx.AsyncClient(timeout=60) as client:
                r = await client.post(BACKENDS["local"]["url"], json={"model": MODEL_MAP[target], "messages": messages})
                return r.json()
        finally:
            self._queue_depths[backend] -= 1

    def health(self) -> dict:
        return {k: {"weight": v["weight"], "queue": self._queue_depths[k]} for k, v in BACKENDS.items()}