JiaqiXue
/

R2-Router-RouterArena

@@ -79,14 +79,36 @@ for q, r in zip(queries, results):
     print(f"{q[:40]:40s} -> {r['model']} (budget={r['token_limit']})")
 ```
-### CPU-Only (No GPU)
-If you don't have a GPU, provide pre-computed embeddings directly:
 ```python
-import numpy as np
 from router import R2Router
 router = R2Router.from_pretrained(path)
 # Your own 1024-dim embedding (e.g., from an API or pre-computed)
@@ -175,13 +197,14 @@ checkpoints/
   token_knn_*.joblib    # Pre-fitted KNN token predictors (6 total)
 ```
-### Three Ways to Use
 | Method | GPU? | Description |
 |--------|------|-------------|
-| `router.route_text(query)` | Yes | End-to-end: auto-embeds with vLLM, then routes |
-| `router.route(embedding)` | No | Route from pre-computed 1024-dim embedding |
-| `R2Router.from_training_data(path)` | No | Train your own KNN with custom hyperparameters |
 ## Training Details

     print(f"{q[:40]:40s} -> {r['model']} (budget={r['token_limit']})")
 ```
+### With vLLM Server (Recommended for Production)
+Start the embedding server once, then route from any process without reloading the model:
+```bash
+# Terminal 1: Start vLLM embedding server (runs once, stays alive)
+uv pip install vllm
+vllm serve Qwen/Qwen3-0.6B --task embed --port 8000
+```
 ```python
+# Terminal 2: Route queries (connects to the running server)
+from huggingface_hub import snapshot_download
+import sys
+path = snapshot_download("JiaqiXue/r2-router")
+sys.path.insert(0, path)
 from router import R2Router
+router = R2Router.from_pretrained(path, embed_url="http://localhost:8000")
+result = router.route_text("What is the capital of France?")
+print(f"Model: {result['model_full_name']}, Budget: {result['token_limit']}")
+```
+### CPU-Only (No GPU)
+If you don't have a GPU, provide pre-computed embeddings directly:
+```python
 router = R2Router.from_pretrained(path)
 # Your own 1024-dim embedding (e.g., from an API or pre-computed)
   token_knn_*.joblib    # Pre-fitted KNN token predictors (6 total)
 ```
+### Ways to Use
 | Method | GPU? | Description |
 |--------|------|-------------|
+| `route_text()` + vLLM server | Yes (server) | Start `vllm serve` once, route from anywhere via HTTP |
+| `route_text()` + local vLLM | Yes (local) | Auto-loads Qwen3-0.6B on first call, caches it |
+| `route(embedding)` | No | Route from pre-computed 1024-dim embedding |
+| `from_training_data(path)` | No | Train your own KNN with custom hyperparameters |
 ## Training Details

router.py CHANGED Viewed

@@ -7,16 +7,17 @@ pair by predicting per-query quality and cost using KNN.
 Usage:
     from router import R2Router
     router = R2Router.from_pretrained(path)
-    # Option 1: Route from text (auto-embeds with vLLM)
     result = router.route_text("What is the capital of France?")
-    # Option 2: Route from pre-computed embedding
     result = router.route(embedding)  # np.ndarray (1024,)
-    # Option 3: Train from scratch
-    router = R2Router.from_training_data(path, k=80)
 """
 import os
@@ -44,6 +45,7 @@ class R2Router:
         model_names: Dict[str, str],
         budgets: Dict[str, int],
         lambda_val: float = 0.999,
     ):
         self.quality_knns = quality_knns  # {model: {budget: KNN}}
         self.token_knns = token_knns      # {model: KNN}
@@ -51,16 +53,24 @@ class R2Router:
         self.model_names = model_names    # {short_name: full_name}
         self.budgets = budgets            # {budget_name: token_limit}
         self.lambda_val = lambda_val
         self._embedder = None
     @classmethod
-    def from_pretrained(cls, path: str, lambda_val: float = 0.999) -> "R2Router":
         """
         Load pre-trained KNN checkpoints.
         Args:
             path: Local directory or HuggingFace repo ID (e.g., "JiaqiXue/r2-router")
             lambda_val: Cost-accuracy tradeoff (higher = more cost-sensitive)
         """
         if not os.path.isdir(path):
             path = cls._download_from_hf(path)
@@ -99,6 +109,7 @@ class R2Router:
             model_names=model_names,
             budgets=config["budgets"],
             lambda_val=lambda_val,
         )
     @classmethod
@@ -188,7 +199,10 @@ class R2Router:
     def embed(self, queries: Union[str, List[str]]) -> np.ndarray:
         """
-        Embed queries using Qwen3-0.6B via vLLM (loaded on first call).
         Args:
             queries: Single query string or list of queries
@@ -196,13 +210,43 @@ class R2Router:
         Returns:
             numpy array of shape (N, 1024)
         """
         if self._embedder is None:
             try:
                 from vllm import LLM
             except ImportError:
                 raise ImportError(
-                    "vLLM is required for text embedding. "
-                    "Install with: uv pip install vllm"
                 )
             self._embedder = LLM(
                 model="Qwen/Qwen3-0.6B",
@@ -211,9 +255,6 @@ class R2Router:
                 dtype="half",
             )
-        if isinstance(queries, str):
-            queries = [queries]
         outputs = self._embedder.embed(queries)
         return np.array([o.outputs.embedding for o in outputs])

 Usage:
     from router import R2Router
+    # Option A: Local vLLM (loads Qwen3-0.6B on first call)
     router = R2Router.from_pretrained(path)
+    result = router.route_text("What is the capital of France?")
+    # Option B: Remote vLLM server (no local GPU needed for embedding)
+    #   Start server: vllm serve Qwen/Qwen3-0.6B --task embed
+    router = R2Router.from_pretrained(path, embed_url="http://localhost:8000")
     result = router.route_text("What is the capital of France?")
+    # Option C: Pre-computed embedding
     result = router.route(embedding)  # np.ndarray (1024,)
 """
 import os
         model_names: Dict[str, str],
         budgets: Dict[str, int],
         lambda_val: float = 0.999,
+        embed_url: Optional[str] = None,
     ):
         self.quality_knns = quality_knns  # {model: {budget: KNN}}
         self.token_knns = token_knns      # {model: KNN}
         self.model_names = model_names    # {short_name: full_name}
         self.budgets = budgets            # {budget_name: token_limit}
         self.lambda_val = lambda_val
+        self.embed_url = embed_url        # vLLM server URL, e.g. "http://localhost:8000"
         self._embedder = None
     @classmethod
+    def from_pretrained(
+        cls,
+        path: str,
+        lambda_val: float = 0.999,
+        embed_url: Optional[str] = None,
+    ) -> "R2Router":
         """
         Load pre-trained KNN checkpoints.
         Args:
             path: Local directory or HuggingFace repo ID (e.g., "JiaqiXue/r2-router")
             lambda_val: Cost-accuracy tradeoff (higher = more cost-sensitive)
+            embed_url: vLLM server URL for embedding (e.g., "http://localhost:8000").
+                       If None, loads Qwen3-0.6B locally on first route_text() call.
         """
         if not os.path.isdir(path):
             path = cls._download_from_hf(path)
             model_names=model_names,
             budgets=config["budgets"],
             lambda_val=lambda_val,
+            embed_url=embed_url,
         )
     @classmethod
     def embed(self, queries: Union[str, List[str]]) -> np.ndarray:
         """
+        Embed queries using Qwen3-0.6B.
+        If embed_url is set, uses a remote vLLM server (OpenAI-compatible API).
+        Otherwise, loads Qwen3-0.6B locally via vLLM (on first call).
         Args:
             queries: Single query string or list of queries
         Returns:
             numpy array of shape (N, 1024)
         """
+        if isinstance(queries, str):
+            queries = [queries]
+        if self.embed_url:
+            return self._embed_remote(queries)
+        return self._embed_local(queries)
+    def _embed_remote(self, queries: List[str]) -> np.ndarray:
+        """Embed via a running vLLM server (OpenAI-compatible embeddings API)."""
+        import urllib.request
+        url = self.embed_url.rstrip("/") + "/v1/embeddings"
+        payload = json.dumps({
+            "model": "Qwen/Qwen3-0.6B",
+            "input": queries,
+        }).encode()
+        req = urllib.request.Request(
+            url, data=payload,
+            headers={"Content-Type": "application/json"},
+        )
+        with urllib.request.urlopen(req) as resp:
+            result = json.loads(resp.read())
+        embeddings = [item["embedding"] for item in sorted(result["data"], key=lambda x: x["index"])]
+        return np.array(embeddings)
+    def _embed_local(self, queries: List[str]) -> np.ndarray:
+        """Embed by loading Qwen3-0.6B locally via vLLM."""
         if self._embedder is None:
             try:
                 from vllm import LLM
             except ImportError:
                 raise ImportError(
+                    "vLLM is required for local embedding. "
+                    "Install with: uv pip install vllm\n"
+                    "Or start a vLLM server and pass embed_url to from_pretrained()."
                 )
             self._embedder = LLM(
                 model="Qwen/Qwen3-0.6B",
                 dtype="half",
             )
         outputs = self._embedder.embed(queries)
         return np.array([o.outputs.embedding for o in outputs])