| <!DOCTYPE html> |
| <html lang="en"> |
| <head> |
| <meta charset="UTF-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <title>vLLM deployment advisor</title> |
| <link rel="preconnect" href="https://huggingface.co" /> |
| <style> |
| :root { |
| --bg: #0f1419; |
| --surface: #1a2332; |
| --surface2: #243044; |
| --border: #334155; |
| --text: #e2e8f0; |
| --muted: #94a3b8; |
| --accent: #38bdf8; |
| --accent2: #a78bfa; |
| --good: #34d399; |
| --warn: #fbbf24; |
| } |
| * { box-sizing: border-box; } |
| body { |
| margin: 0; |
| font-family: "Segoe UI", system-ui, sans-serif; |
| background: var(--bg); |
| color: var(--text); |
| line-height: 1.5; |
| min-height: 100vh; |
| } |
| .wrap { |
| max-width: 1100px; |
| margin: 0 auto; |
| padding: 1.5rem 1.25rem 3rem; |
| } |
| h1 { |
| font-size: 1.35rem; |
| font-weight: 600; |
| margin: 0 0 0.25rem; |
| letter-spacing: -0.02em; |
| } |
| .sub { |
| color: var(--muted); |
| font-size: 0.9rem; |
| margin-bottom: 1.5rem; |
| } |
| label { |
| display: block; |
| font-size: 0.8rem; |
| color: var(--muted); |
| margin-bottom: 0.35rem; |
| } |
| input[type="text"], input[type="number"], select { |
| width: 100%; |
| padding: 0.6rem 0.75rem; |
| border: 1px solid var(--border); |
| border-radius: 8px; |
| background: var(--surface); |
| color: var(--text); |
| font-size: 0.95rem; |
| } |
| input:focus, select:focus { |
| outline: 2px solid var(--accent); |
| outline-offset: 1px; |
| } |
| .row { |
| display: grid; |
| gap: 1rem; |
| margin-bottom: 1rem; |
| } |
| @media (min-width: 640px) { |
| .row.cols-2 { grid-template-columns: 1fr 1fr; } |
| .row.cols-3 { grid-template-columns: repeat(3, 1fr); } |
| } |
| button.primary { |
| padding: 0.65rem 1.25rem; |
| background: linear-gradient(135deg, #0ea5e9, #6366f1); |
| color: #fff; |
| border: none; |
| border-radius: 8px; |
| font-weight: 600; |
| cursor: pointer; |
| font-size: 0.95rem; |
| } |
| button.primary:hover { filter: brightness(1.08); } |
| button.primary:disabled { opacity: 0.5; cursor: not-allowed; } |
| .card { |
| background: var(--surface); |
| border: 1px solid var(--border); |
| border-radius: 12px; |
| padding: 1.1rem 1.25rem; |
| margin-top: 1rem; |
| } |
| .card h2 { |
| font-size: 1rem; |
| margin: 0 0 0.75rem; |
| color: var(--accent); |
| } |
| .err { color: #f87171; font-size: 0.9rem; margin-top: 0.5rem; } |
| .ok { color: var(--good); font-size: 0.9rem; } |
| table { |
| width: 100%; |
| border-collapse: collapse; |
| font-size: 0.85rem; |
| } |
| th, td { |
| text-align: left; |
| padding: 0.45rem 0.5rem; |
| border-bottom: 1px solid var(--border); |
| } |
| th { color: var(--muted); font-weight: 500; } |
| .gpu-grid { |
| display: grid; |
| gap: 0.65rem; |
| grid-template-columns: repeat(auto-fill, minmax(200px, 1fr)); |
| } |
| .gpu-card { |
| background: var(--surface2); |
| border: 1px solid var(--border); |
| border-radius: 10px; |
| padding: 0.75rem 0.9rem; |
| cursor: pointer; |
| transition: border-color 0.15s, box-shadow 0.15s; |
| } |
| .gpu-card:hover, .gpu-card.selected { |
| border-color: var(--accent); |
| box-shadow: 0 0 0 1px var(--accent); |
| } |
| .gpu-card .name { font-weight: 600; font-size: 0.9rem; } |
| .gpu-card .vram { color: var(--muted); font-size: 0.8rem; } |
| .gpu-detail { |
| margin-top: 1rem; |
| padding: 1rem; |
| background: var(--bg); |
| border-radius: 8px; |
| border: 1px solid var(--border); |
| font-size: 0.88rem; |
| } |
| .gpu-detail dl { |
| display: grid; |
| grid-template-columns: auto 1fr; |
| gap: 0.35rem 1rem; |
| margin: 0; |
| } |
| .gpu-detail dt { color: var(--muted); } |
| .gpu-detail dd { margin: 0; } |
| pre.cmd { |
| background: #0c1220; |
| border: 1px solid var(--border); |
| border-radius: 8px; |
| padding: 1rem; |
| overflow-x: auto; |
| font-size: 0.78rem; |
| line-height: 1.45; |
| white-space: pre-wrap; |
| word-break: break-all; |
| } |
| .badge { |
| display: inline-block; |
| padding: 0.15rem 0.45rem; |
| border-radius: 4px; |
| font-size: 0.75rem; |
| background: var(--surface2); |
| color: var(--muted); |
| } |
| .hint { font-size: 0.8rem; color: var(--muted); margin-top: 0.75rem; } |
| .spinner { display: inline-block; width: 1rem; height: 1rem; border: 2px solid var(--border); border-top-color: var(--accent); border-radius: 50%; animation: spin 0.7s linear infinite; vertical-align: middle; margin-right: 0.35rem; } |
| @keyframes spin { to { transform: rotate(360deg); } } |
| .model-row { |
| display: grid; |
| grid-template-columns: 1fr auto; |
| gap: 0.5rem; |
| align-items: end; |
| margin-bottom: 0.65rem; |
| } |
| .model-row .model-id-input { margin: 0; } |
| button.btn-ghost { |
| padding: 0.55rem 0.85rem; |
| background: var(--surface2); |
| color: var(--text); |
| border: 1px solid var(--border); |
| border-radius: 8px; |
| cursor: pointer; |
| font-size: 0.85rem; |
| } |
| button.btn-ghost:hover { border-color: var(--accent); } |
| button.btn-ghost.danger:hover { border-color: #f87171; color: #f87171; } |
| .gpu-card.preferred { border-color: var(--accent2); box-shadow: 0 0 0 1px var(--accent2); } |
| .gpu-card.commands-target { outline: 1px dashed var(--good); outline-offset: 2px; } |
| details.model-block { margin-bottom: 1rem; border: 1px solid var(--border); border-radius: 8px; padding: 0.5rem 0.75rem; background: var(--bg); } |
| details.model-block summary { cursor: pointer; font-weight: 600; color: var(--accent); } |
| </style> |
| </head> |
| <body> |
| <div class="wrap"> |
| <h1>vLLM deployment advisor</h1> |
| <p class="sub">Pulls weight sizes from Hugging Face, estimates KV memory, and suggests tensor parallelism and <code style="color:var(--accent2)">vllm serve</code> commands. Add several models to estimate total GPUs on your preferred GPU type (separate vLLM instances). Estimates are heuristic — validate on your hardware.</p> |
|
|
| <div class="card" style="margin-top:0"> |
| <label>Hugging Face models (one per serving endpoint)</label> |
| <p class="hint" style="margin-top:0">Each model is a separate <code>vllm serve</code> process. Planning assumes tensor-parallel groups do not share GPUs with another model unless you colocate manually.</p> |
| <div id="modelListContainer"></div> |
| <button type="button" class="btn-ghost" id="btnAddModel" style="margin-bottom:1rem">+ Add model</button> |
| <div class="row cols-2"> |
| <div> |
| <label for="hfToken">HF token (optional, for gated/private)</label> |
| <input type="text" id="hfToken" placeholder="hf_..." autocomplete="off" /> |
| </div> |
| <div> |
| <label for="preferredGpu">Preferred GPU (for TP & totals)</label> |
| <select id="preferredGpu"></select> |
| </div> |
| </div> |
| <div class="row cols-3"> |
| <div> |
| <label for="weightDtype">Weight memory (dtype)</label> |
| <select id="weightDtype"> |
| <option value="bf16" selected>BF16 / FP16 (2 bytes/param)</option> |
| <option value="fp8">FP8 weights (~1 byte/param, if supported)</option> |
| </select> |
| </div> |
| <div> |
| <label for="kvDtype">KV cache dtype</label> |
| <select id="kvDtype"> |
| <option value="auto">auto</option> |
| <option value="fp8" selected>fp8 (half KV vs fp16)</option> |
| <option value="fp16">fp16</option> |
| </select> |
| </div> |
| <div> |
| <label for="maxModelLen">Max model length (tokens)</label> |
| <input type="number" id="maxModelLen" value="8192" min="256" step="256" /> |
| </div> |
| </div> |
| <div class="row cols-3"> |
| <div> |
| <label for="gpuUtil">Target GPU memory utilization</label> |
| <input type="number" id="gpuUtil" value="0.90" min="0.5" max="0.98" step="0.01" /> |
| </div> |
| <div> |
| <label for="batchHint">Concurrent sequences per model (KV hint)</label> |
| <input type="number" id="batchHint" value="8" min="1" max="512" step="1" /> |
| </div> |
| <div style="display:flex;align-items:flex-end"> |
| <button type="button" class="primary" id="btnFetch" style="width:100%">Fetch all & compute</button> |
| </div> |
| </div> |
| <div id="fetchError" class="err" hidden></div> |
| </div> |
|
|
| <div id="results" hidden> |
| <div class="card"> |
| <h2>Multi-model deployment (preferred GPU)</h2> |
| <div id="multiDeployment"></div> |
| </div> |
|
|
| <div class="card"> |
| <h2>Models & shards (from Hub)</h2> |
| <div id="modelSummary"></div> |
| </div> |
|
|
| <div class="card"> |
| <h2>Memory breakdown</h2> |
| <div id="memBreakdown"></div> |
| </div> |
|
|
| <div class="card"> |
| <h2>GPU catalog</h2> |
| <p class="sub" style="margin:0 0 0.75rem">Click a GPU for full specs. Your <strong>preferred</strong> choice is highlighted for multi-model totals above.</p> |
| <div id="gpuGrid" class="gpu-grid"></div> |
| <div id="gpuDetailPanel" hidden></div> |
| </div> |
|
|
| <div class="card"> |
| <h2>vLLM commands</h2> |
| <p id="commandGpuHint" class="hint" style="margin-top:0"></p> |
| <pre class="cmd" id="vllmCmd"></pre> |
| <p class="hint">Use a different <code>--port</code> per model when running on the same host. Adjust <code>--tensor-parallel-size</code> if your cluster differs. See <a href="https://docs.vllm.ai" style="color:var(--accent)" target="_blank" rel="noopener">vLLM docs</a>.</p> |
| </div> |
| </div> |
| </div> |
|
|
| <script> |
| const HF_API = "https://huggingface.co/api"; |
| |
| /** Hugging Face repo ids are `org/name`; encoding the whole string turns `/` into `%2F` and breaks `/api/models/...` (400). Encode each path segment only. */ |
| function hfRepoPath(repoId) { |
| return repoId |
| .trim() |
| .split("/") |
| .filter(Boolean) |
| .map(encodeURIComponent) |
| .join("/"); |
| } |
| |
| /** Accept pasted browser URLs, e.g. https://huggingface.co/Qwen/Qwen3-30B-A3B → Qwen/Qwen3-30B-A3B */ |
| function normalizeHfModelInput(raw) { |
| const s = String(raw).trim(); |
| if (!s) return s; |
| if (!/^https?:\/\//i.test(s)) return s; |
| try { |
| const u = new URL(s); |
| const h = u.hostname.replace(/^www\./i, "").toLowerCase(); |
| if (h !== "huggingface.co" && h !== "hf.co") return s; |
| const parts = u.pathname.split("/").filter(Boolean); |
| if (parts[0] === "datasets" || parts[0] === "spaces") return s; |
| if (parts.length >= 2) { |
| return `${decodeURIComponent(parts[0])}/${decodeURIComponent(parts[1])}`; |
| } |
| } catch { |
| /* ignore */ |
| } |
| return s; |
| } |
| |
| const GPU_CATALOG = [ |
| { id: "h100-sxm", name: "NVIDIA H100 SXM", vramGb: 80, memBandwidthGbps: 3350, tdpW: 700, fp16Tflops: 989, pcie: "PCIe 5.0 x16", notes: "Datacenter flagship; best for large TP." }, |
| { id: "h100-pcie", name: "NVIDIA H100 PCIe", vramGb: 80, memBandwidthGbps: 2000, tdpW: 350, fp16Tflops: 756, pcie: "PCIe 5.0 x16", notes: "Slightly lower BW than SXM." }, |
| { id: "h200", name: "NVIDIA H200", vramGb: 141, memBandwidthGbps: 4800, tdpW: 700, fp16Tflops: 989, pcie: "PCIe 5.0 x16", notes: "More HBM than H100." }, |
| { id: "b200", name: "NVIDIA B200", vramGb: 192, memBandwidthGbps: 8000, tdpW: 1000, fp16Tflops: 2250, pcie: "NVLink / rack", notes: "Blackwell; approximate specs." }, |
| { id: "a100-80", name: "NVIDIA A100 80GB", vramGb: 80, memBandwidthGbps: 2039, tdpW: 400, fp16Tflops: 312, pcie: "PCIe 4.0", notes: "Common in clouds." }, |
| { id: "a100-40", name: "NVIDIA A100 40GB", vramGb: 40, memBandwidthGbps: 1555, tdpW: 400, fp16Tflops: 312, pcie: "PCIe 4.0", notes: "" }, |
| { id: "l40s", name: "NVIDIA L40S", vramGb: 48, memBandwidthGbps: 864, tdpW: 350, fp16Tflops: 362, pcie: "PCIe 4.0 x16", notes: "Inference-oriented Ada." }, |
| { id: "l40", name: "NVIDIA L40", vramGb: 48, memBandwidthGbps: 864, tdpW: 300, fp16Tflops: 181, pcie: "PCIe 4.0 x16", notes: "Legacy Ada datacenter; predecessor to L40S." }, |
| { id: "a30", name: "NVIDIA A30", vramGb: 24, memBandwidthGbps: 933, tdpW: 165, fp16Tflops: 165, pcie: "PCIe 4.0 x16", notes: "Legacy Ampere; compact inference." }, |
| { id: "a10", name: "NVIDIA A10", vramGb: 24, memBandwidthGbps: 600, tdpW: 150, fp16Tflops: 125, pcie: "PCIe 4.0 x16", notes: "Legacy Ampere single-slot cloud GPU." }, |
| { id: "a10g", name: "NVIDIA A10G", vramGb: 24, memBandwidthGbps: 600, tdpW: 300, fp16Tflops: 125, pcie: "PCIe 4.0 x16", notes: "A10-class (e.g. AWS G5); ref. specs." }, |
| { id: "l4", name: "NVIDIA L4", vramGb: 24, memBandwidthGbps: 300, tdpW: 72, fp16Tflops: 120, pcie: "PCIe 4.0 x16", notes: "Legacy Ada low-power inference." }, |
| { id: "t4", name: "NVIDIA T4", vramGb: 16, memBandwidthGbps: 320, tdpW: 70, fp16Tflops: 65, pcie: "PCIe 3.0 x16", notes: "Legacy Turing inference." }, |
| { id: "v100-32", name: "NVIDIA V100 32GB", vramGb: 32, memBandwidthGbps: 1134, tdpW: 300, fp16Tflops: 125, pcie: "PCIe 3.0 / SXM2", notes: "Legacy Volta; still common in older clusters." }, |
| { id: "v100-16", name: "NVIDIA V100 16GB", vramGb: 16, memBandwidthGbps: 900, tdpW: 250, fp16Tflops: 125, pcie: "PCIe 3.0 / SXM2", notes: "Legacy Volta 16 GB SKU." }, |
| { id: "p100-16", name: "NVIDIA P100 16GB", vramGb: 16, memBandwidthGbps: 732, tdpW: 250, fp16Tflops: 19, pcie: "PCIe 3.0", notes: "Legacy Pascal; very dated for LLMs." }, |
| { id: "a6000", name: "NVIDIA RTX A6000", vramGb: 48, memBandwidthGbps: 768, tdpW: 300, fp16Tflops: 155, pcie: "PCIe 4.0 x16", notes: "Workstation." }, |
| { id: "3090", name: "NVIDIA GeForce RTX 3090", vramGb: 24, memBandwidthGbps: 936, tdpW: 350, fp16Tflops: 160, pcie: "PCIe 4.0 x16", notes: "Legacy Ampere consumer; 24 GB." }, |
| { id: "4090", name: "NVIDIA GeForce RTX 4090", vramGb: 24, memBandwidthGbps: 1008, tdpW: 450, fp16Tflops: 330, pcie: "PCIe 4.0 x16", notes: "High BW consumer card." }, |
| { id: "4080", name: "NVIDIA GeForce RTX 4080", vramGb: 16, memBandwidthGbps: 717, tdpW: 320, fp16Tflops: 195, pcie: "PCIe 4.0 x16", notes: "" }, |
| { id: "5090", name: "NVIDIA GeForce RTX 5090", vramGb: 32, memBandwidthGbps: 1792, tdpW: 575, fp16Tflops: 420, pcie: "PCIe 5.0 x16", notes: "Approximate consumer flagship." }, |
| { id: "mi300x", name: "AMD MI300X", vramGb: 192, memBandwidthGbps: 5300, tdpW: 750, fp16Tflops: 1300, pcie: "OAM", notes: "Approximate; check ROCm/vLLM support." }, |
| ]; |
| |
| function authHeaders() { |
| const t = document.getElementById("hfToken").value.trim(); |
| return t ? { Authorization: `Bearer ${t}` } : {}; |
| } |
| |
| async function hfFetch(url) { |
| const r = await fetch(url, { headers: { ...authHeaders() } }); |
| if (!r.ok) throw new Error(`${r.status} ${r.statusText} — ${url}`); |
| return r; |
| } |
| |
| async function hfJson(url) { |
| const r = await hfFetch(url); |
| return r.json(); |
| } |
| |
| async function hfText(url) { |
| const r = await hfFetch(url); |
| return r.text(); |
| } |
| |
| /** Sum sizes of weight files from tree API */ |
| function analyzeTreeFiles(tree) { |
| const ignore = ["training_args", "optimizer", "scheduler", "tf_model", "flax_model", "rust_model"]; |
| const files = tree.filter((f) => { |
| if ((f.type !== "blob" && f.type !== "file") || typeof f.size !== "number") return false; |
| const p = f.path.toLowerCase(); |
| if (ignore.some((k) => p.includes(k))) return false; |
| if (p.endsWith(".safetensors")) return true; |
| if (p.endsWith(".bin")) { |
| return ( |
| p.endsWith("pytorch_model.bin") || |
| /model-\d+-of-\d+\.bin$/.test(p) || |
| p.includes("pytorch_model-") |
| ); |
| } |
| return false; |
| }); |
| const totalBytes = files.reduce((s, f) => s + f.size, 0); |
| const byShard = files.map((f) => ({ path: f.path, sizeBytes: f.size, sizeGb: f.size / 1e9 })); |
| byShard.sort((a, b) => b.sizeBytes - a.sizeBytes); |
| const maxShard = byShard.length ? byShard[0].sizeBytes : 0; |
| return { files: byShard, totalBytes, maxShardBytes: maxShard }; |
| } |
| |
| function parseConfigJson(text) { |
| try { |
| return JSON.parse(text); |
| } catch { |
| return null; |
| } |
| } |
| |
| /** Rough param count from Llama-like config */ |
| function estimateParamsFromConfig(cfg) { |
| if (!cfg) return null; |
| if (typeof cfg.num_parameters === "number") return cfg.num_parameters; |
| const h = cfg.hidden_size; |
| const L = cfg.num_hidden_layers; |
| const V = cfg.vocab_size; |
| const I = cfg.intermediate_size; |
| const nHead = cfg.num_attention_heads; |
| const nKV = cfg.num_key_value_heads ?? nHead; |
| if (!h || !L || !V || !I || !nHead) return null; |
| const headDim = h / nHead; |
| const embed = V * h; |
| const attnPerLayer = 2 * (h * h) + 2 * (nKV * headDim * h); |
| const mlpPerLayer = 3 * h * I; |
| const ln = 2 * h * L * 2; |
| const out = h * V; |
| return embed + L * (attnPerLayer + mlpPerLayer) + ln + out; |
| } |
| |
| /** |
| * KV bytes per token per layer: K and V each num_kv_heads * head_dim. |
| * Per token: 2 (K+V) * num_kv_heads * head_dim * bytes |
| */ |
| function kvBytesPerToken(cfg, kvBytesPerEl) { |
| if (!cfg) return 0; |
| const h = cfg.hidden_size; |
| const L = cfg.num_hidden_layers; |
| const nHead = cfg.num_attention_heads; |
| const nKV = cfg.num_key_value_heads ?? nHead; |
| if (!h || !L || !nHead) return 0; |
| const headDim = h / nHead; |
| return L * 2 * nKV * headDim * kvBytesPerEl; |
| } |
| |
| function bytesPerParamWeight(dtype) { |
| return dtype === "fp8" ? 1 : 2; |
| } |
| |
| function usableVramGb(vramGb, util) { |
| return vramGb * util; |
| } |
| |
| /** |
| * With tensor parallelism, weights and standard attention KV are split across TP ranks: |
| * per-GPU ≈ (weightGb + kvTotalGb) / tp. Need tp ≥ ceil((weight + KV) / usable). |
| * Largest on-disk shard is shown separately (load-time peak can differ by loader). |
| */ |
| function minTpForWeightsAndKv(totalWeightGb, kvTotalGb, usablePerGpuGb) { |
| if (usablePerGpuGb <= 0) return Infinity; |
| const combined = totalWeightGb + kvTotalGb; |
| return Math.max(1, Math.ceil(combined / usablePerGpuGb)); |
| } |
| |
| function minTpForLargestShard(maxShardGb, usablePerGpuGb) { |
| if (!maxShardGb || maxShardGb <= 0) return 1; |
| if (usablePerGpuGb <= 0) return Infinity; |
| return Math.max(1, Math.ceil(maxShardGb / usablePerGpuGb)); |
| } |
| |
| function renderGpuDetail(gpu) { |
| const el = document.getElementById("gpuDetailPanel"); |
| el.hidden = false; |
| el.innerHTML = ` |
| <div class="gpu-detail"> |
| <strong style="color:var(--accent)">${gpu.name}</strong> |
| <dl style="margin-top:0.75rem"> |
| <dt>VRAM</dt><dd>${gpu.vramGb} GB</dd> |
| <dt>Memory bandwidth (ref.)</dt><dd>~${gpu.memBandwidthGbps} GB/s</dd> |
| <dt>FP16 TFLOPS (ref.)</dt><dd>~${gpu.fp16Tflops}</dd> |
| <dt>TDP (ref.)</dt><dd>${gpu.tdpW} W</dd> |
| <dt>PCIe</dt><dd>${gpu.pcie}</dd> |
| <dt>Notes</dt><dd>${gpu.notes || "—"}</dd> |
| </dl> |
| <p class="hint" style="margin-bottom:0">Published specs vary by SKU and firmware; use vendor datasheets for procurement.</p> |
| </div> |
| `; |
| } |
| |
| let selectedGpuId = null; |
| /** @type {{ models: object[] } | null} */ |
| let lastFetchCtx = null; |
| let rowIdSeq = 0; |
| |
| function populatePreferredGpuSelect() { |
| const sel = document.getElementById("preferredGpu"); |
| if (!sel || sel.options.length) return; |
| GPU_CATALOG.forEach((g) => { |
| const o = document.createElement("option"); |
| o.value = g.id; |
| o.textContent = `${g.name} (${g.vramGb} GB)`; |
| sel.appendChild(o); |
| }); |
| sel.value = "h100-sxm"; |
| } |
| |
| function getModelIdsFromInputs() { |
| return Array.from(document.querySelectorAll(".model-id-input")) |
| .map((el) => normalizeHfModelInput(el.value.trim())) |
| .filter(Boolean); |
| } |
| |
| function syncInputValuesFromNormalized() { |
| const inputs = document.querySelectorAll(".model-id-input"); |
| inputs.forEach((el) => { |
| const n = normalizeHfModelInput(el.value.trim()); |
| if (n && n !== el.value.trim()) el.value = n; |
| }); |
| } |
| |
| function addModelRow(initial = "") { |
| const container = document.getElementById("modelListContainer"); |
| const id = `mr-${++rowIdSeq}`; |
| const wrap = document.createElement("div"); |
| wrap.className = "model-row"; |
| wrap.dataset.rowId = id; |
| wrap.innerHTML = ` |
| <div> |
| <label class="model-row-label" style="font-size:0.8rem;color:var(--muted)">Model id or URL</label> |
| <input type="text" class="model-id-input" placeholder="org/model or https://huggingface.co/…" autocomplete="off" /> |
| </div> |
| <button type="button" class="btn-ghost danger btn-remove-model" title="Remove">Remove</button>`; |
| wrap.querySelector(".model-id-input").value = initial; |
| container.appendChild(wrap); |
| wrap.querySelector(".btn-remove-model").addEventListener("click", () => { |
| if (document.querySelectorAll(".model-row").length <= 1) return; |
| wrap.remove(); |
| }); |
| } |
| |
| async function fetchOneModel(modelId) { |
| const meta = await hfJson(`${HF_API}/models/${hfRepoPath(modelId)}`); |
| const ref = meta.sha || "main"; |
| const treeUrl = `${HF_API}/models/${hfRepoPath(modelId)}/tree/${encodeURIComponent(ref)}?recursive=true`; |
| const tree = await hfJson(treeUrl); |
| const analysis = analyzeTreeFiles(Array.isArray(tree) ? tree : []); |
| |
| let config = null; |
| try { |
| const cfgUrl = `https://huggingface.co/${hfRepoPath(modelId)}/resolve/${encodeURIComponent(ref)}/config.json`; |
| const cfgText = await hfText(cfgUrl); |
| config = parseConfigJson(cfgText); |
| } catch { |
| config = null; |
| } |
| |
| let indexMeta = null; |
| try { |
| const idxCandidates = [ |
| `https://huggingface.co/${hfRepoPath(modelId)}/resolve/${encodeURIComponent(ref)}/model.safetensors.index.json`, |
| `https://huggingface.co/${hfRepoPath(modelId)}/resolve/${encodeURIComponent(ref)}/pytorch_model.bin.index.json`, |
| ]; |
| for (const u of idxCandidates) { |
| try { |
| const j = await hfJson(u); |
| if (j.metadata && j.metadata.total_size != null) { |
| indexMeta = j.metadata; |
| break; |
| } |
| } catch { /* try next */ } |
| } |
| } catch { /* optional */ } |
| const totalBytesFromIndex = indexMeta && indexMeta.total_size ? Number(indexMeta.total_size) : null; |
| |
| const totalBytes = analysis.totalBytes > 0 ? analysis.totalBytes : totalBytesFromIndex; |
| const totalGbDisk = totalBytes != null ? totalBytes / 1e9 : null; |
| const maxShardGb = analysis.maxShardBytes > 0 ? analysis.maxShardBytes / 1e9 : (totalGbDisk || 0); |
| const estParams = estimateParamsFromConfig(config); |
| |
| return { modelId, meta, analysis, config, totalGbDisk, maxShardGb, estParams }; |
| } |
| |
| function metricsForCtx(ctx) { |
| const weightDtype = document.getElementById("weightDtype").value; |
| const bPerParam = bytesPerParamWeight(weightDtype); |
| const weightGbFromParams = ctx.estParams != null ? (ctx.estParams * bPerParam) / 1e9 : null; |
| const weightGb = ctx.totalGbDisk != null ? ctx.totalGbDisk * (bPerParam / 2) : weightGbFromParams; |
| |
| const kvSel = document.getElementById("kvDtype").value; |
| const kvBytesPerEl = kvSel === "fp8" ? 1 : 2; |
| const maxLen = Math.max(256, parseInt(document.getElementById("maxModelLen").value, 10) || 8192); |
| const batchHint = Math.max(1, parseInt(document.getElementById("batchHint").value, 10) || 1); |
| |
| const kvPerToken = kvBytesPerToken(ctx.config, kvBytesPerEl); |
| const kvTotalGb = (kvPerToken * maxLen * batchHint) / 1e9; |
| return { weightGb, kvTotalGb, kvPerToken, weightDtype, kvSel, maxLen, batchHint }; |
| } |
| |
| function tpForModelOnGpu(ctx, weightGb, kvTotalGb, gpu, util) { |
| const usable = usableVramGb(gpu.vramGb, util); |
| if (weightGb == null) return null; |
| const tpMem = minTpForWeightsAndKv(weightGb, kvTotalGb, usable); |
| const tpShard = minTpForLargestShard(ctx.maxShardGb, usable); |
| return Math.max(tpMem, tpShard); |
| } |
| |
| /** GPU used for generated vLLM commands: clicked card overrides Preferred dropdown. */ |
| function gpuForCommands() { |
| const prefId = document.getElementById("preferredGpu").value; |
| if (selectedGpuId) { |
| const g = GPU_CATALOG.find((x) => x.id === selectedGpuId); |
| if (g) return g; |
| } |
| return GPU_CATALOG.find((x) => x.id === prefId) || GPU_CATALOG[0]; |
| } |
| |
| function renderVllmCommands(models) { |
| const hintEl = document.getElementById("commandGpuHint"); |
| const cmdEl = document.getElementById("vllmCmd"); |
| if (!models || !models.length || !cmdEl) return; |
| |
| const util = Math.min(0.98, Math.max(0.5, parseFloat(document.getElementById("gpuUtil").value) || 0.9)); |
| const cmdGpu = gpuForCommands(); |
| const usableCmd = usableVramGb(cmdGpu.vramGb, util); |
| |
| const kvFlag = |
| document.getElementById("kvDtype").value === "fp8" |
| ? "fp8_e5m2" |
| : document.getElementById("kvDtype").value === "fp16" |
| ? "fp16" |
| : "auto"; |
| const dtypeFlag = document.getElementById("weightDtype").value === "fp8" ? "float8_e4m3fn" : "bfloat16"; |
| const maxLen = Math.max(256, parseInt(document.getElementById("maxModelLen").value, 10) || 8192); |
| |
| const blocks = []; |
| let totalCmd = 0; |
| models.forEach((ctx) => { |
| const m = metricsForCtx(ctx); |
| const tp = tpForModelOnGpu(ctx, m.weightGb, m.kvTotalGb, cmdGpu, util); |
| const tpUse = typeof tp === "number" && !Number.isNaN(tp) ? tp : 1; |
| totalCmd += tpUse; |
| blocks.push({ ctx, tpUse }); |
| }); |
| |
| const lines = [ |
| `# Total GPUs (separate vLLM servers, ${cmdGpu.name}): ${totalCmd}`, |
| `# ~${usableCmd.toFixed(1)} GB usable per GPU @ ${(util * 100).toFixed(0)}% of ${cmdGpu.vramGb} GB VRAM`, |
| `# Assign disjoint CUDA_VISIBLE_DEVICES per server on the same host.`, |
| "", |
| ]; |
| blocks.forEach((b, i) => { |
| const port = 8000 + i; |
| lines.push( |
| `# --- ${b.ctx.modelId} ---`, |
| `vllm serve "${b.ctx.modelId}" \\`, |
| ` --dtype ${dtypeFlag} \\`, |
| ` --tensor-parallel-size ${b.tpUse} \\`, |
| ` --max-model-len ${maxLen} \\`, |
| ` --gpu-memory-utilization ${util} \\`, |
| ` --kv-cache-dtype ${kvFlag} \\`, |
| ` --port ${port}`, |
| "" |
| ); |
| }); |
| cmdEl.textContent = lines.join("\n").trimEnd(); |
| |
| if (hintEl) { |
| const src = selectedGpuId ? "GPU catalog (clicked card)" : "Preferred GPU dropdown"; |
| hintEl.textContent = `Tensor parallelism in the commands below uses ${cmdGpu.name} (~${usableCmd.toFixed(1)} GB usable per GPU). Source: ${src}. Click a GPU card to override the dropdown; change the dropdown to clear the override.`; |
| } |
| } |
| |
| function buildGpuGrid(state) { |
| const grid = document.getElementById("gpuGrid"); |
| grid.innerHTML = ""; |
| const { usablePerGpuByGpu, shardsFit, tp, util, preferredGpuId } = state; |
| const cmdGpuId = gpuForCommands().id; |
| |
| GPU_CATALOG.forEach((gpu) => { |
| const usable = usablePerGpuByGpu[gpu.id]; |
| const fit = shardsFit[gpu.id]; |
| const isPref = preferredGpuId && gpu.id === preferredGpuId; |
| const isCmdTarget = gpu.id === cmdGpuId; |
| const card = document.createElement("div"); |
| card.className = |
| "gpu-card" + |
| (selectedGpuId === gpu.id ? " selected" : "") + |
| (isPref ? " preferred" : "") + |
| (isCmdTarget ? " commands-target" : ""); |
| card.innerHTML = ` |
| <div class="name">${isPref ? '<span style="float:right;font-size:0.65rem;color:var(--accent2);text-transform:uppercase">preferred</span>' : ""}${gpu.name}</div> |
| <div class="vram">${gpu.vramGb} GB VRAM · ~${usable.toFixed(1)} GB usable @ ${(util * 100).toFixed(0)}%</div> |
| <div style="margin-top:0.4rem;font-size:0.78rem;color:var(--muted)"> |
| Shards fit (largest shard across models) / GPU: <strong style="color:var(--text)">${fit}</strong> |
| ${tp[gpu.id] != null ? ` · max TP any model: <strong style="color:var(--good)">${tp[gpu.id]}</strong>` : ""} |
| </div> |
| `; |
| card.addEventListener("click", () => { |
| selectedGpuId = gpu.id; |
| document.querySelectorAll(".gpu-card").forEach((c) => c.classList.remove("selected")); |
| card.classList.add("selected"); |
| renderGpuDetail(gpu); |
| if (lastFetchCtx && lastFetchCtx.models) computeAndRenderMulti(lastFetchCtx.models); |
| }); |
| grid.appendChild(card); |
| }); |
| } |
| |
| /** |
| * @param {object[]} models — array of Hub fetch ctx |
| */ |
| function computeAndRenderMulti(models) { |
| const util = Math.min(0.98, Math.max(0.5, parseFloat(document.getElementById("gpuUtil").value) || 0.9)); |
| const preferredGpuId = document.getElementById("preferredGpu").value; |
| const prefGpu = GPU_CATALOG.find((g) => g.id === preferredGpuId) || GPU_CATALOG[0]; |
| |
| const perModel = models.map((ctx) => { |
| const m = metricsForCtx(ctx); |
| const tpPref = tpForModelOnGpu(ctx, m.weightGb, m.kvTotalGb, prefGpu, util); |
| const perGpuPref = |
| m.weightGb != null && tpPref != null ? (m.weightGb + m.kvTotalGb) / tpPref : null; |
| return { ctx, ...m, tpOnPreferred: tpPref, perGpuOnPreferred: perGpuPref }; |
| }); |
| |
| const maxShardAll = Math.max(0, ...models.map((c) => c.maxShardGb || 0)); |
| |
| let summaryHtml = ""; |
| let memHtml = ""; |
| perModel.forEach((row, idx) => { |
| const { ctx, weightGb, kvTotalGb, kvPerToken, weightDtype, kvSel, maxLen, batchHint } = row; |
| const shardRows = ctx.analysis.files.slice(0, 12).map((f) => |
| `<tr><td>${escapeHtml(f.path)}</td><td>${f.sizeGb.toFixed(2)}</td></tr>` |
| ).join(""); |
| const moreShards = |
| ctx.analysis.files.length > 12 |
| ? `<tr><td colspan="2">… ${ctx.analysis.files.length - 12} more</td></tr>` |
| : ""; |
| |
| summaryHtml += ` |
| <details class="model-block" ${idx === 0 ? "open" : ""}> |
| <summary>${escapeHtml(ctx.modelId)}</summary> |
| <p style="margin:0.5rem 0;font-size:0.85rem;color:var(--muted)">${escapeHtml(ctx.meta.pipeline_tag || ctx.meta.library_name || "model")}</p> |
| <table> |
| <tr><th>Metric</th><th>Value</th></tr> |
| <tr><td>Weight files total</td><td>${ctx.totalGbDisk != null ? ctx.totalGbDisk.toFixed(2) + " GB" : "unknown"}</td></tr> |
| <tr><td>Largest shard</td><td>${ctx.maxShardGb > 0 ? ctx.maxShardGb.toFixed(2) + " GB" : "—"}</td></tr> |
| <tr><td>Est. weight (${weightDtype})</td><td>${weightGb != null ? weightGb.toFixed(2) + " GB" : "—"}</td></tr> |
| </table> |
| ${ctx.analysis.files.length ? `<table style="margin-top:0.5rem"><tr><th>File</th><th>GB</th></tr>${shardRows}${moreShards}</table>` : ""} |
| </details>`; |
| |
| memHtml += ` |
| <h3 style="font-size:0.9rem;margin:0.75rem 0 0.4rem;color:var(--accent)">${escapeHtml(ctx.modelId)}</h3> |
| <table> |
| <tr><th>Component</th><th>Estimate</th></tr> |
| <tr><td>Weights</td><td>${weightGb != null ? weightGb.toFixed(2) + " GB" : "—"}</td></tr> |
| <tr><td>KV (${kvSel}, ${maxLen} × ${batchHint} seqs)</td><td>${kvTotalGb.toFixed(3)} GB</td></tr> |
| <tr><td>KV / token</td><td>${(kvPerToken / 1024).toFixed(2)} KiB</td></tr> |
| </table>`; |
| }); |
| |
| document.getElementById("modelSummary").innerHTML = summaryHtml || "<p class='hint'>No models.</p>"; |
| document.getElementById("memBreakdown").innerHTML = |
| memHtml + `<p class="hint">KV is a planning upper bound; vLLM paging changes real usage.</p>`; |
| |
| const usablePerGpuByGpu = {}; |
| const shardsFit = {}; |
| const minTp = {}; |
| for (const gpu of GPU_CATALOG) { |
| const usable = usableVramGb(gpu.vramGb, util); |
| usablePerGpuByGpu[gpu.id] = usable; |
| shardsFit[gpu.id] = |
| maxShardAll > 0 ? Math.floor(usable / maxShardAll) : 0; |
| let maxTp = 0; |
| for (const row of perModel) { |
| if (row.weightGb == null) continue; |
| const t = tpForModelOnGpu(row.ctx, row.weightGb, row.kvTotalGb, gpu, util); |
| if (t != null && t > maxTp) maxTp = t; |
| } |
| minTp[gpu.id] = maxTp || null; |
| } |
| |
| buildGpuGrid({ util, usablePerGpuByGpu, shardsFit, tp: minTp, preferredGpuId }); |
| |
| const totalGpusSeparate = perModel.reduce( |
| (s, r) => s + (typeof r.tpOnPreferred === "number" && !Number.isNaN(r.tpOnPreferred) ? r.tpOnPreferred : 0), |
| 0 |
| ); |
| const sumMemOneGpu = perModel.reduce((s, r) => s + (r.weightGb || 0) + r.kvTotalGb, 0); |
| const usablePref = usableVramGb(prefGpu.vramGb, util); |
| const eachTpOne = perModel.every((r) => r.tpOnPreferred === 1); |
| const fitsAllOnSingleGpu = sumMemOneGpu <= usablePref && eachTpOne; |
| |
| let multiHtml = ` |
| <p class="hint" style="margin-bottom:0.75rem">This table uses the <strong>Preferred GPU</strong> dropdown only. The <strong>vLLM commands</strong> section uses that same GPU until you click a GPU in the catalog — then commands switch to the clicked GPU (dashed outline). Changing the dropdown clears the click override.</p> |
| <p><strong>Preferred GPU:</strong> ${escapeHtml(prefGpu.name)} — ~${usablePref.toFixed(1)} GB usable @ ${(util * 100).toFixed(0)}%</p> |
| <table> |
| <tr><th>Model</th><th>Weights+KV (est.)</th><th>Min TP on preferred</th><th>GPUs (dedicated group)</th></tr> |
| ${perModel |
| .map((r) => { |
| const sum = r.weightGb != null ? r.weightGb + r.kvTotalGb : r.kvTotalGb; |
| const tp = r.tpOnPreferred ?? "—"; |
| const gpus = r.tpOnPreferred ?? "—"; |
| return `<tr> |
| <td>${escapeHtml(r.ctx.modelId)}</td> |
| <td>${sum.toFixed(2)} GB</td> |
| <td>${tp}</td> |
| <td>${gpus}</td> |
| </tr>`; |
| }) |
| .join("")} |
| <tr style="font-weight:600;border-top:2px solid var(--border)"> |
| <td>Total (separate instances)</td> |
| <td>—</td> |
| <td>—</td> |
| <td>${totalGpusSeparate || "—"} GPUs</td> |
| </tr> |
| </table> |
| <p class="hint" style="margin-top:0.75rem"> |
| <strong>Separate instances:</strong> each model uses its own tensor-parallel group; total accelerator count ≈ <strong>${totalGpusSeparate}</strong> × ${escapeHtml(prefGpu.name)} (no GPU sharing between models). |
| </p> |
| <p class="hint"> |
| <strong>Single GPU, multiple models:</strong> needs sum(weights+KV) ≤ usable VRAM on one GPU <em>and</em> each model’s min TP = 1 on that GPU. |
| Here sum ≈ <strong>${sumMemOneGpu.toFixed(2)} GB</strong> vs <strong>${usablePref.toFixed(2)} GB</strong> usable — |
| ${fitsAllOnSingleGpu ? '<span style="color:var(--good)">may fit in theory (still not recommended for large models — VRAM fragmentation & two processes).</span>' : '<span style="color:#f87171">does not fit on one GPU of this type at current settings.</span>'} |
| </p> |
| <p class="hint"><strong>Max configuration on preferred GPU:</strong> at these dtype / max-model-len / batch settings, the table above is the minimum TP per model; you cannot lower TP without reducing context, batch, quantization, or choosing a larger GPU.</p> |
| `; |
| document.getElementById("multiDeployment").innerHTML = multiHtml; |
| |
| renderVllmCommands(models); |
| } |
| |
| function tryRecomputeFromCache() { |
| if (!lastFetchCtx || !lastFetchCtx.models || document.getElementById("results").hidden) return; |
| const ids = getModelIdsFromInputs(); |
| const cached = lastFetchCtx.models.map((m) => m.modelId); |
| if (ids.length !== cached.length || ids.some((id, i) => id !== cached[i])) return; |
| computeAndRenderMulti(lastFetchCtx.models); |
| } |
| |
| document.getElementById("btnAddModel").addEventListener("click", () => addModelRow()); |
| |
| document.getElementById("btnFetch").addEventListener("click", async () => { |
| syncInputValuesFromNormalized(); |
| const ids = getModelIdsFromInputs(); |
| const errEl = document.getElementById("fetchError"); |
| const results = document.getElementById("results"); |
| errEl.hidden = true; |
| results.hidden = true; |
| |
| if (ids.length === 0) { |
| errEl.textContent = "Add at least one Hugging Face model id or URL."; |
| errEl.hidden = false; |
| return; |
| } |
| |
| const btn = document.getElementById("btnFetch"); |
| btn.disabled = true; |
| btn.innerHTML = '<span class="spinner"></span>Loading…'; |
| |
| try { |
| const models = []; |
| const errors = []; |
| for (let i = 0; i < ids.length; i++) { |
| try { |
| models.push(await fetchOneModel(ids[i])); |
| } catch (e) { |
| errors.push(`${ids[i]}: ${e.message || e}`); |
| } |
| } |
| if (errors.length && models.length === 0) { |
| errEl.textContent = errors.join("\n"); |
| errEl.hidden = false; |
| lastFetchCtx = null; |
| return; |
| } |
| if (errors.length) { |
| errEl.textContent = "Some models failed:\n" + errors.join("\n"); |
| errEl.hidden = false; |
| } |
| |
| lastFetchCtx = { models }; |
| document.getElementById("gpuDetailPanel").hidden = true; |
| selectedGpuId = null; |
| computeAndRenderMulti(models); |
| |
| results.hidden = false; |
| } catch (e) { |
| errEl.textContent = e.message || String(e); |
| errEl.hidden = false; |
| lastFetchCtx = null; |
| } finally { |
| btn.disabled = false; |
| btn.textContent = "Fetch all & compute"; |
| } |
| }); |
| |
| function escapeHtml(s) { |
| const d = document.createElement("div"); |
| d.textContent = s; |
| return d.innerHTML; |
| } |
| |
| function debounce(fn, ms) { |
| let t; |
| return (...args) => { |
| clearTimeout(t); |
| t = setTimeout(() => fn(...args), ms); |
| }; |
| } |
| |
| const debouncedRecompute = debounce(tryRecomputeFromCache, 350); |
| |
| ["maxModelLen", "batchHint", "gpuUtil", "weightDtype", "kvDtype"].forEach((id) => { |
| const el = document.getElementById(id); |
| if (!el) return; |
| el.addEventListener("change", tryRecomputeFromCache); |
| if (el.type === "number") el.addEventListener("input", debouncedRecompute); |
| }); |
| |
| document.getElementById("preferredGpu").addEventListener("change", () => { |
| selectedGpuId = null; |
| tryRecomputeFromCache(); |
| }); |
| |
| populatePreferredGpuSelect(); |
| addModelRow(); |
| </script> |
| </body> |
| </html> |
|
|