Spaces:

karlexmarin
/

taf-agent

Running

File size: 18,463 Bytes

// Speculative-Decode Compatibility Checker (v0.8.5 anti-bullshit pack #11)
//
// Pain: speculative decoding (vLLM, SGLang, llama.cpp, transformers
// `assistant_model`) requires the draft and target model to share an
// EXACT vocabulary. If token IDs disagree, every draft token is
// rejected by the target's verifier — the user pays the draft compute
// AND the full target compute, getting WORSE throughput than baseline.
// Worse, the system reports nominal output (just slower) so the bug
// is invisible in unit tests.
//
// Common silent failures:
//   - Llama-3.1 draft + Llama-3.2 target (vocab differs by added tokens)
//   - Mistral draft + Llama target (different tokenizer family entirely)
//   - Quantized variant with different special tokens
//   - Chat-template additions (`<|im_start|>` etc) on one side only
//
// vLLM #4570 / #16757 / #20409 / #12488 all surface variants of this.
//
// Tool: paste two HF model ids → fetch `tokenizer.json` from HF Hub for
// both → compare vocab type, size, token-to-id sample, special tokens,
// added tokens → verdict + speedup estimate when compatible.
//
// Pure logic + async fetch. No human strings; main.js does i18n.

// =============================================================================
// HF Hub fetching
// =============================================================================
//
// HF Hub serves text-content files (tokenizer.json, tokenizer_config.json,
// config.json) with CORS. The v0.7.4 autocomplete already proved this
// path is reachable from the browser. We fetch with a short timeout so
// the UI doesn't hang on gated/private/missing models.

const HF_BASE = "https://huggingface.co";
// 15s timeout — Llama-3.x tokenizer.json is ~17 MB via LFS-CDN and the
// fetch can take 3-8s on first hit (cold cache). 8s was too tight.
const FETCH_TIMEOUT_MS = 15000;

async function fetchHfJson(modelId, fileName) {
  if (typeof modelId !== "string" || !modelId.trim()) {
    return { ok: false, error: "missing_model_id" };
  }
  // Use `/resolve/main/` (NOT `/raw/main/`) so we get the actual content
  // for LFS-tracked artifacts. Llama-3.x tokenizer.json is ~17 MB and
  // stored via Git-LFS — `/raw/main/` returns the LFS POINTER text
  // ("version https://git-lfs.github.com/spec/v1\noid sha256:..."),
  // which JSON.parse rejects, leaving the linter with empty vocabs and
  // a silent false-fail. `/resolve/main/` redirects through HF's CDN
  // for LFS files and serves small files (config.json) unchanged. CORS
  // is granted for both via Access-Control-Allow-Origin headers.
  const url = `${HF_BASE}/${encodeURI(modelId.trim())}/resolve/main/${fileName}`;
  const controller = new AbortController();
  const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
  try {
    const res = await fetch(url, { signal: controller.signal });
    clearTimeout(timer);
    if (res.status === 401 || res.status === 403) {
      return { ok: false, error: "gated_or_private", status: res.status };
    }
    if (res.status === 404) {
      return { ok: false, error: "not_found", status: 404 };
    }
    if (!res.ok) {
      return { ok: false, error: "fetch_failed", status: res.status };
    }
    const text = await res.text();
    try {
      return { ok: true, data: JSON.parse(text), bytes: text.length };
    } catch (e) {
      return { ok: false, error: "parse_failed", message: String(e).slice(0, 200) };
    }
  } catch (e) {
    clearTimeout(timer);
    if (e.name === "AbortError") {
      return { ok: false, error: "timeout" };
    }
    return { ok: false, error: "network", message: String(e).slice(0, 200) };
  }
}

export async function fetchTokenizer(modelId) {
  // tokenizer.json is the canonical fast-tokenizer artifact. If it's
  // absent (some older models ship only sentencepiece), fall back to
  // tokenizer_config.json which carries the special-tokens metadata
  // even without the BPE merges.
  const main = await fetchHfJson(modelId, "tokenizer.json");
  if (main.ok) return { ...main, source: "tokenizer.json" };
  const fallback = await fetchHfJson(modelId, "tokenizer_config.json");
  if (fallback.ok) return { ...fallback, source: "tokenizer_config.json" };
  return main; // surface the original error code
}

export async function fetchConfig(modelId) {
  return await fetchHfJson(modelId, "config.json");
}

// =============================================================================
// Open-mirror fallback for gated models
// =============================================================================
//
// HF officially DISCOURAGES browser-side tokens (their own transformers.js
// docs: "we only support accessing private/gated models from server-side
// environments"). For client-only tools, the practical workaround for
// gated families (Llama, Mistral, Gemma) is to fall back to public mirrors
// that re-host the same tokenizer:
//   - unsloth/{name}            ← unsloth's open redistributions
//   - unsloth/Meta-{name}       ← Meta-prefixed Llama mirrors
//   - unsloth/{name}-bnb-4bit   ← quantized variants (tokenizer preserved)
//
// Tokenizer (BPE merges + vocab) is text — quantization touches weights,
// not the tokenizer artifact, so the mirror's tokenizer.json is usually
// byte-identical to the gated original. Caveat: some unsloth releases
// patch chat-template tokens (issue #880); we surface that in the UI
// with a "verify chat-template if exact match required" note.

const MIRROR_PATTERN_BUILDERS = [
  (id) => {
    const last = id.split("/").slice(-1)[0];
    return `unsloth/${last}`;
  },
  (id) => {
    const last = id.split("/").slice(-1)[0];
    return last.startsWith("Meta-") ? `unsloth/${last}` : `unsloth/Meta-${last}`;
  },
  (id) => {
    const last = id.split("/").slice(-1)[0];
    return `unsloth/${last}-bnb-4bit`;
  },
  (id) => {
    const last = id.split("/").slice(-1)[0];
    return last.startsWith("Meta-") ? `unsloth/${last}-bnb-4bit` : `unsloth/Meta-${last}-bnb-4bit`;
  },
];

export async function fetchTokenizerWithMirrorFallback(modelId) {
  const original = await fetchTokenizer(modelId);
  if (original.ok) return { ...original, viaMirror: null };
  // Only attempt mirror fallback when the failure is gated/private.
  // 404 / network / parse errors aren't fixable by trying a mirror.
  if (original.error !== "gated_or_private") {
    return { ...original, viaMirror: null };
  }
  const tried = new Set([modelId]);
  for (const build of MIRROR_PATTERN_BUILDERS) {
    let candidate;
    try { candidate = build(modelId); }
    catch { continue; }
    if (!candidate || tried.has(candidate)) continue;
    tried.add(candidate);
    const r = await fetchTokenizer(candidate);
    if (r.ok) return { ...r, viaMirror: candidate, mirrorOf: modelId };
  }
  return { ...original, viaMirror: null, triedMirrors: [...tried].slice(1) };
}

export async function fetchConfigWithMirrorFallback(modelId, mirrorId) {
  // Prefer the mirror's config when one was used (param counts come from
  // there), but also try the ORIGINAL config — some unsloth mirrors omit
  // it. Falls back gracefully.
  if (mirrorId) {
    const m = await fetchConfig(mirrorId);
    if (m.ok) return { ...m, viaMirror: mirrorId };
  }
  const o = await fetchConfig(modelId);
  return { ...o, viaMirror: null };
}

// =============================================================================
// Vocab extraction + comparison
// =============================================================================

// Return a Map<string,id> for whatever shape the tokenizer.json carries.
// HF fast tokenizers store vocab under `model.vocab`, which is either
// {token: id} (BPE) or [[token, score], ...] (Unigram). Special tokens
// live under top-level `added_tokens` (with id) and the model itself
// keeps an `unk_token`/`bos_token`/`eos_token` etc shape.
function extractVocab(tokenizer) {
  if (!tokenizer || typeof tokenizer !== "object") return null;
  const model = tokenizer.model;
  if (!model) return null;
  let vocab = null;
  if (model.vocab && typeof model.vocab === "object" && !Array.isArray(model.vocab)) {
    // BPE / WordPiece form
    vocab = model.vocab;
  } else if (Array.isArray(model.vocab)) {
    // Unigram form: [[token, log_prob], ...]
    vocab = {};
    for (let i = 0; i < model.vocab.length; i++) {
      const entry = model.vocab[i];
      if (Array.isArray(entry)) vocab[entry[0]] = i;
    }
  }
  return vocab;
}

function extractAddedTokens(tokenizer) {
  if (!tokenizer || typeof tokenizer !== "object") return [];
  const arr = tokenizer.added_tokens;
  if (!Array.isArray(arr)) return [];
  return arr.map(t => ({
    id: typeof t.id === "number" ? t.id : null,
    content: typeof t.content === "string" ? t.content : "",
    special: !!t.special,
  })).filter(t => t.content);
}

function extractSpecialTokens(tokenizer) {
  // tokenizer.json places special-token strings on the post-processor /
  // template — but the canonical names are in tokenizer_config.json.
  // Return what's available; the UI can show "—" for missing.
  if (!tokenizer || typeof tokenizer !== "object") return {};
  return {
    bos_token: tokenizer.bos_token ?? null,
    eos_token: tokenizer.eos_token ?? null,
    pad_token: tokenizer.pad_token ?? null,
    unk_token: tokenizer.unk_token ?? null,
  };
}

function tokenizerType(tokenizer) {
  return tokenizer?.model?.type || null;
}

// Sample-match strategy: for full-vocab compare (which is fine in JS
// for vocabs up to ~150K), build both maps and check equality. The
// expensive branch — VOCABS DIFFER — short-circuits on the first
// mismatch so the cost is bounded by the number of differing tokens.
export function compareVocabs(targetTok, draftTok) {
  const tType = tokenizerType(targetTok);
  const dType = tokenizerType(draftTok);
  const tVocab = extractVocab(targetTok);
  const dVocab = extractVocab(draftTok);

  if (!tVocab || !dVocab) {
    return {
      type_match: tType !== null && tType === dType,
      target_type: tType,
      draft_type: dType,
      vocab_size_match: false,
      target_vocab_size: tVocab ? Object.keys(tVocab).length : 0,
      draft_vocab_size: dVocab ? Object.keys(dVocab).length : 0,
      sampled_total: 0,
      sampled_match_count: 0,
      first_mismatch: null,
      special_tokens_diff: [],
      added_tokens_diff: [],
    };
  }

  const tKeys = Object.keys(tVocab);
  const dKeys = Object.keys(dVocab);
  const tSize = tKeys.length;
  const dSize = dKeys.length;
  const sizeMatch = tSize === dSize;

  // Sample comparison: walk every key on the SMALLER side. For each
  // key, check the id matches exactly. First mismatch is recorded.
  const sampleKeys = tSize <= dSize ? tKeys : dKeys;
  const a = tSize <= dSize ? tVocab : dVocab;
  const b = tSize <= dSize ? dVocab : tVocab;
  const sideA = tSize <= dSize ? "target" : "draft";
  const sideB = sideA === "target" ? "draft" : "target";

  let matchCount = 0;
  let firstMismatch = null;
  for (const key of sampleKeys) {
    const aId = a[key];
    const bId = b[key];
    if (aId === bId) {
      matchCount++;
    } else if (firstMismatch === null) {
      firstMismatch = { token: key, [`${sideA}_id`]: aId, [`${sideB}_id`]: bId };
    }
  }

  // Special-token diff
  const tSpec = extractSpecialTokens(targetTok);
  const dSpec = extractSpecialTokens(draftTok);
  const specDiff = [];
  for (const name of ["bos_token", "eos_token", "pad_token", "unk_token"]) {
    if ((tSpec[name] ?? null) !== (dSpec[name] ?? null)) {
      specDiff.push({ name, target: tSpec[name], draft: dSpec[name] });
    }
  }

  // Added-tokens diff (chat-template tokens etc.)
  const tAdded = extractAddedTokens(targetTok);
  const dAdded = extractAddedTokens(draftTok);
  const tAddedSet = new Set(tAdded.map(x => `${x.id}:${x.content}`));
  const dAddedSet = new Set(dAdded.map(x => `${x.id}:${x.content}`));
  const addedDiff = [];
  for (const k of tAddedSet) if (!dAddedSet.has(k)) addedDiff.push({ side: "target_only", token: k });
  for (const k of dAddedSet) if (!tAddedSet.has(k)) addedDiff.push({ side: "draft_only", token: k });

  return {
    type_match: tType === dType,
    target_type: tType,
    draft_type: dType,
    vocab_size_match: sizeMatch,
    target_vocab_size: tSize,
    draft_vocab_size: dSize,
    sampled_total: sampleKeys.length,
    sampled_match_count: matchCount,
    first_mismatch: firstMismatch,
    special_tokens_diff: specDiff,
    added_tokens_diff: addedDiff,
  };
}

// =============================================================================
// Param-count parsing — best-effort from model id strings
// =============================================================================
//
// HF model ids commonly carry a size hint: "Llama-3.1-8B", "Qwen2.5-72B",
// "Mistral-7B-v0.3". Parse the largest "{N}{B|M}" token; fall back to
// fetched config.json hidden_size × num_hidden_layers heuristic.

const PARAM_HINT_RE = /(\d+(?:\.\d+)?)\s*([bm])\b/i;

export function parseParamHint(modelId) {
  if (typeof modelId !== "string") return null;
  // Pick the LAST match — for "Llama-3.1-8B" we want 8B, not the "3.1"
  // (which doesn't carry b/m suffix anyway). Iterating to ensure we
  // find size hints not just version numbers.
  const matches = [...modelId.matchAll(/(\d+(?:\.\d+)?)\s*([bm])\b/gi)];
  if (matches.length === 0) return null;
  const last = matches[matches.length - 1];
  const value = parseFloat(last[1]);
  const unit = last[2].toLowerCase();
  if (isNaN(value)) return null;
  const params = unit === "b" ? value * 1e9 : value * 1e6;
  return params;
}

// Approximate param count from config.json. Highly heuristic.
function paramsFromConfig(config) {
  if (!config) return null;
  const h = config.hidden_size ?? config.n_embd ?? config.d_model;
  const l = config.num_hidden_layers ?? config.n_layer ?? config.num_layers;
  const v = config.vocab_size;
  if (typeof h !== "number" || typeof l !== "number" || typeof v !== "number") return null;
  // Rough transformer param count: 12 × h² × l + h × v (embedding) + h × v (output, if not tied).
  // Not exact but order-of-magnitude usable for ratio computation.
  return 12 * h * h * l + 2 * h * v;
}

// =============================================================================
// Speedup estimation
// =============================================================================
//
// Speculative decoding theoretical maximum speedup:
//   S = 1 / ((1 - α^(K+1)) / (1 - α) × (T_d / T_t) + α^(K+1))
// where α = draft acceptance rate, K = lookahead, T_d/T_t = ratio of
// draft to target step time. For practical config (K=4-7, α=0.6-0.8):
//   S ≈ 1 + α × (1 - param_ratio)
// up to a ceiling of ~3-4x. Anything beyond that is wishful.
//
// Without α measured in-domain, return a band: low (α=0.5), expected
// (α=0.7), high (α=0.85). Surfaces the uncertainty honestly.

function speedupBand(targetParams, draftParams) {
  if (!targetParams || !draftParams) return null;
  const ratio = draftParams / targetParams;
  if (ratio >= 1) {
    // Draft must be smaller; this is misuse.
    return { ratio, code: "draft_not_smaller" };
  }
  const compute = (alpha) => {
    const s = 1 + alpha * (1 - ratio);
    // Cap at empirical 3.5x ceiling — beyond that, the assumptions break.
    return Math.min(s, 3.5);
  };
  return {
    ratio,
    low:      Math.round(compute(0.50) * 100) / 100,
    expected: Math.round(compute(0.70) * 100) / 100,
    high:     Math.round(compute(0.85) * 100) / 100,
  };
}

// =============================================================================
// Public entry point — orchestrates fetch + compare + speedup
// =============================================================================

const COMPATIBLE_THRESHOLD = 0.999;        // 99.9% of sampled tokens map identically
const PARTIAL_THRESHOLD    = 0.95;          // >=95% but <99.9%

export async function checkCompatibility(targetId, draftId) {
  if (!targetId || !draftId) {
    return { code: "missing_input", params: { targetId, draftId }, errors: [] };
  }
  if (targetId.trim() === draftId.trim()) {
    return { code: "identical_models", params: { targetId, draftId }, errors: [] };
  }

  const [tTok, dTok] = await Promise.all([
    fetchTokenizerWithMirrorFallback(targetId),
    fetchTokenizerWithMirrorFallback(draftId),
  ]);

  const errors = [];
  if (!tTok.ok) errors.push({ side: "target", error: tTok.error, status: tTok.status, triedMirrors: tTok.triedMirrors });
  if (!dTok.ok) errors.push({ side: "draft",  error: dTok.error, status: dTok.status, triedMirrors: dTok.triedMirrors });
  if (!tTok.ok || !dTok.ok) {
    return { code: "fetch_failed", params: { targetId, draftId }, errors };
  }

  // Fetch configs — prefer mirror when one was used.
  const [tCfg, dCfg] = await Promise.all([
    fetchConfigWithMirrorFallback(targetId, tTok.viaMirror),
    fetchConfigWithMirrorFallback(draftId, dTok.viaMirror),
  ]);

  const cmp = compareVocabs(tTok.data, dTok.data);

  // Param ratio + speedup estimate
  const tParams = paramsFromConfig(tCfg.ok ? tCfg.data : null) || parseParamHint(targetId);
  const dParams = paramsFromConfig(dCfg.ok ? dCfg.data : null) || parseParamHint(draftId);
  const speedup = speedupBand(tParams, dParams);

  const sampledMatchRatio = cmp.sampled_total === 0
    ? 0
    : cmp.sampled_match_count / cmp.sampled_total;

  let code;
  if (!cmp.type_match) {
    code = "type_mismatch";
  } else if (!cmp.vocab_size_match) {
    code = "vocab_size_mismatch";
  } else if (sampledMatchRatio >= COMPATIBLE_THRESHOLD) {
    code = cmp.special_tokens_diff.length || cmp.added_tokens_diff.length
      ? "compatible_with_caveats"
      : "compatible";
  } else if (sampledMatchRatio >= PARTIAL_THRESHOLD) {
    code = "partial_compatible";
  } else {
    code = "incompatible";
  }

  return {
    code,
    params: {
      targetId, draftId,
      ...cmp,
      sampled_match_ratio: Math.round(sampledMatchRatio * 10000) / 10000,
      target_params: tParams,
      draft_params: dParams,
      param_ratio: speedup?.ratio ?? null,
      speedup_low:      speedup?.low ?? null,
      speedup_expected: speedup?.expected ?? null,
      speedup_high:     speedup?.high ?? null,
      target_source: tTok.source,
      draft_source: dTok.source,
      target_via_mirror: tTok.viaMirror || null,
      draft_via_mirror:  dTok.viaMirror || null,
    },
    errors,
  };
}