Spaces:

karlexmarin
/

taf-agent

Running

File size: 10,784 Bytes

3d389cc

// Prompt-Cache Diff Predictor (v0.8.4 anti-bullshit pack #10)
//
// Pain: small prompt edits silently invalidate provider prompt caches,
// turning a 50% discount into a 0% discount and 10x'ing the bill.
// Users debug this blind because:
//   - Anthropic's `cache_control` cache breaks at the first token diff
//     in the marked prefix (TTL 5 min default, 1 hour beta).
//   - OpenAI auto-caches prefixes ≥1024 tokens but invalidates on any
//     prefix change; the 50% read discount only applies on hit.
//   - Gemini's context cache requires explicit creation, ≥32K tokens,
//     and any prefix edit forces a new cache.
//
// Tool: paste old + new prompt → compute longest common prefix in
// tokens → predict per-provider cache hit ratio + $ delta vs no-cache.
//
// Pure logic — no human strings; main.js does i18n. Returns
// {code, params, providers: [{provider_id, ...}]}.

// =============================================================================
// Token estimation — heuristic, browser-only
// =============================================================================
//
// Real tokenizers vary by ±15% between Llama / GPT / Claude / Qwen and
// running them in-browser would mean shipping a 5-10 MB WASM blob. For a
// cache-diff predictor the absolute count doesn't matter — what matters
// is the RATIO of common-prefix to divergent-suffix tokens, which is
// robust to estimator choice. The three profiles below cover 95% of
// real prompts; users with extreme cases can paste pre-tokenized counts.
const TOKEN_PROFILES = {
  english: { chars_per_token: 4.0, label_key: "cache.profile.english" },
  code:    { chars_per_token: 3.5, label_key: "cache.profile.code" },
  mixed:   { chars_per_token: 2.0, label_key: "cache.profile.mixed" }, // CJK / Cyrillic
};

export function estimateTokens(text, profile = "english") {
  if (typeof text !== "string" || !text) return 0;
  const cpt = TOKEN_PROFILES[profile]?.chars_per_token ?? 4.0;
  return Math.ceil(text.length / cpt);
}

// =============================================================================
// Provider rules — pricing + cache mechanics
// =============================================================================
//
// Prices are USD per million tokens, snapshot 2026-01 (knowledge cutoff).
// `cache_read_multiplier` is the fraction of input price billed on a
// cache hit (Anthropic 0.10 = 10%; OpenAI/Gemini 0.50 = 50%; etc).
// `cache_write_multiplier` accounts for Anthropic's 25% write surcharge
// the first time a prefix is seen.
//
// `min_cache_tokens` is the floor below which the provider cannot cache
// (OpenAI auto-cache requires ≥1024; Gemini context cache ≥32K).
// Anthropic has no min token floor but requires explicit cache_control
// marker — we treat that as min=0 with a `requires_explicit` flag for UI.
export const PROVIDERS = {
  anthropic_opus: {
    name: "Claude Opus 4.7",
    min_cache_tokens: 0,
    requires_explicit: true,
    cache_ttl_seconds: 300,                 // 5 min default
    input_per_mt:  15.00,
    output_per_mt: 75.00,
    cache_write_multiplier: 1.25,
    cache_read_multiplier:  0.10,           // 10% of input
  },
  anthropic_sonnet: {
    name: "Claude Sonnet 4.6",
    min_cache_tokens: 0,
    requires_explicit: true,
    cache_ttl_seconds: 300,
    input_per_mt:   3.00,
    output_per_mt: 15.00,
    cache_write_multiplier: 1.25,
    cache_read_multiplier:  0.10,
  },
  anthropic_haiku: {
    name: "Claude Haiku 4.5",
    min_cache_tokens: 0,
    requires_explicit: true,
    cache_ttl_seconds: 300,
    input_per_mt:   1.00,
    output_per_mt:  5.00,
    cache_write_multiplier: 1.25,
    cache_read_multiplier:  0.10,
  },
  openai_gpt5: {
    name: "OpenAI GPT-5",
    min_cache_tokens: 1024,
    requires_explicit: false,
    cache_ttl_seconds: 600,                 // ~5-10 min observed
    input_per_mt:   5.00,
    output_per_mt: 15.00,
    cache_write_multiplier: 1.00,
    cache_read_multiplier:  0.50,           // 50% of input
  },
  openai_gpt5_mini: {
    name: "OpenAI GPT-5 mini",
    min_cache_tokens: 1024,
    requires_explicit: false,
    cache_ttl_seconds: 600,
    input_per_mt:   0.30,
    output_per_mt:  1.20,
    cache_write_multiplier: 1.00,
    cache_read_multiplier:  0.50,
  },
  gemini_25_pro: {
    name: "Gemini 2.5 Pro",
    min_cache_tokens: 32768,
    requires_explicit: true,
    cache_ttl_seconds: 3600,                // 1 hour default for context cache
    input_per_mt:   1.25,
    output_per_mt: 10.00,
    cache_write_multiplier: 1.00,
    cache_read_multiplier:  0.25,           // 25% of input
  },
};

// =============================================================================
// Longest common prefix — character-level
// =============================================================================

export function longestCommonPrefix(a, b) {
  if (typeof a !== "string" || typeof b !== "string") return 0;
  const n = Math.min(a.length, b.length);
  let i = 0;
  while (i < n && a.charCodeAt(i) === b.charCodeAt(i)) i++;
  return i;
}

// First differing line — useful for the UI "your edit landed here" hint.
function firstDifferingLine(a, b, prefixLen) {
  // Walk back to the start of the line containing the diff
  let i = prefixLen;
  while (i > 0 && a[i - 1] !== "\n" && b[i - 1] !== "\n") i--;
  // Count line number (1-indexed)
  let line = 1;
  for (let j = 0; j < i; j++) {
    if (a[j] === "\n") line++;
  }
  return { offset: i, line };
}

// =============================================================================
// Per-provider cache analysis
// =============================================================================

function analyseProvider(
  providerId,
  totalTokensNew,
  commonTokens,
  divergeTokens,
  outputTokens,
) {
  const p = PROVIDERS[providerId];
  if (!p) return null;

  const inputPrice = p.input_per_mt / 1_000_000;
  const outputPrice = p.output_per_mt / 1_000_000;
  const baseCost =
    totalTokensNew * inputPrice + outputTokens * outputPrice;

  // Can the provider cache anything? Two failure modes:
  //   (a) common prefix below provider's minimum cacheable size
  //   (b) provider requires an explicit marker AND the user almost
  //       certainly didn't include one in the paste — we still report
  //       the best-case savings but tag the result as `requires_marker`.
  let canCache = true;
  let reason = null;
  if (commonTokens < p.min_cache_tokens) {
    canCache = false;
    reason = "below_min";
  }

  if (!canCache) {
    return {
      provider_id: providerId,
      provider_name: p.name,
      base_cost_usd: baseCost,
      cached_cost_usd: baseCost,
      savings_usd: 0,
      hit_ratio: 0,
      tokens_cached: 0,
      tokens_billed_input: totalTokensNew,
      reason,
      min_cache_tokens: p.min_cache_tokens,
      requires_explicit: p.requires_explicit,
      cache_ttl_seconds: p.cache_ttl_seconds,
    };
  }

  // Cost on cache HIT for the prefix:
  //   cache-read: commonTokens × inputPrice × cache_read_multiplier
  //   fresh:      divergeTokens × inputPrice
  //   output:     outputTokens × outputPrice
  const cachedInputCost =
    commonTokens * inputPrice * p.cache_read_multiplier +
    divergeTokens * inputPrice;
  const cachedCost = cachedInputCost + outputTokens * outputPrice;

  // Cache write surcharge (Anthropic). Surfaced as `cache_write_cost`
  // separately so users see the amortization picture.
  const cacheWriteSurcharge =
    commonTokens * inputPrice * (p.cache_write_multiplier - 1.0);

  const savings = baseCost - cachedCost;
  const hitRatio = totalTokensNew === 0 ? 0 : commonTokens / totalTokensNew;

  return {
    provider_id: providerId,
    provider_name: p.name,
    base_cost_usd: baseCost,
    cached_cost_usd: cachedCost,
    cache_write_surcharge_usd: cacheWriteSurcharge,
    savings_usd: savings,
    savings_pct: baseCost === 0 ? 0 : savings / baseCost,
    hit_ratio: hitRatio,
    tokens_cached: commonTokens,
    tokens_billed_input: divergeTokens,
    reason: null,
    min_cache_tokens: p.min_cache_tokens,
    requires_explicit: p.requires_explicit,
    cache_ttl_seconds: p.cache_ttl_seconds,
  };
}

// =============================================================================
// Public entry point
// =============================================================================

export function diffPromptCache(
  oldPrompt,
  newPrompt,
  {
    profile = "english",
    outputTokensEstimate = 500,
    providers = null,
  } = {},
) {
  if (typeof oldPrompt !== "string" || typeof newPrompt !== "string") {
    return { code: "empty_input", params: {} };
  }
  const oldTrim = oldPrompt;
  const newTrim = newPrompt;
  if (!oldTrim && !newTrim) {
    return { code: "empty_input", params: {} };
  }

  const lcpChars = longestCommonPrefix(oldTrim, newTrim);
  const isIdentical = oldTrim === newTrim;
  const totalCharsNew = newTrim.length;
  const divergeChars = totalCharsNew - lcpChars;

  const tokensCommon  = estimateTokens(oldTrim.slice(0, lcpChars), profile);
  const tokensDiverge = estimateTokens(newTrim.slice(lcpChars),    profile);
  const tokensTotal   = tokensCommon + tokensDiverge;

  const providerIds = providers ?? Object.keys(PROVIDERS);
  const providerResults = providerIds
    .map(id => analyseProvider(id, tokensTotal, tokensCommon, tokensDiverge, outputTokensEstimate))
    .filter(r => r !== null);

  const diffPoint = isIdentical
    ? { offset: oldTrim.length, line: oldTrim.split("\n").length }
    : firstDifferingLine(oldTrim, newTrim, lcpChars);

  let code;
  if (isIdentical) {
    code = "identical";
  } else if (lcpChars === 0) {
    code = "fully_divergent";
  } else if (providerResults.every(r => r.reason === "below_min")) {
    code = "divergent_below_min";
  } else {
    code = "divergent_can_cache";
  }

  return {
    code,
    params: {
      profile,
      lcp_chars: lcpChars,
      diverge_chars: divergeChars,
      tokens_common: tokensCommon,
      tokens_diverge: tokensDiverge,
      tokens_total: tokensTotal,
      hit_ratio: tokensTotal === 0 ? 0 : tokensCommon / tokensTotal,
      diff_point: diffPoint,
      output_tokens: outputTokensEstimate,
    },
    providers: providerResults,
  };
}

// Helper used by the UI: short summary string per provider, suitable for
// rendering in a table row (i18n-substituted in main.js).
export function summariseProvider(result) {
  if (!result) return null;
  return {
    name: result.provider_name,
    hit_pct: Math.round(result.hit_ratio * 100),
    base: result.base_cost_usd,
    cached: result.cached_cost_usd,
    savings: result.savings_usd,
    savings_pct: result.savings_pct ?? 0,
    requires_explicit: result.requires_explicit,
    reason: result.reason,
  };
}