taf-agent / js /prompt_cache_diff.js
karlexmarin's picture
v0.8.4 Prompt-Cache Diff Predictor — anti-bullshit pack #10
3d389cc
// Prompt-Cache Diff Predictor (v0.8.4 anti-bullshit pack #10)
//
// Pain: small prompt edits silently invalidate provider prompt caches,
// turning a 50% discount into a 0% discount and 10x'ing the bill.
// Users debug this blind because:
// - Anthropic's `cache_control` cache breaks at the first token diff
// in the marked prefix (TTL 5 min default, 1 hour beta).
// - OpenAI auto-caches prefixes ≥1024 tokens but invalidates on any
// prefix change; the 50% read discount only applies on hit.
// - Gemini's context cache requires explicit creation, ≥32K tokens,
// and any prefix edit forces a new cache.
//
// Tool: paste old + new prompt → compute longest common prefix in
// tokens → predict per-provider cache hit ratio + $ delta vs no-cache.
//
// Pure logic — no human strings; main.js does i18n. Returns
// {code, params, providers: [{provider_id, ...}]}.
// =============================================================================
// Token estimation — heuristic, browser-only
// =============================================================================
//
// Real tokenizers vary by ±15% between Llama / GPT / Claude / Qwen and
// running them in-browser would mean shipping a 5-10 MB WASM blob. For a
// cache-diff predictor the absolute count doesn't matter — what matters
// is the RATIO of common-prefix to divergent-suffix tokens, which is
// robust to estimator choice. The three profiles below cover 95% of
// real prompts; users with extreme cases can paste pre-tokenized counts.
const TOKEN_PROFILES = {
english: { chars_per_token: 4.0, label_key: "cache.profile.english" },
code: { chars_per_token: 3.5, label_key: "cache.profile.code" },
mixed: { chars_per_token: 2.0, label_key: "cache.profile.mixed" }, // CJK / Cyrillic
};
export function estimateTokens(text, profile = "english") {
if (typeof text !== "string" || !text) return 0;
const cpt = TOKEN_PROFILES[profile]?.chars_per_token ?? 4.0;
return Math.ceil(text.length / cpt);
}
// =============================================================================
// Provider rules — pricing + cache mechanics
// =============================================================================
//
// Prices are USD per million tokens, snapshot 2026-01 (knowledge cutoff).
// `cache_read_multiplier` is the fraction of input price billed on a
// cache hit (Anthropic 0.10 = 10%; OpenAI/Gemini 0.50 = 50%; etc).
// `cache_write_multiplier` accounts for Anthropic's 25% write surcharge
// the first time a prefix is seen.
//
// `min_cache_tokens` is the floor below which the provider cannot cache
// (OpenAI auto-cache requires ≥1024; Gemini context cache ≥32K).
// Anthropic has no min token floor but requires explicit cache_control
// marker — we treat that as min=0 with a `requires_explicit` flag for UI.
export const PROVIDERS = {
anthropic_opus: {
name: "Claude Opus 4.7",
min_cache_tokens: 0,
requires_explicit: true,
cache_ttl_seconds: 300, // 5 min default
input_per_mt: 15.00,
output_per_mt: 75.00,
cache_write_multiplier: 1.25,
cache_read_multiplier: 0.10, // 10% of input
},
anthropic_sonnet: {
name: "Claude Sonnet 4.6",
min_cache_tokens: 0,
requires_explicit: true,
cache_ttl_seconds: 300,
input_per_mt: 3.00,
output_per_mt: 15.00,
cache_write_multiplier: 1.25,
cache_read_multiplier: 0.10,
},
anthropic_haiku: {
name: "Claude Haiku 4.5",
min_cache_tokens: 0,
requires_explicit: true,
cache_ttl_seconds: 300,
input_per_mt: 1.00,
output_per_mt: 5.00,
cache_write_multiplier: 1.25,
cache_read_multiplier: 0.10,
},
openai_gpt5: {
name: "OpenAI GPT-5",
min_cache_tokens: 1024,
requires_explicit: false,
cache_ttl_seconds: 600, // ~5-10 min observed
input_per_mt: 5.00,
output_per_mt: 15.00,
cache_write_multiplier: 1.00,
cache_read_multiplier: 0.50, // 50% of input
},
openai_gpt5_mini: {
name: "OpenAI GPT-5 mini",
min_cache_tokens: 1024,
requires_explicit: false,
cache_ttl_seconds: 600,
input_per_mt: 0.30,
output_per_mt: 1.20,
cache_write_multiplier: 1.00,
cache_read_multiplier: 0.50,
},
gemini_25_pro: {
name: "Gemini 2.5 Pro",
min_cache_tokens: 32768,
requires_explicit: true,
cache_ttl_seconds: 3600, // 1 hour default for context cache
input_per_mt: 1.25,
output_per_mt: 10.00,
cache_write_multiplier: 1.00,
cache_read_multiplier: 0.25, // 25% of input
},
};
// =============================================================================
// Longest common prefix — character-level
// =============================================================================
export function longestCommonPrefix(a, b) {
if (typeof a !== "string" || typeof b !== "string") return 0;
const n = Math.min(a.length, b.length);
let i = 0;
while (i < n && a.charCodeAt(i) === b.charCodeAt(i)) i++;
return i;
}
// First differing line — useful for the UI "your edit landed here" hint.
function firstDifferingLine(a, b, prefixLen) {
// Walk back to the start of the line containing the diff
let i = prefixLen;
while (i > 0 && a[i - 1] !== "\n" && b[i - 1] !== "\n") i--;
// Count line number (1-indexed)
let line = 1;
for (let j = 0; j < i; j++) {
if (a[j] === "\n") line++;
}
return { offset: i, line };
}
// =============================================================================
// Per-provider cache analysis
// =============================================================================
function analyseProvider(
providerId,
totalTokensNew,
commonTokens,
divergeTokens,
outputTokens,
) {
const p = PROVIDERS[providerId];
if (!p) return null;
const inputPrice = p.input_per_mt / 1_000_000;
const outputPrice = p.output_per_mt / 1_000_000;
const baseCost =
totalTokensNew * inputPrice + outputTokens * outputPrice;
// Can the provider cache anything? Two failure modes:
// (a) common prefix below provider's minimum cacheable size
// (b) provider requires an explicit marker AND the user almost
// certainly didn't include one in the paste — we still report
// the best-case savings but tag the result as `requires_marker`.
let canCache = true;
let reason = null;
if (commonTokens < p.min_cache_tokens) {
canCache = false;
reason = "below_min";
}
if (!canCache) {
return {
provider_id: providerId,
provider_name: p.name,
base_cost_usd: baseCost,
cached_cost_usd: baseCost,
savings_usd: 0,
hit_ratio: 0,
tokens_cached: 0,
tokens_billed_input: totalTokensNew,
reason,
min_cache_tokens: p.min_cache_tokens,
requires_explicit: p.requires_explicit,
cache_ttl_seconds: p.cache_ttl_seconds,
};
}
// Cost on cache HIT for the prefix:
// cache-read: commonTokens × inputPrice × cache_read_multiplier
// fresh: divergeTokens × inputPrice
// output: outputTokens × outputPrice
const cachedInputCost =
commonTokens * inputPrice * p.cache_read_multiplier +
divergeTokens * inputPrice;
const cachedCost = cachedInputCost + outputTokens * outputPrice;
// Cache write surcharge (Anthropic). Surfaced as `cache_write_cost`
// separately so users see the amortization picture.
const cacheWriteSurcharge =
commonTokens * inputPrice * (p.cache_write_multiplier - 1.0);
const savings = baseCost - cachedCost;
const hitRatio = totalTokensNew === 0 ? 0 : commonTokens / totalTokensNew;
return {
provider_id: providerId,
provider_name: p.name,
base_cost_usd: baseCost,
cached_cost_usd: cachedCost,
cache_write_surcharge_usd: cacheWriteSurcharge,
savings_usd: savings,
savings_pct: baseCost === 0 ? 0 : savings / baseCost,
hit_ratio: hitRatio,
tokens_cached: commonTokens,
tokens_billed_input: divergeTokens,
reason: null,
min_cache_tokens: p.min_cache_tokens,
requires_explicit: p.requires_explicit,
cache_ttl_seconds: p.cache_ttl_seconds,
};
}
// =============================================================================
// Public entry point
// =============================================================================
export function diffPromptCache(
oldPrompt,
newPrompt,
{
profile = "english",
outputTokensEstimate = 500,
providers = null,
} = {},
) {
if (typeof oldPrompt !== "string" || typeof newPrompt !== "string") {
return { code: "empty_input", params: {} };
}
const oldTrim = oldPrompt;
const newTrim = newPrompt;
if (!oldTrim && !newTrim) {
return { code: "empty_input", params: {} };
}
const lcpChars = longestCommonPrefix(oldTrim, newTrim);
const isIdentical = oldTrim === newTrim;
const totalCharsNew = newTrim.length;
const divergeChars = totalCharsNew - lcpChars;
const tokensCommon = estimateTokens(oldTrim.slice(0, lcpChars), profile);
const tokensDiverge = estimateTokens(newTrim.slice(lcpChars), profile);
const tokensTotal = tokensCommon + tokensDiverge;
const providerIds = providers ?? Object.keys(PROVIDERS);
const providerResults = providerIds
.map(id => analyseProvider(id, tokensTotal, tokensCommon, tokensDiverge, outputTokensEstimate))
.filter(r => r !== null);
const diffPoint = isIdentical
? { offset: oldTrim.length, line: oldTrim.split("\n").length }
: firstDifferingLine(oldTrim, newTrim, lcpChars);
let code;
if (isIdentical) {
code = "identical";
} else if (lcpChars === 0) {
code = "fully_divergent";
} else if (providerResults.every(r => r.reason === "below_min")) {
code = "divergent_below_min";
} else {
code = "divergent_can_cache";
}
return {
code,
params: {
profile,
lcp_chars: lcpChars,
diverge_chars: divergeChars,
tokens_common: tokensCommon,
tokens_diverge: tokensDiverge,
tokens_total: tokensTotal,
hit_ratio: tokensTotal === 0 ? 0 : tokensCommon / tokensTotal,
diff_point: diffPoint,
output_tokens: outputTokensEstimate,
},
providers: providerResults,
};
}
// Helper used by the UI: short summary string per provider, suitable for
// rendering in a table row (i18n-substituted in main.js).
export function summariseProvider(result) {
if (!result) return null;
return {
name: result.provider_name,
hit_pct: Math.round(result.hit_ratio * 100),
base: result.base_cost_usd,
cached: result.cached_cost_usd,
savings: result.savings_usd,
savings_pct: result.savings_pct ?? 0,
requires_explicit: result.requires_explicit,
reason: result.reason,
};
}