Spaces:

karlexmarin
/

taf-agent

Running

App Files Files Community

taf-agent / js /prompt_cache_diff.js

karlexmarin's picture

v0.8.4 Prompt-Cache Diff Predictor — anti-bullshit pack #10

3d389cc 5 days ago

history blame contribute delete

10.8 kB

	// Prompt-Cache Diff Predictor (v0.8.4 anti-bullshit pack #10)
	//
	// Pain: small prompt edits silently invalidate provider prompt caches,
	// turning a 50% discount into a 0% discount and 10x'ing the bill.
	// Users debug this blind because:
	// - Anthropic's `cache_control` cache breaks at the first token diff
	// in the marked prefix (TTL 5 min default, 1 hour beta).
	// - OpenAI auto-caches prefixes ≥1024 tokens but invalidates on any
	// prefix change; the 50% read discount only applies on hit.
	// - Gemini's context cache requires explicit creation, ≥32K tokens,
	// and any prefix edit forces a new cache.
	//
	// Tool: paste old + new prompt → compute longest common prefix in
	// tokens → predict per-provider cache hit ratio + $ delta vs no-cache.
	//
	// Pure logic — no human strings; main.js does i18n. Returns
	// {code, params, providers: [{provider_id, ...}]}.

	// =============================================================================
	// Token estimation — heuristic, browser-only
	// =============================================================================
	//
	// Real tokenizers vary by ±15% between Llama / GPT / Claude / Qwen and
	// running them in-browser would mean shipping a 5-10 MB WASM blob. For a
	// cache-diff predictor the absolute count doesn't matter — what matters
	// is the RATIO of common-prefix to divergent-suffix tokens, which is
	// robust to estimator choice. The three profiles below cover 95% of
	// real prompts; users with extreme cases can paste pre-tokenized counts.
	const TOKEN_PROFILES = {
	english: { chars_per_token: 4.0, label_key: "cache.profile.english" },
	code: { chars_per_token: 3.5, label_key: "cache.profile.code" },
	mixed: { chars_per_token: 2.0, label_key: "cache.profile.mixed" }, // CJK / Cyrillic
	};

	export function estimateTokens(text, profile = "english") {
	if (typeof text !== "string" \|\| !text) return 0;
	const cpt = TOKEN_PROFILES[profile]?.chars_per_token ?? 4.0;
	return Math.ceil(text.length / cpt);
	}

	// =============================================================================
	// Provider rules — pricing + cache mechanics
	// =============================================================================
	//
	// Prices are USD per million tokens, snapshot 2026-01 (knowledge cutoff).
	// `cache_read_multiplier` is the fraction of input price billed on a
	// cache hit (Anthropic 0.10 = 10%; OpenAI/Gemini 0.50 = 50%; etc).
	// `cache_write_multiplier` accounts for Anthropic's 25% write surcharge
	// the first time a prefix is seen.
	//
	// `min_cache_tokens` is the floor below which the provider cannot cache
	// (OpenAI auto-cache requires ≥1024; Gemini context cache ≥32K).
	// Anthropic has no min token floor but requires explicit cache_control
	// marker — we treat that as min=0 with a `requires_explicit` flag for UI.
	export const PROVIDERS = {
	anthropic_opus: {
	name: "Claude Opus 4.7",
	min_cache_tokens: 0,
	requires_explicit: true,
	cache_ttl_seconds: 300, // 5 min default
	input_per_mt: 15.00,
	output_per_mt: 75.00,
	cache_write_multiplier: 1.25,
	cache_read_multiplier: 0.10, // 10% of input
	},
	anthropic_sonnet: {
	name: "Claude Sonnet 4.6",
	min_cache_tokens: 0,
	requires_explicit: true,
	cache_ttl_seconds: 300,
	input_per_mt: 3.00,
	output_per_mt: 15.00,
	cache_write_multiplier: 1.25,
	cache_read_multiplier: 0.10,
	},
	anthropic_haiku: {
	name: "Claude Haiku 4.5",
	min_cache_tokens: 0,
	requires_explicit: true,
	cache_ttl_seconds: 300,
	input_per_mt: 1.00,
	output_per_mt: 5.00,
	cache_write_multiplier: 1.25,
	cache_read_multiplier: 0.10,
	},
	openai_gpt5: {
	name: "OpenAI GPT-5",
	min_cache_tokens: 1024,
	requires_explicit: false,
	cache_ttl_seconds: 600, // ~5-10 min observed
	input_per_mt: 5.00,
	output_per_mt: 15.00,
	cache_write_multiplier: 1.00,
	cache_read_multiplier: 0.50, // 50% of input
	},
	openai_gpt5_mini: {
	name: "OpenAI GPT-5 mini",
	min_cache_tokens: 1024,
	requires_explicit: false,
	cache_ttl_seconds: 600,
	input_per_mt: 0.30,
	output_per_mt: 1.20,
	cache_write_multiplier: 1.00,
	cache_read_multiplier: 0.50,
	},
	gemini_25_pro: {
	name: "Gemini 2.5 Pro",
	min_cache_tokens: 32768,
	requires_explicit: true,
	cache_ttl_seconds: 3600, // 1 hour default for context cache
	input_per_mt: 1.25,
	output_per_mt: 10.00,
	cache_write_multiplier: 1.00,
	cache_read_multiplier: 0.25, // 25% of input
	},
	};

	// =============================================================================
	// Longest common prefix — character-level
	// =============================================================================

	export function longestCommonPrefix(a, b) {
	if (typeof a !== "string" \|\| typeof b !== "string") return 0;
	const n = Math.min(a.length, b.length);
	let i = 0;
	while (i < n && a.charCodeAt(i) === b.charCodeAt(i)) i++;
	return i;
	}

	// First differing line — useful for the UI "your edit landed here" hint.
	function firstDifferingLine(a, b, prefixLen) {
	// Walk back to the start of the line containing the diff
	let i = prefixLen;
	while (i > 0 && a[i - 1] !== "\n" && b[i - 1] !== "\n") i--;
	// Count line number (1-indexed)
	let line = 1;
	for (let j = 0; j < i; j++) {
	if (a[j] === "\n") line++;
	}
	return { offset: i, line };
	}

	// =============================================================================
	// Per-provider cache analysis
	// =============================================================================

	function analyseProvider(
	providerId,
	totalTokensNew,
	commonTokens,
	divergeTokens,
	outputTokens,
	) {
	const p = PROVIDERS[providerId];
	if (!p) return null;

	const inputPrice = p.input_per_mt / 1_000_000;
	const outputPrice = p.output_per_mt / 1_000_000;
	const baseCost =
	totalTokensNew * inputPrice + outputTokens * outputPrice;

	// Can the provider cache anything? Two failure modes:
	// (a) common prefix below provider's minimum cacheable size
	// (b) provider requires an explicit marker AND the user almost
	// certainly didn't include one in the paste — we still report
	// the best-case savings but tag the result as `requires_marker`.
	let canCache = true;
	let reason = null;
	if (commonTokens < p.min_cache_tokens) {
	canCache = false;
	reason = "below_min";
	}

	if (!canCache) {
	return {
	provider_id: providerId,
	provider_name: p.name,
	base_cost_usd: baseCost,
	cached_cost_usd: baseCost,
	savings_usd: 0,
	hit_ratio: 0,
	tokens_cached: 0,
	tokens_billed_input: totalTokensNew,
	reason,
	min_cache_tokens: p.min_cache_tokens,
	requires_explicit: p.requires_explicit,
	cache_ttl_seconds: p.cache_ttl_seconds,
	};
	}

	// Cost on cache HIT for the prefix:
	// cache-read: commonTokens × inputPrice × cache_read_multiplier
	// fresh: divergeTokens × inputPrice
	// output: outputTokens × outputPrice
	const cachedInputCost =
	commonTokens * inputPrice * p.cache_read_multiplier +
	divergeTokens * inputPrice;
	const cachedCost = cachedInputCost + outputTokens * outputPrice;

	// Cache write surcharge (Anthropic). Surfaced as `cache_write_cost`
	// separately so users see the amortization picture.
	const cacheWriteSurcharge =
	commonTokens * inputPrice * (p.cache_write_multiplier - 1.0);

	const savings = baseCost - cachedCost;
	const hitRatio = totalTokensNew === 0 ? 0 : commonTokens / totalTokensNew;

	return {
	provider_id: providerId,
	provider_name: p.name,
	base_cost_usd: baseCost,
	cached_cost_usd: cachedCost,
	cache_write_surcharge_usd: cacheWriteSurcharge,
	savings_usd: savings,
	savings_pct: baseCost === 0 ? 0 : savings / baseCost,
	hit_ratio: hitRatio,
	tokens_cached: commonTokens,
	tokens_billed_input: divergeTokens,
	reason: null,
	min_cache_tokens: p.min_cache_tokens,
	requires_explicit: p.requires_explicit,
	cache_ttl_seconds: p.cache_ttl_seconds,
	};
	}

	// =============================================================================
	// Public entry point
	// =============================================================================

	export function diffPromptCache(
	oldPrompt,
	newPrompt,
	{
	profile = "english",
	outputTokensEstimate = 500,
	providers = null,
	} = {},
	) {
	if (typeof oldPrompt !== "string" \|\| typeof newPrompt !== "string") {
	return { code: "empty_input", params: {} };
	}
	const oldTrim = oldPrompt;
	const newTrim = newPrompt;
	if (!oldTrim && !newTrim) {
	return { code: "empty_input", params: {} };
	}

	const lcpChars = longestCommonPrefix(oldTrim, newTrim);
	const isIdentical = oldTrim === newTrim;
	const totalCharsNew = newTrim.length;
	const divergeChars = totalCharsNew - lcpChars;

	const tokensCommon = estimateTokens(oldTrim.slice(0, lcpChars), profile);
	const tokensDiverge = estimateTokens(newTrim.slice(lcpChars), profile);
	const tokensTotal = tokensCommon + tokensDiverge;

	const providerIds = providers ?? Object.keys(PROVIDERS);
	const providerResults = providerIds
	.map(id => analyseProvider(id, tokensTotal, tokensCommon, tokensDiverge, outputTokensEstimate))
	.filter(r => r !== null);

	const diffPoint = isIdentical
	? { offset: oldTrim.length, line: oldTrim.split("\n").length }
	: firstDifferingLine(oldTrim, newTrim, lcpChars);

	let code;
	if (isIdentical) {
	code = "identical";
	} else if (lcpChars === 0) {
	code = "fully_divergent";
	} else if (providerResults.every(r => r.reason === "below_min")) {
	code = "divergent_below_min";
	} else {
	code = "divergent_can_cache";
	}

	return {
	code,
	params: {
	profile,
	lcp_chars: lcpChars,
	diverge_chars: divergeChars,
	tokens_common: tokensCommon,
	tokens_diverge: tokensDiverge,
	tokens_total: tokensTotal,
	hit_ratio: tokensTotal === 0 ? 0 : tokensCommon / tokensTotal,
	diff_point: diffPoint,
	output_tokens: outputTokensEstimate,
	},
	providers: providerResults,
	};
	}

	// Helper used by the UI: short summary string per provider, suitable for
	// rendering in a table row (i18n-substituted in main.js).
	export function summariseProvider(result) {
	if (!result) return null;
	return {
	name: result.provider_name,
	hit_pct: Math.round(result.hit_ratio * 100),
	base: result.base_cost_usd,
	cached: result.cached_cost_usd,
	savings: result.savings_usd,
	savings_pct: result.savings_pct ?? 0,
	requires_explicit: result.requires_explicit,
	reason: result.reason,
	};
	}