import { readStoredEffectiveExcludeGeneratedPatternsText, readStoredEffectiveExcludePromptPatternsText, } from './attributionExcludePromptPatternsStorage'; import { collectExcludeRegexMatchIntervals, isOffsetSpanFullyExcluded, } from './attributionDisplayModel'; import type { NodeAggregatedEntry } from './genAttributeDagIntervalResolve'; import type { TokenGenStep } from './tokenGenAttributionRunner'; import { getAttentionRawScore } from '../utils/semanticUtils'; import { DAG_EDGE_MIN_DISPLAY_OPACITY } from './genAttributeDagEdgeDisplay'; /** 与 DAG 节点 id 一致：来自 API `token_attribution` 几何（按 offset 去重，独立于 exclude/归一化）。 */ export type PromptTokenSpan = { offset: [number, number]; raw: string; /** tokenizer 词表 id（/api/tokenize 返回）；DAG 几何不依赖此字段。 */ token_id?: number; }; /** 每步在 exclude 之后按 `score` 降序取前 N 条作为候选池，避免长上下文长尾稀释。 */ // 经验值，最后能筛选出大概一半的归因数 const DAG_EDGE_TOP_N = 10; /** DAG 边 Top-P：候选池内累计份额默认上限（{@link phase2RankAndSparsify}）。 */ export const DAG_EDGE_TOP_P_COVERAGE_DEFAULT = 0.7; const DAG_EDGE_TOP_P_COVERAGE_MIN = 0.05; const DAG_EDGE_TOP_P_COVERAGE_MAX = 1; export function clampDagEdgeTopPCoverage(n: number): number { if (!Number.isFinite(n)) return DAG_EDGE_TOP_P_COVERAGE_DEFAULT; return Math.min(DAG_EDGE_TOP_P_COVERAGE_MAX, Math.max(DAG_EDGE_TOP_P_COVERAGE_MIN, n)); } /** * 按 `score` 降序排序后取前 min(N, length) 项。 * 会 **原地** `sort` 输入数组（与池内 `poolMassFrac` 次序一致，调用方无需再按份额排序）。 */ function selectTopNByScore(effective: T[], n: number): T[] { effective.sort((a, b) => b.score - a.score); return effective.slice(0, Math.min(n, effective.length)); } /** Top-N 候选池内一行：max 归一后的 `score`、rawScore，以及池内正质量上的 L1 份额 `poolMassFrac`（仅预处理内部使用）。 */ type DagPoolNormRow = T & { score: number; rawScore: number; poolMassFrac: number }; /** 候选池内 max 归一、rawScore、以及各条目在池内 Σscore 上的 L1 份额（保留其余字段如 nodeId）。 */ function normalizeTopNPoolForDagSparse(tokens: T[]): Array> { const max = Math.max(0, ...tokens.map((t) => t.score).filter(Number.isFinite)); const positiveMass = tokens.map((t) => { const s = t.score; return Number.isFinite(s) ? Math.max(0, s) : 0; }); const massSum = positiveMass.reduce((a, v) => a + v, 0); return tokens.map((t, i) => { const rawScore = getAttentionRawScore(t); const poolMassFrac = massSum > 0 ? positiveMass[i]! / massSum : 0; const scoreNorm = max <= 0 ? t.score : t.score / max; return { ...t, score: scoreNorm, rawScore, poolMassFrac }; }); } /** * 在候选池已按 `score` 降序、池内归一保持该顺序的前提下，按遍历顺序取前缀，直到： * - 池内 L1 份额小于 {@link DAG_EDGE_MIN_DISPLAY_OPACITY}×首条份额（`relativeFloor`，系数与最小展示透明度同值），或 * - 累计达到给定阈值（默认 {@link DAG_EDGE_TOP_P_COVERAGE_DEFAULT}；候选池内 Top-P，非整步全量 token 的分母）。 * （池内份额与 `score` 单调一致，无需再排序。） * * `relativeFloor`：{@link normalizeTopNPoolForDagSparse} 后首条 `normalizedScore === 1`，且对正分条目有 * `poolMassFrac_i / topFrac === normalizedScore_i`。故 `frac < β×topFrac` ⇔ `normalizedScore < β`； * 再乘互信息率（≤1）后不可能达到视图层最小 `stroke-opacity`，等于提前剔除注定画不出的边，与 * {@link DAG_EDGE_MIN_DISPLAY_OPACITY} 在视图中的含义对齐。 */ function selectTokenAttributionByCumulativeShare( normalized: Array, cumulativeShareThreshold: number, ): Array { if (normalized.length === 0) return []; const topFrac = normalized[0]?.poolMassFrac ?? 0; if (!(topFrac > 0)) return []; const relativeFloor = DAG_EDGE_MIN_DISPLAY_OPACITY * topFrac; let cum = 0; const picked: Array = []; for (const t of normalized) { const frac = t.poolMassFrac; if (!(frac > 0)) { break; } if (frac < relativeFloor) { break; } picked.push(t); cum += frac; if (cum >= cumulativeShareThreshold) { break; } } return picked; } /** * 第 0 步：从 API 原始 `token_attribution` 按 offset 去重得到 prompt spans，供 DAG `setPromptTokenSpans`（配合 `context` 全文测量布局）。 * 与 {@link excludeNodeAggregatedEntries} / {@link phase2RankAndSparsify} 无关（不 exclude、不归一化）。 */ export function extractPromptTokenSpans(step: TokenGenStep): PromptTokenSpan[] { const ta = step.response.token_attribution; if (!ta?.length) return []; const byKey = new Map(); for (const t of ta) { const k = `${t.offset[0]}_${t.offset[1]}`; if (!byKey.has(k)) { byKey.set(k, { offset: t.offset, raw: t.raw }); } } return [...byKey.values()]; } /** 与 {@link excludeNodeAggregatedEntries} 使用同一套 prompt / 生成区与 storage 文本，在 `intervalCtx` 上收集排除区间（全串下标）。 */ export function collectGenAttrDagExcludeIntervals( intervalCtx: string, promptRegionEnd: number, ): [number, number][] { const pe = promptRegionEnd; return [ ...collectExcludeRegexMatchIntervals(intervalCtx, readStoredEffectiveExcludePromptPatternsText(), { start: 0, end: pe, }), ...collectExcludeRegexMatchIntervals(intervalCtx, readStoredEffectiveExcludeGeneratedPatternsText(), { start: pe, end: intervalCtx.length, }), ]; } /** * 对齐聚合之后、Top-N 之前：在 **prompt 区** / **已生成后缀区** 分别匹配两套 exclude 模式，按**节点区间** `[ts, te)` 判定是否整段落入排除区间， * 命中则该条 `score` 置 0。与 piece 级 exclude 相比，合并型 piece 拆到多节点后可分别命中/不命中。 * * @param excludeIntervalContext 取匹配区间所用的全文（与 DAG 节点 offset 同源）。流式场景传**当前已写出的累积串** *（如 `steps[last].context + steps[last].token`），使跨多 token 才闭合的正则与下标一致；缺省为 `step.context`。 */ export function excludeNodeAggregatedEntries( step: TokenGenStep, entries: NodeAggregatedEntry[], excludeIntervalContext?: string, ): NodeAggregatedEntry[] { if (!entries.length) return []; const pe = step.promptRegionEnd; const intervalCtx = excludeIntervalContext ?? step.context; const excludeIntervals = collectGenAttrDagExcludeIntervals(intervalCtx, pe); return entries.map((t) => { const [ts, te] = t.offset; const excluded = isOffsetSpanFullyExcluded(ts, te, excludeIntervals); return { ...t, score: excluded ? 0 : t.score, }; }); } /** Top-N 候选池 → 池内归一 → β 截断与累计 Top-P；`cumulativeShare` 未传用 {@link DAG_EDGE_TOP_P_COVERAGE_DEFAULT}。 */ export function phase2RankAndSparsify( entries: T[], options?: { cumulativeShare?: number }, ): Array { if (!entries.length) return []; const topNPool = selectTopNByScore(entries, DAG_EDGE_TOP_N); const normalized = normalizeTopNPoolForDagSparse(topNPool); const threshold = options?.cumulativeShare !== undefined ? clampDagEdgeTopPCoverage(options.cumulativeShare) : DAG_EDGE_TOP_P_COVERAGE_DEFAULT; return selectTokenAttributionByCumulativeShare(normalized, threshold); }