File size: 7,458 Bytes
494c9e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import type { AnalyzeResponse, FrontendToken } from '../api/GLTR_API';
import {
    type DigitMergePipelineOptions,
    digitMergeIndexGroupsByText,
    dropEmptyZeroWidthTokens,
    flattenMergePartsForDigitGroup,
    mergeSequentialOverlap,
    mergeSourcePartsForOverlapPair,
    sliceTextByCodePointOffsets,
} from './mergeTokenSpans';

export type DigitMergeResult = {
    digitMergedTokens: FrontendToken[];
    /** 输出 token i 对应的输入 token 索引列表(长度 1 表示未合并) */
    mergeGroups: number[][];
};

export type CloneTokenOptions = {
    keepMergedFlag?: boolean;
};

/**
 * 克隆 real_topk 元组
 */
export const cloneRealTopk = (tuple: [number, number] | null | undefined): [number, number] | undefined => {
    if (Array.isArray(tuple) && tuple.length === 2 && tuple.every((item) => typeof item === 'number')) {
        return [tuple[0], tuple[1]];
    }
    return undefined;
};

/**
 * 克隆 pred_topk 数组
 */
export const clonePredTopk = (list: [string, number][] | null | undefined): [string, number][] => {
    if (!Array.isArray(list)) {
        return [];
    }
    return list.map((item) => {
        const tokenText = typeof item?.[0] === 'string' ? item[0] : '';
        const prob = typeof item?.[1] === 'number' && Number.isFinite(item[1]) ? item[1] : 0;
        return [tokenText, prob] as [string, number];
    });
};

/**
 * 克隆 FrontendToken
 */
export const cloneFrontendToken = (token: FrontendToken, options: CloneTokenOptions = {}): FrontendToken => {
    const cloned: FrontendToken = {
        offset: [token.offset[0], token.offset[1]],
        raw: token.raw,
        real_topk: cloneRealTopk(token.real_topk),
        pred_topk: clonePredTopk(token.pred_topk)
    };
    if (options.keepMergedFlag !== false && typeof token.bpe_merged === 'string') {
        cloned.bpe_merged = token.bpe_merged;
    }
    if (options.keepMergedFlag !== false && Array.isArray(token.bpe_merge_parts)) {
        cloned.bpe_merge_parts = [...token.bpe_merge_parts];
    }
    return cloned;
};

/**
 * 获取 token 的概率值
 */
export const getTokenProbability = (token: FrontendToken): number => {
    const tuple = token.real_topk;
    if (Array.isArray(tuple) && tuple.length === 2 && typeof tuple[1] === 'number') {
        return tuple[1];
    }
    return 0;
};

/**
 * BPE Overlap 合并:将 offset 重叠的 token 合并。
 * 重叠多来自 tokenizer 与字边界不对齐(如 CJK):表层 raw/offset 可能看起来交叉或「重复」,底层仍是各不相同的分词位置。
 * 合并后 `raw` 取原文切片;`real_topk` 概率按独立近似 **相乘**(语义 token_attention 则对原始梯度 **求和** 后 **再** 全局归一化,见 semanticUtils)。
 *
 * 先去掉零宽且 raw 为空的 token;其余零宽由 {@link mergeSequentialOverlap} 按 offset 与下一 token 是否覆盖该点统一合并。
 */
export const mergeBpeOverlapTokens = (tokens: FrontendToken[], originalText: string): FrontendToken[] => {
    const prepared = dropEmptyZeroWidthTokens(tokens);
    return mergeSequentialOverlap(prepared, {
        getOffset: (t) => t.offset,
        cloneForStep: (t) => cloneFrontendToken(t),
        sliceMergedRaw: (start, end) => sliceTextByCodePointOffsets(originalText, start, end),
        mergeOverlappingPair: (current, next, mergedOffset, mergedRaw) => {
            const mergedParts = mergeSourcePartsForOverlapPair(originalText, current, next);
            current.offset[0] = mergedOffset[0];
            current.offset[1] = mergedOffset[1];
            current.raw = mergedRaw;
            current.bpe_merge_parts = mergedParts;
            const combinedProb = getTokenProbability(current) * getTokenProbability(next);
            current.real_topk = [0, combinedProb];
            current.pred_topk = [];
            current.bpe_merged = 'overlap';
            return current;
        },
    });
};

/**
 * BPE Digit 合并:按原文码点上的「0/1 个 ASCII 空格 + 连续 ASCII 数字」段合并 token,与分词切法无关(overlap 后 offset 须与原文一致)。
 * 概率合并:real_topk 与各子 token 概率相乘(与 overlap 合并一致,独立近似)。
 */
export const mergeBpeDigitTokens = (tokens: FrontendToken[], originalText: string): DigitMergeResult => {
    const mergeGroups = digitMergeIndexGroupsByText(originalText, tokens);
    const digitMergedTokens = mergeGroups.map((group) => {
        if (group.length === 1) {
            return tokens[group[0]!]!;
        }
        const first = tokens[group[0]!]!;
        const last = tokens[group[group.length - 1]!]!;
        const mergedRaw = sliceTextByCodePointOffsets(originalText, first.offset[0], last.offset[1]);
        const mergedProb = group.reduce((p, idx) => p * getTokenProbability(tokens[idx]!), 1);
        return {
            offset: [first.offset[0], last.offset[1]] as [number, number],
            raw: mergedRaw,
            real_topk: [0, mergedProb] as [number, number],
            pred_topk: [],
            bpe_merged: 'digit' as const,
            bpe_merge_parts: flattenMergePartsForDigitGroup(group, tokens),
        };
    });
    return { digitMergedTokens, mergeGroups };
};

/**
 * 按 mergeGroups 对一组并行分数数组同时求和(digit merge 后对齐分数数组)
 */
export const digitMergeWithScores = (
    tokens: FrontendToken[],
    scoreArrays: (number | undefined)[][],
    originalText: string
): { digitMergedTokens: FrontendToken[]; mergedScoreArrays: (number | undefined)[][] } => {
    const { digitMergedTokens, mergeGroups } = mergeBpeDigitTokens(tokens, originalText);
    const mergedScoreArrays = scoreArrays.map((arr) =>
        mergeGroups.map((group) => group.reduce((sum, idx) => sum + (arr[idx] ?? 0), 0))
    );
    return { digitMergedTokens, mergedScoreArrays };
};

/**
 * 合并 token 用于渲染:先做 BPE Overlap 合并,可选再做 BPE Digit 合并
 */
export const mergeTokensForRendering = (
    tokens: FrontendToken[],
    originalText: string,
    options: DigitMergePipelineOptions = {}
): FrontendToken[] => {
    const overlapMerged = mergeBpeOverlapTokens(tokens, originalText);
    if (options.digitMerge === false) {
        return overlapMerged;
    }
    const { digitMergedTokens } = mergeBpeDigitTokens(overlapMerged, originalText);
    return digitMergedTokens;
};

/**
 * 从 token 数组中提取 real_topk 元组
 */
export const extractRealTopkFromTokens = (tokens: FrontendToken[] | null | undefined): [number, number][] => {
    if (!Array.isArray(tokens)) {
        return [];
    }
    return tokens.map((token) => {
        const tuple = token.real_topk;
        return [tuple[0], tuple[1]];
    });
};

/**
 * 创建原始数据的快照(用于保存 demo)
 */
export const createRawSnapshot = (response: AnalyzeResponse): AnalyzeResponse => {
    const requestClone: AnalyzeResponse['request'] = {
        text: response.request.text
    };
    const originalResult = response.result;
    const tokensForSave = originalResult.bpe_strings.map((token) =>
        cloneFrontendToken(token as FrontendToken, { keepMergedFlag: false })
    );
    // 确保 model 字段在最前面
    const resultClone: AnalyzeResponse['result'] = {
        model: originalResult.model,
        ...originalResult,
        bpe_strings: tokensForSave
    };
    return {
        request: requestClone,
        result: resultClone
    };
};