File size: 4,743 Bytes
494c9e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/**
 * Token 文本显示工具:特殊字符可视化、HTML 转义
 * 与 Tooltip、TopK 图表等共享
 */

function escapeHtmlImpl(text: string): string {
    const div = document.createElement('div');
    div.textContent = text;
    return div.innerHTML;
}

function isWhitespaceChar(char: string): boolean {
    return /\p{White_Space}/u.test(char);
}

function isPrintableChar(char: string): boolean {
    if (isWhitespaceChar(char)) return false;
    const codePoint = char.codePointAt(0);
    if (codePoint === undefined) return false;
    if (codePoint >= 32 && codePoint <= 126) return true;
    if (
        (codePoint >= 0x00A0 && codePoint <= 0x00FF) ||
        (codePoint >= 0x0100 && codePoint <= 0x017F) ||
        (codePoint >= 0x0180 && codePoint <= 0x024F) ||
        (codePoint >= 0x2000 && codePoint <= 0x206F) ||
        (codePoint >= 0x2070 && codePoint <= 0x209F) ||
        (codePoint >= 0x20A0 && codePoint <= 0x20CF) ||
        (codePoint >= 0x2100 && codePoint <= 0x214F) ||
        (codePoint >= 0x2190 && codePoint <= 0x21FF) ||
        (codePoint >= 0x2200 && codePoint <= 0x22FF) ||
        (codePoint >= 0x2300 && codePoint <= 0x23FF) ||
        (codePoint >= 0x2400 && codePoint <= 0x243F) ||
        (codePoint >= 0x2E00 && codePoint <= 0x2E7F) ||
        (codePoint >= 0x3000 && codePoint <= 0x303F) ||
        (codePoint >= 0x3040 && codePoint <= 0x309F) ||
        (codePoint >= 0x30A0 && codePoint <= 0x30FF) ||
        (codePoint >= 0x4E00 && codePoint <= 0x9FFF) ||
        (codePoint >= 0xAC00 && codePoint <= 0xD7AF) ||
        (codePoint >= 0xF900 && codePoint <= 0xFAFF) ||
        (codePoint >= 0xFF00 && codePoint <= 0xFFEF)
    ) return true;
    return false;
}

/** {@link visualizeSpecialChars} 可选行为 */
export type VisualizeSpecialCharsOptions = {
    /**
     * 为 true(如 DAG 节点标签):仅当 ASCII 空格**后面不是** `[A-Za-z0-9]` 时改为 ·;
     * 空格后是 ASCII 字母或数字时保留空格(便于 SVG 显示词界)。
     * 省略或 false:每个 ASCII 空格都变为 ·(与 Tooltip / 候选词等一致)。
     */
    spaceDotExceptBeforeAsciiLetterOrNumber?: boolean;
};

function visualizeSpecialCharsImpl(text: string, options?: VisualizeSpecialCharsOptions): string {
    let result = text
        .replace(/\r\n/g, '[CRLF]')
        .replace(/\n/g, '[LF]')
        .replace(/\r/g, '[CR]')
        .replace(/\t/g, '[TAB]')
        .replace(/\u3000/g, '[FS]');
    if (options?.spaceDotExceptBeforeAsciiLetterOrNumber === true) {
        // 须写成 (?![…]),勿写成 (?!=[…]):后者会解析成「否定先行 + 字面量 = + 字符类」,几乎总匹配,导致所有空格都变 ·。
        result = result.replace(/ (?![A-Za-z0-9])/g, '·');
    } else {
        result = result.replace(/ /g, '·');
    }

    const processed: string[] = [];
    let inBracket = false;

    for (let i = 0; i < result.length; i++) {
        const char = result[i];
        if (char === '[') {
            inBracket = true;
            processed.push(char);
        } else if (char === ']' && inBracket) {
            processed.push(char);
            inBracket = false;
        } else if (inBracket) {
            processed.push(char);
        } else {
            // 保留的空格不能走下方「不可打印 → U+」分支,否则会变成 [U+0020]
            if (char === ' ') {
                processed.push(char);
            } else if (isPrintableChar(char)) {
                processed.push(char);
            } else {
                const codePoint = char.codePointAt(0);
                if (codePoint !== undefined) {
                    const hexCode = codePoint.toString(16).toUpperCase().padStart(4, '0');
                    processed.push(`[U+${hexCode}]`);
                } else {
                    processed.push(char);
                }
            }
        }
    }
    return processed.join('');
}

/** 处理候选词文本,与主 token 保持一致:先可视化特殊字符,再 HTML 转义 */
export function processCandidateText(text: string): string {
    return escapeHtmlImpl(visualizeSpecialCharsImpl(text));
}

/**
 * Tooltip 内展示的当前 token 与合并子片段共用:与 {@link processCandidateText} 同一管线,
 * 保证与主栏「当前 token」行渲染一致。
 */
export function tooltipTokenDisplayHtml(text: string): string {
    return processCandidateText(text);
}

/** HTML 转义 */
export function escapeHtml(text: string): string {
    return escapeHtmlImpl(text);
}

/** 可视化特殊字符 */
export function visualizeSpecialChars(text: string, options?: VisualizeSpecialCharsOptions): string {
    return visualizeSpecialCharsImpl(text, options);
}