InfoLens / client /src /ts /utils /splitTextToChunks.test.ts
dqy08's picture
initial beta release
494c9e4
/**
* splitTextToChunks 单元测试
* 运行: cd client/src && npx tsx ts/utils/splitTextToChunks.test.ts
*/
import assert from "assert";
import { splitTextToChunks } from "./semanticUtils";
const enc = new TextEncoder();
const b = (s: string) => enc.encode(s).byteLength;
let passed = 0;
let failed = 0;
function test(desc: string, fn: () => void) {
try {
fn();
console.log(` ✓ ${desc}`);
passed++;
} catch (e: any) {
console.error(` ✗ ${desc}`);
console.error(` ${e.message}`);
failed++;
}
}
/** 验证 chunks 文本、startOffset 均正确,且拼接还原原文 */
function expectChunks(text: string, limit: number, expectedTexts: string[]) {
const chunks = splitTextToChunks(text, limit);
assert.strictEqual(chunks.length, expectedTexts.length,
`chunk 数量: expected ${expectedTexts.length}, got ${chunks.length}${JSON.stringify(chunks.map(c => c.text))}`);
for (let i = 0; i < chunks.length; i++) {
assert.strictEqual(chunks[i].text, expectedTexts[i],
`chunk[${i}].text: expected ${JSON.stringify(expectedTexts[i])}, got ${JSON.stringify(chunks[i].text)}`);
// startOffset 必须是正确的字符索引
assert.strictEqual(
text.slice(chunks[i].startOffset, chunks[i].startOffset + chunks[i].text.length),
chunks[i].text,
`chunk[${i}].startOffset=${chunks[i].startOffset} 指向错误位置`
);
}
assert.strictEqual(chunks.map(c => c.text).join(""), text, "拼接后与原文不一致");
}
// ── 1. Guard 校验 ──────────────────────────────────────────────────────────────
console.log("1. Guard 校验");
test("bytesPerChunk=0 抛错", () => {
assert.throws(() => splitTextToChunks("hello", 0), /必须大于 0/);
});
test("bytesPerChunk=-1 抛错", () => {
assert.throws(() => splitTextToChunks("hello", -1), /必须大于 0/);
});
test("文本含 \\r 抛错", () => {
assert.throws(() => splitTextToChunks("hello\r\nworld", 512), /\\r/);
});
// ── 2. 空文本(outer while 不进入)──────────────────────────────────────────────
console.log("2. 空文本");
test("空字符串返回空数组", () => {
expectChunks("", 10, []);
});
// ── 3. 正常路径:多行累积不超限(内层 while 自然退出)─────────────────────────
console.log("3. 正常路径(行累积)");
test("单行无换行,整体放入一个 chunk", () => {
expectChunks("hello world", 100, ["hello world"]);
});
test("多行全部累积入一个 chunk", () => {
// "a\n"=2B + "b\n"=2B + "c\n"=2B = 6B ≤ 10
expectChunks("a\nb\nc\n", 10, ["a\nb\nc\n"]);
});
test("末行无换行(nextLineEnd 返回 text.length)", () => {
// "aaa\n"=4B ≤ 5,"bb"=2B:4+2=6>5 → break,chunk1="aaa\n";chunk2="bb"
expectChunks("aaa\nbb", 5, ["aaa\n", "bb"]);
});
// ── 4. chunkBytes > 0 && wouldExceed → break(行级拆分)─────────────────────
console.log("4. 行级拆分(chunkBytes>0 超限)");
test("两行各自放入独立 chunk", () => {
// "aaa\n"=4B,"bbb\n"=4B,limit=5:先放"aaa\n"(4B),再加"bbb\n"→ 8>5 → break
expectChunks("aaa\nbbb\n", 5, ["aaa\n", "bbb\n"]);
});
test("三行(无段落边界):贪婪行模式填满 chunk", () => {
// 整段 "aa\nbb\ncc\n"=9B > 6 → 行模式贪婪消费
// "aa\n"(3B) + "bb\n"(3B) = 6B ≤ 6 → 合并;"cc\n"(3B) → 3+3>6 → break
// chunk1="aa\nbb\n";下一轮: "cc\n"=3B ≤ 6 → 整段入 chunk
expectChunks("aa\nbb\ncc\n", 6, ["aa\nbb\n", "cc\n"]);
});
// ── 5. 连续换行(nextLineEnd 的 while 分支)──────────────────────────────────
console.log("5. 连续换行");
test("连续换行作为一行整体", () => {
// "a\n\n\nb" limit=100 → 一个 chunk
expectChunks("a\n\n\nb", 100, ["a\n\n\nb"]);
});
test("连续换行导致跨 chunk 分割", () => {
// "ab\n\n"=4B,"cd\n"=3B,limit=6:4+3=7>6 → "ab\n\n" / "cd\n"
expectChunks("ab\n\ncd\n", 6, ["ab\n\n", "cd\n"]);
});
// ── 6. 单行超长:findSplitPoint 第一优先级命中 ─────────────────────────────
console.log("6. 单行超长:句子级分隔符");
test("句号在 maxEnd 范围内,按句号切分", () => {
// "A。BB" → b("A。")=1+3=4B,limit=4
// maxEnd = charIndexForByteLimit("A。BB", 0, 4) = 2(A=1B,。=3B,累计4≤4→i=2; B→5>4→stop)
// window="A。", 。在 idx=1, bestEnd=2 → chunk="A。"
expectChunks("A。BB", 4, ["A。", "BB"]);
});
test("感叹号切分", () => {
// "Hi!World" limit=7:maxEnd=7, window="Hi!Worl", "!"在idx=2,bestEnd=3 → "Hi!";剩余"World"=5B≤7
expectChunks("Hi!World", 7, ["Hi!", "World"]);
});
test("同组多个句子符取最靠右", () => {
// "A.B.CCC" limit=4:maxEnd=4, window="A.B.", "."在1和3,rightmost bestEnd=4 → "A.B."
expectChunks("A.B.CCC", 4, ["A.B.", "CCC"]);
});
// ── 7. 单行超长:第一优先级无命中,第二优先级命中 ──────────────────────────
console.log("7. 单行超长:子句级分隔符");
test("逗号切分(ASCII)", () => {
// "AAAA,BBBB" limit=6:maxEnd=6, window="AAAA,B", 无句子符,","在idx=4,bestEnd=5 → "AAAA,"
expectChunks("AAAA,BBBB", 6, ["AAAA,", "BBBB"]);
});
test("中文逗号切分", () => {
// "你好,世界啊" → b("你好,")=3+3+3=9B, limit=9
// maxEnd = charIndexForByteLimit(..., 0, 9) = 3("你好,"恰好9B)
// window="你好,",,在idx=2,bestEnd=3 → chunk="你好,"
expectChunks("你好,世界啊", 9, ["你好,", "世界啊"]);
});
// ── 8. 单行超长:两级均无命中,回退 maxEnd ───────────────────────────────────
console.log("8. 单行超长:回退 maxEnd");
test("纯字母无分隔符,按字节边界强切", () => {
// "ABCDEFGH" limit=4,无分隔符 → maxEnd=4 → "ABCD",再"EFGH"
expectChunks("ABCDEFGH", 4, ["ABCD", "EFGH"]);
});
test("中文无分隔符,按字节边界强切(每字 3B)", () => {
// "你好世界" limit=6 → maxEnd=2("你好"=6B)→ 无分隔符 → chunk="你好"
expectChunks("你好世界", 6, ["你好", "世界"]);
});
// ── 9. startOffset 在多字节字符下的正确性 ──────────────────────────────────
console.log("9. startOffset 正确性(多字节字符)");
test("中文分行后 startOffset 指向正确字符索引", () => {
// "你好\n世界\n":b("你好\n")=7B,limit=7
// chunk1: "你好\n"(JS idx 0-2,startOffset=0)
// chunk2: "世界\n"(JS idx 3-5,startOffset=3)
const text = "你好\n世界\n";
const chunks = splitTextToChunks(text, 7);
assert.strictEqual(chunks[0].startOffset, 0);
assert.strictEqual(chunks[1].startOffset, 3);
assert.strictEqual(text.slice(3), "世界\n");
});
// ── 10. Emoji 代理对(4B,JS charLen=2)不被切断 ──────────────────────────
console.log("10. Emoji 代理对不被切断");
test("😀 不被切断(charIndexForByteLimit cp>0xFFFF 分支)", () => {
// "😀AB😀" = 4+1+1+4=10B,limit=5
// maxEnd: 😀(4B)→i=2;A(1B)→5B≤5→i=3;B(1B)→6>5→stop,maxEnd=3
// window="😀A",无分隔符 → chunk="😀A"(JS chars 0-2)
// 下一轮:"B😀"=5B≤5,不超限 → chunk="B😀"
expectChunks("😀AB😀", 5, ["😀A", "B😀"]);
});
test("纯 emoji 序列按 4B 边界切分", () => {
// "😀😀😀" = 12B,limit=4 → 每个 emoji 独立 chunk
expectChunks("😀😀😀", 4, ["😀", "😀", "😀"]);
});
// ── 11. 综合:超长行与正常行混合 ─────────────────────────────────────────────
console.log("11. 综合场景");
test("超长行被多次切分后,正常行继续正常累积", () => {
// "ABCDEFGH\n" limit=4:
// pos=0: 行="ABCDEFGH\n"(9B)>4,maxEnd=4,无分隔符→"ABCD",pos=4
// pos=4: 行仍是"ABCDEFGH\n"的剩余?不,nextLineEnd(text,4)="EFGH\n"→lineEnd=9
// lineText="EFGH\n"(5B)>4,maxEnd=charIndexForByteLimit("ABCDEFGH\n",4,4)=8,window="EFGH"→无→chunk="EFGH"(maxEnd=8),pos=8
// pos=8: lineText="\n"(1B)≤4,chunkBytes=1,chunkEnd=9;退出内层
// chunk="\n"
// "XY\n"(3B)≤4 → 一个 chunk
const text = "ABCDEFGH\nXY\n";
const chunks = splitTextToChunks(text, 4);
assert.strictEqual(chunks.map(c => c.text).join(""), text);
assert.ok(chunks.length >= 3); // "ABCD" + "EFGH" + "\nXY\n"
for (const c of chunks) {
assert.strictEqual(text.slice(c.startOffset, c.startOffset + c.text.length), c.text,
`startOffset 错误: ${JSON.stringify(c)}`);
}
});
test("中英混合文本,所有 chunk 字节数不超过 limit(含分隔符回退场景除外)", () => {
const text = "Hello, world! 你好世界。This is a test. 测试一下,看看效果!\n";
const limit = 20;
const chunks = splitTextToChunks(text, limit);
assert.strictEqual(chunks.map(c => c.text).join(""), text);
for (const c of chunks) {
assert.strictEqual(text.slice(c.startOffset, c.startOffset + c.text.length), c.text);
}
});
// ── 12. 段落级切分(nextParagraphEnd)─────────────────────────────────────────
console.log("12. 段落级切分");
test("两个短段落合并入一个 chunk", () => {
// "P1\n\n"=4B + "P2\n\n"=4B = 8B ≤ 20 → 合并
expectChunks("P1\n\nP2\n\n", 20, ["P1\n\nP2\n\n"]);
});
test("两个段落超过 limit,各自独立成 chunk", () => {
// "AAAA\n\n"=6B ≤ 6;"BBBB\n\n"=6B,6+6>6 → break
expectChunks("AAAA\n\nBBBB\n\n", 6, ["AAAA\n\n", "BBBB\n\n"]);
});
test("大段落降级到行模式,贪婪填满后行末段落边界合并", () => {
// paragraph "LINE1\nLINE2\nLINE3\n\n"=20B > 12B → 行模式贪婪消费
// "LINE1\n"(6B) + "LINE2\n"(6B) = 12B ≤ 12 → 合并;"LINE3\n\n"(8B) → 12+8>12 → break
// chunk1="LINE1\nLINE2\n";剩余 "LINE3\n\n"(8B) + "FOO"(3B) = 11B ≤ 12 → 合并
expectChunks("LINE1\nLINE2\nLINE3\n\nFOO", 12, ["LINE1\nLINE2\n", "LINE3\n\nFOO"]);
});
test("nextLineEnd while 分支:单行段落末尾 \\n\\n 不被切断", () => {
// "AAAA,BBBB\n\n"=11B > limit=10 → 段落超限,调 nextLineEnd
// nextLineEnd 的 while 分支消费两个 \n → lineBytes=11 > 10 → findSplitPoint
// window="AAAA,BBBB\n",逗号命中 → chunk="AAAA,",\n\n 留在下一 chunk
// 若无 while 分支:lineBytes=10 ≤ 10 → 直接加入,\n\n 被切断("\nCCC" 开头带孤立 \n)
expectChunks("AAAA,BBBB\n\nCCC", 10, ["AAAA,", "BBBB\n\nCCC"]);
});
// ── 结果汇总 ──────────────────────────────────────────────────────────────────
console.log(`\n结果: ${passed} 通过 / ${failed} 失败`);
if (failed > 0) process.exit(1);