| |
| |
| |
| |
| import assert from "assert"; |
| import { splitTextToChunks } from "./semanticUtils"; |
|
|
| const enc = new TextEncoder(); |
| const b = (s: string) => enc.encode(s).byteLength; |
|
|
| let passed = 0; |
| let failed = 0; |
|
|
| function test(desc: string, fn: () => void) { |
| try { |
| fn(); |
| console.log(` ✓ ${desc}`); |
| passed++; |
| } catch (e: any) { |
| console.error(` ✗ ${desc}`); |
| console.error(` ${e.message}`); |
| failed++; |
| } |
| } |
|
|
| |
| function expectChunks(text: string, limit: number, expectedTexts: string[]) { |
| const chunks = splitTextToChunks(text, limit); |
| assert.strictEqual(chunks.length, expectedTexts.length, |
| `chunk 数量: expected ${expectedTexts.length}, got ${chunks.length} — ${JSON.stringify(chunks.map(c => c.text))}`); |
| for (let i = 0; i < chunks.length; i++) { |
| assert.strictEqual(chunks[i].text, expectedTexts[i], |
| `chunk[${i}].text: expected ${JSON.stringify(expectedTexts[i])}, got ${JSON.stringify(chunks[i].text)}`); |
| |
| assert.strictEqual( |
| text.slice(chunks[i].startOffset, chunks[i].startOffset + chunks[i].text.length), |
| chunks[i].text, |
| `chunk[${i}].startOffset=${chunks[i].startOffset} 指向错误位置` |
| ); |
| } |
| assert.strictEqual(chunks.map(c => c.text).join(""), text, "拼接后与原文不一致"); |
| } |
|
|
| |
| console.log("1. Guard 校验"); |
|
|
| test("bytesPerChunk=0 抛错", () => { |
| assert.throws(() => splitTextToChunks("hello", 0), /必须大于 0/); |
| }); |
| test("bytesPerChunk=-1 抛错", () => { |
| assert.throws(() => splitTextToChunks("hello", -1), /必须大于 0/); |
| }); |
| test("文本含 \\r 抛错", () => { |
| assert.throws(() => splitTextToChunks("hello\r\nworld", 512), /\\r/); |
| }); |
|
|
| |
| console.log("2. 空文本"); |
|
|
| test("空字符串返回空数组", () => { |
| expectChunks("", 10, []); |
| }); |
|
|
| |
| console.log("3. 正常路径(行累积)"); |
|
|
| test("单行无换行,整体放入一个 chunk", () => { |
| expectChunks("hello world", 100, ["hello world"]); |
| }); |
| test("多行全部累积入一个 chunk", () => { |
| |
| expectChunks("a\nb\nc\n", 10, ["a\nb\nc\n"]); |
| }); |
| test("末行无换行(nextLineEnd 返回 text.length)", () => { |
| |
| expectChunks("aaa\nbb", 5, ["aaa\n", "bb"]); |
| }); |
|
|
| |
| console.log("4. 行级拆分(chunkBytes>0 超限)"); |
|
|
| test("两行各自放入独立 chunk", () => { |
| |
| expectChunks("aaa\nbbb\n", 5, ["aaa\n", "bbb\n"]); |
| }); |
| test("三行(无段落边界):贪婪行模式填满 chunk", () => { |
| |
| |
| |
| expectChunks("aa\nbb\ncc\n", 6, ["aa\nbb\n", "cc\n"]); |
| }); |
|
|
| |
| console.log("5. 连续换行"); |
|
|
| test("连续换行作为一行整体", () => { |
| |
| expectChunks("a\n\n\nb", 100, ["a\n\n\nb"]); |
| }); |
| test("连续换行导致跨 chunk 分割", () => { |
| |
| expectChunks("ab\n\ncd\n", 6, ["ab\n\n", "cd\n"]); |
| }); |
|
|
| |
| console.log("6. 单行超长:句子级分隔符"); |
|
|
| test("句号在 maxEnd 范围内,按句号切分", () => { |
| |
| |
| |
| expectChunks("A。BB", 4, ["A。", "BB"]); |
| }); |
| test("感叹号切分", () => { |
| |
| expectChunks("Hi!World", 7, ["Hi!", "World"]); |
| }); |
| test("同组多个句子符取最靠右", () => { |
| |
| expectChunks("A.B.CCC", 4, ["A.B.", "CCC"]); |
| }); |
|
|
| |
| console.log("7. 单行超长:子句级分隔符"); |
|
|
| test("逗号切分(ASCII)", () => { |
| |
| expectChunks("AAAA,BBBB", 6, ["AAAA,", "BBBB"]); |
| }); |
| test("中文逗号切分", () => { |
| |
| |
| |
| expectChunks("你好,世界啊", 9, ["你好,", "世界啊"]); |
| }); |
|
|
| |
| console.log("8. 单行超长:回退 maxEnd"); |
|
|
| test("纯字母无分隔符,按字节边界强切", () => { |
| |
| expectChunks("ABCDEFGH", 4, ["ABCD", "EFGH"]); |
| }); |
| test("中文无分隔符,按字节边界强切(每字 3B)", () => { |
| |
| expectChunks("你好世界", 6, ["你好", "世界"]); |
| }); |
|
|
| |
| console.log("9. startOffset 正确性(多字节字符)"); |
|
|
| test("中文分行后 startOffset 指向正确字符索引", () => { |
| |
| |
| |
| const text = "你好\n世界\n"; |
| const chunks = splitTextToChunks(text, 7); |
| assert.strictEqual(chunks[0].startOffset, 0); |
| assert.strictEqual(chunks[1].startOffset, 3); |
| assert.strictEqual(text.slice(3), "世界\n"); |
| }); |
|
|
| |
| console.log("10. Emoji 代理对不被切断"); |
|
|
| test("😀 不被切断(charIndexForByteLimit cp>0xFFFF 分支)", () => { |
| |
| |
| |
| |
| expectChunks("😀AB😀", 5, ["😀A", "B😀"]); |
| }); |
| test("纯 emoji 序列按 4B 边界切分", () => { |
| |
| expectChunks("😀😀😀", 4, ["😀", "😀", "😀"]); |
| }); |
|
|
| |
| console.log("11. 综合场景"); |
|
|
| test("超长行被多次切分后,正常行继续正常累积", () => { |
| |
| |
| |
| |
| |
| |
| |
| const text = "ABCDEFGH\nXY\n"; |
| const chunks = splitTextToChunks(text, 4); |
| assert.strictEqual(chunks.map(c => c.text).join(""), text); |
| assert.ok(chunks.length >= 3); |
| for (const c of chunks) { |
| assert.strictEqual(text.slice(c.startOffset, c.startOffset + c.text.length), c.text, |
| `startOffset 错误: ${JSON.stringify(c)}`); |
| } |
| }); |
|
|
| test("中英混合文本,所有 chunk 字节数不超过 limit(含分隔符回退场景除外)", () => { |
| const text = "Hello, world! 你好世界。This is a test. 测试一下,看看效果!\n"; |
| const limit = 20; |
| const chunks = splitTextToChunks(text, limit); |
| assert.strictEqual(chunks.map(c => c.text).join(""), text); |
| for (const c of chunks) { |
| assert.strictEqual(text.slice(c.startOffset, c.startOffset + c.text.length), c.text); |
| } |
| }); |
|
|
| |
| console.log("12. 段落级切分"); |
|
|
| test("两个短段落合并入一个 chunk", () => { |
| |
| expectChunks("P1\n\nP2\n\n", 20, ["P1\n\nP2\n\n"]); |
| }); |
| test("两个段落超过 limit,各自独立成 chunk", () => { |
| |
| expectChunks("AAAA\n\nBBBB\n\n", 6, ["AAAA\n\n", "BBBB\n\n"]); |
| }); |
| test("大段落降级到行模式,贪婪填满后行末段落边界合并", () => { |
| |
| |
| |
| expectChunks("LINE1\nLINE2\nLINE3\n\nFOO", 12, ["LINE1\nLINE2\n", "LINE3\n\nFOO"]); |
| }); |
|
|
| test("nextLineEnd while 分支:单行段落末尾 \\n\\n 不被切断", () => { |
| |
| |
| |
| |
| expectChunks("AAAA,BBBB\n\nCCC", 10, ["AAAA,", "BBBB\n\nCCC"]); |
| }); |
|
|
| |
| console.log(`\n结果: ${passed} 通过 / ${failed} 失败`); |
| if (failed > 0) process.exit(1); |
|
|