InfoLens / client /src /ts /utils /semanticUtils.chunk.test.ts
dqy08's picture
initial beta release
494c9e4
/**
* 本地测试 splitTextToChunks
* 运行: cd client/src && npx tsx ts/utils/semanticUtils.chunk.test.ts [文件路径] [bytesPerChunk]
* 示例: npx tsx ts/utils/semanticUtils.chunk.test.ts chunk_test1.txt 512
*/
import * as fs from "fs";
import * as path from "path";
import { splitTextToChunks } from "./semanticUtils";
const defaultFile = path.resolve(__dirname, "./chunk_test1.txt");
const testFile = path.resolve(process.argv[2] || defaultFile);
const BYTES_PER_CHUNK = parseInt(process.argv[3] || "800", 10);
const text = fs.readFileSync(testFile, "utf-8");
const chunks = splitTextToChunks(text, BYTES_PER_CHUNK);
const merged = chunks.map((c) => c.text).join("");
console.log(`文件: ${testFile}`);
console.log(`bytesPerChunk: ${BYTES_PER_CHUNK}`);
console.log(`chunk 数量: ${chunks.length}`);
console.log(`合并与原文一致: ${merged === text}`);
chunks.forEach((c, i) => {
const bytes = Buffer.byteLength(c.text, "utf8");
console.log(`---[${i}] ${bytes} bytes----------------------------------------------`);
console.log(c.text);
});