import * as d3 from "d3"; import { TokenWithOffset } from "../api/generatedSchemas"; /** * Created by hen on 5/15/17. */ let the_unique_id_counter = 0; export class Util { static simpleUId({prefix = ''}): string { the_unique_id_counter += 1; return prefix + the_unique_id_counter; } } export type D3Sel = d3.Selection export function argsort(array, sortFct):number[] { return array .map((d, i) => [d, i]) .sort((a,b) => sortFct(a[0], b[0])) .map(d => d[1]); } export function range(end){ return [...Array(end).keys()] } /** 判断是否为有限数字(排除 NaN、Infinity、非 number 类型) */ export function isFiniteNumber(x: unknown): x is number { return typeof x === 'number' && Number.isFinite(x); } export function obj_to_arr(obj:object){ const sortedKeys = Object.keys(obj).sort(); const res=[]; sortedKeys.forEach(k => {res.push(k); res.push(obj[k])}) return res; } export function arr_to_obj(arr:any){ const res={}; const max_l = Math.floor(arr.length/2); for (let i = 0; i 1) { list[index] = splitted; } } (list[index].constructor === Array) ? traverseList(list[index], splitter, 0) : null; (list.constructor === Array) ? traverseList(list, splitter, index+1) : null; } } export function flatten(arr) { return arr.reduce(function(acc, val) { return acc.concat(val.constructor === Array ? flatten(val) : val); },[]); } // Kudos: https://stackoverflow.com/questions/9401312/how-to-replace-curly-quotation-marks-in-a-string-using-javascript#answer-9401374 // Note: Removed em dash (\u2014) replacement to preserve Chinese em dash "——" export const cleanSpecials = input => input // .replace(/[‘’]/g, "'") // 注释掉替换卷单引号的逻辑 // .replace(/[“”]/g, '"') // 注释掉替换卷双引号的逻辑 // .replace(/[–]/g, "-"); // 注释掉替换en dash的逻辑,em dash (—) 已在上一个版本中移除替换 /** * Calculate surprisal (information content) from probability * @param probability - The probability value (0 < p <= 1) * @returns Surprisal in bits (using base-2 logarithm) */ export function calculateSurprisal(probability: number): number { return -Math.log2(Math.max(probability, Number.EPSILON)); } /** * 计算token的字符数(中文按字,英文按字母) * 使用Array.from正确处理Unicode字符(包括emoji) * @param tokenText token文本 * @returns 字符数 */ export function countTokenCharacters(tokenText: string): number { // 使用Array.from正确处理Unicode字符(包括中文、emoji等) return Array.from(tokenText).length; } // 复用 TextEncoder 实例,避免每次调用都创建新实例 const textEncoder = new TextEncoder(); /** * 获取字符串的UTF-8编码字节长度 * @param value 要计算字节长度的字符串 * @returns UTF-8编码的字节数 */ export const getByteLength = (value: string): number => { return textEncoder.encode(value).length; }; /** * 计算单位字节的surprisal值 * @param surprisal token的总surprisal值 * @param tokenText token文本 * @returns 单位字节的surprisal值(bits/Byte) */ function calculateSurprisalPerByte(surprisal: number, tokenText: string): number { // 按UTF-8编码字节数计算 const byteCount = getByteLength(tokenText); return byteCount > 0 ? surprisal / byteCount : 0; } /** * 计算信息密度(统一接口,方便将来扩展) * @param token token对象,包含real_topk和raw字段 * @returns 信息密度值(bits/Byte) */ export function calculateSurprisalDensity(token: TokenWithOffset): number { const [rank, prob] = token.real_topk; const surprisal = calculateSurprisal(prob); const tokenText = token.raw; return calculateSurprisalPerByte(surprisal, tokenText); } /** * 为文本创建字符索引到字节索引的映射表 * @param text 原始文本 * @returns 数组,charToByteIndex[charIndex] = byteIndex */ export function buildCharToByteIndexMap(text: string): number[] { const map: number[] = []; let byteOffset = 0; for (let charIndex = 0; charIndex < text.length; charIndex++) { map[charIndex] = byteOffset; // 获取当前字符的UTF-8字节长度 const char = text[charIndex]; byteOffset += getByteLength(char); } // 添加末尾位置(文本总字节长度) map[text.length] = byteOffset; return map; }