File size: 5,110 Bytes
494c9e4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | import * as d3 from "d3";
import { TokenWithOffset } from "../api/generatedSchemas";
/**
* Created by hen on 5/15/17.
*/
let the_unique_id_counter = 0;
export class Util {
static simpleUId({prefix = ''}): string {
the_unique_id_counter += 1;
return prefix + the_unique_id_counter;
}
}
export type D3Sel = d3.Selection<any, any, any, any>
export function argsort(array, sortFct):number[] {
return array
.map((d, i) => [d, i])
.sort((a,b) => sortFct(a[0], b[0]))
.map(d => d[1]);
}
export function range(end){
return [...Array(end).keys()]
}
/** 判断是否为有限数字(排除 NaN、Infinity、非 number 类型) */
export function isFiniteNumber(x: unknown): x is number {
return typeof x === 'number' && Number.isFinite(x);
}
export function obj_to_arr(obj:object){
const sortedKeys = Object.keys(obj).sort();
const res=[];
sortedKeys.forEach(k => {res.push(k); res.push(obj[k])})
return res;
}
export function arr_to_obj(arr:any){
const res={};
const max_l = Math.floor(arr.length/2);
for (let i = 0; i<max_l; i++){
res[arr[2*i]] = arr[2*i+1];
}
return res;
}
export function splitString(string, splitters) {
var list = [string];
for(var i=0, len=splitters.length; i<len; i++) {
traverseList(list, splitters[i], 0);
}
return flatten(list);
}
export function traverseList(list, splitter, index) {
if(list[index]) {
if((list.constructor !== String) && (list[index].constructor === String)) {
const splitted = list[index].split(splitter);
if (splitted.length > 1) {
list[index] = splitted;
}
}
(list[index].constructor === Array) ? traverseList(list[index], splitter, 0) : null;
(list.constructor === Array) ? traverseList(list, splitter, index+1) : null;
}
}
export function flatten(arr) {
return arr.reduce(function(acc, val) {
return acc.concat(val.constructor === Array ? flatten(val) : val);
},[]);
}
// Kudos: https://stackoverflow.com/questions/9401312/how-to-replace-curly-quotation-marks-in-a-string-using-javascript#answer-9401374
// Note: Removed em dash (\u2014) replacement to preserve Chinese em dash "——"
export const cleanSpecials = input => input
// .replace(/[‘’]/g, "'") // 注释掉替换卷单引号的逻辑
// .replace(/[“”]/g, '"') // 注释掉替换卷双引号的逻辑
// .replace(/[–]/g, "-"); // 注释掉替换en dash的逻辑,em dash (—) 已在上一个版本中移除替换
/**
* Calculate surprisal (information content) from probability
* @param probability - The probability value (0 < p <= 1)
* @returns Surprisal in bits (using base-2 logarithm)
*/
export function calculateSurprisal(probability: number): number {
return -Math.log2(Math.max(probability, Number.EPSILON));
}
/**
* 计算token的字符数(中文按字,英文按字母)
* 使用Array.from正确处理Unicode字符(包括emoji)
* @param tokenText token文本
* @returns 字符数
*/
export function countTokenCharacters(tokenText: string): number {
// 使用Array.from正确处理Unicode字符(包括中文、emoji等)
return Array.from(tokenText).length;
}
// 复用 TextEncoder 实例,避免每次调用都创建新实例
const textEncoder = new TextEncoder();
/**
* 获取字符串的UTF-8编码字节长度
* @param value 要计算字节长度的字符串
* @returns UTF-8编码的字节数
*/
export const getByteLength = (value: string): number => {
return textEncoder.encode(value).length;
};
/**
* 计算单位字节的surprisal值
* @param surprisal token的总surprisal值
* @param tokenText token文本
* @returns 单位字节的surprisal值(bits/Byte)
*/
function calculateSurprisalPerByte(surprisal: number, tokenText: string): number {
// 按UTF-8编码字节数计算
const byteCount = getByteLength(tokenText);
return byteCount > 0 ? surprisal / byteCount : 0;
}
/**
* 计算信息密度(统一接口,方便将来扩展)
* @param token token对象,包含real_topk和raw字段
* @returns 信息密度值(bits/Byte)
*/
export function calculateSurprisalDensity(token: TokenWithOffset): number {
const [rank, prob] = token.real_topk;
const surprisal = calculateSurprisal(prob);
const tokenText = token.raw;
return calculateSurprisalPerByte(surprisal, tokenText);
}
/**
* 为文本创建字符索引到字节索引的映射表
* @param text 原始文本
* @returns 数组,charToByteIndex[charIndex] = byteIndex
*/
export function buildCharToByteIndexMap(text: string): number[] {
const map: number[] = [];
let byteOffset = 0;
for (let charIndex = 0; charIndex < text.length; charIndex++) {
map[charIndex] = byteOffset;
// 获取当前字符的UTF-8字节长度
const char = text[charIndex];
byteOffset += getByteLength(char);
}
// 添加末尾位置(文本总字节长度)
map[text.length] = byteOffset;
return map;
}
|