| |
| |
| |
| |
| |
| |
|
|
| |
| export type DigitMergePipelineOptions = { |
| digitMerge?: boolean; |
| }; |
|
|
| |
| export function codePointLength(text: string): number { |
| return Array.from(text || '').length; |
| } |
|
|
| |
| |
| |
| export function sliceTextByCodePointOffsets(text: string, start: number, end: number): string { |
| const chars = Array.from(text || ''); |
| if (chars.length === 0) return ''; |
| const boundedStart = Math.max(0, Math.min(start, chars.length)); |
| const boundedEnd = Math.max(boundedStart, Math.min(end, chars.length)); |
| if (boundedStart >= boundedEnd) return ''; |
| return chars.slice(boundedStart, boundedEnd).join(''); |
| } |
|
|
| |
| |
| |
| export function dropEmptyZeroWidthTokens<T extends { offset: [number, number]; raw?: string }>(tokens: T[]): T[] { |
| return tokens.filter((t) => { |
| const [s, e] = t.offset; |
| return !(s === e && (t.raw ?? '') === ''); |
| }); |
| } |
|
|
| function isAsciiDigitCodePoint(c: string): boolean { |
| return c.length === 1 && c >= '0' && c <= '9'; |
| } |
|
|
| |
| |
| |
| |
| |
| export function asciiDigitSpanRangesByCodePoint(text: string): [number, number][] { |
| const chars = Array.from(text || ''); |
| const n = chars.length; |
| const spans: [number, number][] = []; |
| let i = 0; |
| while (i < n) { |
| if (!isAsciiDigitCodePoint(chars[i]!)) { |
| i++; |
| continue; |
| } |
| const digitStart = i; |
| let k = i; |
| while (k < n && isAsciiDigitCodePoint(chars[k]!)) k++; |
| const start = digitStart > 0 && chars[digitStart - 1] === ' ' ? digitStart - 1 : digitStart; |
| spans.push([start, k]); |
| i = k; |
| } |
| return spans; |
| } |
|
|
| |
| |
| |
| |
| function tokenIndicesCoveringSpan<T extends { offset: [number, number] }>(tokens: T[], ms: number, me: number): number[] | null { |
| const n = tokens.length; |
| let k = 0; |
| while (k < n && tokens[k]!.offset[1] <= ms) k++; |
| if (k >= n) return null; |
| if (tokens[k]!.offset[0] !== ms) return null; |
|
|
| const idxs: number[] = []; |
| while (k < n) { |
| const [ts, te] = tokens[k]!.offset; |
| if (ts < ms || te > me) return null; |
| idxs.push(k); |
| if (te === me) return idxs; |
| k++; |
| if (k >= n) return null; |
| if (tokens[k]!.offset[0] !== te) return null; |
| } |
| return null; |
| } |
|
|
| |
| |
| |
| |
| export function digitMergeIndexGroupsByText<T extends { offset: [number, number] }>( |
| originalText: string, |
| tokens: T[] |
| ): number[][] { |
| const n = tokens.length; |
| if (n === 0) return []; |
|
|
| const spans = asciiDigitSpanRangesByCodePoint(originalText); |
| const spanTag: (number | null)[] = new Array(n).fill(null); |
| let nextSid = 0; |
|
|
| for (const [ms, me] of spans) { |
| const idxs = tokenIndicesCoveringSpan(tokens, ms, me); |
| if (!idxs) { |
| continue; |
| } |
| if (idxs.length < 2) continue; |
|
|
| const sid = nextSid++; |
| for (const ti of idxs) { |
| if (spanTag[ti] !== null) { |
| const t = tokens[ti]!; |
| const prevSid = spanTag[ti]; |
| throw new Error( |
| `digitMerge: token 下标 ${ti} 重复落入两段数字区间(offset=[${t.offset[0]},${t.offset[1]}),先前段 id=${prevSid})` |
| ); |
| } |
| spanTag[ti] = sid; |
| } |
| } |
|
|
| const groups: number[][] = []; |
| let i = 0; |
| while (i < n) { |
| const sid = spanTag[i]!; |
| if (sid === null) { |
| groups.push([i]); |
| i++; |
| continue; |
| } |
| const g: number[] = [i]; |
| i++; |
| while (i < n && spanTag[i] === sid) { |
| g.push(i); |
| i++; |
| } |
| groups.push(g); |
| } |
| return groups; |
| } |
|
|
| export type SequentialOverlapOptions<T> = { |
| getOffset: (t: T) => [number, number]; |
| cloneForStep: (t: T) => T; |
| mergeOverlappingPair: (current: T, next: T, mergedOffset: [number, number], mergedRaw: string) => T; |
| |
| sliceMergedRaw: (start: number, end: number) => string; |
| }; |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| export function mergeSequentialOverlap<T>(tokens: T[], options: SequentialOverlapOptions<T>): T[] { |
| if (!Array.isArray(tokens) || tokens.length === 0) { |
| return []; |
| } |
| const { getOffset, cloneForStep, mergeOverlappingPair, sliceMergedRaw } = options; |
|
|
| const out: T[] = []; |
| let current = cloneForStep(tokens[0]!); |
| for (let k = 1; k < tokens.length; k++) { |
| const next = cloneForStep(tokens[k]!); |
| const [curStart] = getOffset(next); |
| const [cs, ce] = getOffset(current); |
| const prevEnd = ce; |
| let overlapping = curStart < prevEnd; |
| if (!overlapping && cs === ce) { |
| const [ns, ne] = getOffset(next); |
| if (ns <= cs && cs < ne) { |
| overlapping = true; |
| } else if (ns === ne && ns === cs) { |
| overlapping = true; |
| } |
| } |
| if (overlapping) { |
| const end = Math.max(prevEnd, getOffset(next)[1]); |
| const mergedOffset: [number, number] = [getOffset(current)[0], end]; |
| const mergedRaw = sliceMergedRaw(mergedOffset[0], end); |
| current = mergeOverlappingPair(current, next, mergedOffset, mergedRaw); |
| } else { |
| out.push(current); |
| current = next; |
| } |
| } |
| out.push(current); |
| return out; |
| } |
|
|
| |
| |
| |
| export function mergeSourcePartsForOverlapPair<T extends { offset: [number, number]; bpe_merge_parts?: string[] }>( |
| text: string, |
| current: T, |
| next: T |
| ): string[] { |
| const curParts = |
| current.bpe_merge_parts ?? [sliceTextByCodePointOffsets(text, current.offset[0], current.offset[1])]; |
| const nextParts = |
| next.bpe_merge_parts ?? [sliceTextByCodePointOffsets(text, next.offset[0], next.offset[1])]; |
| return [...curParts, ...nextParts]; |
| } |
|
|
| |
| |
| |
| export function flattenMergePartsForDigitGroup<T extends { raw: string; bpe_merge_parts?: string[] }>( |
| group: number[], |
| tokens: T[] |
| ): string[] { |
| return group.flatMap((idx) => { |
| const tok = tokens[idx]!; |
| return tok.bpe_merge_parts ?? [tok.raw]; |
| }); |
| } |
|
|