| import type { FrontendAnalyzeResult, FrontendToken } from '../api/GLTR_API'; |
| import { calculateSurprisal, calculateSurprisalDensity, countTokenCharacters, getByteLength } from './Util'; |
| import { extractRealTopkFromTokens } from './tokenUtils'; |
|
|
| export type TextStats = { |
| byteCount: number; |
| charCount: number; |
| tokenCount: number; |
| tokenSurprisals: number[]; |
| byteSurprisals: number[]; |
| tokenAverage: number | null; |
| tokenP90: number | null; |
| byteAverage: number | null; |
| totalSurprisal: number | null; |
| }; |
|
|
| |
| |
| |
| export type DiffStats = { |
| |
| byteCount: number; |
| charCount: number; |
| tokenCount: number; |
| tokenSurprisals: number[]; |
| tokenAverage: number | null; |
| |
| deltaTotalSurprisal: number | null; |
| deltaByteSurprisals: number[]; |
| }; |
|
|
| |
| |
| |
| export const computeAverage = (values: number[] | null | undefined): number | null => { |
| if (!values || values.length === 0) { |
| return null; |
| } |
| const validValues = values.filter((value) => Number.isFinite(value)); |
| if (validValues.length === 0) { |
| return null; |
| } |
| const sum = validValues.reduce((acc, value) => acc + value, 0); |
| return sum / validValues.length; |
| }; |
|
|
| |
| |
| |
| |
| export function calculateMergedTokenSurprisals(bpeBpeMergedTokens: FrontendToken[]): number[] { |
| if (!bpeBpeMergedTokens.length) return []; |
| const realTopkMerged = extractRealTopkFromTokens(bpeBpeMergedTokens); |
| return bpeBpeMergedTokens.map((_, index) => calculateSurprisal(realTopkMerged[index][1])); |
| } |
|
|
| |
| export const computeP90 = (values: number[] | null | undefined): number | null => { |
| if (!values || values.length === 0) { |
| return null; |
| } |
| const sorted = values |
| .filter((value) => Number.isFinite(value)) |
| .slice() |
| .sort((a, b) => a - b); |
| const n = sorted.length; |
| if (n === 0) { |
| return null; |
| } |
| |
| const index = (n - 1) * 0.9; |
| const lower = Math.floor(index); |
| const upper = Math.ceil(index); |
| const weight = index - lower; |
| |
| if (lower === upper) { |
| return sorted[lower]; |
| } |
| |
| return sorted[lower] * (1 - weight) + sorted[upper] * weight; |
| }; |
|
|
| |
| |
| |
| export const calculateTextStats = ( |
| result: FrontendAnalyzeResult, |
| originalText: string |
| ): TextStats => { |
| const originalTokens = result.originalTokens; |
| const bpeBpeMergedTokens = result.bpeBpeMergedTokens; |
|
|
| const realTopkOriginal = extractRealTopkFromTokens(originalTokens); |
| const realTopkMerged = extractRealTopkFromTokens(bpeBpeMergedTokens); |
|
|
| |
| let truncatedTextLength = 0; |
| if (originalTokens.length > 0) { |
| const lastToken = originalTokens[originalTokens.length - 1]; |
| truncatedTextLength = lastToken.offset[1]; |
| } |
| |
| |
| const truncatedText = originalText.slice(0, truncatedTextLength); |
| const safeText = truncatedText; |
| |
| const byteCount = getByteLength(safeText); |
| const charCount = countTokenCharacters(safeText); |
| const tokenCount = originalTokens.length; |
|
|
| const tokenSurprisals: number[] = []; |
| const byteSurprisals: number[] = []; |
| let totalSurprisal = 0; |
| let hasValidTotal = false; |
|
|
| originalTokens.forEach((token, index) => { |
| const prob = realTopkOriginal[index][1]; |
| const surprisal = calculateSurprisal(prob); |
| tokenSurprisals.push(surprisal); |
| if (Number.isFinite(surprisal)) { |
| totalSurprisal += surprisal; |
| hasValidTotal = true; |
| } |
| }); |
|
|
| bpeBpeMergedTokens.forEach((token) => { |
| const tokenText = token.raw; |
| const byteCountForToken = getByteLength(tokenText); |
| const byteSurprisal = calculateSurprisalDensity(token); |
| |
| |
| |
| for (let i = 0; i < byteCountForToken; i++) { |
| byteSurprisals.push(byteSurprisal); |
| } |
| }); |
|
|
| return { |
| byteCount, |
| charCount, |
| tokenCount, |
| tokenSurprisals, |
| byteSurprisals, |
| tokenAverage: computeAverage(tokenSurprisals), |
| tokenP90: computeP90(tokenSurprisals), |
| byteAverage: computeAverage(byteSurprisals), |
| totalSurprisal: hasValidTotal ? totalSurprisal : null |
| }; |
| }; |
|
|
| |
| |
| |
| |
| |
| |
| export const calculateDiffStats = ( |
| diffStats: TextStats, |
| baseStats: TextStats |
| ): DiffStats => { |
| |
| const deltaTotalSurprisal = (diffStats.totalSurprisal !== null && baseStats.totalSurprisal !== null) |
| ? diffStats.totalSurprisal - baseStats.totalSurprisal |
| : null; |
|
|
| |
| const deltaByteSurprisals: number[] = []; |
| const minLength = Math.min(diffStats.byteSurprisals.length, baseStats.byteSurprisals.length); |
| |
| for (let i = 0; i < minLength; i++) { |
| const delta = diffStats.byteSurprisals[i] - baseStats.byteSurprisals[i]; |
| deltaByteSurprisals.push(delta); |
| } |
|
|
| return { |
| byteCount: diffStats.byteCount, |
| charCount: diffStats.charCount, |
| tokenCount: diffStats.tokenCount, |
| tokenSurprisals: diffStats.tokenSurprisals, |
| tokenAverage: diffStats.tokenAverage, |
| deltaTotalSurprisal, |
| deltaByteSurprisals |
| }; |
| }; |
|
|
|
|