| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import { quantileSorted } from 'd3-array'; |
| import { fitLogNormalTruncatedMLE, logNormalExpectedCountInInterval, normCdf, LN_EPS } from './lognormalFit'; |
| import { computeFitQuality } from './fitQuality'; |
|
|
| |
| const SCAN_SATISFACTION_CONFIDENCE = 0.99999; |
| |
| const MIN_ACCEPTABLE_CONFIDENCE = 0.9; |
| |
| const EXCESS_MIN = 0.1; |
| const MIN_OBSERVED = 1; |
| const MIN_BIN_WIDTH = 0.01; |
| const MIN_SAMPLE_SIZE = 20; |
| const P0 = 1; |
| const MAX_REFINE_ITER = 10; |
| const THRESHOLD_CONVERGE_EPS = 0.01; |
| |
| const START_PERCENTILE_DEFAULT = 0.5; |
| |
| const EXP_IN_BIN_EPS = 1e-10; |
|
|
| |
| interface SignalThresholdScanResult { |
| threshold: number; |
| confidence: number; |
| } |
|
|
| |
| export interface signalFitResult { |
| threshold: number; |
| |
| confidence: number; |
| |
| mu: number; |
| |
| sigma: number; |
| |
| bins: SignalThresholdBin[]; |
| } |
|
|
| export interface SignalThresholdBin { |
| tauLeft: number; |
| tauRight: number; |
| obsInBin: number; |
| expInBin: number; |
| } |
|
|
| |
| interface BinStructure { |
| tauLeft: number; |
| tauRight: number; |
| obsInBin: number; |
| } |
|
|
| const TAU_RIGHT_EPSILON = 1e-6; |
|
|
| const PERCENTILE_DIAGNOSTICS = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 1] as const; |
|
|
| |
| const FALLBACK_TEXT_HEAD_LEN = 24; |
| const FALLBACK_TEXT_TAIL_LEN = 24; |
|
|
| function concatTokenRawText(tokens: Array<{ raw?: string }>): string { |
| return tokens.map((t) => t.raw ?? '').join(''); |
| } |
|
|
| |
| function formatFallbackTextExcerpt(text: string, headLen: number, tailLen: number): string { |
| const maxShort = headLen + tailLen + 1; |
| if (text.length <= maxShort) { |
| return `${text}`; |
| } |
| return `${text.slice(0, headLen)}\n……\n${text.slice(-tailLen)}`; |
| } |
|
|
| |
| function computeExcess(obsInBin: number, expInBin: number): number { |
| if (expInBin <= EXP_IN_BIN_EPS) return obsInBin > 0 ? Infinity : 0; |
| return (obsInBin - expInBin) / Math.sqrt(expInBin); |
| } |
|
|
| |
| function logPercentileDiagnostics(scores: number[]): void { |
| const sorted = [...scores].sort((a, b) => a - b); |
| const n = sorted.length; |
| if (n < 2) return; |
| const rows: Array<{ p: number; n: number; mu: number; sigma: number }> = []; |
| for (const p of PERCENTILE_DIAGNOSTICS) { |
| const pIdx = Math.max(1, Math.min(n, Math.round(n * p))); |
| const noiseNorm = sorted.slice(0, pIdx); |
| const tau = pIdx < n ? (sorted[pIdx - 1]! + sorted[pIdx]!) / 2 : sorted[pIdx - 1]!; |
| const fit = fitLogNormalTruncatedMLE(noiseNorm, tau); |
| if (fit) rows.push({ p, n: pIdx, mu: fit.mu, sigma: fit.sigma }); |
| } |
| if (rows.length === 0) return; |
| console.log('[signalThreshold] 渐近一致性诊断 (percentile → μ, σ)'); |
| for (const { p, n, mu, sigma } of rows) { |
| console.log(` p=${p} n=${n}: μ=${mu.toFixed(4)}, σ=${sigma.toFixed(4)}`); |
| } |
| } |
| |
| function printBinScanLogs(bins: SignalThresholdBin[], excessMin: number): void { |
| console.log('[signalThreshold] 完整扫描明细 τ_left | τ_right | obsInBin | expInBin | excess | binConf | hit | confidence'); |
| let cumulativeFalsePositiveProbability = 1; |
| let firstHitTauLeft: number | null = null; |
| for (const bin of bins) { |
| const excess = computeExcess(bin.obsInBin, bin.expInBin); |
| const hit = excess > excessMin; |
| const binConfidence = normCdf(excess); |
| if (hit) { |
| if (firstHitTauLeft === null) firstHitTauLeft = bin.tauLeft; |
| cumulativeFalsePositiveProbability *= 1 - binConfidence; |
| const confidence = 1 - cumulativeFalsePositiveProbability; |
| console.log(`[signalThreshold] ${bin.tauLeft.toFixed(4)} | ${bin.tauRight.toFixed(4)} | ${String(bin.obsInBin).padStart(7)} | ${bin.expInBin.toFixed(1).padStart(8)} | ${excess.toFixed(2).padStart(6)} | ${binConfidence.toFixed(4)} | ✓ | ${confidence.toFixed(4)}`); |
| } else { |
| cumulativeFalsePositiveProbability = 1; |
| firstHitTauLeft = null; |
| console.log(`[signalThreshold] ${bin.tauLeft.toFixed(4)} | ${bin.tauRight.toFixed(4)} | ${String(bin.obsInBin).padStart(7)} | ${bin.expInBin.toFixed(1).padStart(8)} | ${excess.toFixed(2).padStart(6)} | ${binConfidence.toFixed(4)} | | -`); |
| } |
| } |
| } |
|
|
| |
| function formBinStructures(sorted: number[]): BinStructure[] { |
| const n = sorted.length; |
| const mids: number[] = []; |
| for (let i = 0; i < n - 1; i++) mids.push(Math.sqrt(sorted[i]! * sorted[i + 1]!)); |
| const structures: BinStructure[] = []; |
| let tauLeft = sorted[0]! - TAU_RIGHT_EPSILON; |
|
|
| while (tauLeft < sorted[n - 1]!) { |
| let midIdx = mids.findIndex((m) => m >= tauLeft + MIN_BIN_WIDTH); |
| let tauRight = midIdx >= 0 ? mids[midIdx]! : sorted[n - 1]! + TAU_RIGHT_EPSILON; |
|
|
| let leftIdx = sorted.findIndex((v) => v >= tauLeft); |
| let rightIdx = midIdx >= 0 ? sorted.findIndex((v) => v >= tauRight) : -1; |
| let obsInBin = leftIdx < 0 ? 0 : rightIdx < 0 ? n - leftIdx : rightIdx - leftIdx; |
|
|
| while (obsInBin < MIN_OBSERVED && midIdx >= 0 && midIdx < mids.length - 1) { |
| midIdx++; |
| tauRight = mids[midIdx]!; |
| rightIdx = sorted.findIndex((v) => v >= tauRight); |
| obsInBin = leftIdx < 0 ? 0 : rightIdx < 0 ? n - leftIdx : rightIdx - leftIdx; |
| } |
| if (obsInBin < MIN_OBSERVED) { |
| tauRight = sorted[n - 1]! + TAU_RIGHT_EPSILON; |
| rightIdx = -1; |
| obsInBin = leftIdx < 0 ? 0 : n - leftIdx; |
| if (obsInBin < MIN_OBSERVED) break; |
| } |
|
|
| structures.push({ tauLeft, tauRight, obsInBin }); |
| tauLeft = tauRight; |
| if (tauRight >= sorted[n - 1]! + TAU_RIGHT_EPSILON) break; |
| } |
| return structures; |
| } |
|
|
| |
| function evaluateBins( |
| structures: BinStructure[], |
| n: number, |
| mu: number, |
| sigma: number, |
| excessMin: number, |
| confidenceThreshold: number, |
| verbose: boolean, |
| startPercentile: number |
| ): SignalThresholdScanResult | null { |
| let cumulativeFalsePositiveProbability = 1; |
| let firstHitTauLeft: number | null = null; |
|
|
| const K = Math.min(Math.floor((n - 1) * startPercentile), n - 1); |
| let cumSum = 0; |
| let startIdx = 0; |
| for (let i = 0; i < structures.length; i++) { |
| if (K < cumSum + structures[i]!.obsInBin) { |
| startIdx = i; |
| break; |
| } |
| cumSum += structures[i]!.obsInBin; |
| } |
| const structuresToScan = structures.slice(startIdx); |
|
|
| if (verbose) { |
| console.log('[signalThreshold] 扫描明细 τ_left | τ_right | obsInBin | expInBin | excess | binConf | hit | confidence'); |
| } |
|
|
| for (const s of structuresToScan) { |
| const expInBin = logNormalExpectedCountInInterval(s.tauLeft, s.tauRight, n, mu, sigma); |
| const excess = computeExcess(s.obsInBin, expInBin); |
| const hit = excess > excessMin; |
| const binConfidence = normCdf(excess); |
|
|
| if (hit) { |
| if (firstHitTauLeft === null) firstHitTauLeft = s.tauLeft; |
| cumulativeFalsePositiveProbability *= 1 - binConfidence; |
| const confidence = 1 - cumulativeFalsePositiveProbability; |
| if (verbose) { |
| console.log(`[signalThreshold] ${s.tauLeft.toFixed(4)} | ${s.tauRight.toFixed(4)} | ${String(s.obsInBin).padStart(7)} | ${expInBin.toFixed(1).padStart(8)} | ${excess.toFixed(2).padStart(6)} | ${binConfidence.toFixed(4)} | ✓ | ${confidence.toFixed(4)}`); |
| } |
| if (confidence >= confidenceThreshold) { |
| return { threshold: firstHitTauLeft, confidence }; |
| } |
| } else { |
| cumulativeFalsePositiveProbability = 1; |
| firstHitTauLeft = null; |
| if (verbose) { |
| console.log(`[signalThreshold] ${s.tauLeft.toFixed(4)} | ${s.tauRight.toFixed(4)} | ${String(s.obsInBin).padStart(7)} | ${expInBin.toFixed(1).padStart(8)} | ${excess.toFixed(2).padStart(6)} | ${binConfidence.toFixed(4)} | | -`); |
| } |
| } |
| } |
|
|
| if (firstHitTauLeft !== null) { |
| return { threshold: firstHitTauLeft, confidence: 1 - cumulativeFalsePositiveProbability }; |
| } |
| return null; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| export function findSignalThreshold( |
| rawScoresNormed: number[], |
| verbose = false |
| ): signalFitResult | null { |
| const values = rawScoresNormed.filter( |
| (s) => typeof s === 'number' && isFinite(s) && s > LN_EPS |
| ); |
| const sorted = [...values].sort((a, b) => a - b); |
| const n = sorted.length; |
|
|
| if (n < MIN_SAMPLE_SIZE) { |
| if (verbose) console.log('[signalThreshold] 样本不足 n<', MIN_SAMPLE_SIZE, ',跳过'); |
| return null; |
| } |
|
|
| const p0 = P0; |
| const splitIdx = Math.max(1, Math.min(n, Math.round(n * p0))); |
| if (verbose) console.log('[signalThreshold] n=', n, 'splitIdx=', splitIdx); |
|
|
| let result: SignalThresholdScanResult | null = null; |
| let lastFit = { mu: 0, sigma: 0 }; |
| const binStructures = formBinStructures(sorted); |
|
|
| for (let iter = 0; iter <= MAX_REFINE_ITER; iter++) { |
| if (iter > 0 && result === null) return null; |
| const thresholdForNoise = result?.threshold ?? 0; |
| const noiseSamples = iter === 0 |
| ? sorted.slice(0, splitIdx) |
| : sorted.filter((x) => x <= thresholdForNoise); |
| const tauBoundary = iter === 0 |
| ? (splitIdx < n ? (sorted[splitIdx - 1]! + sorted[splitIdx]!) / 2 : sorted[splitIdx - 1]!) |
| : thresholdForNoise; |
|
|
| if (iter > 0 && noiseSamples.length < MIN_SAMPLE_SIZE) { |
| if (verbose) console.log('[signalThreshold] 迭代', iter, '失败:噪声样本数<', MIN_SAMPLE_SIZE); |
| return null; |
| } |
|
|
| if (verbose && iter === 0) { |
| const nInit = noiseSamples.length; |
| const minN = noiseSamples[0]!, maxN = noiseSamples[nInit - 1]!; |
| const midN = noiseSamples[Math.floor(nInit / 2)]!; |
| console.log('[signalThreshold] 迭代 0 噪声样本 n=', nInit, 'min=', minN.toFixed(4), 'max=', maxN.toFixed(4), 'median=', midN.toFixed(4)); |
| } |
|
|
| const fit = fitLogNormalTruncatedMLE(noiseSamples, tauBoundary); |
| if (fit === null) { |
| if (verbose) console.log('[signalThreshold] 迭代', iter, '失败:拟合失败'); |
| return null; |
| } |
| lastFit = { mu: fit.mu, sigma: fit.sigma }; |
|
|
| const q = computeFitQuality(noiseSamples, tauBoundary, fit.mu, fit.sigma); |
| if (verbose) { |
| console.log('[signalThreshold] 迭代', iter, '拟合 μ=', fit.mu.toFixed(4), 'σ=', fit.sigma.toFixed(4), '| maxDiff=', q.maxDiff.toFixed(4), 'RMSE=', q.rmse.toFixed(4)); |
| if (iter === 0) { |
| console.log('[signalThreshold] 迭代', iter, '从', (START_PERCENTILE_DEFAULT * 100).toFixed(0), '% 分位 bin 开始扫描 (excess>', EXCESS_MIN, ', confidence>=', SCAN_SATISFACTION_CONFIDENCE, ')'); |
| } |
| } |
|
|
| const scanResult = evaluateBins(binStructures, n, fit.mu, fit.sigma, EXCESS_MIN, SCAN_SATISFACTION_CONFIDENCE, verbose, START_PERCENTILE_DEFAULT); |
| if (scanResult === null) { |
| if (verbose) console.log('[signalThreshold] 迭代', iter, '失败:未检测到阈值'); |
| return null; |
| } |
|
|
| if (scanResult.confidence < MIN_ACCEPTABLE_CONFIDENCE) { |
| console.warn( |
| '[signalThreshold] 迭代', |
| iter, |
| '失败:confidence <', |
| MIN_ACCEPTABLE_CONFIDENCE, |
| '。当前', |
| scanResult.confidence.toFixed(4) |
| ); |
| return null; |
| } |
|
|
| const savedThreshold = result?.threshold; |
| result = scanResult; |
|
|
| if (iter > 0 && savedThreshold !== undefined) { |
| const delta = Math.abs(result.threshold - savedThreshold); |
| if (verbose) { |
| console.log('[signalThreshold] 迭代', iter, '新阈值=', result.threshold.toFixed(4), 'confidence=', result.confidence.toFixed(2), 'delta=', delta.toFixed(6)); |
| } |
| if (delta < THRESHOLD_CONVERGE_EPS) { |
| if (verbose) console.log('[signalThreshold] 迭代', iter, '收敛,最终阈值=', result.threshold.toFixed(4)); |
| break; |
| } |
| if (iter === MAX_REFINE_ITER && verbose) { |
| console.log('[signalThreshold] 达到最大迭代次数,最终阈值=', result.threshold.toFixed(4)); |
| } |
| } else if (verbose) { |
| console.log('[signalThreshold] 迭代 0 检测到阈值', result.threshold.toFixed(4), 'confidence=', result.confidence.toFixed(2)); |
| } |
| } |
|
|
| const bins: SignalThresholdBin[] = binStructures.map((s) => ({ |
| ...s, |
| expInBin: logNormalExpectedCountInInterval(s.tauLeft, s.tauRight, n, lastFit.mu, lastFit.sigma), |
| })); |
| if (verbose && bins.length > 0) { |
| printBinScanLogs(bins, EXCESS_MIN); |
| logPercentileDiagnostics(values); |
| } |
| if (result === null) return null; |
| return { ...result, mu: lastFit.mu, sigma: lastFit.sigma, bins }; |
| } |
|
|
| |
| function getVerboseFromWindow(): boolean { |
| return !!(typeof window !== 'undefined' && (window as Window & { signalThresholdVerbose?: boolean }).signalThresholdVerbose); |
| } |
|
|
| |
| |
| |
| |
| export function findSignalThresholdWithLog( |
| tokens: Array<{ score: number; raw?: string }>, |
| verbose = getVerboseFromWindow() |
| ): signalFitResult | null { |
| const rawScoresNormed = tokens.map(t => t.score).filter((s): s is number => typeof s === 'number' && Number.isFinite(s)); |
| if (rawScoresNormed.length === 0) { |
| console.warn('[signalThreshold] 无有效分数,跳过阈值'); |
| return null; |
| } |
|
|
| const result = findSignalThreshold(rawScoresNormed, verbose); |
| if (result !== null) { |
| if (verbose) { |
| const t = result.threshold; |
| const below = rawScoresNormed.filter((s) => s < t).length; |
| const quantile = below / rawScoresNormed.length; |
| console.log( |
| '[signalThreshold]', |
| `threshold=${t.toFixed(4)} confidence=${result.confidence.toFixed(2)} (quantile=${quantile.toFixed(4)}, ${below}/${rawScoresNormed.length} below) μ=${result.mu.toFixed(4)} σ=${result.sigma.toFixed(4)}` |
| ); |
| } |
| return result; |
| } |
|
|
| const sorted = [...rawScoresNormed].sort((a, b) => a - b); |
| const p90 = quantileSorted(sorted, 0.9); |
| const below = rawScoresNormed.filter((s) => s < p90).length; |
| const quantile = below / rawScoresNormed.length; |
|
|
| const rawText = concatTokenRawText(tokens); |
| const textHint = |
| rawText.length > 0 |
| ? ` | ${formatFallbackTextExcerpt(rawText, FALLBACK_TEXT_HEAD_LEN, FALLBACK_TEXT_TAIL_LEN)}` |
| : ''; |
| console.warn( |
| `[signalThreshold] 自动阈值检测失败,已使用 P90 分位作为启发式阈值(confidence=0)${textHint}` |
| ); |
| if (verbose) { |
| console.log( |
| '[signalThreshold]', |
| `threshold=${p90.toFixed(4)} (P90 fallback) confidence=0.00 (quantile=${quantile.toFixed(4)}, ${below}/${rawScoresNormed.length} below) 无截尾对数正态拟合` |
| ); |
| } |
|
|
| return { |
| threshold: p90, |
| confidence: 0, |
| mu: 0, |
| sigma: 1, |
| bins: [], |
| }; |
| } |
|
|