| import { |
| SPAN_REGEX, |
| COMPOSITE_REGEX, |
| STANDALONE_PATTERN, |
| CLEANUP_REGEX, |
| INVALID_CITATION_REGEX, |
| } from '../citations'; |
|
|
| describe('Citation Regex Patterns', () => { |
| beforeEach(() => { |
| |
| SPAN_REGEX.lastIndex = 0; |
| COMPOSITE_REGEX.lastIndex = 0; |
| STANDALONE_PATTERN.lastIndex = 0; |
| CLEANUP_REGEX.lastIndex = 0; |
| INVALID_CITATION_REGEX.lastIndex = 0; |
| }); |
|
|
| describe('STANDALONE_PATTERN', () => { |
| describe('literal text format (\\ue202)', () => { |
| it('should match literal text search citation', () => { |
| const text = 'Some fact \\ue202turn0search0 here'; |
| STANDALONE_PATTERN.lastIndex = 0; |
| const match = STANDALONE_PATTERN.exec(text); |
| expect(match).not.toBeNull(); |
| expect(match?.[1]).toBe('0'); |
| expect(match?.[2]).toBe('search'); |
| expect(match?.[3]).toBe('0'); |
| }); |
|
|
| it('should match literal text file citation', () => { |
| const text = 'Document says \\ue202turn0file0 (doc.pdf)'; |
| STANDALONE_PATTERN.lastIndex = 0; |
| const match = STANDALONE_PATTERN.exec(text); |
| expect(match).not.toBeNull(); |
| expect(match?.[1]).toBe('0'); |
| expect(match?.[2]).toBe('file'); |
| expect(match?.[3]).toBe('0'); |
| }); |
|
|
| it('should match literal text news citation', () => { |
| const text = 'Breaking news \\ue202turn0news1'; |
| STANDALONE_PATTERN.lastIndex = 0; |
| const match = STANDALONE_PATTERN.exec(text); |
| expect(match).not.toBeNull(); |
| expect(match?.[1]).toBe('0'); |
| expect(match?.[2]).toBe('news'); |
| expect(match?.[3]).toBe('1'); |
| }); |
|
|
| it('should match multiple literal text citations', () => { |
| const text = 'Fact one \\ue202turn0search0 and fact two \\ue202turn0file1'; |
| const matches: RegExpExecArray[] = []; |
| let match: RegExpExecArray | null; |
| STANDALONE_PATTERN.lastIndex = 0; |
| while ((match = STANDALONE_PATTERN.exec(text)) !== null) { |
| matches.push(match); |
| } |
| expect(matches).toHaveLength(2); |
| expect(matches[0][2]).toBe('search'); |
| expect(matches[1][2]).toBe('file'); |
| }); |
|
|
| it('should match all supported types in literal text format', () => { |
| const types = ['search', 'image', 'news', 'video', 'ref', 'file']; |
| for (const type of types) { |
| const text = `Test \\ue202turn0${type}0`; |
| STANDALONE_PATTERN.lastIndex = 0; |
| const match = STANDALONE_PATTERN.exec(text); |
| expect(match).not.toBeNull(); |
| expect(match?.[2]).toBe(type); |
| } |
| }); |
| }); |
|
|
| describe('actual Unicode character format (U+E202)', () => { |
| it('should match actual Unicode search citation', () => { |
| const text = 'Some fact \ue202turn0search0 here'; |
| STANDALONE_PATTERN.lastIndex = 0; |
| const match = STANDALONE_PATTERN.exec(text); |
| expect(match).not.toBeNull(); |
| expect(match?.[1]).toBe('0'); |
| expect(match?.[2]).toBe('search'); |
| expect(match?.[3]).toBe('0'); |
| }); |
|
|
| it('should match actual Unicode file citation', () => { |
| const text = 'Document says \ue202turn0file0 (doc.pdf)'; |
| STANDALONE_PATTERN.lastIndex = 0; |
| const match = STANDALONE_PATTERN.exec(text); |
| expect(match).not.toBeNull(); |
| expect(match?.[1]).toBe('0'); |
| expect(match?.[2]).toBe('file'); |
| expect(match?.[3]).toBe('0'); |
| }); |
|
|
| it('should match all supported types in actual Unicode format', () => { |
| const types = ['search', 'image', 'news', 'video', 'ref', 'file']; |
| for (const type of types) { |
| const text = `Test \ue202turn0${type}0`; |
| STANDALONE_PATTERN.lastIndex = 0; |
| const match = STANDALONE_PATTERN.exec(text); |
| expect(match).not.toBeNull(); |
| expect(match?.[2]).toBe(type); |
| } |
| }); |
| }); |
|
|
| describe('mixed format handling', () => { |
| it('should match both formats in the same text', () => { |
| const text = 'Literal \\ue202turn0search0 and Unicode \ue202turn0file1'; |
| const matches: RegExpExecArray[] = []; |
| let match: RegExpExecArray | null; |
| STANDALONE_PATTERN.lastIndex = 0; |
| while ((match = STANDALONE_PATTERN.exec(text)) !== null) { |
| matches.push(match); |
| } |
| expect(matches).toHaveLength(2); |
| expect(matches[0][2]).toBe('search'); |
| expect(matches[1][2]).toBe('file'); |
| }); |
| }); |
| }); |
|
|
| describe('SPAN_REGEX', () => { |
| it('should match literal text span markers', () => { |
| const text = 'Before \\ue203highlighted text\\ue204 after'; |
| SPAN_REGEX.lastIndex = 0; |
| const match = SPAN_REGEX.exec(text); |
| expect(match).not.toBeNull(); |
| expect(match?.[0]).toContain('highlighted text'); |
| }); |
|
|
| it('should match actual Unicode span markers', () => { |
| const text = 'Before \ue203highlighted text\ue204 after'; |
| SPAN_REGEX.lastIndex = 0; |
| const match = SPAN_REGEX.exec(text); |
| expect(match).not.toBeNull(); |
| expect(match?.[0]).toContain('highlighted text'); |
| }); |
| }); |
|
|
| describe('COMPOSITE_REGEX', () => { |
| it('should match literal text composite markers', () => { |
| const text = 'Statement \\ue200\\ue202turn0search0\\ue202turn0news0\\ue201'; |
| COMPOSITE_REGEX.lastIndex = 0; |
| const match = COMPOSITE_REGEX.exec(text); |
| expect(match).not.toBeNull(); |
| }); |
|
|
| it('should match actual Unicode composite markers', () => { |
| const text = 'Statement \ue200\ue202turn0search0\ue202turn0news0\ue201'; |
| COMPOSITE_REGEX.lastIndex = 0; |
| const match = COMPOSITE_REGEX.exec(text); |
| expect(match).not.toBeNull(); |
| }); |
| }); |
|
|
| describe('CLEANUP_REGEX', () => { |
| it('should clean up literal text markers', () => { |
| const text = '\\ue200\\ue201\\ue202\\ue203\\ue204\\ue206'; |
| const cleaned = text.replace(CLEANUP_REGEX, ''); |
| expect(cleaned).toBe(''); |
| }); |
|
|
| it('should clean up actual Unicode markers', () => { |
| const text = '\ue200\ue201\ue202\ue203\ue204\ue206'; |
| const cleaned = text.replace(CLEANUP_REGEX, ''); |
| expect(cleaned).toBe(''); |
| }); |
|
|
| it('should preserve normal text while cleaning markers', () => { |
| const text = 'Hello \\ue202turn0search0 world'; |
| const cleaned = text.replace(CLEANUP_REGEX, ''); |
| expect(cleaned).toBe('Hello turn0search0 world'); |
| }); |
| }); |
|
|
| describe('INVALID_CITATION_REGEX', () => { |
| it('should match invalid literal text citations with leading whitespace', () => { |
| const text = 'Text \\ue202turn0search5'; |
| INVALID_CITATION_REGEX.lastIndex = 0; |
| const match = INVALID_CITATION_REGEX.exec(text); |
| expect(match).not.toBeNull(); |
| }); |
|
|
| it('should match invalid actual Unicode citations with leading whitespace', () => { |
| const text = 'Text \ue202turn0search5'; |
| INVALID_CITATION_REGEX.lastIndex = 0; |
| const match = INVALID_CITATION_REGEX.exec(text); |
| expect(match).not.toBeNull(); |
| }); |
| }); |
|
|
| describe('Integration: Full Citation Processing Flow', () => { |
| |
| |
| |
| const processFullCitationFlow = (text: string) => { |
| |
| const spans: Array<{ content: string; position: number }> = []; |
| let spanMatch; |
| const spanRegex = new RegExp(SPAN_REGEX.source, 'g'); |
| while ((spanMatch = spanRegex.exec(text)) !== null) { |
| const content = spanMatch[0].replace(/\\ue203|\\ue204|\ue203|\ue204/g, ''); |
| spans.push({ content, position: spanMatch.index }); |
| } |
|
|
| |
| const composites: Array<{ citations: string[]; position: number }> = []; |
| let compMatch; |
| const compRegex = new RegExp(COMPOSITE_REGEX.source, 'g'); |
| while ((compMatch = compRegex.exec(text)) !== null) { |
| const block = compMatch[0]; |
| const citations: string[] = []; |
| let citMatch; |
| const citRegex = new RegExp(STANDALONE_PATTERN.source, 'g'); |
| while ((citMatch = citRegex.exec(block)) !== null) { |
| citations.push(`turn${citMatch[1]}${citMatch[2]}${citMatch[3]}`); |
| } |
| composites.push({ citations, position: compMatch.index }); |
| } |
|
|
| |
| const standalones: Array<{ citation: string; position: number }> = []; |
| let standMatch; |
| const standRegex = new RegExp(STANDALONE_PATTERN.source, 'g'); |
| while ((standMatch = standRegex.exec(text)) !== null) { |
| |
| const isInComposite = composites.some( |
| (c) => standMatch && standMatch.index >= c.position && standMatch.index < c.position + 50, |
| ); |
| if (!isInComposite) { |
| standalones.push({ |
| citation: `turn${standMatch[1]}${standMatch[2]}${standMatch[3]}`, |
| position: standMatch.index, |
| }); |
| } |
| } |
|
|
| |
| const cleanedText = text.replace(INVALID_CITATION_REGEX, '').replace(CLEANUP_REGEX, ''); |
|
|
| return { spans, composites, standalones, cleanedText }; |
| }; |
|
|
| describe('literal text format integration', () => { |
| it('should process complex LLM response with multiple citation types', () => { |
| const llmResponse = `Here's what I found about the topic. |
| |
| \\ue203This is an important quote from the source.\\ue204 \\ue202turn0search0 |
| |
| The data shows several key findings \\ue202turn0search1 including: |
| - First finding \\ue202turn0news0 |
| - Second finding \\ue200\\ue202turn0search2\\ue202turn0file0\\ue201 |
| |
| For more details, see the attached document \\ue202turn0file1.`; |
|
|
| const result = processFullCitationFlow(llmResponse); |
|
|
| expect(result.spans).toHaveLength(1); |
| expect(result.spans[0].content).toBe('This is an important quote from the source.'); |
|
|
| expect(result.composites).toHaveLength(1); |
| expect(result.composites[0].citations).toEqual(['turn0search2', 'turn0file0']); |
|
|
| expect(result.standalones.length).toBeGreaterThanOrEqual(3); |
|
|
| expect(result.cleanedText).not.toContain('\\ue202'); |
| expect(result.cleanedText).not.toContain('\\ue200'); |
| }); |
|
|
| it('should handle file citations from document search', () => { |
| const fileSearchResponse = `Based on the document medical-anthem-blue-cross.pdf: |
| |
| - **Annual deductible:** $3,300 per person \\ue202turn0file0 |
| - **Out-of-pocket maximum:** $4,000 per person \\ue202turn0file0 |
| - **Network:** Prudent Buyer PPO \\ue202turn0file1 |
| |
| Multiple sources confirm these details. \\ue200\\ue202turn0file0\\ue202turn0file1\\ue202turn0file2\\ue201`; |
|
|
| const result = processFullCitationFlow(fileSearchResponse); |
|
|
| expect(result.composites).toHaveLength(1); |
| expect(result.composites[0].citations).toHaveLength(3); |
|
|
| |
| const fileCitations = result.standalones.filter((s) => s.citation.includes('file')); |
| expect(fileCitations.length).toBeGreaterThanOrEqual(2); |
| }); |
| }); |
|
|
| describe('actual Unicode format integration', () => { |
| it('should process response with actual Unicode characters', () => { |
| const llmResponse = `Research findings indicate the following: |
| |
| \ue203Key insight from the study.\ue204 \ue202turn0search0 |
| |
| Additional context \ue202turn0news0 supports this conclusion \ue200\ue202turn0search1\ue202turn0ref0\ue201.`; |
|
|
| const result = processFullCitationFlow(llmResponse); |
|
|
| expect(result.spans).toHaveLength(1); |
| expect(result.composites).toHaveLength(1); |
| expect(result.standalones.length).toBeGreaterThanOrEqual(1); |
| expect(result.cleanedText).not.toContain('\ue202'); |
| }); |
| }); |
|
|
| describe('mixed format integration', () => { |
| it('should handle mixed literal and Unicode formats in same response', () => { |
| const mixedResponse = `First citation uses literal \\ue202turn0search0 format. |
| Second citation uses Unicode \ue202turn0search1 format. |
| Composite with mixed: \\ue200\\ue202turn0file0\ue202turn0file1\\ue201`; |
|
|
| const result = processFullCitationFlow(mixedResponse); |
|
|
| |
| expect(result.standalones.length).toBeGreaterThanOrEqual(2); |
| expect(result.composites).toHaveLength(1); |
| expect(result.composites[0].citations).toHaveLength(2); |
| }); |
| }); |
| }); |
|
|
| describe('Performance: Regex Benchmarks', () => { |
| |
| |
| |
| const generateCitationHeavyText = (citationCount: number, format: 'literal' | 'unicode') => { |
| const marker = format === 'literal' ? '\\ue202' : '\ue202'; |
| const spanStart = format === 'literal' ? '\\ue203' : '\ue203'; |
| const spanEnd = format === 'literal' ? '\\ue204' : '\ue204'; |
| const compStart = format === 'literal' ? '\\ue200' : '\ue200'; |
| const compEnd = format === 'literal' ? '\\ue201' : '\ue201'; |
|
|
| const types = ['search', 'news', 'file', 'ref', 'image', 'video']; |
| let text = ''; |
|
|
| for (let i = 0; i < citationCount; i++) { |
| const type = types[i % types.length]; |
| const turn = Math.floor(i / 10); |
| const index = i % 10; |
|
|
| if (i % 5 === 0) { |
| |
| text += `${spanStart}Important fact number ${i}.${spanEnd} ${marker}turn${turn}${type}${index} `; |
| } else if (i % 7 === 0) { |
| |
| text += `Multiple sources ${compStart}${marker}turn${turn}${type}${index}${marker}turn${turn}${types[(i + 1) % types.length]}${(index + 1) % 10}${compEnd} confirm this. `; |
| } else { |
| text += `This is fact ${i} ${marker}turn${turn}${type}${index} from the research. `; |
| } |
| } |
|
|
| return text; |
| }; |
|
|
| it('should process 100 literal citations in reasonable time (<100ms)', () => { |
| const text = generateCitationHeavyText(100, 'literal'); |
|
|
| const start = performance.now(); |
|
|
| |
| const results = { spans: 0, composites: 0, standalones: 0, cleaned: '' }; |
|
|
| SPAN_REGEX.lastIndex = 0; |
| while (SPAN_REGEX.exec(text) !== null) { |
| results.spans++; |
| } |
|
|
| COMPOSITE_REGEX.lastIndex = 0; |
| while (COMPOSITE_REGEX.exec(text) !== null) { |
| results.composites++; |
| } |
|
|
| STANDALONE_PATTERN.lastIndex = 0; |
| while (STANDALONE_PATTERN.exec(text) !== null) { |
| results.standalones++; |
| } |
|
|
| results.cleaned = text.replace(CLEANUP_REGEX, ''); |
|
|
| const duration = performance.now() - start; |
|
|
| expect(duration).toBeLessThan(100); |
| expect(results.standalones).toBeGreaterThan(80); |
| expect(results.spans).toBeGreaterThan(10); |
| expect(results.composites).toBeGreaterThan(5); |
| }); |
|
|
| it('should process 100 Unicode citations in reasonable time (<100ms)', () => { |
| const text = generateCitationHeavyText(100, 'unicode'); |
|
|
| const start = performance.now(); |
|
|
| const results = { spans: 0, composites: 0, standalones: 0, cleaned: '' }; |
|
|
| SPAN_REGEX.lastIndex = 0; |
| while (SPAN_REGEX.exec(text) !== null) { |
| results.spans++; |
| } |
|
|
| COMPOSITE_REGEX.lastIndex = 0; |
| while (COMPOSITE_REGEX.exec(text) !== null) { |
| results.composites++; |
| } |
|
|
| STANDALONE_PATTERN.lastIndex = 0; |
| while (STANDALONE_PATTERN.exec(text) !== null) { |
| results.standalones++; |
| } |
|
|
| results.cleaned = text.replace(CLEANUP_REGEX, ''); |
|
|
| const duration = performance.now() - start; |
|
|
| expect(duration).toBeLessThan(100); |
| expect(results.standalones).toBeGreaterThan(80); |
| }); |
|
|
| it('should process 500 citations without timeout (<500ms)', () => { |
| const text = generateCitationHeavyText(500, 'literal'); |
|
|
| const start = performance.now(); |
|
|
| let count = 0; |
|
|
| STANDALONE_PATTERN.lastIndex = 0; |
| while (STANDALONE_PATTERN.exec(text) !== null) { |
| count++; |
| } |
|
|
| const cleaned = text.replace(CLEANUP_REGEX, ''); |
|
|
| const duration = performance.now() - start; |
|
|
| expect(duration).toBeLessThan(500); |
| expect(count).toBeGreaterThan(400); |
| expect(cleaned.length).toBeLessThan(text.length); |
| }); |
|
|
| it('should handle mixed formats efficiently (<100ms for 100 citations)', () => { |
| |
| const literalText = generateCitationHeavyText(50, 'literal'); |
| const unicodeText = generateCitationHeavyText(50, 'unicode'); |
| const mixedText = literalText + '\n\n' + unicodeText; |
|
|
| const start = performance.now(); |
|
|
| let count = 0; |
|
|
| STANDALONE_PATTERN.lastIndex = 0; |
| while (STANDALONE_PATTERN.exec(mixedText) !== null) { |
| count++; |
| } |
|
|
| const duration = performance.now() - start; |
|
|
| expect(duration).toBeLessThan(100); |
| expect(count).toBeGreaterThan(80); |
| }); |
|
|
| it('should handle repeated execution during streaming simulation (<1000ms cumulative)', () => { |
| |
| |
| |
| |
| const fullText = generateCitationHeavyText(50, 'literal'); |
| const tokens: string[] = []; |
|
|
| |
| const chunkSize = Math.ceil(fullText.length / 100); |
| for (let i = 0; i < fullText.length; i += chunkSize) { |
| tokens.push(fullText.slice(0, i + chunkSize)); |
| } |
|
|
| const start = performance.now(); |
| let totalMatches = 0; |
| let spanCount = 0; |
| let compositeCount = 0; |
|
|
| |
| for (const partialText of tokens) { |
| |
| SPAN_REGEX.lastIndex = 0; |
| while (SPAN_REGEX.exec(partialText) !== null) { |
| spanCount++; |
| } |
|
|
| COMPOSITE_REGEX.lastIndex = 0; |
| while (COMPOSITE_REGEX.exec(partialText) !== null) { |
| compositeCount++; |
| } |
|
|
| STANDALONE_PATTERN.lastIndex = 0; |
| while (STANDALONE_PATTERN.exec(partialText) !== null) { |
| totalMatches++; |
| } |
|
|
| |
| void partialText.replace(CLEANUP_REGEX, ''); |
| } |
|
|
| const duration = performance.now() - start; |
|
|
| |
| |
| expect(duration).toBeLessThan(1000); |
| expect(totalMatches).toBeGreaterThan(1000); |
| expect(spanCount).toBeGreaterThan(0); |
| expect(compositeCount).toBeGreaterThan(0); |
| }); |
|
|
| it('should handle rapid repeated execution (300 renders with 20 citations)', () => { |
| |
| |
| |
| const fullText = generateCitationHeavyText(20, 'literal'); |
| const renderCount = 300; |
|
|
| const start = performance.now(); |
| let totalOps = 0; |
|
|
| |
| for (let i = 0; i < renderCount; i++) { |
| const progress = Math.min(1, (i + 1) / renderCount); |
| const partialText = fullText.slice(0, Math.floor(fullText.length * progress)); |
|
|
| SPAN_REGEX.lastIndex = 0; |
| while (SPAN_REGEX.exec(partialText) !== null) { |
| totalOps++; |
| } |
|
|
| COMPOSITE_REGEX.lastIndex = 0; |
| while (COMPOSITE_REGEX.exec(partialText) !== null) { |
| totalOps++; |
| } |
|
|
| STANDALONE_PATTERN.lastIndex = 0; |
| while (STANDALONE_PATTERN.exec(partialText) !== null) { |
| totalOps++; |
| } |
|
|
| void partialText.replace(CLEANUP_REGEX, ''); |
| } |
|
|
| const duration = performance.now() - start; |
| const avgPerRender = duration / renderCount; |
|
|
| |
| |
| expect(duration).toBeLessThan(500); |
| expect(avgPerRender).toBeLessThan(2); |
| expect(totalOps).toBeGreaterThan(0); |
| }); |
| }); |
| }); |
|
|