/** * Parse the streaming markdown produced by the Granite reconciler into the * four-section briefing IA the design system expects. * * The reconciler's prompt (app/reconcile.py:EXTRA_SYSTEM_PROMPT) enforces * **bold-stop** section heads, one per line: * **Status.** * **Empirical evidence.** * **Modeled scenarios.** * **Policy context.** * The model occasionally drops them inline; the backend's * _split_inline_headers normaliser fixes that before yielding final text, * but mid-stream we still tolerate inline forms ourselves. * * Within a section, every sentence with a `[doc_id]` citation is a Claim. * The claim's tier is inferred from the cited doc_id family via * tierForDocId(). Multiple cites on one sentence: the tier of the first * cited doc wins (visual margin glyph), all cites still rendered. */ import type { BriefingBlock, Citation, ClaimPart } from '$lib/types/claim'; import { tierForDocId, type Tier } from '$lib/types/tier'; const CANONICAL_SECTIONS: Array<{ key: string; label: string; n: string; tier?: Tier; aliases: string[] }> = [ { key: 'status', label: 'Status', n: '01', aliases: ['status'] }, { key: 'empirical', label: 'Empirical evidence', n: '02', tier: 'empirical', aliases: ['empirical evidence', 'empirical'] }, { key: 'modeled', label: 'Modeled scenarios', n: '03', tier: 'modeled', aliases: ['modeled scenarios', 'modeled'] }, { key: 'policy', label: 'Policy context', n: '04', aliases: ['policy context', 'policy'] } ]; function findSection(rawTitle: string) { const t = rawTitle.toLowerCase().replace(/[.:]+\s*$/, '').trim(); return CANONICAL_SECTIONS.find((s) => s.aliases.includes(t)); } // Match either `**Heading.**` (the canonical reconciler output) or the // markdown-headed `## 01 Heading` form we use in the static demo data. const SECTION_HEAD_RE = /(^|\n)\s*(?:\*\*([A-Z][A-Za-z\s/]+?)\.\s*\*\*|#{1,3}\s*(0[1-4])\s*[:\-—.]?\s*([^\n]+))/g; export interface ParseResult { blocks: BriefingBlock[]; citations: Record; /** Doc IDs cited in the body but not in the provided citation registry. */ unresolvedDocIds: string[]; } /** * Build a Citation record from a doc_id and any backend-supplied metadata. * The reconciler has the registry; we keep this conservative so unknown * doc IDs still render with sensible defaults. */ export function citationFromMeta( n: number, docId: string, meta?: Partial> ): Citation { return { id: docId, n, tier: tierForDocId(docId), source: meta?.source ?? docId.split(/[_-]/)[0].toUpperCase(), title: meta?.title ?? docId, docId, url: meta?.url ?? '', vintage: meta?.vintage ?? '', retrieved: meta?.retrieved ?? '' }; } const CITE_RE = /\[([a-z][a-z0-9_]*(?:\s*,\s*[a-z][a-z0-9_]*)*)\]/gi; function splitSentences(text: string): string[] { const parts = text.split(/(?<=[.!?])\s+(?=[A-Z(])/g); return parts.filter((s) => s.trim().length > 0); } function parseSentenceParts( sentence: string, cites: Record, registerCite: (docId: string) => Citation ): ClaimPart[] { let cursor = 0; const parts: ClaimPart[] = []; let firstTier: Tier | undefined; const matches = [...sentence.matchAll(CITE_RE)]; if (matches.length === 0) { return [{ text: sentence }]; } for (const m of matches) { const before = sentence.slice(cursor, m.index ?? 0); const docIds = m[1].split(/\s*,\s*/).filter(Boolean); cursor = (m.index ?? 0) + m[0].length; const tier = tierForDocId(docIds[0]); if (!firstTier) firstTier = tier; parts.push({ text: before, tier, cite: docIds[0] }); for (const id of docIds) { if (!cites[id]) cites[id] = registerCite(id); } } if (cursor < sentence.length) { const tail = sentence.slice(cursor); if (tail.trim()) parts.push({ text: tail }); } return parts; } /** * Parse a fully-or-partially-streamed briefing markdown string into blocks. * Safe to call repeatedly during streaming — re-parses from scratch. */ export function parseBriefing( markdown: string, knownCitations: Record = {} ): ParseResult { const cites: Record = { ...knownCitations }; let nextN = Object.values(cites).reduce((m, c) => Math.max(m, c.n), 0) + 1; const unresolvedDocIds = new Set(); const registerCite = (docId: string): Citation => { if (!knownCitations[docId]) unresolvedDocIds.add(docId); const c = citationFromMeta(nextN++, docId); return c; }; const blocks: BriefingBlock[] = []; type Idx = { num: string; label: string; tier?: Tier; titleExtra?: string; start: number; bodyStart: number }; const indices: Idx[] = []; let m: RegExpExecArray | null; SECTION_HEAD_RE.lastIndex = 0; while ((m = SECTION_HEAD_RE.exec(markdown))) { if (m[2] !== undefined) { // **Heading.** form const sec = findSection(m[2]); if (!sec) continue; indices.push({ num: sec.n, label: sec.label, tier: sec.tier, start: m.index + m[1].length, bodyStart: m.index + m[0].length }); } else if (m[3] !== undefined) { // ## 0n Heading form (used by the static demo) const num = m[3]; const title = (m[4] ?? '').trim(); const sec = CANONICAL_SECTIONS.find((s) => s.n === num) ?? findSection(title); indices.push({ num, label: sec?.label ?? title, tier: sec?.tier, titleExtra: sec && title.toLowerCase() !== sec.label.toLowerCase() ? title : undefined, start: m.index + m[1].length, bodyStart: m.index + m[0].length }); } } // Pre-section preamble. Don't render — the reconciler doesn't emit one and // we don't want a stray HTML escape of the bold-marker prefix to flash. for (let i = 0; i < indices.length; i++) { const sec = indices[i]; const next = indices[i + 1]; const body = markdown.slice(sec.bodyStart, next ? next.start : markdown.length).trim(); if (!body) continue; blocks.push({ kind: 'head', n: sec.num, label: sec.label, tier: sec.tier, title: sec.titleExtra }); for (const para of body.split(/\n\s*\n/)) { const flat = para.replace(/\s+/g, ' ').trim(); if (!flat) continue; const sentences = splitSentences(flat); const parts: ClaimPart[] = []; for (const s of sentences) { parts.push(...parseSentenceParts(s, cites, registerCite)); parts.push({ text: ' ' }); } while (parts.length && parts[parts.length - 1].text.trim() === '' && !parts[parts.length - 1].tier) { parts.pop(); } if (parts.length) blocks.push({ kind: 'prose', parts }); } } // Fallback: if the model hasn't emitted any recognised section head yet // (or won't — e.g. live_now intent), render the whole markdown as one // implicit "Status" block so the reader sees something during streaming. if (blocks.length === 0 && markdown.trim()) { blocks.push({ kind: 'head', n: '01', label: 'Status' }); const flat = markdown.replace(/\s+/g, ' ').trim(); const sentences = splitSentences(flat); const parts: ClaimPart[] = []; for (const s of sentences) { parts.push(...parseSentenceParts(s, cites, registerCite)); parts.push({ text: ' ' }); } while (parts.length && parts[parts.length - 1].text.trim() === '' && !parts[parts.length - 1].tier) { parts.pop(); } if (parts.length) blocks.push({ kind: 'prose', parts }); } return { blocks, citations: cites, unresolvedDocIds: [...unresolvedDocIds] }; } /** * HTML escape — kept around because the v0.4.1 parser used it for the * status-preamble fallback path. The v0.4.2 parser drops the preamble * entirely (the reconciler doesn't emit one), so this is currently * dead-code documentation. If the preamble path comes back, wire it * here. */ // eslint-disable-next-line @typescript-eslint/no-unused-vars function escapeHtml(s: string): string { return s .replace(/&/g, '&') .replace(//g, '>') .replace(/"/g, '"'); }