Spaces:
Configuration error
Configuration error
File size: 8,209 Bytes
e8a6c67 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 | /**
* Parse the streaming markdown produced by the Granite reconciler into the
* four-section briefing IA the design system expects.
*
* The reconciler's prompt (app/reconcile.py:EXTRA_SYSTEM_PROMPT) enforces
* **bold-stop** section heads, one per line:
* **Status.**
* **Empirical evidence.**
* **Modeled scenarios.**
* **Policy context.**
* The model occasionally drops them inline; the backend's
* _split_inline_headers normaliser fixes that before yielding final text,
* but mid-stream we still tolerate inline forms ourselves.
*
* Within a section, every sentence with a `[doc_id]` citation is a Claim.
* The claim's tier is inferred from the cited doc_id family via
* tierForDocId(). Multiple cites on one sentence: the tier of the first
* cited doc wins (visual margin glyph), all cites still rendered.
*/
import type { BriefingBlock, Citation, ClaimPart } from '$lib/types/claim';
import { tierForDocId, type Tier } from '$lib/types/tier';
const CANONICAL_SECTIONS: Array<{ key: string; label: string; n: string; tier?: Tier; aliases: string[] }> = [
{ key: 'status', label: 'Status', n: '01', aliases: ['status'] },
{ key: 'empirical', label: 'Empirical evidence', n: '02', tier: 'empirical', aliases: ['empirical evidence', 'empirical'] },
{ key: 'modeled', label: 'Modeled scenarios', n: '03', tier: 'modeled', aliases: ['modeled scenarios', 'modeled'] },
{ key: 'policy', label: 'Policy context', n: '04', aliases: ['policy context', 'policy'] }
];
function findSection(rawTitle: string) {
const t = rawTitle.toLowerCase().replace(/[.:]+\s*$/, '').trim();
return CANONICAL_SECTIONS.find((s) => s.aliases.includes(t));
}
// Match either `**Heading.**` (the canonical reconciler output) or the
// markdown-headed `## 01 Heading` form we use in the static demo data.
const SECTION_HEAD_RE = /(^|\n)\s*(?:\*\*([A-Z][A-Za-z\s/]+?)\.\s*\*\*|#{1,3}\s*(0[1-4])\s*[:\-—.]?\s*([^\n]+))/g;
export interface ParseResult {
blocks: BriefingBlock[];
citations: Record<string, Citation>;
/** Doc IDs cited in the body but not in the provided citation registry. */
unresolvedDocIds: string[];
}
/**
* Build a Citation record from a doc_id and any backend-supplied metadata.
* The reconciler has the registry; we keep this conservative so unknown
* doc IDs still render with sensible defaults.
*/
export function citationFromMeta(
n: number,
docId: string,
meta?: Partial<Pick<Citation, 'source' | 'title' | 'url' | 'vintage' | 'retrieved'>>
): Citation {
return {
id: docId,
n,
tier: tierForDocId(docId),
source: meta?.source ?? docId.split(/[_-]/)[0].toUpperCase(),
title: meta?.title ?? docId,
docId,
url: meta?.url ?? '',
vintage: meta?.vintage ?? '',
retrieved: meta?.retrieved ?? ''
};
}
const CITE_RE = /\[([a-z][a-z0-9_]*(?:\s*,\s*[a-z][a-z0-9_]*)*)\]/gi;
function splitSentences(text: string): string[] {
const parts = text.split(/(?<=[.!?])\s+(?=[A-Z(])/g);
return parts.filter((s) => s.trim().length > 0);
}
function parseSentenceParts(
sentence: string,
cites: Record<string, Citation>,
registerCite: (docId: string) => Citation
): ClaimPart[] {
let cursor = 0;
const parts: ClaimPart[] = [];
let firstTier: Tier | undefined;
const matches = [...sentence.matchAll(CITE_RE)];
if (matches.length === 0) {
return [{ text: sentence }];
}
for (const m of matches) {
const before = sentence.slice(cursor, m.index ?? 0);
const docIds = m[1].split(/\s*,\s*/).filter(Boolean);
cursor = (m.index ?? 0) + m[0].length;
const tier = tierForDocId(docIds[0]);
if (!firstTier) firstTier = tier;
parts.push({ text: before, tier, cite: docIds[0] });
for (const id of docIds) {
if (!cites[id]) cites[id] = registerCite(id);
}
}
if (cursor < sentence.length) {
const tail = sentence.slice(cursor);
if (tail.trim()) parts.push({ text: tail });
}
return parts;
}
/**
* Parse a fully-or-partially-streamed briefing markdown string into blocks.
* Safe to call repeatedly during streaming — re-parses from scratch.
*/
export function parseBriefing(
markdown: string,
knownCitations: Record<string, Citation> = {}
): ParseResult {
const cites: Record<string, Citation> = { ...knownCitations };
let nextN = Object.values(cites).reduce((m, c) => Math.max(m, c.n), 0) + 1;
const unresolvedDocIds = new Set<string>();
const registerCite = (docId: string): Citation => {
if (!knownCitations[docId]) unresolvedDocIds.add(docId);
const c = citationFromMeta(nextN++, docId);
return c;
};
const blocks: BriefingBlock[] = [];
type Idx = { num: string; label: string; tier?: Tier; titleExtra?: string; start: number; bodyStart: number };
const indices: Idx[] = [];
let m: RegExpExecArray | null;
SECTION_HEAD_RE.lastIndex = 0;
while ((m = SECTION_HEAD_RE.exec(markdown))) {
if (m[2] !== undefined) {
// **Heading.** form
const sec = findSection(m[2]);
if (!sec) continue;
indices.push({
num: sec.n,
label: sec.label,
tier: sec.tier,
start: m.index + m[1].length,
bodyStart: m.index + m[0].length
});
} else if (m[3] !== undefined) {
// ## 0n Heading form (used by the static demo)
const num = m[3];
const title = (m[4] ?? '').trim();
const sec = CANONICAL_SECTIONS.find((s) => s.n === num) ?? findSection(title);
indices.push({
num,
label: sec?.label ?? title,
tier: sec?.tier,
titleExtra: sec && title.toLowerCase() !== sec.label.toLowerCase() ? title : undefined,
start: m.index + m[1].length,
bodyStart: m.index + m[0].length
});
}
}
// Pre-section preamble. Don't render — the reconciler doesn't emit one and
// we don't want a stray HTML escape of the bold-marker prefix to flash.
for (let i = 0; i < indices.length; i++) {
const sec = indices[i];
const next = indices[i + 1];
const body = markdown.slice(sec.bodyStart, next ? next.start : markdown.length).trim();
if (!body) continue;
blocks.push({
kind: 'head',
n: sec.num,
label: sec.label,
tier: sec.tier,
title: sec.titleExtra
});
for (const para of body.split(/\n\s*\n/)) {
const flat = para.replace(/\s+/g, ' ').trim();
if (!flat) continue;
const sentences = splitSentences(flat);
const parts: ClaimPart[] = [];
for (const s of sentences) {
parts.push(...parseSentenceParts(s, cites, registerCite));
parts.push({ text: ' ' });
}
while (parts.length && parts[parts.length - 1].text.trim() === '' && !parts[parts.length - 1].tier) {
parts.pop();
}
if (parts.length) blocks.push({ kind: 'prose', parts });
}
}
// Fallback: if the model hasn't emitted any recognised section head yet
// (or won't — e.g. live_now intent), render the whole markdown as one
// implicit "Status" block so the reader sees something during streaming.
if (blocks.length === 0 && markdown.trim()) {
blocks.push({ kind: 'head', n: '01', label: 'Status' });
const flat = markdown.replace(/\s+/g, ' ').trim();
const sentences = splitSentences(flat);
const parts: ClaimPart[] = [];
for (const s of sentences) {
parts.push(...parseSentenceParts(s, cites, registerCite));
parts.push({ text: ' ' });
}
while (parts.length && parts[parts.length - 1].text.trim() === '' && !parts[parts.length - 1].tier) {
parts.pop();
}
if (parts.length) blocks.push({ kind: 'prose', parts });
}
return { blocks, citations: cites, unresolvedDocIds: [...unresolvedDocIds] };
}
/**
* HTML escape — kept around because the v0.4.1 parser used it for the
* status-preamble fallback path. The v0.4.2 parser drops the preamble
* entirely (the reconciler doesn't emit one), so this is currently
* dead-code documentation. If the preamble path comes back, wire it
* here.
*/
// eslint-disable-next-line @typescript-eslint/no-unused-vars
function escapeHtml(s: string): string {
return s
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/"/g, '"');
}
|