"""Prompt templates + ICL pool. Templates are loaded from the tutorial repo so the app stays in sync with the written material. ICLPool keeps a session-scoped, filterable bank of validated or corrected examples. """ from __future__ import annotations from copy import deepcopy import json import random from dataclasses import dataclass, field, asdict from typing import Optional from paths import TUTORIAL_PROMPTS_DIR, read_text from schemas import AnnotationSchema, to_json_schema DEFAULT_SYSTEM_PROMPT = read_text(TUTORIAL_PROMPTS_DIR / "00_system_role.txt") DEFAULT_ZERO_SHOT = read_text(TUTORIAL_PROMPTS_DIR / "01_zero_shot_pos_lemma_morph.txt") DEFAULT_FEW_SHOT = read_text(TUTORIAL_PROMPTS_DIR / "02_few_shot_pos_lemma_morph.txt") VALIDATION_RETRY = read_text(TUTORIAL_PROMPTS_DIR / "03_validation_retry.txt") @dataclass class ICLExample: language: str schema_hash: str tokens: list[str] gold_annotation: dict # {"tokens": [{"surface": ..., "lemma": ..., "pos": ...}, ...]} source: str = "sandbox" # "sandbox" | "uploaded" | "corrected" note: str = "" @dataclass class ICLPool: """Session-scoped pool of in-context examples. Filter by (language, schema_hash) so a POS-correction never leaks into NER. """ entries: list[ICLExample] = field(default_factory=list) version: int = 0 def _key(self, ex: ICLExample) -> tuple[str, str, tuple[str, ...]]: return ( ex.language or "", ex.schema_hash or "", tuple(ex.tokens or []), ) def _same_content(self, a: ICLExample, b: ICLExample) -> bool: return a.gold_annotation == b.gold_annotation def add(self, ex: ICLExample) -> str: ex = deepcopy(ex) key = self._key(ex) for i, existing in enumerate(self.entries): if self._key(existing) == key: if self._same_content(existing, ex): return "unchanged" self.entries[i] = ex self.version += 1 return "updated" self.entries.append(ex) self.version += 1 return "inserted" def filter(self, language: str = "", schema_hash: str = "") -> list[ICLExample]: out = self.entries if language: out = [e for e in out if e.language == language] if schema_hash: out = [e for e in out if e.schema_hash == schema_hash] return out def sample( self, n: int, language: str = "", schema_hash: str = "", strategy: str = "random", seed: int = 0, ) -> list[ICLExample]: pool = self.filter(language=language, schema_hash=schema_hash) if not pool or n <= 0: return [] if strategy == "most_recent_corrections": corr = [e for e in pool if e.source == "corrected"] corr = list(reversed(corr)) others = [e for e in pool if e.source != "corrected"] return (corr + others)[:n] if strategy == "by_language": # already filtered by language; deterministic order return pool[:n] rng = random.Random(seed) return rng.sample(pool, min(n, len(pool))) def to_jsonl(self) -> str: return "\n".join(json.dumps(asdict(e), ensure_ascii=False) for e in self.entries) @classmethod def from_jsonl(cls, text: str) -> "ICLPool": pool = cls() for line in text.splitlines(): line = line.strip() if not line: continue d = json.loads(line) pool.entries.append(ICLExample(**d)) return pool # --------------------------------------------------------------------------- # Prompt rendering # --------------------------------------------------------------------------- def render_few_shot_block(examples: list[ICLExample]) -> str: """Format ICL examples as compact JSON blocks separated by ---.""" blocks = [] for i, ex in enumerate(examples, 1): block = { "tokens": ex.tokens, "gold": ex.gold_annotation, } blocks.append(f"### Example {i}\n```json\n{json.dumps(block, ensure_ascii=False, indent=2)}\n```") return "\n\n".join(blocks) def render_inventory(schema: AnnotationSchema) -> tuple[str, str]: """Build the (upos_inventory, feature_inventory) text blobs for the template.""" upos_lines = [] feature_lines = [] for f in schema.fields: if f.type == "enum": upos_lines.append(f"- `{f.name}` ∈ {{{', '.join(f.values)}}}") elif f.type == "object": for sub in f.subfields: vals = sub.values or "(free string)" feature_lines.append(f"- `{f.name}.{sub.name}` ∈ {vals}") else: upos_lines.append(f"- `{f.name}`: free string{' (nullable)' if f.nullable else ''}") return "\n".join(upos_lines), "\n".join(feature_lines) or "(no morphological subfields)" def render_prompt( template: str, *, schema: AnnotationSchema, tokens: list[str], text: str = "", language: str = "", script: str = "", domain: str = "", sentence_id: str = "s1", few_shot_examples: Optional[list[ICLExample]] = None, ) -> str: """Fill the user-prompt template. Unknown braces are left untouched.""" upos_inv, feat_inv = render_inventory(schema) fs_block = render_few_shot_block(few_shot_examples or []) schema_str = json.dumps(to_json_schema(schema), ensure_ascii=False, indent=2) mapping = { "{language}": language or schema.language or "(unspecified)", "{script}": script or "(unspecified)", "{domain}": domain or "(unspecified)", "{sentence_id}": sentence_id, "{text}": text or " ".join(tokens), "{tokens}": json.dumps(tokens, ensure_ascii=False), "{upos_inventory}": upos_inv, "{feature_inventory}": feat_inv, "{few_shot_examples}": fs_block or "(none)", "{schema}": schema_str, "{tagset}": upos_inv, } out = template for k, v in mapping.items(): out = out.replace(k, v) return out