| """Prompt templates + ICL pool. |
| |
| Templates are loaded from the tutorial repo so the app stays in sync with the |
| written material. ICLPool keeps a session-scoped, filterable bank of validated |
| or corrected examples. |
| """ |
| from __future__ import annotations |
| from copy import deepcopy |
|
|
| import json |
| import random |
| from dataclasses import dataclass, field, asdict |
| from typing import Optional |
|
|
| from paths import TUTORIAL_PROMPTS_DIR, read_text |
| from schemas import AnnotationSchema, to_json_schema |
|
|
| DEFAULT_SYSTEM_PROMPT = read_text(TUTORIAL_PROMPTS_DIR / "00_system_role.txt") |
| DEFAULT_ZERO_SHOT = read_text(TUTORIAL_PROMPTS_DIR / "01_zero_shot_pos_lemma_morph.txt") |
| DEFAULT_FEW_SHOT = read_text(TUTORIAL_PROMPTS_DIR / "02_few_shot_pos_lemma_morph.txt") |
| VALIDATION_RETRY = read_text(TUTORIAL_PROMPTS_DIR / "03_validation_retry.txt") |
|
|
|
|
| @dataclass |
| class ICLExample: |
| language: str |
| schema_hash: str |
| tokens: list[str] |
| gold_annotation: dict |
| source: str = "sandbox" |
| note: str = "" |
|
|
|
|
|
|
|
|
| @dataclass |
| class ICLPool: |
| """Session-scoped pool of in-context examples. |
| |
| Filter by (language, schema_hash) so a POS-correction never leaks into NER. |
| """ |
|
|
| entries: list[ICLExample] = field(default_factory=list) |
| version: int = 0 |
|
|
| def _key(self, ex: ICLExample) -> tuple[str, str, tuple[str, ...]]: |
| return ( |
| ex.language or "", |
| ex.schema_hash or "", |
| tuple(ex.tokens or []), |
| ) |
|
|
| def _same_content(self, a: ICLExample, b: ICLExample) -> bool: |
| return a.gold_annotation == b.gold_annotation |
|
|
| def add(self, ex: ICLExample) -> str: |
| ex = deepcopy(ex) |
| key = self._key(ex) |
|
|
| for i, existing in enumerate(self.entries): |
| if self._key(existing) == key: |
| if self._same_content(existing, ex): |
| return "unchanged" |
| self.entries[i] = ex |
| self.version += 1 |
| return "updated" |
| self.entries.append(ex) |
| self.version += 1 |
| return "inserted" |
|
|
| def filter(self, language: str = "", schema_hash: str = "") -> list[ICLExample]: |
| out = self.entries |
| if language: |
| out = [e for e in out if e.language == language] |
| if schema_hash: |
| out = [e for e in out if e.schema_hash == schema_hash] |
| return out |
|
|
| def sample( |
| self, |
| n: int, |
| language: str = "", |
| schema_hash: str = "", |
| strategy: str = "random", |
| seed: int = 0, |
| ) -> list[ICLExample]: |
| pool = self.filter(language=language, schema_hash=schema_hash) |
| if not pool or n <= 0: |
| return [] |
| if strategy == "most_recent_corrections": |
| corr = [e for e in pool if e.source == "corrected"] |
| corr = list(reversed(corr)) |
| others = [e for e in pool if e.source != "corrected"] |
| return (corr + others)[:n] |
| if strategy == "by_language": |
| |
| return pool[:n] |
| rng = random.Random(seed) |
| return rng.sample(pool, min(n, len(pool))) |
|
|
| def to_jsonl(self) -> str: |
| return "\n".join(json.dumps(asdict(e), ensure_ascii=False) for e in self.entries) |
|
|
| @classmethod |
| def from_jsonl(cls, text: str) -> "ICLPool": |
| pool = cls() |
| for line in text.splitlines(): |
| line = line.strip() |
| if not line: |
| continue |
| d = json.loads(line) |
| pool.entries.append(ICLExample(**d)) |
| return pool |
|
|
|
|
| |
| |
| |
|
|
| def render_few_shot_block(examples: list[ICLExample]) -> str: |
| """Format ICL examples as compact JSON blocks separated by ---.""" |
| blocks = [] |
| for i, ex in enumerate(examples, 1): |
| block = { |
| "tokens": ex.tokens, |
| "gold": ex.gold_annotation, |
| } |
| blocks.append(f"### Example {i}\n```json\n{json.dumps(block, ensure_ascii=False, indent=2)}\n```") |
| return "\n\n".join(blocks) |
|
|
|
|
| def render_inventory(schema: AnnotationSchema) -> tuple[str, str]: |
| """Build the (upos_inventory, feature_inventory) text blobs for the template.""" |
| upos_lines = [] |
| feature_lines = [] |
| for f in schema.fields: |
| if f.type == "enum": |
| upos_lines.append(f"- `{f.name}` ∈ {{{', '.join(f.values)}}}") |
| elif f.type == "object": |
| for sub in f.subfields: |
| vals = sub.values or "(free string)" |
| feature_lines.append(f"- `{f.name}.{sub.name}` ∈ {vals}") |
| else: |
| upos_lines.append(f"- `{f.name}`: free string{' (nullable)' if f.nullable else ''}") |
| return "\n".join(upos_lines), "\n".join(feature_lines) or "(no morphological subfields)" |
|
|
|
|
| def render_prompt( |
| template: str, |
| *, |
| schema: AnnotationSchema, |
| tokens: list[str], |
| text: str = "", |
| language: str = "", |
| script: str = "", |
| domain: str = "", |
| sentence_id: str = "s1", |
| few_shot_examples: Optional[list[ICLExample]] = None, |
| ) -> str: |
| """Fill the user-prompt template. Unknown braces are left untouched.""" |
| upos_inv, feat_inv = render_inventory(schema) |
| fs_block = render_few_shot_block(few_shot_examples or []) |
| schema_str = json.dumps(to_json_schema(schema), ensure_ascii=False, indent=2) |
| mapping = { |
| "{language}": language or schema.language or "(unspecified)", |
| "{script}": script or "(unspecified)", |
| "{domain}": domain or "(unspecified)", |
| "{sentence_id}": sentence_id, |
| "{text}": text or " ".join(tokens), |
| "{tokens}": json.dumps(tokens, ensure_ascii=False), |
| "{upos_inventory}": upos_inv, |
| "{feature_inventory}": feat_inv, |
| "{few_shot_examples}": fs_block or "(none)", |
| "{schema}": schema_str, |
| "{tagset}": upos_inv, |
| } |
| out = template |
| for k, v in mapping.items(): |
| out = out.replace(k, v) |
| return out |
|
|