lrec2026-llm-annotator / tutorial.py
dhuser's picture
Initial LREC LLM-as-Annotator app
a918698
raw
history blame
4.49 kB
"""In-app guided exercises that prefill the workbench with sandbox data.
Each exercise returns a dict consumed by the Welcome tab's "Try this" handlers,
which then push the values into the tabs' state.
"""
from __future__ import annotations
from dataclasses import dataclass
from io_utils import read_sandbox_tsv, sandbox_sentence
from paths import corpus_file, LANGUAGES
from prompts import DEFAULT_SYSTEM_PROMPT, DEFAULT_ZERO_SHOT, DEFAULT_FEW_SHOT, ICLExample
from schemas import from_preset
@dataclass
class Exercise:
title: str
summary: str
language_code: str
preset_key: str
tokenizer: str
n_tokens: int
use_few_shot: bool
n_icl: int
models: list[str]
user_template: str
sandbox_start: int = 0
EXERCISES = [
Exercise(
title="Exercise 1 — Greek POS, zero-shot, single model",
summary=(
"Annotate an Ancient Greek sentence from the historical corpus with a single "
"model in zero-shot mode. You will see the raw output, no MoE, no ICL. "
"This is the smallest possible loop."
),
language_code="GRC",
preset_key="grc_tagset",
tokenizer="as_is",
n_tokens=10,
use_few_shot=False,
n_icl=0,
models=["openai/gpt-oss-20b:free"],
user_template=DEFAULT_ZERO_SHOT,
),
Exercise(
title="Exercise 2 — Armenian POS + lemma, few-shot (5 examples)",
summary=(
"Annotate Old Armenian with the bespoke compound tagset. 5 validated examples "
"are sampled from the training corpus and inserted into the prompt's "
"{few_shot_examples} block. Compare the few-shot result with what zero-shot "
"would give."
),
language_code="HYE",
preset_key="hye_tagset",
tokenizer="as_is",
n_tokens=10,
use_few_shot=True,
n_icl=5,
models=["mistralai/mistral-small-24b-instruct-2501"],
user_template=DEFAULT_FEW_SHOT,
sandbox_start=200,
),
Exercise(
title="Exercise 3 — Syriac MoE: vote, correct, re-inject",
summary=(
"Annotate Syriac with three models in parallel. The Run tab highlights "
"disagreements. Correct the contested tokens in Review, click "
"'Add to ICL pool', then re-run on a new sentence — the corrections appear "
"in the rendered prompt's few-shot block, closing the bootstrap loop."
),
language_code="SYC",
preset_key="syc_tagset",
tokenizer="as_is",
n_tokens=12,
use_few_shot=True,
n_icl=3,
models=["meta-llama/llama-3.3-70b-instruct:free", "qwen/qwen3-next-80b-a3b-instruct:free", "deepseek/deepseek-v4-flash:free"],
user_template=DEFAULT_FEW_SHOT,
sandbox_start=50,
),
]
def list_exercise_titles() -> list[str]:
return [e.title for e in EXERCISES]
def prefill(idx: int) -> dict:
"""Return a dict the app uses to seed every tab for exercise `idx`."""
ex = EXERCISES[idx]
rows = read_sandbox_tsv(corpus_file(ex.language_code, "train"), max_rows=2000)
surfaces, gold = sandbox_sentence(rows, ex.sandbox_start, ex.n_tokens)
text = " ".join(surfaces)
schema = from_preset(ex.preset_key)
icl_examples: list[ICLExample] = []
if ex.use_few_shot:
# Build N example sentences from earlier slices of the same corpus
for k in range(ex.n_icl):
s2, g2 = sandbox_sentence(rows, k * (ex.n_tokens + 2), ex.n_tokens)
if not s2:
break
icl_examples.append(
ICLExample(
language=ex.language_code,
schema_hash=schema.hash(),
tokens=s2,
gold_annotation={"tokens": g2},
source="sandbox",
)
)
return {
"exercise_title": ex.title,
"exercise_summary": ex.summary,
"language_code": ex.language_code,
"language_name": LANGUAGES.get(ex.language_code, ex.language_code),
"preset_key": ex.preset_key,
"tokenizer": ex.tokenizer,
"text": text,
"tokens": surfaces,
"gold": gold,
"use_few_shot": ex.use_few_shot,
"n_icl": ex.n_icl,
"icl_examples": icl_examples,
"system_prompt": DEFAULT_SYSTEM_PROMPT,
"user_template": ex.user_template,
"models": ex.models,
}