Spaces:

ENC-PSL
/

lrec2026-llm-annotator

Running

lrec2026-llm-annotator

File size: 4,494 Bytes

a918698

"""In-app guided exercises that prefill the workbench with sandbox data.

Each exercise returns a dict consumed by the Welcome tab's "Try this" handlers,
which then push the values into the tabs' state.
"""
from __future__ import annotations

from dataclasses import dataclass

from io_utils import read_sandbox_tsv, sandbox_sentence
from paths import corpus_file, LANGUAGES
from prompts import DEFAULT_SYSTEM_PROMPT, DEFAULT_ZERO_SHOT, DEFAULT_FEW_SHOT, ICLExample
from schemas import from_preset


@dataclass
class Exercise:
    title: str
    summary: str
    language_code: str
    preset_key: str
    tokenizer: str
    n_tokens: int
    use_few_shot: bool
    n_icl: int
    models: list[str]
    user_template: str
    sandbox_start: int = 0


EXERCISES = [
    Exercise(
        title="Exercise 1 — Greek POS, zero-shot, single model",
        summary=(
            "Annotate an Ancient Greek sentence from the historical corpus with a single "
            "model in zero-shot mode. You will see the raw output, no MoE, no ICL. "
            "This is the smallest possible loop."
        ),
        language_code="GRC",
        preset_key="grc_tagset",
        tokenizer="as_is",
        n_tokens=10,
        use_few_shot=False,
        n_icl=0,
        models=["openai/gpt-oss-20b:free"],
        user_template=DEFAULT_ZERO_SHOT,
    ),
    Exercise(
        title="Exercise 2 — Armenian POS + lemma, few-shot (5 examples)",
        summary=(
            "Annotate Old Armenian with the bespoke compound tagset. 5 validated examples "
            "are sampled from the training corpus and inserted into the prompt's "
            "{few_shot_examples} block. Compare the few-shot result with what zero-shot "
            "would give."
        ),
        language_code="HYE",
        preset_key="hye_tagset",
        tokenizer="as_is",
        n_tokens=10,
        use_few_shot=True,
        n_icl=5,
        models=["mistralai/mistral-small-24b-instruct-2501"],
        user_template=DEFAULT_FEW_SHOT,
        sandbox_start=200,
    ),
    Exercise(
        title="Exercise 3 — Syriac MoE: vote, correct, re-inject",
        summary=(
            "Annotate Syriac with three models in parallel. The Run tab highlights "
            "disagreements. Correct the contested tokens in Review, click "
            "'Add to ICL pool', then re-run on a new sentence — the corrections appear "
            "in the rendered prompt's few-shot block, closing the bootstrap loop."
        ),
        language_code="SYC",
        preset_key="syc_tagset",
        tokenizer="as_is",
        n_tokens=12,
        use_few_shot=True,
        n_icl=3,
        models=["meta-llama/llama-3.3-70b-instruct:free", "qwen/qwen3-next-80b-a3b-instruct:free", "deepseek/deepseek-v4-flash:free"],
        user_template=DEFAULT_FEW_SHOT,
        sandbox_start=50,
    ),
]


def list_exercise_titles() -> list[str]:
    return [e.title for e in EXERCISES]


def prefill(idx: int) -> dict:
    """Return a dict the app uses to seed every tab for exercise `idx`."""
    ex = EXERCISES[idx]
    rows = read_sandbox_tsv(corpus_file(ex.language_code, "train"), max_rows=2000)
    surfaces, gold = sandbox_sentence(rows, ex.sandbox_start, ex.n_tokens)
    text = " ".join(surfaces)

    schema = from_preset(ex.preset_key)
    icl_examples: list[ICLExample] = []
    if ex.use_few_shot:
        # Build N example sentences from earlier slices of the same corpus
        for k in range(ex.n_icl):
            s2, g2 = sandbox_sentence(rows, k * (ex.n_tokens + 2), ex.n_tokens)
            if not s2:
                break
            icl_examples.append(
                ICLExample(
                    language=ex.language_code,
                    schema_hash=schema.hash(),
                    tokens=s2,
                    gold_annotation={"tokens": g2},
                    source="sandbox",
                )
            )

    return {
        "exercise_title": ex.title,
        "exercise_summary": ex.summary,
        "language_code": ex.language_code,
        "language_name": LANGUAGES.get(ex.language_code, ex.language_code),
        "preset_key": ex.preset_key,
        "tokenizer": ex.tokenizer,
        "text": text,
        "tokens": surfaces,
        "gold": gold,
        "use_few_shot": ex.use_few_shot,
        "n_icl": ex.n_icl,
        "icl_examples": icl_examples,
        "system_prompt": DEFAULT_SYSTEM_PROMPT,
        "user_template": ex.user_template,
        "models": ex.models,
    }