File size: 4,494 Bytes
a918698
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""In-app guided exercises that prefill the workbench with sandbox data.

Each exercise returns a dict consumed by the Welcome tab's "Try this" handlers,
which then push the values into the tabs' state.
"""
from __future__ import annotations

from dataclasses import dataclass

from io_utils import read_sandbox_tsv, sandbox_sentence
from paths import corpus_file, LANGUAGES
from prompts import DEFAULT_SYSTEM_PROMPT, DEFAULT_ZERO_SHOT, DEFAULT_FEW_SHOT, ICLExample
from schemas import from_preset


@dataclass
class Exercise:
    title: str
    summary: str
    language_code: str
    preset_key: str
    tokenizer: str
    n_tokens: int
    use_few_shot: bool
    n_icl: int
    models: list[str]
    user_template: str
    sandbox_start: int = 0


EXERCISES = [
    Exercise(
        title="Exercise 1 — Greek POS, zero-shot, single model",
        summary=(
            "Annotate an Ancient Greek sentence from the historical corpus with a single "
            "model in zero-shot mode. You will see the raw output, no MoE, no ICL. "
            "This is the smallest possible loop."
        ),
        language_code="GRC",
        preset_key="grc_tagset",
        tokenizer="as_is",
        n_tokens=10,
        use_few_shot=False,
        n_icl=0,
        models=["openai/gpt-oss-20b:free"],
        user_template=DEFAULT_ZERO_SHOT,
    ),
    Exercise(
        title="Exercise 2 — Armenian POS + lemma, few-shot (5 examples)",
        summary=(
            "Annotate Old Armenian with the bespoke compound tagset. 5 validated examples "
            "are sampled from the training corpus and inserted into the prompt's "
            "{few_shot_examples} block. Compare the few-shot result with what zero-shot "
            "would give."
        ),
        language_code="HYE",
        preset_key="hye_tagset",
        tokenizer="as_is",
        n_tokens=10,
        use_few_shot=True,
        n_icl=5,
        models=["mistralai/mistral-small-24b-instruct-2501"],
        user_template=DEFAULT_FEW_SHOT,
        sandbox_start=200,
    ),
    Exercise(
        title="Exercise 3 — Syriac MoE: vote, correct, re-inject",
        summary=(
            "Annotate Syriac with three models in parallel. The Run tab highlights "
            "disagreements. Correct the contested tokens in Review, click "
            "'Add to ICL pool', then re-run on a new sentence — the corrections appear "
            "in the rendered prompt's few-shot block, closing the bootstrap loop."
        ),
        language_code="SYC",
        preset_key="syc_tagset",
        tokenizer="as_is",
        n_tokens=12,
        use_few_shot=True,
        n_icl=3,
        models=["meta-llama/llama-3.3-70b-instruct:free", "qwen/qwen3-next-80b-a3b-instruct:free", "deepseek/deepseek-v4-flash:free"],
        user_template=DEFAULT_FEW_SHOT,
        sandbox_start=50,
    ),
]


def list_exercise_titles() -> list[str]:
    return [e.title for e in EXERCISES]


def prefill(idx: int) -> dict:
    """Return a dict the app uses to seed every tab for exercise `idx`."""
    ex = EXERCISES[idx]
    rows = read_sandbox_tsv(corpus_file(ex.language_code, "train"), max_rows=2000)
    surfaces, gold = sandbox_sentence(rows, ex.sandbox_start, ex.n_tokens)
    text = " ".join(surfaces)

    schema = from_preset(ex.preset_key)
    icl_examples: list[ICLExample] = []
    if ex.use_few_shot:
        # Build N example sentences from earlier slices of the same corpus
        for k in range(ex.n_icl):
            s2, g2 = sandbox_sentence(rows, k * (ex.n_tokens + 2), ex.n_tokens)
            if not s2:
                break
            icl_examples.append(
                ICLExample(
                    language=ex.language_code,
                    schema_hash=schema.hash(),
                    tokens=s2,
                    gold_annotation={"tokens": g2},
                    source="sandbox",
                )
            )

    return {
        "exercise_title": ex.title,
        "exercise_summary": ex.summary,
        "language_code": ex.language_code,
        "language_name": LANGUAGES.get(ex.language_code, ex.language_code),
        "preset_key": ex.preset_key,
        "tokenizer": ex.tokenizer,
        "text": text,
        "tokens": surfaces,
        "gold": gold,
        "use_few_shot": ex.use_few_shot,
        "n_icl": ex.n_icl,
        "icl_examples": icl_examples,
        "system_prompt": DEFAULT_SYSTEM_PROMPT,
        "user_template": ex.user_template,
        "models": ex.models,
    }