| """AnnotationSchema: presets, JSON Schema builder, output validator. |
| |
| A single internal representation handles POS, NER, lemmatization, and any |
| user-defined per-token task. See README for the contract. |
| """ |
| from __future__ import annotations |
|
|
| import hashlib |
| import json |
| from dataclasses import dataclass, field, asdict |
| from typing import Any, Optional |
|
|
| from jsonschema import Draft202012Validator |
|
|
| from paths import TAGSETS_DIR, TUTORIAL_SCHEMAS_DIR, LANGUAGES, read_text |
|
|
| AGGREGATORS = ["vote", "lcs", "min", "max", "priority", "vote_per_subfield"] |
| FIELD_TYPES = ["string", "enum", "object"] |
|
|
|
|
| @dataclass |
| class Subfield: |
| name: str |
| values: Optional[list[str]] = None |
| nullable: bool = True |
|
|
|
|
| @dataclass |
| class Field: |
| name: str |
| type: str |
| values: list[str] = field(default_factory=list) |
| nullable: bool = True |
| aggregator: str = "vote" |
| subfields: list[Subfield] = field(default_factory=list) |
|
|
|
|
| @dataclass |
| class AnnotationSchema: |
| task_name: str |
| fields: list[Field] |
| language: str = "" |
| description: str = "" |
|
|
| def to_dict(self) -> dict: |
| return { |
| "task_name": self.task_name, |
| "language": self.language, |
| "description": self.description, |
| "fields": [_field_to_dict(f) for f in self.fields], |
| } |
|
|
| def hash(self) -> str: |
| payload = json.dumps(self.to_dict(), sort_keys=True, ensure_ascii=False) |
| return hashlib.sha1(payload.encode("utf-8")).hexdigest()[:10] |
|
|
|
|
| def _field_to_dict(f: Field) -> dict: |
| d = {"name": f.name, "type": f.type, "nullable": f.nullable, "aggregator": f.aggregator} |
| if f.type == "enum": |
| d["values"] = f.values |
| if f.type == "object": |
| d["subfields"] = [asdict(s) for s in f.subfields] |
| return d |
|
|
|
|
| def _field_from_dict(d: dict) -> Field: |
| subs = [Subfield(**s) for s in d.get("subfields", [])] |
| return Field( |
| name=d["name"], |
| type=d["type"], |
| values=d.get("values", []), |
| nullable=d.get("nullable", True), |
| aggregator=d.get("aggregator", "vote"), |
| subfields=subs, |
| ) |
|
|
|
|
| def schema_from_dict(d: dict) -> AnnotationSchema: |
| return AnnotationSchema( |
| task_name=d.get("task_name", "custom"), |
| language=d.get("language", ""), |
| description=d.get("description", ""), |
| fields=[_field_from_dict(f) for f in d.get("fields", [])], |
| ) |
|
|
|
|
| |
| |
| |
|
|
| def to_json_schema(s: AnnotationSchema) -> dict: |
| token_props: dict[str, Any] = {"surface": {"type": "string"}} |
| required = ["surface"] |
| for f in s.fields: |
| token_props[f.name] = _field_to_json(f) |
| required.append(f.name) |
| return { |
| "$schema": "https://json-schema.org/draft/2020-12/schema", |
| "title": f"{s.task_name} annotation", |
| "type": "object", |
| "required": ["sentence_id", "language", "tokens"], |
| "additionalProperties": False, |
| "properties": { |
| "sentence_id": {"type": "string"}, |
| "language": {"type": "string"}, |
| "tokens": { |
| "type": "array", |
| "items": { |
| "type": "object", |
| "required": required, |
| "additionalProperties": False, |
| "properties": token_props, |
| }, |
| }, |
| }, |
| } |
|
|
|
|
| def _field_to_json(f: Field) -> dict: |
| if f.type == "enum": |
| out: dict[str, Any] = {"type": ["string", "null"] if f.nullable else "string", "enum": list(f.values) + ([None] if f.nullable else [])} |
| return out |
| if f.type == "object": |
| props = {} |
| for sub in f.subfields: |
| t = ["string", "null"] if sub.nullable else "string" |
| if sub.values: |
| props[sub.name] = {"type": t, "enum": list(sub.values) + ([None] if sub.nullable else [])} |
| else: |
| props[sub.name] = {"type": t} |
| return {"type": "object", "additionalProperties": False, "properties": props} |
| |
| return {"type": ["string", "null"] if f.nullable else "string"} |
|
|
|
|
| def validate(s: AnnotationSchema, payload: dict) -> tuple[bool, list[str]]: |
| schema = to_json_schema(s) |
| validator = Draft202012Validator(schema) |
| errors = [f"{'/'.join(str(p) for p in e.path)}: {e.message}" for e in validator.iter_errors(payload)] |
| return (len(errors) == 0, errors) |
|
|
|
|
| |
| |
| |
|
|
| def _parse_tagset(lang_code: str) -> list[str]: |
| path = TAGSETS_DIR / f"{lang_code}_tagset.txt" |
| tags = [] |
| for line in read_text(path).splitlines(): |
| line = line.strip() |
| if not line: |
| continue |
| |
| tag = line.split(None, 1)[0] |
| tags.append(tag) |
| return tags |
|
|
|
|
| def _ud_preset() -> AnnotationSchema: |
| """Read the tutorial's UD schema and convert to AnnotationSchema.""" |
| raw = json.loads(read_text(TUTORIAL_SCHEMAS_DIR / "pos_lemma_morph_schema.json")) |
| token_props = raw["properties"]["tokens"]["items"]["properties"] |
| upos_values = token_props["upos"]["enum"] |
| feature_subs = [] |
| for name, spec in token_props["features"]["properties"].items(): |
| feature_subs.append(Subfield(name=name, values=None, nullable=True)) |
| return AnnotationSchema( |
| task_name="UD POS + lemma + morphology", |
| description="Universal Dependencies UPOS + morphological features.", |
| fields=[ |
| Field(name="lemma", type="string", nullable=True, aggregator="lcs"), |
| Field(name="upos", type="enum", values=upos_values, nullable=False, aggregator="vote"), |
| Field(name="features", type="object", subfields=feature_subs, aggregator="vote_per_subfield"), |
| Field(name="confidence", type="enum", values=["low", "medium", "high"], nullable=False, aggregator="min"), |
| Field(name="comment", type="string", nullable=True, aggregator="priority"), |
| ], |
| ) |
|
|
|
|
| def _tagset_preset(lang_code: str) -> AnnotationSchema: |
| tags = _parse_tagset(lang_code) |
| return AnnotationSchema( |
| task_name=f"{lang_code} POS + lemma (bespoke tagset)", |
| language=LANGUAGES.get(lang_code, lang_code), |
| description=f"Bespoke compound POS tags for {LANGUAGES.get(lang_code, lang_code)}.", |
| fields=[ |
| Field(name="lemma", type="string", nullable=True, aggregator="lcs"), |
| Field(name="pos", type="enum", values=tags, nullable=False, aggregator="vote"), |
| Field(name="confidence", type="enum", values=["low", "medium", "high"], nullable=False, aggregator="min"), |
| Field(name="comment", type="string", nullable=True, aggregator="priority"), |
| ], |
| ) |
|
|
|
|
| def _ner_preset() -> AnnotationSchema: |
| return AnnotationSchema( |
| task_name="Named Entity Recognition (BIO)", |
| description="Per-token BIO tags for PER / LOC / ORG / MISC.", |
| fields=[ |
| Field( |
| name="ner", |
| type="enum", |
| values=["O", "B-PER", "I-PER", "B-LOC", "I-LOC", "B-ORG", "I-ORG", "B-MISC", "I-MISC"], |
| nullable=False, |
| aggregator="vote", |
| ), |
| Field(name="confidence", type="enum", values=["low", "medium", "high"], nullable=False, aggregator="min"), |
| Field(name="comment", type="string", nullable=True, aggregator="priority"), |
| ], |
| ) |
|
|
|
|
| def _lemma_only_preset() -> AnnotationSchema: |
| return AnnotationSchema( |
| task_name="Lemmatization only", |
| description="One lemma per token.", |
| fields=[ |
| Field(name="lemma", type="string", nullable=True, aggregator="lcs"), |
| Field(name="confidence", type="enum", values=["low", "medium", "high"], nullable=False, aggregator="min"), |
| ], |
| ) |
|
|
|
|
| def _custom_preset() -> AnnotationSchema: |
| return AnnotationSchema( |
| task_name="Custom task", |
| description="Define your own fields in the Schema tab.", |
| fields=[ |
| Field(name="label", type="enum", values=["LABEL_A", "LABEL_B"], nullable=False, aggregator="vote"), |
| Field(name="confidence", type="enum", values=["low", "medium", "high"], nullable=False, aggregator="min"), |
| ], |
| ) |
|
|
|
|
| PRESETS = { |
| "ud_upos_morph": ("UD UPOS + morphology (UD-standard)", _ud_preset), |
| "grc_tagset": ("Ancient Greek — bespoke tagset (GRC)", lambda: _tagset_preset("GRC")), |
| "hye_tagset": ("Old Armenian — bespoke tagset (HYE)", lambda: _tagset_preset("HYE")), |
| "kat_tagset": ("Old Georgian — bespoke tagset (KAT)", lambda: _tagset_preset("KAT")), |
| "syc_tagset": ("Syriac — bespoke tagset (SYC)", lambda: _tagset_preset("SYC")), |
| "ner_basic": ("Named Entity Recognition (BIO)", _ner_preset), |
| "lemma_only": ("Lemmatization only", _lemma_only_preset), |
| "custom": ("Custom — define your own", _custom_preset), |
| } |
|
|
|
|
| def list_presets() -> list[tuple[str, str]]: |
| """Return [(preset_key, display_label), ...].""" |
| return [(k, label) for k, (label, _) in PRESETS.items()] |
|
|
|
|
| def from_preset(key: str) -> AnnotationSchema: |
| if key not in PRESETS: |
| raise KeyError(f"Unknown preset: {key}. Known: {list(PRESETS)}") |
| return PRESETS[key][1]() |
|
|