File size: 3,230 Bytes
a81a250 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | {
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "german-ocr-3/schemas/generic_document.json",
"title": "GermanOCR3 Generic Document",
"description": "Allgemeines deutsches Dokument-Extraktionsschema. Felder dürfen null sein, wenn nicht eindeutig erkennbar.",
"type": "object",
"additionalProperties": false,
"required": ["document_type", "language", "raw_text", "confidence"],
"properties": {
"document_type": {
"description": "z.B. invoice, receipt, letter, form, contract, id_card, other, unknown",
"type": ["string", "null"]
},
"language": {
"description": "BCP-47 Sprachcode des Dokuments (typisch 'de').",
"type": "string",
"default": "de"
},
"sender": {
"type": ["object", "null"],
"additionalProperties": false,
"properties": {
"name": {"type": ["string", "null"]},
"address": {"type": ["string", "null"]},
"email": {"type": ["string", "null"]},
"phone": {"type": ["string", "null"]},
"tax_id": {"type": ["string", "null"]},
"vat_id": {"type": ["string", "null"]}
}
},
"recipient": {
"type": ["object", "null"],
"additionalProperties": false,
"properties": {
"name": {"type": ["string", "null"]},
"address": {"type": ["string", "null"]},
"customer_id": {"type": ["string", "null"]}
}
},
"date": {
"description": "Hauptdatum des Dokuments im Format YYYY-MM-DD, falls eindeutig erkennbar.",
"type": ["string", "null"]
},
"reference_numbers": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": false,
"required": ["label", "value"],
"properties": {
"label": {"type": "string"},
"value": {"type": "string"}
}
},
"default": []
},
"amounts": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": false,
"required": ["label", "value"],
"properties": {
"label": {"type": "string"},
"value": {"type": ["number", "string"]},
"currency": {"type": ["string", "null"]}
}
},
"default": []
},
"tables": {
"type": "array",
"default": [],
"items": {
"type": "object",
"additionalProperties": false,
"required": ["headers", "rows"],
"properties": {
"title": {"type": ["string", "null"]},
"headers": {"type": "array", "items": {"type": "string"}},
"rows": {
"type": "array",
"items": {
"type": "array",
"items": {"type": ["string", "number", "null"]}
}
}
}
}
},
"raw_text": {
"description": "Roher OCR-Text, möglichst layouterhaltend, deutsche Originalschreibweise behalten.",
"type": ["string", "null"]
},
"confidence": {
"description": "Subjektive Selbsteinschätzung 0..1.",
"type": ["number", "null"],
"minimum": 0,
"maximum": 1
},
"notes": {
"type": "array",
"items": {"type": "string"},
"default": []
}
}
}
|