german-ocr-3 / schemas /generic_document.json
Keyven's picture
Upload schemas/generic_document.json with huggingface_hub
a81a250 verified
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "german-ocr-3/schemas/generic_document.json",
"title": "GermanOCR3 Generic Document",
"description": "Allgemeines deutsches Dokument-Extraktionsschema. Felder dürfen null sein, wenn nicht eindeutig erkennbar.",
"type": "object",
"additionalProperties": false,
"required": ["document_type", "language", "raw_text", "confidence"],
"properties": {
"document_type": {
"description": "z.B. invoice, receipt, letter, form, contract, id_card, other, unknown",
"type": ["string", "null"]
},
"language": {
"description": "BCP-47 Sprachcode des Dokuments (typisch 'de').",
"type": "string",
"default": "de"
},
"sender": {
"type": ["object", "null"],
"additionalProperties": false,
"properties": {
"name": {"type": ["string", "null"]},
"address": {"type": ["string", "null"]},
"email": {"type": ["string", "null"]},
"phone": {"type": ["string", "null"]},
"tax_id": {"type": ["string", "null"]},
"vat_id": {"type": ["string", "null"]}
}
},
"recipient": {
"type": ["object", "null"],
"additionalProperties": false,
"properties": {
"name": {"type": ["string", "null"]},
"address": {"type": ["string", "null"]},
"customer_id": {"type": ["string", "null"]}
}
},
"date": {
"description": "Hauptdatum des Dokuments im Format YYYY-MM-DD, falls eindeutig erkennbar.",
"type": ["string", "null"]
},
"reference_numbers": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": false,
"required": ["label", "value"],
"properties": {
"label": {"type": "string"},
"value": {"type": "string"}
}
},
"default": []
},
"amounts": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": false,
"required": ["label", "value"],
"properties": {
"label": {"type": "string"},
"value": {"type": ["number", "string"]},
"currency": {"type": ["string", "null"]}
}
},
"default": []
},
"tables": {
"type": "array",
"default": [],
"items": {
"type": "object",
"additionalProperties": false,
"required": ["headers", "rows"],
"properties": {
"title": {"type": ["string", "null"]},
"headers": {"type": "array", "items": {"type": "string"}},
"rows": {
"type": "array",
"items": {
"type": "array",
"items": {"type": ["string", "number", "null"]}
}
}
}
}
},
"raw_text": {
"description": "Roher OCR-Text, möglichst layouterhaltend, deutsche Originalschreibweise behalten.",
"type": ["string", "null"]
},
"confidence": {
"description": "Subjektive Selbsteinschätzung 0..1.",
"type": ["number", "null"],
"minimum": 0,
"maximum": 1
},
"notes": {
"type": "array",
"items": {"type": "string"},
"default": []
}
}
}