Spaces:
Build error
Build error
Claude commited on
Sprint 1 — Setup initial : arborescence, schémas Pydantic, profils JSON, prompts, tests
Browse files- Structure canonique complète (backend/app/, profiles/, prompts/, infra/)
- Schémas Pydantic v2 : CorpusProfile, PageMaster, AnnotationLayer
- 4 profils JSON (medieval-illuminated, medieval-textual, early-modern-print, modern-handwritten)
- 9 templates de prompts avec variables {{profile_label}}, {{language_hints}}, {{script_type}}
- 54 tests pytest : 100% passed (test_schemas.py + test_profiles.py)
- backend/pyproject.toml avec dépendances stack technique
- .gitignore avec exclusion du dossier data/
https://claude.ai/code/session_018woyEHc8HG2th7V4ewJ4Kg
- .gitignore +40 -0
- backend/app/__init__.py +0 -0
- backend/app/api/v1/.gitkeep +0 -0
- backend/app/models/.gitkeep +0 -0
- backend/app/schemas/__init__.py +0 -0
- backend/app/schemas/annotation.py +32 -0
- backend/app/schemas/corpus_profile.py +54 -0
- backend/app/schemas/page_master.py +108 -0
- backend/app/services/__init__.py +0 -0
- backend/app/services/ai/.gitkeep +0 -0
- backend/app/services/export/.gitkeep +0 -0
- backend/app/services/image/.gitkeep +0 -0
- backend/app/services/ingest/.gitkeep +0 -0
- backend/app/services/search/.gitkeep +0 -0
- backend/pyproject.toml +34 -0
- backend/tests/__init__.py +0 -0
- backend/tests/test_profiles.py +121 -0
- backend/tests/test_schemas.py +302 -0
- infra/.gitkeep +0 -0
- profiles/early-modern-print.json +26 -0
- profiles/medieval-illuminated.json +34 -0
- profiles/medieval-textual.json +31 -0
- profiles/modern-handwritten.json +25 -0
- prompts/early-modern-print/primary_v1.txt +41 -0
- prompts/medieval-illuminated/commentary_v1.txt +34 -0
- prompts/medieval-illuminated/iconography_v1.txt +34 -0
- prompts/medieval-illuminated/primary_v1.txt +41 -0
- prompts/medieval-illuminated/translation_v1.txt +22 -0
- prompts/medieval-textual/commentary_v1.txt +34 -0
- prompts/medieval-textual/primary_v1.txt +41 -0
- prompts/medieval-textual/translation_v1.txt +22 -0
- prompts/modern-handwritten/primary_v1.txt +41 -0
.gitignore
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Data — jamais versionné (CLAUDE.md section 3)
|
| 2 |
+
data/
|
| 3 |
+
|
| 4 |
+
# Python
|
| 5 |
+
__pycache__/
|
| 6 |
+
*.py[cod]
|
| 7 |
+
*.pyo
|
| 8 |
+
*.pyd
|
| 9 |
+
.Python
|
| 10 |
+
*.egg-info/
|
| 11 |
+
dist/
|
| 12 |
+
build/
|
| 13 |
+
.eggs/
|
| 14 |
+
*.egg
|
| 15 |
+
.venv/
|
| 16 |
+
venv/
|
| 17 |
+
env/
|
| 18 |
+
.env
|
| 19 |
+
|
| 20 |
+
# pytest / coverage
|
| 21 |
+
.pytest_cache/
|
| 22 |
+
.coverage
|
| 23 |
+
htmlcov/
|
| 24 |
+
.tox/
|
| 25 |
+
|
| 26 |
+
# IDE
|
| 27 |
+
.vscode/
|
| 28 |
+
.idea/
|
| 29 |
+
*.swp
|
| 30 |
+
*.swo
|
| 31 |
+
|
| 32 |
+
# OS
|
| 33 |
+
.DS_Store
|
| 34 |
+
Thumbs.db
|
| 35 |
+
|
| 36 |
+
# Secrets — jamais versionné (CLAUDE.md R06)
|
| 37 |
+
.env.local
|
| 38 |
+
.env.*.local
|
| 39 |
+
*.key
|
| 40 |
+
secrets/
|
backend/app/__init__.py
ADDED
|
File without changes
|
backend/app/api/v1/.gitkeep
ADDED
|
File without changes
|
backend/app/models/.gitkeep
ADDED
|
File without changes
|
backend/app/schemas/__init__.py
ADDED
|
File without changes
|
backend/app/schemas/annotation.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Schémas Pydantic pour les couches d'annotation de page.
|
| 3 |
+
"""
|
| 4 |
+
# 1. stdlib
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
from enum import Enum
|
| 7 |
+
|
| 8 |
+
# 2. third-party
|
| 9 |
+
from pydantic import BaseModel
|
| 10 |
+
|
| 11 |
+
# 3. local
|
| 12 |
+
from app.schemas.corpus_profile import LayerType
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class LayerStatus(str, Enum):
|
| 16 |
+
PENDING = "pending"
|
| 17 |
+
RUNNING = "running"
|
| 18 |
+
DONE = "done"
|
| 19 |
+
FAILED = "failed"
|
| 20 |
+
NEEDS_REVIEW = "needs_review"
|
| 21 |
+
VALIDATED = "validated"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class AnnotationLayer(BaseModel):
|
| 25 |
+
id: str
|
| 26 |
+
page_id: str
|
| 27 |
+
layer_type: LayerType
|
| 28 |
+
status: LayerStatus = LayerStatus.PENDING
|
| 29 |
+
version: int = 1
|
| 30 |
+
source_model: str | None = None
|
| 31 |
+
prompt_version: str | None = None
|
| 32 |
+
created_at: datetime
|
backend/app/schemas/corpus_profile.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Schémas Pydantic pour le profil de corpus — entité centrale du pipeline.
|
| 3 |
+
"""
|
| 4 |
+
# 1. stdlib
|
| 5 |
+
from enum import Enum
|
| 6 |
+
|
| 7 |
+
# 2. third-party
|
| 8 |
+
from pydantic import BaseModel, ConfigDict, Field
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class LayerType(str, Enum):
|
| 12 |
+
IMAGE = "image"
|
| 13 |
+
OCR_DIPLOMATIC = "ocr_diplomatic"
|
| 14 |
+
OCR_NORMALIZED = "ocr_normalized"
|
| 15 |
+
TRANSLATION_FR = "translation_fr"
|
| 16 |
+
TRANSLATION_EN = "translation_en"
|
| 17 |
+
SUMMARY = "summary"
|
| 18 |
+
SCHOLARLY_COMMENTARY = "scholarly_commentary"
|
| 19 |
+
PUBLIC_COMMENTARY = "public_commentary"
|
| 20 |
+
ICONOGRAPHY_DETECTION = "iconography_detection"
|
| 21 |
+
MATERIAL_NOTES = "material_notes"
|
| 22 |
+
UNCERTAINTY = "uncertainty"
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class ScriptType(str, Enum):
|
| 26 |
+
CAROLINE = "caroline"
|
| 27 |
+
GOTHIC = "gothic"
|
| 28 |
+
PRINT = "print"
|
| 29 |
+
CURSIVE = "cursive"
|
| 30 |
+
OTHER = "other"
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class ExportConfig(BaseModel):
|
| 34 |
+
mets: bool = True
|
| 35 |
+
alto: bool = True
|
| 36 |
+
tei: bool = False
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class UncertaintyConfig(BaseModel):
|
| 40 |
+
flag_below: float = Field(0.4, ge=0.0, le=1.0)
|
| 41 |
+
min_acceptable: float = Field(0.25, ge=0.0, le=1.0)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class CorpusProfile(BaseModel):
|
| 45 |
+
model_config = ConfigDict(frozen=True)
|
| 46 |
+
|
| 47 |
+
profile_id: str
|
| 48 |
+
label: str
|
| 49 |
+
language_hints: list[str]
|
| 50 |
+
script_type: ScriptType
|
| 51 |
+
active_layers: list[LayerType]
|
| 52 |
+
prompt_templates: dict[str, str]
|
| 53 |
+
uncertainty_config: UncertaintyConfig
|
| 54 |
+
export_config: ExportConfig
|
backend/app/schemas/page_master.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Schémas Pydantic pour le JSON maître de page — source canonique de toutes les sorties.
|
| 3 |
+
"""
|
| 4 |
+
# 1. stdlib
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
from enum import Enum
|
| 7 |
+
from typing import Any, Literal
|
| 8 |
+
|
| 9 |
+
# 2. third-party
|
| 10 |
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class RegionType(str, Enum):
|
| 14 |
+
TEXT_BLOCK = "text_block"
|
| 15 |
+
MINIATURE = "miniature"
|
| 16 |
+
DECORATED_INITIAL = "decorated_initial"
|
| 17 |
+
MARGIN = "margin"
|
| 18 |
+
RUBRIC = "rubric"
|
| 19 |
+
OTHER = "other"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class Region(BaseModel):
|
| 23 |
+
id: str
|
| 24 |
+
type: RegionType
|
| 25 |
+
bbox: list[int] = Field(..., min_length=4, max_length=4)
|
| 26 |
+
confidence: float = Field(..., ge=0.0, le=1.0)
|
| 27 |
+
polygon: list[list[int]] | None = None
|
| 28 |
+
parent_region_id: str | None = None
|
| 29 |
+
|
| 30 |
+
@field_validator("bbox")
|
| 31 |
+
@classmethod
|
| 32 |
+
def bbox_must_be_positive(cls, v: list[int]) -> list[int]:
|
| 33 |
+
if any(x < 0 for x in v):
|
| 34 |
+
raise ValueError("bbox values must be >= 0")
|
| 35 |
+
if v[2] <= 0 or v[3] <= 0:
|
| 36 |
+
raise ValueError("bbox width and height must be > 0")
|
| 37 |
+
return v
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class OCRResult(BaseModel):
|
| 41 |
+
diplomatic_text: str = ""
|
| 42 |
+
blocks: list[dict] = []
|
| 43 |
+
lines: list[dict] = []
|
| 44 |
+
language: str = "la"
|
| 45 |
+
confidence: float = Field(0.0, ge=0.0, le=1.0)
|
| 46 |
+
uncertain_segments: list[str] = []
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class Translation(BaseModel):
|
| 50 |
+
fr: str = ""
|
| 51 |
+
en: str = ""
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class CommentaryClaim(BaseModel):
|
| 55 |
+
claim: str
|
| 56 |
+
evidence_region_ids: list[str] = []
|
| 57 |
+
certainty: Literal["high", "medium", "low", "speculative"] = "medium"
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class Commentary(BaseModel):
|
| 61 |
+
public: str = ""
|
| 62 |
+
scholarly: str = ""
|
| 63 |
+
claims: list[CommentaryClaim] = []
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class ProcessingInfo(BaseModel):
|
| 67 |
+
model_id: str
|
| 68 |
+
model_display_name: str
|
| 69 |
+
prompt_version: str
|
| 70 |
+
raw_response_path: str
|
| 71 |
+
processed_at: datetime
|
| 72 |
+
cost_estimate_usd: float | None = None
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
class EditorialStatus(str, Enum):
|
| 76 |
+
MACHINE_DRAFT = "machine_draft"
|
| 77 |
+
NEEDS_REVIEW = "needs_review"
|
| 78 |
+
REVIEWED = "reviewed"
|
| 79 |
+
VALIDATED = "validated"
|
| 80 |
+
PUBLISHED = "published"
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
class EditorialInfo(BaseModel):
|
| 84 |
+
status: EditorialStatus = EditorialStatus.MACHINE_DRAFT
|
| 85 |
+
validated: bool = False
|
| 86 |
+
validated_by: str | None = None
|
| 87 |
+
version: int = 1
|
| 88 |
+
notes: list[str] = []
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
class PageMaster(BaseModel):
|
| 92 |
+
schema_version: str = "1.0"
|
| 93 |
+
page_id: str
|
| 94 |
+
corpus_profile: str
|
| 95 |
+
manuscript_id: str
|
| 96 |
+
folio_label: str
|
| 97 |
+
sequence: int
|
| 98 |
+
|
| 99 |
+
image: dict
|
| 100 |
+
layout: dict
|
| 101 |
+
ocr: OCRResult | None = None
|
| 102 |
+
translation: Translation | None = None
|
| 103 |
+
summary: dict | None = None
|
| 104 |
+
commentary: Commentary | None = None
|
| 105 |
+
extensions: dict[str, Any] = {}
|
| 106 |
+
|
| 107 |
+
processing: ProcessingInfo | None = None
|
| 108 |
+
editorial: EditorialInfo = Field(default_factory=EditorialInfo)
|
backend/app/services/__init__.py
ADDED
|
File without changes
|
backend/app/services/ai/.gitkeep
ADDED
|
File without changes
|
backend/app/services/export/.gitkeep
ADDED
|
File without changes
|
backend/app/services/image/.gitkeep
ADDED
|
File without changes
|
backend/app/services/ingest/.gitkeep
ADDED
|
File without changes
|
backend/app/services/search/.gitkeep
ADDED
|
File without changes
|
backend/pyproject.toml
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=61", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "scriptorium-ai-backend"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "Backend Scriptorium AI — plateforme générique d'éditions savantes augmentées"
|
| 9 |
+
requires-python = ">=3.11"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"fastapi>=0.111",
|
| 12 |
+
"uvicorn[standard]>=0.29",
|
| 13 |
+
"pydantic>=2.7",
|
| 14 |
+
"sqlalchemy>=2.0",
|
| 15 |
+
"aiosqlite>=0.20",
|
| 16 |
+
"google-generativeai>=0.3",
|
| 17 |
+
"lxml>=5.2",
|
| 18 |
+
"Pillow>=10.3",
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
[project.optional-dependencies]
|
| 22 |
+
dev = [
|
| 23 |
+
"pytest>=8.2",
|
| 24 |
+
"pytest-cov>=5.0",
|
| 25 |
+
"pytest-asyncio>=0.23",
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
[tool.pytest.ini_options]
|
| 29 |
+
asyncio_mode = "auto"
|
| 30 |
+
testpaths = ["tests"]
|
| 31 |
+
|
| 32 |
+
[tool.setuptools.packages.find]
|
| 33 |
+
where = ["."]
|
| 34 |
+
include = ["app*"]
|
backend/tests/__init__.py
ADDED
|
File without changes
|
backend/tests/test_profiles.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests de chargement et validation des profils JSON — un test par profil.
|
| 3 |
+
"""
|
| 4 |
+
# 1. stdlib
|
| 5 |
+
import json
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
# 2. third-party
|
| 9 |
+
import pytest
|
| 10 |
+
from pydantic import ValidationError
|
| 11 |
+
|
| 12 |
+
# 3. local
|
| 13 |
+
from app.schemas.corpus_profile import CorpusProfile, LayerType, ScriptType
|
| 14 |
+
|
| 15 |
+
PROFILES_DIR = Path(__file__).parent.parent.parent / "profiles"
|
| 16 |
+
PROFILE_FILES = [
|
| 17 |
+
"medieval-illuminated.json",
|
| 18 |
+
"medieval-textual.json",
|
| 19 |
+
"early-modern-print.json",
|
| 20 |
+
"modern-handwritten.json",
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def load_profile(filename: str) -> CorpusProfile:
|
| 25 |
+
path = PROFILES_DIR / filename
|
| 26 |
+
with path.open(encoding="utf-8") as f:
|
| 27 |
+
data = json.load(f)
|
| 28 |
+
return CorpusProfile.model_validate(data)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# ---------------------------------------------------------------------------
|
| 32 |
+
# Tests de chargement
|
| 33 |
+
# ---------------------------------------------------------------------------
|
| 34 |
+
|
| 35 |
+
def test_medieval_illuminated_loads():
|
| 36 |
+
profile = load_profile("medieval-illuminated.json")
|
| 37 |
+
assert profile.profile_id == "medieval-illuminated"
|
| 38 |
+
assert profile.script_type == ScriptType.CAROLINE
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def test_medieval_textual_loads():
|
| 42 |
+
profile = load_profile("medieval-textual.json")
|
| 43 |
+
assert profile.profile_id == "medieval-textual"
|
| 44 |
+
assert profile.script_type == ScriptType.GOTHIC
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def test_early_modern_print_loads():
|
| 48 |
+
profile = load_profile("early-modern-print.json")
|
| 49 |
+
assert profile.profile_id == "early-modern-print"
|
| 50 |
+
assert profile.script_type == ScriptType.PRINT
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def test_modern_handwritten_loads():
|
| 54 |
+
profile = load_profile("modern-handwritten.json")
|
| 55 |
+
assert profile.profile_id == "modern-handwritten"
|
| 56 |
+
assert profile.script_type == ScriptType.CURSIVE
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
# ---------------------------------------------------------------------------
|
| 60 |
+
# Tests de cohérence
|
| 61 |
+
# ---------------------------------------------------------------------------
|
| 62 |
+
|
| 63 |
+
@pytest.mark.parametrize("filename", PROFILE_FILES)
|
| 64 |
+
def test_profile_has_required_fields(filename: str):
|
| 65 |
+
profile = load_profile(filename)
|
| 66 |
+
assert profile.profile_id
|
| 67 |
+
assert profile.label
|
| 68 |
+
assert len(profile.language_hints) >= 1
|
| 69 |
+
assert len(profile.active_layers) >= 1
|
| 70 |
+
assert "primary" in profile.prompt_templates
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
@pytest.mark.parametrize("filename", PROFILE_FILES)
|
| 74 |
+
def test_profile_active_layers_are_valid_layer_types(filename: str):
|
| 75 |
+
profile = load_profile(filename)
|
| 76 |
+
valid_values = {lt.value for lt in LayerType}
|
| 77 |
+
for layer in profile.active_layers:
|
| 78 |
+
assert layer.value in valid_values
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
@pytest.mark.parametrize("filename", PROFILE_FILES)
|
| 82 |
+
def test_profile_uncertainty_config_bounds(filename: str):
|
| 83 |
+
profile = load_profile(filename)
|
| 84 |
+
assert 0.0 <= profile.uncertainty_config.flag_below <= 1.0
|
| 85 |
+
assert 0.0 <= profile.uncertainty_config.min_acceptable <= 1.0
|
| 86 |
+
assert profile.uncertainty_config.min_acceptable <= profile.uncertainty_config.flag_below
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
@pytest.mark.parametrize("filename", PROFILE_FILES)
|
| 90 |
+
def test_profile_is_frozen(filename: str):
|
| 91 |
+
profile = load_profile(filename)
|
| 92 |
+
with pytest.raises((TypeError, ValidationError)):
|
| 93 |
+
profile.label = "Hacked" # type: ignore[misc]
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
@pytest.mark.parametrize("filename", PROFILE_FILES)
|
| 97 |
+
def test_profile_prompt_templates_point_to_txt_files(filename: str):
|
| 98 |
+
profile = load_profile(filename)
|
| 99 |
+
for key, path in profile.prompt_templates.items():
|
| 100 |
+
assert path.endswith(".txt"), f"Template '{key}' doit pointer vers un .txt"
|
| 101 |
+
assert path.startswith("prompts/"), f"Template '{key}' doit être dans prompts/"
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def test_medieval_illuminated_has_iconography():
|
| 105 |
+
profile = load_profile("medieval-illuminated.json")
|
| 106 |
+
assert LayerType.ICONOGRAPHY_DETECTION in profile.active_layers
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def test_medieval_illuminated_has_iconography_prompt():
|
| 110 |
+
profile = load_profile("medieval-illuminated.json")
|
| 111 |
+
assert "iconography" in profile.prompt_templates
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def test_early_modern_print_no_iconography():
|
| 115 |
+
profile = load_profile("early-modern-print.json")
|
| 116 |
+
assert LayerType.ICONOGRAPHY_DETECTION not in profile.active_layers
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def test_modern_handwritten_no_iconography():
|
| 120 |
+
profile = load_profile("modern-handwritten.json")
|
| 121 |
+
assert LayerType.ICONOGRAPHY_DETECTION not in profile.active_layers
|
backend/tests/test_schemas.py
ADDED
|
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests des schémas Pydantic — corpus_profile, page_master, annotation.
|
| 3 |
+
"""
|
| 4 |
+
# 1. stdlib
|
| 5 |
+
from datetime import datetime, timezone
|
| 6 |
+
|
| 7 |
+
# 2. third-party
|
| 8 |
+
import pytest
|
| 9 |
+
from pydantic import ValidationError
|
| 10 |
+
|
| 11 |
+
# 3. local
|
| 12 |
+
from app.schemas.corpus_profile import (
|
| 13 |
+
CorpusProfile,
|
| 14 |
+
ExportConfig,
|
| 15 |
+
LayerType,
|
| 16 |
+
ScriptType,
|
| 17 |
+
UncertaintyConfig,
|
| 18 |
+
)
|
| 19 |
+
from app.schemas.page_master import (
|
| 20 |
+
Commentary,
|
| 21 |
+
CommentaryClaim,
|
| 22 |
+
EditorialInfo,
|
| 23 |
+
EditorialStatus,
|
| 24 |
+
OCRResult,
|
| 25 |
+
PageMaster,
|
| 26 |
+
ProcessingInfo,
|
| 27 |
+
Region,
|
| 28 |
+
RegionType,
|
| 29 |
+
Translation,
|
| 30 |
+
)
|
| 31 |
+
from app.schemas.annotation import AnnotationLayer, LayerStatus
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# ---------------------------------------------------------------------------
|
| 35 |
+
# Fixtures
|
| 36 |
+
# ---------------------------------------------------------------------------
|
| 37 |
+
|
| 38 |
+
@pytest.fixture
|
| 39 |
+
def minimal_corpus_profile() -> dict:
|
| 40 |
+
return {
|
| 41 |
+
"profile_id": "test-profile",
|
| 42 |
+
"label": "Test Profile",
|
| 43 |
+
"language_hints": ["la"],
|
| 44 |
+
"script_type": "caroline",
|
| 45 |
+
"active_layers": ["ocr_diplomatic", "translation_fr"],
|
| 46 |
+
"prompt_templates": {"primary": "prompts/test/primary_v1.txt"},
|
| 47 |
+
"uncertainty_config": {"flag_below": 0.4, "min_acceptable": 0.25},
|
| 48 |
+
"export_config": {"mets": True, "alto": True, "tei": False},
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@pytest.fixture
|
| 53 |
+
def minimal_page_master() -> dict:
|
| 54 |
+
return {
|
| 55 |
+
"page_id": "test-corpus-0001r",
|
| 56 |
+
"corpus_profile": "test-profile",
|
| 57 |
+
"manuscript_id": "ms-test-001",
|
| 58 |
+
"folio_label": "0001r",
|
| 59 |
+
"sequence": 1,
|
| 60 |
+
"image": {
|
| 61 |
+
"master": "data/corpora/test/masters/0001r.tif",
|
| 62 |
+
"derivative_web": "data/corpora/test/derivatives/0001r.jpg",
|
| 63 |
+
"iiif_base": "",
|
| 64 |
+
"width": 2000,
|
| 65 |
+
"height": 3000,
|
| 66 |
+
},
|
| 67 |
+
"layout": {"regions": []},
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
@pytest.fixture
|
| 72 |
+
def valid_region() -> dict:
|
| 73 |
+
return {
|
| 74 |
+
"id": "r1",
|
| 75 |
+
"type": "text_block",
|
| 76 |
+
"bbox": [10, 20, 300, 400],
|
| 77 |
+
"confidence": 0.95,
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
# ---------------------------------------------------------------------------
|
| 82 |
+
# Tests — CorpusProfile
|
| 83 |
+
# ---------------------------------------------------------------------------
|
| 84 |
+
|
| 85 |
+
def test_corpus_profile_valid(minimal_corpus_profile):
|
| 86 |
+
profile = CorpusProfile.model_validate(minimal_corpus_profile)
|
| 87 |
+
assert profile.profile_id == "test-profile"
|
| 88 |
+
assert profile.script_type == ScriptType.CAROLINE
|
| 89 |
+
assert LayerType.OCR_DIPLOMATIC in profile.active_layers
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def test_corpus_profile_is_frozen(minimal_corpus_profile):
|
| 93 |
+
profile = CorpusProfile.model_validate(minimal_corpus_profile)
|
| 94 |
+
with pytest.raises((TypeError, ValidationError)):
|
| 95 |
+
profile.label = "Modified" # type: ignore[misc]
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def test_corpus_profile_all_script_types(minimal_corpus_profile):
|
| 99 |
+
for script in ScriptType:
|
| 100 |
+
data = {**minimal_corpus_profile, "script_type": script.value}
|
| 101 |
+
profile = CorpusProfile.model_validate(data)
|
| 102 |
+
assert profile.script_type == script
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def test_corpus_profile_all_layer_types(minimal_corpus_profile):
|
| 106 |
+
all_layers = [lt.value for lt in LayerType]
|
| 107 |
+
data = {**minimal_corpus_profile, "active_layers": all_layers}
|
| 108 |
+
profile = CorpusProfile.model_validate(data)
|
| 109 |
+
assert len(profile.active_layers) == len(LayerType)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def test_uncertainty_config_defaults():
|
| 113 |
+
config = UncertaintyConfig()
|
| 114 |
+
assert config.flag_below == 0.4
|
| 115 |
+
assert config.min_acceptable == 0.25
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def test_uncertainty_config_bounds():
|
| 119 |
+
with pytest.raises(ValidationError):
|
| 120 |
+
UncertaintyConfig(flag_below=1.5)
|
| 121 |
+
with pytest.raises(ValidationError):
|
| 122 |
+
UncertaintyConfig(min_acceptable=-0.1)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def test_export_config_defaults():
|
| 126 |
+
config = ExportConfig()
|
| 127 |
+
assert config.mets is True
|
| 128 |
+
assert config.alto is True
|
| 129 |
+
assert config.tei is False
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def test_corpus_profile_missing_required_field():
|
| 133 |
+
with pytest.raises(ValidationError):
|
| 134 |
+
CorpusProfile.model_validate({"profile_id": "x"})
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
# ---------------------------------------------------------------------------
|
| 138 |
+
# Tests — Region / bbox
|
| 139 |
+
# ---------------------------------------------------------------------------
|
| 140 |
+
|
| 141 |
+
def test_region_valid_bbox(valid_region):
|
| 142 |
+
region = Region.model_validate(valid_region)
|
| 143 |
+
assert region.bbox == [10, 20, 300, 400]
|
| 144 |
+
assert region.confidence == 0.95
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def test_region_bbox_negative_x():
|
| 148 |
+
with pytest.raises(ValidationError):
|
| 149 |
+
Region.model_validate({
|
| 150 |
+
"id": "r1", "type": "text_block",
|
| 151 |
+
"bbox": [-1, 20, 300, 400], "confidence": 0.5,
|
| 152 |
+
})
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def test_region_bbox_zero_width():
|
| 156 |
+
with pytest.raises(ValidationError):
|
| 157 |
+
Region.model_validate({
|
| 158 |
+
"id": "r1", "type": "text_block",
|
| 159 |
+
"bbox": [0, 0, 0, 400], "confidence": 0.5,
|
| 160 |
+
})
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def test_region_bbox_zero_height():
|
| 164 |
+
with pytest.raises(ValidationError):
|
| 165 |
+
Region.model_validate({
|
| 166 |
+
"id": "r1", "type": "text_block",
|
| 167 |
+
"bbox": [0, 0, 300, 0], "confidence": 0.5,
|
| 168 |
+
})
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def test_region_bbox_wrong_length():
|
| 172 |
+
with pytest.raises(ValidationError):
|
| 173 |
+
Region.model_validate({
|
| 174 |
+
"id": "r1", "type": "text_block",
|
| 175 |
+
"bbox": [0, 0, 300], "confidence": 0.5,
|
| 176 |
+
})
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def test_region_all_types():
|
| 180 |
+
for region_type in RegionType:
|
| 181 |
+
region = Region.model_validate({
|
| 182 |
+
"id": "r1", "type": region_type.value,
|
| 183 |
+
"bbox": [0, 0, 100, 100], "confidence": 0.8,
|
| 184 |
+
})
|
| 185 |
+
assert region.type == region_type
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def test_region_optional_polygon():
|
| 189 |
+
region = Region.model_validate({
|
| 190 |
+
"id": "r1", "type": "miniature",
|
| 191 |
+
"bbox": [0, 0, 200, 200], "confidence": 0.9,
|
| 192 |
+
"polygon": [[0, 0], [200, 0], [200, 200], [0, 200]],
|
| 193 |
+
})
|
| 194 |
+
assert region.polygon is not None
|
| 195 |
+
assert len(region.polygon) == 4
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
# ---------------------------------------------------------------------------
|
| 199 |
+
# Tests — PageMaster
|
| 200 |
+
# ---------------------------------------------------------------------------
|
| 201 |
+
|
| 202 |
+
def test_page_master_valid(minimal_page_master):
|
| 203 |
+
page = PageMaster.model_validate(minimal_page_master)
|
| 204 |
+
assert page.schema_version == "1.0"
|
| 205 |
+
assert page.page_id == "test-corpus-0001r"
|
| 206 |
+
assert page.editorial.status == EditorialStatus.MACHINE_DRAFT
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def test_page_master_schema_version_default(minimal_page_master):
|
| 210 |
+
page = PageMaster.model_validate(minimal_page_master)
|
| 211 |
+
assert page.schema_version == "1.0"
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def test_page_master_with_ocr(minimal_page_master):
|
| 215 |
+
data = {**minimal_page_master, "ocr": {
|
| 216 |
+
"diplomatic_text": "In nomine Domini",
|
| 217 |
+
"language": "la",
|
| 218 |
+
"confidence": 0.87,
|
| 219 |
+
}}
|
| 220 |
+
page = PageMaster.model_validate(data)
|
| 221 |
+
assert page.ocr is not None
|
| 222 |
+
assert page.ocr.diplomatic_text == "In nomine Domini"
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def test_page_master_with_translation(minimal_page_master):
|
| 226 |
+
data = {**minimal_page_master, "translation": {
|
| 227 |
+
"fr": "Au nom du Seigneur",
|
| 228 |
+
"en": "In the name of the Lord",
|
| 229 |
+
}}
|
| 230 |
+
page = PageMaster.model_validate(data)
|
| 231 |
+
assert page.translation is not None
|
| 232 |
+
assert page.translation.fr == "Au nom du Seigneur"
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def test_page_master_with_commentary(minimal_page_master):
|
| 236 |
+
data = {**minimal_page_master, "commentary": {
|
| 237 |
+
"public": "Description publique.",
|
| 238 |
+
"scholarly": "Analyse savante.",
|
| 239 |
+
"claims": [
|
| 240 |
+
{"claim": "Ce folio date du XIe siècle.", "certainty": "high"}
|
| 241 |
+
],
|
| 242 |
+
}}
|
| 243 |
+
page = PageMaster.model_validate(data)
|
| 244 |
+
assert page.commentary is not None
|
| 245 |
+
assert len(page.commentary.claims) == 1
|
| 246 |
+
assert page.commentary.claims[0].certainty == "high"
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
def test_page_master_editorial_info_defaults(minimal_page_master):
|
| 250 |
+
page = PageMaster.model_validate(minimal_page_master)
|
| 251 |
+
assert page.editorial.validated is False
|
| 252 |
+
assert page.editorial.version == 1
|
| 253 |
+
assert page.editorial.validated_by is None
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def test_commentary_claim_certainty_values():
|
| 257 |
+
for certainty in ("high", "medium", "low", "speculative"):
|
| 258 |
+
claim = CommentaryClaim(claim="Test.", certainty=certainty)
|
| 259 |
+
assert claim.certainty == certainty
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
def test_commentary_claim_invalid_certainty():
|
| 263 |
+
with pytest.raises(ValidationError):
|
| 264 |
+
CommentaryClaim(claim="Test.", certainty="unknown")
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
# ---------------------------------------------------------------------------
|
| 268 |
+
# Tests — AnnotationLayer
|
| 269 |
+
# ---------------------------------------------------------------------------
|
| 270 |
+
|
| 271 |
+
def test_annotation_layer_valid():
|
| 272 |
+
layer = AnnotationLayer(
|
| 273 |
+
id="layer-001",
|
| 274 |
+
page_id="test-corpus-0001r",
|
| 275 |
+
layer_type=LayerType.OCR_DIPLOMATIC,
|
| 276 |
+
created_at=datetime(2026, 3, 16, 12, 0, 0, tzinfo=timezone.utc),
|
| 277 |
+
)
|
| 278 |
+
assert layer.status == LayerStatus.PENDING
|
| 279 |
+
assert layer.version == 1
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
def test_annotation_layer_all_statuses():
|
| 283 |
+
for status in LayerStatus:
|
| 284 |
+
layer = AnnotationLayer(
|
| 285 |
+
id="layer-001",
|
| 286 |
+
page_id="test-corpus-0001r",
|
| 287 |
+
layer_type=LayerType.TRANSLATION_FR,
|
| 288 |
+
status=status,
|
| 289 |
+
created_at=datetime(2026, 3, 16, tzinfo=timezone.utc),
|
| 290 |
+
)
|
| 291 |
+
assert layer.status == status
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
def test_annotation_layer_all_layer_types():
|
| 295 |
+
for layer_type in LayerType:
|
| 296 |
+
layer = AnnotationLayer(
|
| 297 |
+
id=f"layer-{layer_type.value}",
|
| 298 |
+
page_id="test-corpus-0001r",
|
| 299 |
+
layer_type=layer_type,
|
| 300 |
+
created_at=datetime(2026, 3, 16, tzinfo=timezone.utc),
|
| 301 |
+
)
|
| 302 |
+
assert layer.layer_type == layer_type
|
infra/.gitkeep
ADDED
|
File without changes
|
profiles/early-modern-print.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"profile_id": "early-modern-print",
|
| 3 |
+
"label": "Imprimé de la période moderne",
|
| 4 |
+
"language_hints": ["la", "fr", "it", "de"],
|
| 5 |
+
"script_type": "print",
|
| 6 |
+
"active_layers": [
|
| 7 |
+
"ocr_diplomatic",
|
| 8 |
+
"ocr_normalized",
|
| 9 |
+
"translation_fr",
|
| 10 |
+
"summary",
|
| 11 |
+
"public_commentary",
|
| 12 |
+
"uncertainty"
|
| 13 |
+
],
|
| 14 |
+
"prompt_templates": {
|
| 15 |
+
"primary": "prompts/early-modern-print/primary_v1.txt"
|
| 16 |
+
},
|
| 17 |
+
"uncertainty_config": {
|
| 18 |
+
"flag_below": 0.3,
|
| 19 |
+
"min_acceptable": 0.2
|
| 20 |
+
},
|
| 21 |
+
"export_config": {
|
| 22 |
+
"mets": true,
|
| 23 |
+
"alto": true,
|
| 24 |
+
"tei": false
|
| 25 |
+
}
|
| 26 |
+
}
|
profiles/medieval-illuminated.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"profile_id": "medieval-illuminated",
|
| 3 |
+
"label": "Manuscrit médiéval enluminé",
|
| 4 |
+
"language_hints": ["la", "fr-moyen"],
|
| 5 |
+
"script_type": "caroline",
|
| 6 |
+
"active_layers": [
|
| 7 |
+
"image",
|
| 8 |
+
"ocr_diplomatic",
|
| 9 |
+
"ocr_normalized",
|
| 10 |
+
"translation_fr",
|
| 11 |
+
"translation_en",
|
| 12 |
+
"summary",
|
| 13 |
+
"scholarly_commentary",
|
| 14 |
+
"public_commentary",
|
| 15 |
+
"iconography_detection",
|
| 16 |
+
"material_notes",
|
| 17 |
+
"uncertainty"
|
| 18 |
+
],
|
| 19 |
+
"prompt_templates": {
|
| 20 |
+
"primary": "prompts/medieval-illuminated/primary_v1.txt",
|
| 21 |
+
"translation": "prompts/medieval-illuminated/translation_v1.txt",
|
| 22 |
+
"commentary": "prompts/medieval-illuminated/commentary_v1.txt",
|
| 23 |
+
"iconography": "prompts/medieval-illuminated/iconography_v1.txt"
|
| 24 |
+
},
|
| 25 |
+
"uncertainty_config": {
|
| 26 |
+
"flag_below": 0.4,
|
| 27 |
+
"min_acceptable": 0.25
|
| 28 |
+
},
|
| 29 |
+
"export_config": {
|
| 30 |
+
"mets": true,
|
| 31 |
+
"alto": true,
|
| 32 |
+
"tei": false
|
| 33 |
+
}
|
| 34 |
+
}
|
profiles/medieval-textual.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"profile_id": "medieval-textual",
|
| 3 |
+
"label": "Manuscrit médiéval textuel",
|
| 4 |
+
"language_hints": ["la", "fr-moyen", "oc"],
|
| 5 |
+
"script_type": "gothic",
|
| 6 |
+
"active_layers": [
|
| 7 |
+
"ocr_diplomatic",
|
| 8 |
+
"ocr_normalized",
|
| 9 |
+
"translation_fr",
|
| 10 |
+
"translation_en",
|
| 11 |
+
"summary",
|
| 12 |
+
"scholarly_commentary",
|
| 13 |
+
"public_commentary",
|
| 14 |
+
"material_notes",
|
| 15 |
+
"uncertainty"
|
| 16 |
+
],
|
| 17 |
+
"prompt_templates": {
|
| 18 |
+
"primary": "prompts/medieval-textual/primary_v1.txt",
|
| 19 |
+
"translation": "prompts/medieval-textual/translation_v1.txt",
|
| 20 |
+
"commentary": "prompts/medieval-textual/commentary_v1.txt"
|
| 21 |
+
},
|
| 22 |
+
"uncertainty_config": {
|
| 23 |
+
"flag_below": 0.45,
|
| 24 |
+
"min_acceptable": 0.3
|
| 25 |
+
},
|
| 26 |
+
"export_config": {
|
| 27 |
+
"mets": true,
|
| 28 |
+
"alto": true,
|
| 29 |
+
"tei": false
|
| 30 |
+
}
|
| 31 |
+
}
|
profiles/modern-handwritten.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"profile_id": "modern-handwritten",
|
| 3 |
+
"label": "Document manuscrit moderne",
|
| 4 |
+
"language_hints": ["fr", "en", "de"],
|
| 5 |
+
"script_type": "cursive",
|
| 6 |
+
"active_layers": [
|
| 7 |
+
"ocr_diplomatic",
|
| 8 |
+
"ocr_normalized",
|
| 9 |
+
"summary",
|
| 10 |
+
"public_commentary",
|
| 11 |
+
"uncertainty"
|
| 12 |
+
],
|
| 13 |
+
"prompt_templates": {
|
| 14 |
+
"primary": "prompts/modern-handwritten/primary_v1.txt"
|
| 15 |
+
},
|
| 16 |
+
"uncertainty_config": {
|
| 17 |
+
"flag_below": 0.5,
|
| 18 |
+
"min_acceptable": 0.35
|
| 19 |
+
},
|
| 20 |
+
"export_config": {
|
| 21 |
+
"mets": true,
|
| 22 |
+
"alto": true,
|
| 23 |
+
"tei": false
|
| 24 |
+
}
|
| 25 |
+
}
|
prompts/early-modern-print/primary_v1.txt
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Tu es un expert en histoire du livre imprimé et en paléographie des imprimés anciens.
|
| 2 |
+
|
| 3 |
+
Profil du corpus : {{profile_label}}
|
| 4 |
+
Langue(s) attendue(s) : {{language_hints}}
|
| 5 |
+
Type d'écriture : {{script_type}}
|
| 6 |
+
|
| 7 |
+
Analyse l'image de page fournie et retourne UNIQUEMENT un objet JSON strict, sans texte avant ni après.
|
| 8 |
+
|
| 9 |
+
Tâches :
|
| 10 |
+
1. Détecte et délimite toutes les régions de la page (blocs de texte, titres, notes marginales, ornements typographiques, colophons).
|
| 11 |
+
2. Transcris le texte visible (respecte l'orthographe originale, y compris les graphies archaïques).
|
| 12 |
+
3. Évalue ta confiance pour chaque région et pour la transcription globale.
|
| 13 |
+
4. Signale les passages illisibles ou incertains dans uncertain_segments.
|
| 14 |
+
|
| 15 |
+
Format de sortie JSON attendu :
|
| 16 |
+
{
|
| 17 |
+
"layout": {
|
| 18 |
+
"regions": [
|
| 19 |
+
{
|
| 20 |
+
"id": "r1",
|
| 21 |
+
"type": "text_block|margin|rubric|decorated_initial|other",
|
| 22 |
+
"bbox": [x, y, width, height],
|
| 23 |
+
"confidence": 0.0
|
| 24 |
+
}
|
| 25 |
+
]
|
| 26 |
+
},
|
| 27 |
+
"ocr": {
|
| 28 |
+
"diplomatic_text": "",
|
| 29 |
+
"blocks": [],
|
| 30 |
+
"lines": [],
|
| 31 |
+
"language": "la",
|
| 32 |
+
"confidence": 0.0,
|
| 33 |
+
"uncertain_segments": []
|
| 34 |
+
}
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
Règles absolues :
|
| 38 |
+
- bbox au format [x, y, largeur, hauteur] en pixels entiers, JAMAIS [x1, y1, x2, y2].
|
| 39 |
+
- x, y >= 0 ; largeur > 0 ; hauteur > 0.
|
| 40 |
+
- confidence entre 0.0 et 1.0.
|
| 41 |
+
- Retourne uniquement le JSON, sans markdown, sans commentaire.
|
prompts/medieval-illuminated/commentary_v1.txt
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Tu es un chercheur spécialisé en études médiévales, codicologie et histoire de l'art du livre.
|
| 2 |
+
|
| 3 |
+
Profil du corpus : {{profile_label}}
|
| 4 |
+
Langue(s) : {{language_hints}}
|
| 5 |
+
Type d'écriture : {{script_type}}
|
| 6 |
+
|
| 7 |
+
À partir du texte transcrit et des régions identifiées, produis deux niveaux de commentaire.
|
| 8 |
+
|
| 9 |
+
Retourne UNIQUEMENT un objet JSON strict, sans texte avant ni après.
|
| 10 |
+
|
| 11 |
+
Format de sortie JSON attendu :
|
| 12 |
+
{
|
| 13 |
+
"commentary": {
|
| 14 |
+
"public": "",
|
| 15 |
+
"scholarly": "",
|
| 16 |
+
"claims": [
|
| 17 |
+
{
|
| 18 |
+
"claim": "",
|
| 19 |
+
"evidence_region_ids": [],
|
| 20 |
+
"certainty": "high|medium|low|speculative"
|
| 21 |
+
}
|
| 22 |
+
]
|
| 23 |
+
},
|
| 24 |
+
"summary": {
|
| 25 |
+
"short": "",
|
| 26 |
+
"detailed": ""
|
| 27 |
+
}
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
Règles :
|
| 31 |
+
- public : accessible à un large public, sans jargon technique (2-4 phrases).
|
| 32 |
+
- scholarly : rigoureux, avec références aux sources primaires et secondaires pertinentes.
|
| 33 |
+
- claims : liste les affirmations interprétatives avec leur niveau de certitude.
|
| 34 |
+
- Retourne uniquement le JSON, sans markdown, sans commentaire.
|
prompts/medieval-illuminated/iconography_v1.txt
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Tu es un expert en iconographie médiévale et en histoire de l'art du manuscrit enluminé.
|
| 2 |
+
|
| 3 |
+
Profil du corpus : {{profile_label}}
|
| 4 |
+
Langue(s) : {{language_hints}}
|
| 5 |
+
Type d'écriture : {{script_type}}
|
| 6 |
+
|
| 7 |
+
Analyse les régions visuelles (miniatures, initiales décorées) de l'image fournie.
|
| 8 |
+
Identifie les sujets iconographiques, les personnages, les scènes, les symboles et les programmes décoratifs.
|
| 9 |
+
|
| 10 |
+
Retourne UNIQUEMENT un objet JSON strict, sans texte avant ni après.
|
| 11 |
+
|
| 12 |
+
Format de sortie JSON attendu :
|
| 13 |
+
{
|
| 14 |
+
"iconography": {
|
| 15 |
+
"scenes": [
|
| 16 |
+
{
|
| 17 |
+
"region_id": "",
|
| 18 |
+
"subject": "",
|
| 19 |
+
"iconographic_program": "",
|
| 20 |
+
"identified_figures": [],
|
| 21 |
+
"symbolic_elements": [],
|
| 22 |
+
"confidence": 0.0,
|
| 23 |
+
"notes": ""
|
| 24 |
+
}
|
| 25 |
+
],
|
| 26 |
+
"decorative_program": "",
|
| 27 |
+
"style_notes": ""
|
| 28 |
+
}
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
Règles :
|
| 32 |
+
- confidence entre 0.0 et 1.0 ; utilise 0.3 ou moins pour les identifications spéculatives.
|
| 33 |
+
- identified_figures : noms des personnages si identifiables (saints, figures bibliques, etc.).
|
| 34 |
+
- Retourne uniquement le JSON, sans markdown, sans commentaire.
|
prompts/medieval-illuminated/primary_v1.txt
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Tu es un expert en paléographie et codicologie, spécialisé dans les manuscrits médiévaux enluminés.
|
| 2 |
+
|
| 3 |
+
Profil du corpus : {{profile_label}}
|
| 4 |
+
Langue(s) attendue(s) : {{language_hints}}
|
| 5 |
+
Type d'écriture : {{script_type}}
|
| 6 |
+
|
| 7 |
+
Analyse l'image de folio fournie et retourne UNIQUEMENT un objet JSON strict, sans texte avant ni après.
|
| 8 |
+
|
| 9 |
+
Tâches :
|
| 10 |
+
1. Détecte et délimite toutes les régions de la page (blocs de texte, miniatures, initiales décorées, marges, rubriques).
|
| 11 |
+
2. Transcris diplomatiquement chaque bloc de texte visible (respecte l'orthographe originale, les abréviations, la ponctuation).
|
| 12 |
+
3. Évalue ta confiance pour chaque région et pour la transcription globale.
|
| 13 |
+
4. Signale les passages illisibles ou incertains dans uncertain_segments.
|
| 14 |
+
|
| 15 |
+
Format de sortie JSON attendu :
|
| 16 |
+
{
|
| 17 |
+
"layout": {
|
| 18 |
+
"regions": [
|
| 19 |
+
{
|
| 20 |
+
"id": "r1",
|
| 21 |
+
"type": "text_block|miniature|decorated_initial|margin|rubric|other",
|
| 22 |
+
"bbox": [x, y, width, height],
|
| 23 |
+
"confidence": 0.0
|
| 24 |
+
}
|
| 25 |
+
]
|
| 26 |
+
},
|
| 27 |
+
"ocr": {
|
| 28 |
+
"diplomatic_text": "",
|
| 29 |
+
"blocks": [],
|
| 30 |
+
"lines": [],
|
| 31 |
+
"language": "la",
|
| 32 |
+
"confidence": 0.0,
|
| 33 |
+
"uncertain_segments": []
|
| 34 |
+
}
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
Règles absolues :
|
| 38 |
+
- bbox au format [x, y, largeur, hauteur] en pixels entiers, JAMAIS [x1, y1, x2, y2].
|
| 39 |
+
- x, y >= 0 ; largeur > 0 ; hauteur > 0.
|
| 40 |
+
- confidence entre 0.0 et 1.0.
|
| 41 |
+
- Retourne uniquement le JSON, sans markdown, sans commentaire.
|
prompts/medieval-illuminated/translation_v1.txt
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Tu es un expert en traduction de textes médiévaux latins et en langues romanes médiévales.
|
| 2 |
+
|
| 3 |
+
Profil du corpus : {{profile_label}}
|
| 4 |
+
Langue(s) source : {{language_hints}}
|
| 5 |
+
Type d'écriture : {{script_type}}
|
| 6 |
+
|
| 7 |
+
À partir du texte diplomatique fourni, produis une traduction en français moderne et en anglais.
|
| 8 |
+
|
| 9 |
+
Retourne UNIQUEMENT un objet JSON strict, sans texte avant ni après.
|
| 10 |
+
|
| 11 |
+
Format de sortie JSON attendu :
|
| 12 |
+
{
|
| 13 |
+
"translation": {
|
| 14 |
+
"fr": "",
|
| 15 |
+
"en": ""
|
| 16 |
+
}
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
Règles :
|
| 20 |
+
- Traduis fidèlement en préservant le sens théologique et littéraire du texte.
|
| 21 |
+
- Signale entre crochets les passages conjecturaux : [traduction incertaine].
|
| 22 |
+
- Retourne uniquement le JSON, sans markdown, sans commentaire.
|
prompts/medieval-textual/commentary_v1.txt
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Tu es un chercheur spécialisé en philologie médiévale et en histoire des textes.
|
| 2 |
+
|
| 3 |
+
Profil du corpus : {{profile_label}}
|
| 4 |
+
Langue(s) : {{language_hints}}
|
| 5 |
+
Type d'écriture : {{script_type}}
|
| 6 |
+
|
| 7 |
+
À partir du texte transcrit, produis deux niveaux de commentaire philologique et historique.
|
| 8 |
+
|
| 9 |
+
Retourne UNIQUEMENT un objet JSON strict, sans texte avant ni après.
|
| 10 |
+
|
| 11 |
+
Format de sortie JSON attendu :
|
| 12 |
+
{
|
| 13 |
+
"commentary": {
|
| 14 |
+
"public": "",
|
| 15 |
+
"scholarly": "",
|
| 16 |
+
"claims": [
|
| 17 |
+
{
|
| 18 |
+
"claim": "",
|
| 19 |
+
"evidence_region_ids": [],
|
| 20 |
+
"certainty": "high|medium|low|speculative"
|
| 21 |
+
}
|
| 22 |
+
]
|
| 23 |
+
},
|
| 24 |
+
"summary": {
|
| 25 |
+
"short": "",
|
| 26 |
+
"detailed": ""
|
| 27 |
+
}
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
Règles :
|
| 31 |
+
- public : accessible à un large public, sans jargon technique (2-4 phrases).
|
| 32 |
+
- scholarly : rigoureux, avec références aux traditions manuscrites et sources critiques.
|
| 33 |
+
- claims : liste les affirmations interprétatives avec leur niveau de certitude.
|
| 34 |
+
- Retourne uniquement le JSON, sans markdown, sans commentaire.
|
prompts/medieval-textual/primary_v1.txt
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Tu es un expert en paléographie médiévale spécialisé dans les manuscrits textuels.
|
| 2 |
+
|
| 3 |
+
Profil du corpus : {{profile_label}}
|
| 4 |
+
Langue(s) attendue(s) : {{language_hints}}
|
| 5 |
+
Type d'écriture : {{script_type}}
|
| 6 |
+
|
| 7 |
+
Analyse l'image de folio fournie et retourne UNIQUEMENT un objet JSON strict, sans texte avant ni après.
|
| 8 |
+
|
| 9 |
+
Tâches :
|
| 10 |
+
1. Détecte et délimite toutes les régions textuelles de la page (blocs de texte, marges, rubriques, initiales).
|
| 11 |
+
2. Transcris diplomatiquement chaque bloc de texte visible (respecte l'orthographe originale, les abréviations, la ponctuation).
|
| 12 |
+
3. Évalue ta confiance pour chaque région et pour la transcription globale.
|
| 13 |
+
4. Signale les passages illisibles ou incertains dans uncertain_segments.
|
| 14 |
+
|
| 15 |
+
Format de sortie JSON attendu :
|
| 16 |
+
{
|
| 17 |
+
"layout": {
|
| 18 |
+
"regions": [
|
| 19 |
+
{
|
| 20 |
+
"id": "r1",
|
| 21 |
+
"type": "text_block|margin|rubric|decorated_initial|other",
|
| 22 |
+
"bbox": [x, y, width, height],
|
| 23 |
+
"confidence": 0.0
|
| 24 |
+
}
|
| 25 |
+
]
|
| 26 |
+
},
|
| 27 |
+
"ocr": {
|
| 28 |
+
"diplomatic_text": "",
|
| 29 |
+
"blocks": [],
|
| 30 |
+
"lines": [],
|
| 31 |
+
"language": "la",
|
| 32 |
+
"confidence": 0.0,
|
| 33 |
+
"uncertain_segments": []
|
| 34 |
+
}
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
Règles absolues :
|
| 38 |
+
- bbox au format [x, y, largeur, hauteur] en pixels entiers, JAMAIS [x1, y1, x2, y2].
|
| 39 |
+
- x, y >= 0 ; largeur > 0 ; hauteur > 0.
|
| 40 |
+
- confidence entre 0.0 et 1.0.
|
| 41 |
+
- Retourne uniquement le JSON, sans markdown, sans commentaire.
|
prompts/medieval-textual/translation_v1.txt
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Tu es un expert en traduction de textes médiévaux latins et en langues médiévales.
|
| 2 |
+
|
| 3 |
+
Profil du corpus : {{profile_label}}
|
| 4 |
+
Langue(s) source : {{language_hints}}
|
| 5 |
+
Type d'écriture : {{script_type}}
|
| 6 |
+
|
| 7 |
+
À partir du texte diplomatique fourni, produis une traduction en français moderne et en anglais.
|
| 8 |
+
|
| 9 |
+
Retourne UNIQUEMENT un objet JSON strict, sans texte avant ni après.
|
| 10 |
+
|
| 11 |
+
Format de sortie JSON attendu :
|
| 12 |
+
{
|
| 13 |
+
"translation": {
|
| 14 |
+
"fr": "",
|
| 15 |
+
"en": ""
|
| 16 |
+
}
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
Règles :
|
| 20 |
+
- Traduis fidèlement en préservant le sens du texte original.
|
| 21 |
+
- Signale entre crochets les passages conjecturaux : [traduction incertaine].
|
| 22 |
+
- Retourne uniquement le JSON, sans markdown, sans commentaire.
|
prompts/modern-handwritten/primary_v1.txt
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Tu es un expert en déchiffrement de documents manuscrits modernes et en archivistique.
|
| 2 |
+
|
| 3 |
+
Profil du corpus : {{profile_label}}
|
| 4 |
+
Langue(s) attendue(s) : {{language_hints}}
|
| 5 |
+
Type d'écriture : {{script_type}}
|
| 6 |
+
|
| 7 |
+
Analyse l'image de document fournie et retourne UNIQUEMENT un objet JSON strict, sans texte avant ni après.
|
| 8 |
+
|
| 9 |
+
Tâches :
|
| 10 |
+
1. Détecte et délimite toutes les zones d'écriture du document (blocs de texte, en-têtes, signatures, annotations marginales, cachets).
|
| 11 |
+
2. Transcris le texte manuscrit visible (respecte les abréviations et les ratures).
|
| 12 |
+
3. Évalue ta confiance pour chaque région et pour la transcription globale.
|
| 13 |
+
4. Signale les passages illisibles ou incertains dans uncertain_segments.
|
| 14 |
+
|
| 15 |
+
Format de sortie JSON attendu :
|
| 16 |
+
{
|
| 17 |
+
"layout": {
|
| 18 |
+
"regions": [
|
| 19 |
+
{
|
| 20 |
+
"id": "r1",
|
| 21 |
+
"type": "text_block|margin|rubric|other",
|
| 22 |
+
"bbox": [x, y, width, height],
|
| 23 |
+
"confidence": 0.0
|
| 24 |
+
}
|
| 25 |
+
]
|
| 26 |
+
},
|
| 27 |
+
"ocr": {
|
| 28 |
+
"diplomatic_text": "",
|
| 29 |
+
"blocks": [],
|
| 30 |
+
"lines": [],
|
| 31 |
+
"language": "fr",
|
| 32 |
+
"confidence": 0.0,
|
| 33 |
+
"uncertain_segments": []
|
| 34 |
+
}
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
Règles absolues :
|
| 38 |
+
- bbox au format [x, y, largeur, hauteur] en pixels entiers, JAMAIS [x1, y1, x2, y2].
|
| 39 |
+
- x, y >= 0 ; largeur > 0 ; hauteur > 0.
|
| 40 |
+
- confidence entre 0.0 et 1.0.
|
| 41 |
+
- Retourne uniquement le JSON, sans markdown, sans commentaire.
|