Spaces:
Sleeping
Sleeping
Add /api/eval/parsing endpoint for Docling evaluation
Browse files- Fetches file from Dropbox
- Runs Docling parsing
- Returns element breakdown and metrics
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
- src/api/routes.py +90 -0
src/api/routes.py
CHANGED
|
@@ -447,6 +447,96 @@ async def dropbox_folder(request: dict):
|
|
| 447 |
return {"error": str(e)}
|
| 448 |
|
| 449 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
@router.post("/dropbox/file")
|
| 451 |
async def dropbox_file(request: dict):
|
| 452 |
"""
|
|
|
|
| 447 |
return {"error": str(e)}
|
| 448 |
|
| 449 |
|
| 450 |
+
@router.post("/eval/parsing")
|
| 451 |
+
async def eval_parsing(request: dict):
|
| 452 |
+
"""
|
| 453 |
+
Evaluate Docling parsing on a file from Dropbox.
|
| 454 |
+
|
| 455 |
+
Request:
|
| 456 |
+
- path: Dropbox file path
|
| 457 |
+
- access_token: Dropbox access token
|
| 458 |
+
|
| 459 |
+
Returns parsing metrics and element breakdown.
|
| 460 |
+
"""
|
| 461 |
+
import tempfile
|
| 462 |
+
from pathlib import Path
|
| 463 |
+
|
| 464 |
+
file_path = request.get("path")
|
| 465 |
+
access_token = request.get("access_token")
|
| 466 |
+
|
| 467 |
+
if not access_token or not file_path:
|
| 468 |
+
return {"error": "Missing path or access_token"}
|
| 469 |
+
|
| 470 |
+
try:
|
| 471 |
+
# Download file from Dropbox
|
| 472 |
+
async with httpx.AsyncClient(timeout=120.0) as client:
|
| 473 |
+
response = await client.post(
|
| 474 |
+
"https://content.dropboxapi.com/2/files/download",
|
| 475 |
+
headers={
|
| 476 |
+
"Authorization": f"Bearer {access_token}",
|
| 477 |
+
"Dropbox-API-Arg": f'{{"path": "{file_path}"}}'
|
| 478 |
+
}
|
| 479 |
+
)
|
| 480 |
+
|
| 481 |
+
if response.status_code != 200:
|
| 482 |
+
return {"error": f"Dropbox download failed: {response.text}"}
|
| 483 |
+
|
| 484 |
+
# Save to temp file
|
| 485 |
+
filename = Path(file_path).name
|
| 486 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(filename).suffix) as tmp:
|
| 487 |
+
tmp.write(response.content)
|
| 488 |
+
tmp_path = tmp.name
|
| 489 |
+
|
| 490 |
+
# Run Docling parsing
|
| 491 |
+
try:
|
| 492 |
+
from src.ingestion.docling_loader import load_document_with_docling
|
| 493 |
+
from collections import Counter
|
| 494 |
+
|
| 495 |
+
doc = load_document_with_docling(tmp_path)
|
| 496 |
+
|
| 497 |
+
# Count element types
|
| 498 |
+
type_counts = Counter(el.element_type for el in doc.elements)
|
| 499 |
+
|
| 500 |
+
# Sample elements
|
| 501 |
+
samples = []
|
| 502 |
+
for el in doc.elements[:10]:
|
| 503 |
+
samples.append({
|
| 504 |
+
"type": el.element_type,
|
| 505 |
+
"text": el.text[:200] + "..." if len(el.text) > 200 else el.text,
|
| 506 |
+
"level": el.level
|
| 507 |
+
})
|
| 508 |
+
|
| 509 |
+
result = {
|
| 510 |
+
"status": doc.status,
|
| 511 |
+
"filename": doc.filename,
|
| 512 |
+
"format": doc.format,
|
| 513 |
+
"total_elements": len(doc.elements),
|
| 514 |
+
"total_chars": doc.chars,
|
| 515 |
+
"total_words": doc.words,
|
| 516 |
+
"page_count": doc.page_count,
|
| 517 |
+
"element_types": dict(type_counts),
|
| 518 |
+
"sample_elements": samples,
|
| 519 |
+
"error": doc.error
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
finally:
|
| 523 |
+
# Clean up temp file
|
| 524 |
+
import os
|
| 525 |
+
os.unlink(tmp_path)
|
| 526 |
+
|
| 527 |
+
return result
|
| 528 |
+
|
| 529 |
+
except Exception as e:
|
| 530 |
+
return {"error": str(e)}
|
| 531 |
+
|
| 532 |
+
|
| 533 |
+
@router.get("/eval/formats")
|
| 534 |
+
async def eval_formats():
|
| 535 |
+
"""Get supported document formats for Docling parsing."""
|
| 536 |
+
from src.ingestion.api import get_supported_formats
|
| 537 |
+
return get_supported_formats()
|
| 538 |
+
|
| 539 |
+
|
| 540 |
@router.post("/dropbox/file")
|
| 541 |
async def dropbox_file(request: dict):
|
| 542 |
"""
|