vn6295337 Claude Opus 4.5 commited on
Commit
2413602
·
1 Parent(s): c6a48e0

Add /api/eval/parsing endpoint for Docling evaluation

Browse files

- Fetches file from Dropbox
- Runs Docling parsing
- Returns element breakdown and metrics

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (1) hide show
  1. src/api/routes.py +90 -0
src/api/routes.py CHANGED
@@ -447,6 +447,96 @@ async def dropbox_folder(request: dict):
447
  return {"error": str(e)}
448
 
449
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
  @router.post("/dropbox/file")
451
  async def dropbox_file(request: dict):
452
  """
 
447
  return {"error": str(e)}
448
 
449
 
450
+ @router.post("/eval/parsing")
451
+ async def eval_parsing(request: dict):
452
+ """
453
+ Evaluate Docling parsing on a file from Dropbox.
454
+
455
+ Request:
456
+ - path: Dropbox file path
457
+ - access_token: Dropbox access token
458
+
459
+ Returns parsing metrics and element breakdown.
460
+ """
461
+ import tempfile
462
+ from pathlib import Path
463
+
464
+ file_path = request.get("path")
465
+ access_token = request.get("access_token")
466
+
467
+ if not access_token or not file_path:
468
+ return {"error": "Missing path or access_token"}
469
+
470
+ try:
471
+ # Download file from Dropbox
472
+ async with httpx.AsyncClient(timeout=120.0) as client:
473
+ response = await client.post(
474
+ "https://content.dropboxapi.com/2/files/download",
475
+ headers={
476
+ "Authorization": f"Bearer {access_token}",
477
+ "Dropbox-API-Arg": f'{{"path": "{file_path}"}}'
478
+ }
479
+ )
480
+
481
+ if response.status_code != 200:
482
+ return {"error": f"Dropbox download failed: {response.text}"}
483
+
484
+ # Save to temp file
485
+ filename = Path(file_path).name
486
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(filename).suffix) as tmp:
487
+ tmp.write(response.content)
488
+ tmp_path = tmp.name
489
+
490
+ # Run Docling parsing
491
+ try:
492
+ from src.ingestion.docling_loader import load_document_with_docling
493
+ from collections import Counter
494
+
495
+ doc = load_document_with_docling(tmp_path)
496
+
497
+ # Count element types
498
+ type_counts = Counter(el.element_type for el in doc.elements)
499
+
500
+ # Sample elements
501
+ samples = []
502
+ for el in doc.elements[:10]:
503
+ samples.append({
504
+ "type": el.element_type,
505
+ "text": el.text[:200] + "..." if len(el.text) > 200 else el.text,
506
+ "level": el.level
507
+ })
508
+
509
+ result = {
510
+ "status": doc.status,
511
+ "filename": doc.filename,
512
+ "format": doc.format,
513
+ "total_elements": len(doc.elements),
514
+ "total_chars": doc.chars,
515
+ "total_words": doc.words,
516
+ "page_count": doc.page_count,
517
+ "element_types": dict(type_counts),
518
+ "sample_elements": samples,
519
+ "error": doc.error
520
+ }
521
+
522
+ finally:
523
+ # Clean up temp file
524
+ import os
525
+ os.unlink(tmp_path)
526
+
527
+ return result
528
+
529
+ except Exception as e:
530
+ return {"error": str(e)}
531
+
532
+
533
+ @router.get("/eval/formats")
534
+ async def eval_formats():
535
+ """Get supported document formats for Docling parsing."""
536
+ from src.ingestion.api import get_supported_formats
537
+ return get_supported_formats()
538
+
539
+
540
  @router.post("/dropbox/file")
541
  async def dropbox_file(request: dict):
542
  """