| """ |
| Unified Document Extraction API - Docling + DocStrange |
| Deploy this as a SINGLE app on Hugging Face Spaces |
| Provides both Docling AND DocStrange extraction in one service |
| """ |
| import os |
| import sys |
| import tempfile |
| from pathlib import Path |
|
|
| from fastapi import FastAPI, File, UploadFile, HTTPException, Query |
| from fastapi.responses import JSONResponse |
| from fastapi.middleware.cors import CORSMiddleware |
| import uvicorn |
|
|
| |
| |
| |
|
|
| |
| HAS_DOCLING = False |
| docling_converter = None |
| try: |
| from docling.document_converter import DocumentConverter |
| HAS_DOCLING = True |
| except ImportError: |
| pass |
|
|
| |
| HAS_DOCTSTRANGE = False |
| docstrange_extractor = None |
| try: |
| |
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'docstrange')) |
| from docstrange import DocumentExtractor |
| HAS_DOCTSTRANGE = True |
| except ImportError: |
| pass |
|
|
| app = FastAPI( |
| title="Unified Document Extraction API", |
| description="Extract documents using Docling OR DocStrange AI engines", |
| version="2.0.0" |
| ) |
|
|
| |
| app.add_middleware( |
| CORSMiddleware, |
| allow_origins=["*"], |
| allow_credentials=True, |
| allow_methods=["*"], |
| allow_headers=["*"], |
| ) |
|
|
|
|
| |
| |
| |
|
|
| def get_docling_converter(): |
| """Get or create Docling converter""" |
| global docling_converter |
| if docling_converter is None and HAS_DOCLING: |
| docling_converter = DocumentConverter() |
| return docling_converter |
|
|
|
|
| def get_docstrange_extractor(): |
| """Get or create DocStrange extractor""" |
| global docstrange_extractor |
| if docstrange_extractor is None and HAS_DOCTSTRANGE: |
| |
| try: |
| import torch |
| gpu = torch.cuda.is_available() |
| except: |
| gpu = False |
| docstrange_extractor = DocumentExtractor(gpu=gpu) |
| return docstrange_extractor |
|
|
|
|
| |
| |
| |
|
|
| @app.get("/") |
| def root(): |
| """Health check""" |
| return { |
| "status": "ok", |
| "service": "Unified Document Extraction API", |
| "version": "2.0.0", |
| "engines": { |
| "docling": HAS_DOCLING, |
| "docstrange": HAS_DOCTSTRANGE |
| } |
| } |
|
|
|
|
| @app.get("/health") |
| def health(): |
| """Detailed health check""" |
| try: |
| import torch |
| gpu = torch.cuda.is_available() |
| vram = f"{torch.cuda.get_device_properties(0).total_mem/1024**3:.1f}GB" if gpu else "N/A" |
| except: |
| gpu = False |
| vram = "N/A" |
| |
| return { |
| "status": "ok", |
| "gpu": gpu, |
| "vram": vram, |
| "engines": { |
| "docling": HAS_DOCLING, |
| "docstrange": HAS_DOCTSTRANGE |
| } |
| } |
|
|
|
|
| @app.get("/engines") |
| def list_engines(): |
| """List available extraction engines""" |
| return { |
| "engines": [ |
| { |
| "id": "docling", |
| "name": "Docling AI", |
| "available": HAS_DOCLING, |
| "description": "Advanced document parsing with structure preservation" |
| }, |
| { |
| "id": "docstrange", |
| "name": "DocStrange", |
| "available": HAS_DOCTSTRANGE, |
| "description": "GPU-accelerated intelligent document processing" |
| } |
| ] |
| } |
|
|
|
|
| |
| |
| |
|
|
| @app.post("/convert") |
| async def convert_document( |
| file: UploadFile = File(...), |
| engine: str = Query("docling", description="Extraction engine: docling or docstrange"), |
| output_format: str = Query("markdown", description="Output format: markdown, json, tables") |
| ): |
| """ |
| Convert document using specified engine |
| |
| Args: |
| file: Document file (PDF, DOCX, XLSX, Images, etc.) |
| engine: docling or docstrange |
| output_format: markdown, json, tables |
| |
| Returns: JSON with extracted data |
| """ |
| if not file.filename: |
| raise HTTPException(status_code=400, detail="No file provided") |
| |
| |
| if engine not in ['docling', 'docstrange']: |
| raise HTTPException(status_code=400, detail=f"Unknown engine: {engine}. Use 'docling' or 'docstrange'") |
| |
| |
| if engine == 'docling' and not HAS_DOCLING: |
| raise HTTPException(status_code=503, detail="Docling engine not available") |
| if engine == 'docstrange' and not HAS_DOCTSTRANGE: |
| raise HTTPException(status_code=503, detail="DocStrange engine not available") |
| |
| |
| supported_extensions = ['.pdf', '.docx', '.xlsx', '.pptx', '.png', '.jpg', '.jpeg', |
| '.bmp', '.tiff', '.webp', '.gif', '.txt', '.html', '.md', '.csv'] |
| ext = Path(file.filename).suffix.lower() |
| if ext not in supported_extensions: |
| raise HTTPException(status_code=400, detail=f"Unsupported format: {ext}") |
| |
| try: |
| |
| with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp: |
| content = await file.read() |
| tmp.write(content) |
| tmp_path = tmp.name |
| |
| |
| if engine == 'docling': |
| result = _extract_with_docling(tmp_path, output_format) |
| else: |
| result = _extract_with_docstrange(tmp_path, output_format) |
| |
| |
| os.unlink(tmp_path) |
| |
| return JSONResponse(content=result) |
| |
| except Exception as e: |
| |
| if 'tmp_path' in locals(): |
| try: |
| os.unlink(tmp_path) |
| except: |
| pass |
| |
| raise HTTPException(status_code=500, detail=f"Extraction failed: {str(e)}") |
|
|
|
|
| @app.post("/convert/markdown") |
| async def convert_to_markdown( |
| file: UploadFile = File(...), |
| engine: str = Query("docling", description="docling or docstrange") |
| ): |
| """Extract document to markdown only (lightweight endpoint)""" |
| try: |
| with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix.lower()) as tmp: |
| content = await file.read() |
| tmp.write(content) |
| tmp_path = tmp.name |
| |
| if engine == 'docling' and HAS_DOCLING: |
| converter = get_docling_converter() |
| result = converter.convert(tmp_path) |
| markdown = result.document.export_to_markdown() |
| elif engine == 'docstrange' and HAS_DOCTSTRANGE: |
| ext = get_docstrange_extractor() |
| result = ext.extract_document(tmp_path, output_format='markdown') |
| markdown = result.get('data', '') |
| else: |
| raise HTTPException(status_code=503, detail=f"{engine} engine not available") |
| |
| os.unlink(tmp_path) |
| |
| return { |
| "success": True, |
| "markdown": markdown, |
| "engine": engine, |
| "file_name": file.filename |
| } |
| |
| except Exception as e: |
| if 'tmp_path' in locals(): |
| try: |
| os.unlink(tmp_path) |
| except: |
| pass |
| raise HTTPException(status_code=500, detail=str(e)) |
|
|
|
|
| @app.post("/convert/tables") |
| async def convert_tables( |
| file: UploadFile = File(...), |
| engine: str = Query("docling", description="docling or docstrange") |
| ): |
| """Extract tables only from document""" |
| try: |
| with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix.lower()) as tmp: |
| content = await file.read() |
| tmp.write(content) |
| tmp_path = tmp.name |
| |
| tables_data = [] |
| |
| if engine == 'docling' and HAS_DOCLING: |
| converter = get_docling_converter() |
| result = converter.convert(tmp_path) |
| for table_idx, table in enumerate(result.document.tables): |
| try: |
| df = table.export_to_dataframe() |
| tables_data.append({ |
| "table_index": table_idx, |
| "headers": list(df.columns), |
| "rows": df.to_dict('records'), |
| "row_count": len(df) |
| }) |
| except: |
| pass |
| |
| os.unlink(tmp_path) |
| |
| return { |
| "success": True, |
| "tables": tables_data, |
| "tables_count": len(tables_data), |
| "engine": engine, |
| "file_name": file.filename |
| } |
| |
| except Exception as e: |
| if 'tmp_path' in locals(): |
| try: |
| os.unlink(tmp_path) |
| except: |
| pass |
| raise HTTPException(status_code=500, detail=str(e)) |
|
|
|
|
| |
| |
| |
|
|
| def _extract_with_docling(file_path, output_format): |
| """Extract using Docling""" |
| converter = get_docling_converter() |
| result = converter.convert(file_path) |
| doc = result.document |
| |
| response = { |
| "success": True, |
| "file_name": os.path.basename(file_path), |
| "engine": "docling", |
| "format": output_format, |
| "document": { |
| "markdown": doc.export_to_markdown(), |
| "num_pages": len(doc.pages) if hasattr(doc, 'pages') else 0, |
| "tables_count": len(doc.tables) |
| }, |
| "metadata": { |
| "engine": "docling", |
| "model": "docling-default" |
| } |
| } |
| |
| |
| if output_format in ['json', 'tables']: |
| tables_data = [] |
| for table_idx, table in enumerate(doc.tables): |
| try: |
| df = table.export_to_dataframe() |
| tables_data.append({ |
| "table_index": table_idx, |
| "rows": df.to_dict('records'), |
| "row_count": len(df) |
| }) |
| except: |
| pass |
| response['document']['tables'] = tables_data |
| |
| return response |
|
|
|
|
| def _extract_with_docstrange(file_path, output_format): |
| """Extract using DocStrange""" |
| ext = get_docstrange_extractor() |
| result = ext.extract_document(file_path, output_format=output_format) |
| |
| response = { |
| "success": True, |
| "file_name": os.path.basename(file_path), |
| "engine": "docstrange", |
| "format": result.get('format', output_format), |
| "data": result.get('data', {}), |
| "metadata": { |
| "engine": "docstrange", |
| "file_size": result.get('metadata', {}).get('file_size', 0), |
| "gpu_mode": result.get('metadata', {}).get('gpu_mode', False) |
| } |
| } |
| |
| return response |
|
|
|
|
| |
| |
| |
|
|
| if __name__ == "__main__": |
| print("\n" + "="*60) |
| print("Unified Document Extraction API") |
| print("="*60) |
| print(f"Docling: {'✅ Available' if HAS_DOCLING else '❌ Not installed'}") |
| print(f"DocStrange: {'✅ Available' if HAS_DOCTSTRANGE else '❌ Not installed'}") |
| print("="*60) |
| print("URL: http://localhost:7860") |
| print("Docs: http://localhost:7860/docs") |
| print("="*60 + "\n") |
| |
| uvicorn.run( |
| "app:app", |
| host="0.0.0.0", |
| port=7860 |
| ) |
|
|