Upload 63 files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- docstrange/WEB_INTERFACE.md +168 -0
- docstrange/__init__.py +34 -0
- docstrange/__pycache__/__init__.cpython-310.pyc +0 -0
- docstrange/__pycache__/config.cpython-310.pyc +0 -0
- docstrange/__pycache__/exceptions.cpython-310.pyc +0 -0
- docstrange/__pycache__/extractor.cpython-310.pyc +0 -0
- docstrange/__pycache__/result.cpython-310.pyc +0 -0
- docstrange/__pycache__/web_app.cpython-310.pyc +0 -0
- docstrange/cli.py +643 -0
- docstrange/config.py +15 -0
- docstrange/exceptions.py +25 -0
- docstrange/extractor.py +431 -0
- docstrange/pipeline/__init__.py +1 -0
- docstrange/pipeline/__pycache__/__init__.cpython-310.pyc +0 -0
- docstrange/pipeline/__pycache__/ocr_service.cpython-310.pyc +0 -0
- docstrange/pipeline/layout_detector.py +329 -0
- docstrange/pipeline/model_downloader.py +331 -0
- docstrange/pipeline/nanonets_processor.py +129 -0
- docstrange/pipeline/neural_document_processor.py +644 -0
- docstrange/pipeline/ocr_service.py +222 -0
- docstrange/processors/__init__.py +27 -0
- docstrange/processors/__pycache__/__init__.cpython-310.pyc +0 -0
- docstrange/processors/__pycache__/base.cpython-310.pyc +0 -0
- docstrange/processors/__pycache__/cloud_processor.cpython-310.pyc +0 -0
- docstrange/processors/__pycache__/docx_processor.cpython-310.pyc +0 -0
- docstrange/processors/__pycache__/excel_processor.cpython-310.pyc +0 -0
- docstrange/processors/__pycache__/gpu_processor.cpython-310.pyc +0 -0
- docstrange/processors/__pycache__/html_processor.cpython-310.pyc +0 -0
- docstrange/processors/__pycache__/image_processor.cpython-310.pyc +0 -0
- docstrange/processors/__pycache__/pdf_processor.cpython-310.pyc +0 -0
- docstrange/processors/__pycache__/pptx_processor.cpython-310.pyc +0 -0
- docstrange/processors/__pycache__/txt_processor.cpython-310.pyc +0 -0
- docstrange/processors/__pycache__/url_processor.cpython-310.pyc +0 -0
- docstrange/processors/base.py +87 -0
- docstrange/processors/cloud_processor.py +399 -0
- docstrange/processors/docx_processor.py +202 -0
- docstrange/processors/excel_processor.py +208 -0
- docstrange/processors/gpu_processor.py +501 -0
- docstrange/processors/html_processor.py +65 -0
- docstrange/processors/image_processor.py +110 -0
- docstrange/processors/pdf_processor.py +141 -0
- docstrange/processors/pptx_processor.py +160 -0
- docstrange/processors/txt_processor.py +105 -0
- docstrange/processors/url_processor.py +361 -0
- docstrange/result.py +1143 -0
- docstrange/services/__init__.py +21 -0
- docstrange/services/__pycache__/__init__.cpython-310.pyc +0 -0
- docstrange/services/__pycache__/api_key_pool.cpython-310.pyc +0 -0
- docstrange/services/__pycache__/ollama_service.cpython-310.pyc +0 -0
- docstrange/services/api_key_pool.py +241 -0
docstrange/WEB_INTERFACE.md
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# DocStrange Web Interface
|
| 2 |
+
|
| 3 |
+
A beautiful, modern web interface for the DocStrange document extraction library, inspired by the data-extraction-apis project design.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- **Modern UI**: Clean, responsive design with drag-and-drop file upload
|
| 8 |
+
- **Multiple Formats**: Support for PDF, Word, Excel, PowerPoint, images, and more
|
| 9 |
+
- **Output Options**: Convert to Markdown, HTML, JSON, CSV, or Flat JSON
|
| 10 |
+
- **Real-time Processing**: Live extraction with progress indicators
|
| 11 |
+
- **Download Results**: Save extracted content in various formats
|
| 12 |
+
- **Mobile Friendly**: Responsive design that works on all devices
|
| 13 |
+
|
| 14 |
+
## Quick Start
|
| 15 |
+
|
| 16 |
+
### 1. Install Dependencies
|
| 17 |
+
|
| 18 |
+
```bash
|
| 19 |
+
pip install docstrange[web]
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
### 2. Start the Web Interface
|
| 23 |
+
|
| 24 |
+
```bash
|
| 25 |
+
docstrange web
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
### 3. Open Your Browser
|
| 29 |
+
|
| 30 |
+
Navigate to: http://localhost:8000
|
| 31 |
+
|
| 32 |
+
## Usage
|
| 33 |
+
|
| 34 |
+
### File Upload
|
| 35 |
+
|
| 36 |
+
1. **Drag & Drop**: Simply drag your file onto the upload area
|
| 37 |
+
2. **Click to Browse**: Click the upload area to select a file from your computer
|
| 38 |
+
3. **Supported Formats**: PDF, Word (.docx, .doc), Excel (.xlsx, .xls), PowerPoint (.pptx, .ppt), HTML, CSV, Text, Images (PNG, JPG, TIFF, BMP)
|
| 39 |
+
|
| 40 |
+
### Output Format Selection
|
| 41 |
+
|
| 42 |
+
Choose from multiple output formats:
|
| 43 |
+
|
| 44 |
+
- **Markdown**: Clean, structured markdown text
|
| 45 |
+
- **HTML**: Formatted HTML with styling
|
| 46 |
+
- **JSON**: Structured JSON data
|
| 47 |
+
- **CSV**: Table data in CSV format
|
| 48 |
+
- **Flat JSON**: Simplified JSON structure
|
| 49 |
+
|
| 50 |
+
### Results View
|
| 51 |
+
|
| 52 |
+
After processing, you can:
|
| 53 |
+
|
| 54 |
+
- **Preview**: View formatted content in the preview tab
|
| 55 |
+
- **Raw Output**: See the raw extracted text
|
| 56 |
+
- **Download**: Save results as text or JSON files
|
| 57 |
+
|
| 58 |
+
## API Endpoints
|
| 59 |
+
|
| 60 |
+
The web interface also provides REST API endpoints:
|
| 61 |
+
|
| 62 |
+
### Health Check
|
| 63 |
+
```
|
| 64 |
+
GET /api/health
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
### Get Supported Formats
|
| 68 |
+
```
|
| 69 |
+
GET /api/supported-formats
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
### Extract Document
|
| 73 |
+
```
|
| 74 |
+
POST /api/extract
|
| 75 |
+
Content-Type: multipart/form-data
|
| 76 |
+
|
| 77 |
+
Parameters:
|
| 78 |
+
- file: The document file to extract
|
| 79 |
+
- output_format: markdown, html, json, csv, flat-json
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
## Configuration
|
| 83 |
+
|
| 84 |
+
### Environment Variables
|
| 85 |
+
|
| 86 |
+
- `FLASK_ENV`: Set to `development` for debug mode
|
| 87 |
+
- `MAX_CONTENT_LENGTH`: Maximum file size (default: 100MB)
|
| 88 |
+
|
| 89 |
+
### Customization
|
| 90 |
+
|
| 91 |
+
The web interface uses a modular design system:
|
| 92 |
+
|
| 93 |
+
- **CSS Variables**: Easy theming via CSS custom properties
|
| 94 |
+
- **Responsive Design**: Mobile-first approach
|
| 95 |
+
- **Component-based**: Reusable UI components
|
| 96 |
+
|
| 97 |
+
## Development
|
| 98 |
+
|
| 99 |
+
### Running in Development Mode
|
| 100 |
+
|
| 101 |
+
```bash
|
| 102 |
+
# Install development dependencies
|
| 103 |
+
pip install -e .
|
| 104 |
+
|
| 105 |
+
# Start with debug mode
|
| 106 |
+
python -m docstrange.web_app
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
### File Structure
|
| 110 |
+
|
| 111 |
+
```
|
| 112 |
+
docstrange/
|
| 113 |
+
├── web_app.py # Flask application
|
| 114 |
+
├── templates/
|
| 115 |
+
│ └── index.html # Main HTML template
|
| 116 |
+
└── static/
|
| 117 |
+
├── styles.css # Design system CSS
|
| 118 |
+
└── script.js # Frontend JavaScript
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
### Testing
|
| 122 |
+
|
| 123 |
+
```bash
|
| 124 |
+
# Run the test script
|
| 125 |
+
python test_web_interface.py
|
| 126 |
+
```
|
| 127 |
+
|
| 128 |
+
## Troubleshooting
|
| 129 |
+
|
| 130 |
+
### Common Issues
|
| 131 |
+
|
| 132 |
+
1. **Port Already in Use**
|
| 133 |
+
```bash
|
| 134 |
+
# Use a different port
|
| 135 |
+
docstrange web --port 8080
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
2. **File Upload Fails**
|
| 139 |
+
- Check file size (max 100MB)
|
| 140 |
+
- Verify file format is supported
|
| 141 |
+
- Ensure proper file permissions
|
| 142 |
+
|
| 143 |
+
3. **Extraction Errors**
|
| 144 |
+
- Check console logs for detailed error messages
|
| 145 |
+
- Verify document is not corrupted
|
| 146 |
+
- Try different output formats
|
| 147 |
+
|
| 148 |
+
### Logs
|
| 149 |
+
|
| 150 |
+
The web interface logs to the console. Check for:
|
| 151 |
+
- File upload events
|
| 152 |
+
- Processing status
|
| 153 |
+
- Error messages
|
| 154 |
+
- API request details
|
| 155 |
+
|
| 156 |
+
## Contributing
|
| 157 |
+
|
| 158 |
+
To contribute to the web interface:
|
| 159 |
+
|
| 160 |
+
1. Fork the repository
|
| 161 |
+
2. Create a feature branch
|
| 162 |
+
3. Make your changes
|
| 163 |
+
4. Test thoroughly
|
| 164 |
+
5. Submit a pull request
|
| 165 |
+
|
| 166 |
+
## License
|
| 167 |
+
|
| 168 |
+
This web interface is part of the DocStrange project and is licensed under the MIT License.
|
docstrange/__init__.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Document Data Extractor - Extract structured data from any document into LLM-ready formats.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from .extractor import DocumentExtractor
|
| 6 |
+
from .result import ConversionResult
|
| 7 |
+
from .processors import GPUConversionResult, CloudConversionResult
|
| 8 |
+
from .exceptions import ConversionError, UnsupportedFormatError
|
| 9 |
+
from .config import InternalConfig
|
| 10 |
+
from .services.api_key_pool import (
|
| 11 |
+
ApiKeyPool,
|
| 12 |
+
get_pool,
|
| 13 |
+
add_api_key,
|
| 14 |
+
remove_api_key,
|
| 15 |
+
list_api_keys,
|
| 16 |
+
get_available_key,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
__version__ = "1.1.5"
|
| 20 |
+
__all__ = [
|
| 21 |
+
"DocumentExtractor",
|
| 22 |
+
"ConversionResult",
|
| 23 |
+
"GPUConversionResult",
|
| 24 |
+
"CloudConversionResult",
|
| 25 |
+
"ConversionError",
|
| 26 |
+
"UnsupportedFormatError",
|
| 27 |
+
"InternalConfig",
|
| 28 |
+
"ApiKeyPool",
|
| 29 |
+
"get_pool",
|
| 30 |
+
"add_api_key",
|
| 31 |
+
"remove_api_key",
|
| 32 |
+
"list_api_keys",
|
| 33 |
+
"get_available_key",
|
| 34 |
+
]
|
docstrange/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (842 Bytes). View file
|
|
|
docstrange/__pycache__/config.cpython-310.pyc
ADDED
|
Binary file (426 Bytes). View file
|
|
|
docstrange/__pycache__/exceptions.cpython-310.pyc
ADDED
|
Binary file (964 Bytes). View file
|
|
|
docstrange/__pycache__/extractor.cpython-310.pyc
ADDED
|
Binary file (11.7 kB). View file
|
|
|
docstrange/__pycache__/result.cpython-310.pyc
ADDED
|
Binary file (28 kB). View file
|
|
|
docstrange/__pycache__/web_app.cpython-310.pyc
ADDED
|
Binary file (21.2 kB). View file
|
|
|
docstrange/cli.py
ADDED
|
@@ -0,0 +1,643 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Command-line interface for docstrange."""
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import sys
|
| 5 |
+
import os
|
| 6 |
+
import json
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import List
|
| 9 |
+
|
| 10 |
+
from .extractor import DocumentExtractor
|
| 11 |
+
from .exceptions import ConversionError, UnsupportedFormatError, FileNotFoundError
|
| 12 |
+
from . import __version__
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def print_version():
|
| 16 |
+
"""Print version information."""
|
| 17 |
+
print(f"docstrange v{__version__}")
|
| 18 |
+
print("Convert any document, text, or URL into LLM-ready data format")
|
| 19 |
+
print("with advanced intelligent document processing capabilities.")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def print_supported_formats(extractor: DocumentExtractor):
|
| 23 |
+
"""Print supported formats in a nice format."""
|
| 24 |
+
print("Supported input formats:")
|
| 25 |
+
print()
|
| 26 |
+
|
| 27 |
+
formats = extractor.get_supported_formats()
|
| 28 |
+
|
| 29 |
+
# Group formats by category
|
| 30 |
+
categories = {
|
| 31 |
+
"Documents": [f for f in formats if f in ['.pdf', '.docx', '.doc', '.txt', '.text']],
|
| 32 |
+
"Data Files": [f for f in formats if f in ['.xlsx', '.xls', '.csv']],
|
| 33 |
+
"Presentations": [f for f in formats if f in ['.ppt', '.pptx']],
|
| 34 |
+
"Web": [f for f in formats if f == 'URLs'],
|
| 35 |
+
"Images": [f for f in formats if f in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp', '.gif']],
|
| 36 |
+
"Web Files": [f for f in formats if f in ['.html', '.htm']]
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
for category, format_list in categories.items():
|
| 40 |
+
if format_list:
|
| 41 |
+
print(f" {category}:")
|
| 42 |
+
for fmt in format_list:
|
| 43 |
+
print(f" - {fmt}")
|
| 44 |
+
print()
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def process_single_input(extractor: DocumentExtractor, input_item: str, output_format: str, verbose: bool = False) -> dict:
|
| 48 |
+
"""Process a single input item and return result with metadata."""
|
| 49 |
+
if verbose:
|
| 50 |
+
print(f"Processing: {input_item}", file=sys.stderr)
|
| 51 |
+
|
| 52 |
+
try:
|
| 53 |
+
# Check if it's a URL
|
| 54 |
+
if input_item.startswith(('http://', 'https://')):
|
| 55 |
+
if extractor.cloud_mode:
|
| 56 |
+
raise ConversionError("URL processing is not supported in cloud mode. Use local mode for URLs.")
|
| 57 |
+
result = extractor.extract_url(input_item)
|
| 58 |
+
input_type = "URL"
|
| 59 |
+
# Check if it's a file
|
| 60 |
+
elif os.path.exists(input_item):
|
| 61 |
+
result = extractor.extract(input_item)
|
| 62 |
+
input_type = "File"
|
| 63 |
+
# Treat as text
|
| 64 |
+
else:
|
| 65 |
+
if extractor.cloud_mode:
|
| 66 |
+
raise ConversionError("Text processing is not supported in cloud mode. Use local mode for text.")
|
| 67 |
+
result = extractor.extract_text(input_item)
|
| 68 |
+
input_type = "Text"
|
| 69 |
+
|
| 70 |
+
return {
|
| 71 |
+
"success": True,
|
| 72 |
+
"result": result,
|
| 73 |
+
"input_type": input_type,
|
| 74 |
+
"input_item": input_item
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
except FileNotFoundError:
|
| 78 |
+
return {
|
| 79 |
+
"success": False,
|
| 80 |
+
"error": "File not found",
|
| 81 |
+
"input_item": input_item
|
| 82 |
+
}
|
| 83 |
+
except UnsupportedFormatError:
|
| 84 |
+
return {
|
| 85 |
+
"success": False,
|
| 86 |
+
"error": "Unsupported format",
|
| 87 |
+
"input_item": input_item
|
| 88 |
+
}
|
| 89 |
+
except ConversionError as e:
|
| 90 |
+
return {
|
| 91 |
+
"success": False,
|
| 92 |
+
"error": f"Conversion error: {e}",
|
| 93 |
+
"input_item": input_item
|
| 94 |
+
}
|
| 95 |
+
except Exception as e:
|
| 96 |
+
return {
|
| 97 |
+
"success": False,
|
| 98 |
+
"error": f"Unexpected error: {e}",
|
| 99 |
+
"input_item": input_item
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def handle_login(force_reauth: bool = False) -> int:
|
| 104 |
+
"""Handle login command."""
|
| 105 |
+
try:
|
| 106 |
+
from .services.auth_service import get_authenticated_token
|
| 107 |
+
|
| 108 |
+
print("\n🔐 DocStrange Authentication")
|
| 109 |
+
print("=" * 50)
|
| 110 |
+
|
| 111 |
+
token = get_authenticated_token(force_reauth=force_reauth)
|
| 112 |
+
if token:
|
| 113 |
+
print("✅ Authentication successful!")
|
| 114 |
+
|
| 115 |
+
# Get cached credentials to show user info
|
| 116 |
+
try:
|
| 117 |
+
from .services.auth_service import AuthService
|
| 118 |
+
auth_service = AuthService()
|
| 119 |
+
cached_creds = auth_service.get_cached_credentials()
|
| 120 |
+
|
| 121 |
+
if cached_creds and cached_creds.get('auth0_direct'):
|
| 122 |
+
print(f"👤 Logged in as: {cached_creds.get('user_email', 'Unknown')}")
|
| 123 |
+
print(f"👤 Name: {cached_creds.get('user_name', 'Unknown')}")
|
| 124 |
+
print(f"🔐 Via: Auth0 Google Login")
|
| 125 |
+
print(f"🔑 Access Token: {token[:12]}...{token[-4:]}")
|
| 126 |
+
print("💾 Credentials cached securely")
|
| 127 |
+
else:
|
| 128 |
+
print(f"🔑 Access Token: {token[:12]}...{token[-4:]}")
|
| 129 |
+
print("💾 Credentials cached securely")
|
| 130 |
+
except Exception:
|
| 131 |
+
print(f"🔑 Access Token: {token[:12]}...{token[-4:]}")
|
| 132 |
+
print("💾 Credentials cached securely")
|
| 133 |
+
|
| 134 |
+
print("\n💡 You can now use DocStrange cloud features without specifying --api-key")
|
| 135 |
+
print("🌐 Your CLI is authenticated with the same Google account used on docstrange.nanonets.com")
|
| 136 |
+
return 0
|
| 137 |
+
else:
|
| 138 |
+
print("❌ Authentication failed.")
|
| 139 |
+
return 1
|
| 140 |
+
except ImportError:
|
| 141 |
+
print("❌ Authentication service not available.", file=sys.stderr)
|
| 142 |
+
return 1
|
| 143 |
+
except Exception as e:
|
| 144 |
+
print(f"❌ Authentication error: {e}", file=sys.stderr)
|
| 145 |
+
return 1
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def handle_logout() -> int:
|
| 149 |
+
"""Handle logout command."""
|
| 150 |
+
try:
|
| 151 |
+
from .services.auth_service import clear_auth
|
| 152 |
+
|
| 153 |
+
clear_auth()
|
| 154 |
+
print("✅ Logged out successfully.")
|
| 155 |
+
print("💾 Cached authentication credentials cleared.")
|
| 156 |
+
return 0
|
| 157 |
+
except ImportError:
|
| 158 |
+
print("❌ Authentication service not available.", file=sys.stderr)
|
| 159 |
+
return 1
|
| 160 |
+
except Exception as e:
|
| 161 |
+
print(f"❌ Error clearing credentials: {e}", file=sys.stderr)
|
| 162 |
+
return 1
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def handle_api_keys_command(argv: list) -> int:
|
| 166 |
+
"""Handle API key management commands.
|
| 167 |
+
|
| 168 |
+
Usage:
|
| 169 |
+
docstrange api-keys list
|
| 170 |
+
docstrange api-keys add <key>
|
| 171 |
+
docstrange api-keys remove <key>
|
| 172 |
+
docstrange api-keys stats
|
| 173 |
+
"""
|
| 174 |
+
from .services.api_key_pool import ApiKeyPool
|
| 175 |
+
|
| 176 |
+
pool = ApiKeyPool.get_instance()
|
| 177 |
+
|
| 178 |
+
if not argv or argv[0] == "list":
|
| 179 |
+
keys = pool.get_all_keys()
|
| 180 |
+
stats = pool.get_pool_stats()
|
| 181 |
+
print(f"\n🔑 API Key Pool")
|
| 182 |
+
print("=" * 40)
|
| 183 |
+
print(f"Total keys: {stats['total_keys']}")
|
| 184 |
+
print(f"Available: {stats['available']}")
|
| 185 |
+
print(f"Rate limited: {stats['rate_limited']}")
|
| 186 |
+
print(f"Total requests: {stats['total_requests']}")
|
| 187 |
+
print()
|
| 188 |
+
if keys:
|
| 189 |
+
print("Keys:")
|
| 190 |
+
for i, masked in enumerate(keys, 1):
|
| 191 |
+
print(f" {i}. {masked}")
|
| 192 |
+
else:
|
| 193 |
+
print("No API keys configured.")
|
| 194 |
+
print("\n💡 Add keys with: docstrange api-keys add <key>")
|
| 195 |
+
print("💡 Or set NANONETS_API_KEYS env var (comma-separated)")
|
| 196 |
+
return 0
|
| 197 |
+
|
| 198 |
+
elif argv[0] == "add":
|
| 199 |
+
if len(argv) < 2:
|
| 200 |
+
print("❌ Usage: docstrange api-keys add <key>", file=sys.stderr)
|
| 201 |
+
return 1
|
| 202 |
+
key = argv[1]
|
| 203 |
+
if pool.add_key(key, source="cli"):
|
| 204 |
+
pool.save_config()
|
| 205 |
+
print(f"✅ API key added: {key[:8]}...{key[-4:]}")
|
| 206 |
+
return 0
|
| 207 |
+
else:
|
| 208 |
+
print("⚠️ API key already exists in pool")
|
| 209 |
+
return 0
|
| 210 |
+
|
| 211 |
+
elif argv[0] == "remove":
|
| 212 |
+
if len(argv) < 2:
|
| 213 |
+
print("❌ Usage: docstrange api-keys remove <key>", file=sys.stderr)
|
| 214 |
+
return 1
|
| 215 |
+
key = argv[1]
|
| 216 |
+
if pool.remove_key(key):
|
| 217 |
+
pool.save_config()
|
| 218 |
+
print(f"✅ API key removed: {key[:8]}...{key[-4:]}")
|
| 219 |
+
return 0
|
| 220 |
+
else:
|
| 221 |
+
print("❌ API key not found in pool", file=sys.stderr)
|
| 222 |
+
return 1
|
| 223 |
+
|
| 224 |
+
elif argv[0] == "stats":
|
| 225 |
+
stats = pool.get_pool_stats()
|
| 226 |
+
print(f"\n📊 API Key Pool Statistics")
|
| 227 |
+
print("=" * 40)
|
| 228 |
+
print(f"Total keys: {stats['total_keys']}")
|
| 229 |
+
print(f"Available: {stats['available']}")
|
| 230 |
+
print(f"Rate limited: {stats['rate_limited']}")
|
| 231 |
+
print(f"Total requests: {stats['total_requests']}")
|
| 232 |
+
return 0
|
| 233 |
+
|
| 234 |
+
else:
|
| 235 |
+
print(f"❌ Unknown api-keys command: {argv[0]}", file=sys.stderr)
|
| 236 |
+
print("Usage: docstrange api-keys [list|add|remove|stats]", file=sys.stderr)
|
| 237 |
+
return 1
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def main():
|
| 241 |
+
"""Main CLI function."""
|
| 242 |
+
parser = argparse.ArgumentParser(
|
| 243 |
+
description="Convert documents to LLM-ready formats with intelligent document processing",
|
| 244 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 245 |
+
epilog="""
|
| 246 |
+
Examples:
|
| 247 |
+
# Authentication (browser-based login)
|
| 248 |
+
docstrange login # One-click browser login
|
| 249 |
+
docstrange login --reauth # Force re-authentication
|
| 250 |
+
|
| 251 |
+
# API Key Management
|
| 252 |
+
docstrange api-keys list # List all configured API keys
|
| 253 |
+
docstrange api-keys add <key> # Add an API key to the rotation pool
|
| 254 |
+
docstrange api-keys remove <key> # Remove an API key
|
| 255 |
+
docstrange api-keys stats # Show pool usage statistics
|
| 256 |
+
|
| 257 |
+
# Start web interface
|
| 258 |
+
docstrange web # Start web interface at http://localhost:8000
|
| 259 |
+
|
| 260 |
+
# Convert a PDF to markdown (default cloud mode)
|
| 261 |
+
docstrange document.pdf
|
| 262 |
+
|
| 263 |
+
# Convert with free API key with increased limits
|
| 264 |
+
docstrange document.pdf --api-key YOUR_API_KEY
|
| 265 |
+
|
| 266 |
+
# Convert with multiple API keys for automatic rotation
|
| 267 |
+
docstrange document.pdf --api-keys KEY1 KEY2 KEY3
|
| 268 |
+
|
| 269 |
+
# Force local GPU processing
|
| 270 |
+
docstrange document.pdf --gpu-mode
|
| 271 |
+
|
| 272 |
+
# Convert to different output formats
|
| 273 |
+
docstrange document.pdf --output html
|
| 274 |
+
docstrange document.pdf --output json
|
| 275 |
+
docstrange document.pdf --output csv # Extract tables as CSV
|
| 276 |
+
|
| 277 |
+
# Use specific model for cloud processing
|
| 278 |
+
docstrange document.pdf --model gemini
|
| 279 |
+
docstrange document.pdf --model openapi --output json
|
| 280 |
+
docstrange document.pdf --model nanonets --output csv
|
| 281 |
+
|
| 282 |
+
# Convert a URL (works in all modes)
|
| 283 |
+
docstrange https://example.com --output html
|
| 284 |
+
|
| 285 |
+
# Convert plain text (works in all modes)
|
| 286 |
+
docstrange "Hello world" --output json
|
| 287 |
+
|
| 288 |
+
# Convert multiple files
|
| 289 |
+
docstrange file1.pdf file2.docx file3.xlsx --output markdown
|
| 290 |
+
|
| 291 |
+
# Extract specific fields using cloud processing
|
| 292 |
+
docstrange invoice.pdf --output json --extract-fields invoice_number total_amount vendor_name
|
| 293 |
+
|
| 294 |
+
# Extract using JSON schema with cloud processing
|
| 295 |
+
docstrange document.pdf --output json --json-schema schema.json
|
| 296 |
+
|
| 297 |
+
# Save output to file
|
| 298 |
+
docstrange document.pdf --output-file output.md
|
| 299 |
+
|
| 300 |
+
# Use environment variable for API key
|
| 301 |
+
export NANONETS_API_KEY=your_api_key
|
| 302 |
+
docstrange document.pdf
|
| 303 |
+
|
| 304 |
+
# List supported formats
|
| 305 |
+
docstrange --list-formats
|
| 306 |
+
|
| 307 |
+
# Show version
|
| 308 |
+
docstrange --version
|
| 309 |
+
"""
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
parser.add_argument(
|
| 313 |
+
"input",
|
| 314 |
+
nargs="*",
|
| 315 |
+
help="Input file(s), URL(s), or text to extract"
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
parser.add_argument(
|
| 319 |
+
"--output", "-o",
|
| 320 |
+
choices=["markdown", "html", "json", "text", "csv"],
|
| 321 |
+
default="markdown",
|
| 322 |
+
help="Output format (default: markdown)"
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
# Processing mode arguments
|
| 326 |
+
parser.add_argument(
|
| 327 |
+
"--gpu-mode",
|
| 328 |
+
action="store_true",
|
| 329 |
+
help="Force local GPU processing (disables cloud mode, requires GPU)"
|
| 330 |
+
)
|
| 331 |
+
|
| 332 |
+
parser.add_argument(
|
| 333 |
+
"--api-key",
|
| 334 |
+
help="API key for increased cloud access (get it free from https://app.nanonets.com/#/keys)"
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
parser.add_argument(
|
| 338 |
+
"--api-keys",
|
| 339 |
+
nargs="+",
|
| 340 |
+
help="Multiple API keys for automatic rotation when one hits rate limit"
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
parser.add_argument(
|
| 344 |
+
"--model",
|
| 345 |
+
choices=["gemini", "openapi", "nanonets"],
|
| 346 |
+
help="Model to use for cloud processing (gemini, openapi, nanonets)"
|
| 347 |
+
)
|
| 348 |
+
|
| 349 |
+
parser.add_argument(
|
| 350 |
+
"--ollama-url",
|
| 351 |
+
default="http://localhost:11434",
|
| 352 |
+
help="Ollama server URL for local field extraction (default: http://localhost:11434)"
|
| 353 |
+
)
|
| 354 |
+
|
| 355 |
+
parser.add_argument(
|
| 356 |
+
"--ollama-model",
|
| 357 |
+
default="llama3.2",
|
| 358 |
+
help="Ollama model for local field extraction (default: llama3.2)"
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
parser.add_argument(
|
| 362 |
+
"--extract-fields",
|
| 363 |
+
nargs="+",
|
| 364 |
+
help="Extract specific fields using cloud processing (e.g., --extract-fields invoice_number total_amount)"
|
| 365 |
+
)
|
| 366 |
+
|
| 367 |
+
parser.add_argument(
|
| 368 |
+
"--json-schema",
|
| 369 |
+
help="JSON schema file for structured extraction using cloud processing"
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
parser.add_argument(
|
| 373 |
+
"--preserve-layout",
|
| 374 |
+
action="store_true",
|
| 375 |
+
default=True,
|
| 376 |
+
help="Preserve document layout (default: True)"
|
| 377 |
+
)
|
| 378 |
+
|
| 379 |
+
parser.add_argument(
|
| 380 |
+
"--include-images",
|
| 381 |
+
action="store_true",
|
| 382 |
+
help="Include images in output"
|
| 383 |
+
)
|
| 384 |
+
|
| 385 |
+
parser.add_argument(
|
| 386 |
+
"--ocr-enabled",
|
| 387 |
+
action="store_true",
|
| 388 |
+
help="Enable intelligent document processing for images and PDFs"
|
| 389 |
+
)
|
| 390 |
+
|
| 391 |
+
parser.add_argument(
|
| 392 |
+
"--output-file", "-f",
|
| 393 |
+
help="Output file path (if not specified, prints to stdout)"
|
| 394 |
+
)
|
| 395 |
+
|
| 396 |
+
parser.add_argument(
|
| 397 |
+
"--list-formats",
|
| 398 |
+
action="store_true",
|
| 399 |
+
help="List supported input formats and exit"
|
| 400 |
+
)
|
| 401 |
+
|
| 402 |
+
parser.add_argument(
|
| 403 |
+
"--version",
|
| 404 |
+
action="store_true",
|
| 405 |
+
help="Show version information and exit"
|
| 406 |
+
)
|
| 407 |
+
|
| 408 |
+
parser.add_argument(
|
| 409 |
+
"--verbose", "-v",
|
| 410 |
+
action="store_true",
|
| 411 |
+
help="Enable verbose output"
|
| 412 |
+
)
|
| 413 |
+
|
| 414 |
+
parser.add_argument(
|
| 415 |
+
"--login",
|
| 416 |
+
action="store_true",
|
| 417 |
+
help="Perform browser-based authentication login"
|
| 418 |
+
)
|
| 419 |
+
|
| 420 |
+
parser.add_argument(
|
| 421 |
+
"--reauth",
|
| 422 |
+
action="store_true",
|
| 423 |
+
help="Force re-authentication (use with --login)"
|
| 424 |
+
)
|
| 425 |
+
|
| 426 |
+
parser.add_argument(
|
| 427 |
+
"--logout",
|
| 428 |
+
action="store_true",
|
| 429 |
+
help="Clear cached authentication credentials"
|
| 430 |
+
)
|
| 431 |
+
|
| 432 |
+
args = parser.parse_args()
|
| 433 |
+
|
| 434 |
+
# Handle version flag
|
| 435 |
+
if args.version:
|
| 436 |
+
print_version()
|
| 437 |
+
return 0
|
| 438 |
+
|
| 439 |
+
# Handle list formats flag
|
| 440 |
+
if args.list_formats:
|
| 441 |
+
# Create a extractor to get supported formats
|
| 442 |
+
extractor = DocumentExtractor(
|
| 443 |
+
api_key=args.api_key,
|
| 444 |
+
model=args.model,
|
| 445 |
+
gpu=args.gpu_mode
|
| 446 |
+
)
|
| 447 |
+
print_supported_formats(extractor)
|
| 448 |
+
return 0
|
| 449 |
+
|
| 450 |
+
# Handle authentication commands
|
| 451 |
+
# Check if first argument is "login" command
|
| 452 |
+
if args.input and args.input[0] == "login":
|
| 453 |
+
force_reauth = "--reauth" in sys.argv
|
| 454 |
+
return handle_login(force_reauth)
|
| 455 |
+
|
| 456 |
+
# Handle API key management commands
|
| 457 |
+
if args.input and args.input[0] == "api-keys":
|
| 458 |
+
return handle_api_keys_command(sys.argv[1:])
|
| 459 |
+
|
| 460 |
+
# Handle web command
|
| 461 |
+
if args.input and args.input[0] == "web":
|
| 462 |
+
try:
|
| 463 |
+
from .web_app import run_web_app
|
| 464 |
+
print("Starting DocStrange web interface...")
|
| 465 |
+
print("Open your browser and go to: http://localhost:8000")
|
| 466 |
+
print("Press Ctrl+C to stop the server")
|
| 467 |
+
run_web_app(host='0.0.0.0', port=8000, debug=False)
|
| 468 |
+
return 0
|
| 469 |
+
except ImportError:
|
| 470 |
+
print("❌ Web interface not available. Install Flask: pip install Flask", file=sys.stderr)
|
| 471 |
+
return 1
|
| 472 |
+
|
| 473 |
+
# Handle login flags
|
| 474 |
+
if args.login or args.logout:
|
| 475 |
+
if args.logout:
|
| 476 |
+
return handle_logout()
|
| 477 |
+
else:
|
| 478 |
+
return handle_login(args.reauth)
|
| 479 |
+
|
| 480 |
+
# Check if input is provided
|
| 481 |
+
if not args.input:
|
| 482 |
+
parser.error("No input specified. Please provide file(s), URL(s), or text to extract.")
|
| 483 |
+
|
| 484 |
+
# Cloud mode is default. Without login/API key it's limited calls.
|
| 485 |
+
# Use 'docstrange login' (recommended) or --api-key for 10k docs/month for free.
|
| 486 |
+
|
| 487 |
+
# Initialize extractor
|
| 488 |
+
extractor = DocumentExtractor(
|
| 489 |
+
api_key=args.api_key,
|
| 490 |
+
api_keys=args.api_keys,
|
| 491 |
+
model=args.model,
|
| 492 |
+
gpu=args.gpu_mode
|
| 493 |
+
)
|
| 494 |
+
|
| 495 |
+
if args.verbose:
|
| 496 |
+
mode = "local" if args.gpu_mode else "cloud"
|
| 497 |
+
print(f"Initialized extractor in {mode} mode:")
|
| 498 |
+
print(f" - Output format: {args.output}")
|
| 499 |
+
if mode == "cloud":
|
| 500 |
+
pool_stats = extractor.get_api_key_pool_stats()
|
| 501 |
+
print(f" - API Key Pool: {pool_stats['available']}/{pool_stats['total_keys']} keys available")
|
| 502 |
+
if args.model:
|
| 503 |
+
print(f" - Model: {args.model}")
|
| 504 |
+
else:
|
| 505 |
+
print(f" - Local processing: GPU")
|
| 506 |
+
print()
|
| 507 |
+
|
| 508 |
+
# Process inputs
|
| 509 |
+
results = []
|
| 510 |
+
errors = []
|
| 511 |
+
|
| 512 |
+
for i, input_item in enumerate(args.input, 1):
|
| 513 |
+
if args.verbose and len(args.input) > 1:
|
| 514 |
+
print(f"[{i}/{len(args.input)}] Processing: {input_item}", file=sys.stderr)
|
| 515 |
+
|
| 516 |
+
result = process_single_input(extractor, input_item, args.output, args.verbose)
|
| 517 |
+
|
| 518 |
+
if result["success"]:
|
| 519 |
+
results.append(result["result"])
|
| 520 |
+
if not args.verbose:
|
| 521 |
+
print(f"Processing ... : {input_item}", file=sys.stderr)
|
| 522 |
+
else:
|
| 523 |
+
errors.append(result)
|
| 524 |
+
print(f"❌ Failed: {input_item} - {result['error']}", file=sys.stderr)
|
| 525 |
+
|
| 526 |
+
# Check if we have any successful results
|
| 527 |
+
if not results:
|
| 528 |
+
print("❌ No files were successfully processed.", file=sys.stderr)
|
| 529 |
+
if errors:
|
| 530 |
+
print("Errors encountered:", file=sys.stderr)
|
| 531 |
+
for error in errors:
|
| 532 |
+
print(f" - {error['input_item']}: {error['error']}", file=sys.stderr)
|
| 533 |
+
return 1
|
| 534 |
+
|
| 535 |
+
# Generate output
|
| 536 |
+
if len(results) == 1:
|
| 537 |
+
# Single result
|
| 538 |
+
result = results[0]
|
| 539 |
+
if args.output == "markdown":
|
| 540 |
+
output_content = result.extract_markdown()
|
| 541 |
+
elif args.output == "html":
|
| 542 |
+
output_content = result.extract_html()
|
| 543 |
+
elif args.output == "json":
|
| 544 |
+
# Handle field extraction if specified
|
| 545 |
+
json_schema = None
|
| 546 |
+
if args.json_schema:
|
| 547 |
+
try:
|
| 548 |
+
with open(args.json_schema, 'r') as f:
|
| 549 |
+
json_schema = json.load(f)
|
| 550 |
+
except Exception as e:
|
| 551 |
+
print(f"Error loading JSON schema: {e}", file=sys.stderr)
|
| 552 |
+
sys.exit(1)
|
| 553 |
+
|
| 554 |
+
try:
|
| 555 |
+
result_json = result.extract_data(
|
| 556 |
+
specified_fields=args.extract_fields,
|
| 557 |
+
json_schema=json_schema,
|
| 558 |
+
)
|
| 559 |
+
output_content = json.dumps(result_json, indent=2)
|
| 560 |
+
except Exception as e:
|
| 561 |
+
print(f"Error during JSON extraction: {e}", file=sys.stderr)
|
| 562 |
+
sys.exit(1)
|
| 563 |
+
elif args.output == "csv":
|
| 564 |
+
try:
|
| 565 |
+
output_content = result.extract_csv(include_all_tables=True)
|
| 566 |
+
except ValueError as e:
|
| 567 |
+
print(f"Error: {e}", file=sys.stderr)
|
| 568 |
+
sys.exit(1)
|
| 569 |
+
else: # text
|
| 570 |
+
output_content = result.extract_text()
|
| 571 |
+
else:
|
| 572 |
+
# Multiple results - combine them
|
| 573 |
+
if args.output == "markdown":
|
| 574 |
+
output_content = "\n\n---\n\n".join(r.extract_markdown() for r in results)
|
| 575 |
+
elif args.output == "html":
|
| 576 |
+
output_content = "\n\n<hr>\n\n".join(r.extract_html() for r in results)
|
| 577 |
+
elif args.output == "json":
|
| 578 |
+
# Handle field extraction for multiple results
|
| 579 |
+
json_schema = None
|
| 580 |
+
if args.json_schema:
|
| 581 |
+
try:
|
| 582 |
+
with open(args.json_schema, 'r') as f:
|
| 583 |
+
json_schema = json.load(f)
|
| 584 |
+
except Exception as e:
|
| 585 |
+
print(f"Error loading JSON schema: {e}", file=sys.stderr)
|
| 586 |
+
sys.exit(1)
|
| 587 |
+
|
| 588 |
+
try:
|
| 589 |
+
extracted_results = []
|
| 590 |
+
for r in results:
|
| 591 |
+
result_json = r.extract_data(
|
| 592 |
+
specified_fields=args.extract_fields,
|
| 593 |
+
json_schema=json_schema,
|
| 594 |
+
)
|
| 595 |
+
extracted_results.append(result_json)
|
| 596 |
+
|
| 597 |
+
combined_json = {
|
| 598 |
+
"results": extracted_results,
|
| 599 |
+
"count": len(results),
|
| 600 |
+
"errors": [{"input": e["input_item"], "error": e["error"]} for e in errors] if errors else []
|
| 601 |
+
}
|
| 602 |
+
output_content = json.dumps(combined_json, indent=2)
|
| 603 |
+
except Exception as e:
|
| 604 |
+
print(f"Error during JSON extraction: {e}", file=sys.stderr)
|
| 605 |
+
sys.exit(1)
|
| 606 |
+
elif args.output == "csv":
|
| 607 |
+
csv_outputs = []
|
| 608 |
+
for i, r in enumerate(results):
|
| 609 |
+
try:
|
| 610 |
+
csv_content = r.extract_csv(include_all_tables=True)
|
| 611 |
+
if csv_content.strip():
|
| 612 |
+
csv_outputs.append(f"=== File {i + 1} ===\n{csv_content}")
|
| 613 |
+
except ValueError:
|
| 614 |
+
# Skip files without tables
|
| 615 |
+
continue
|
| 616 |
+
if not csv_outputs:
|
| 617 |
+
print("Error: No tables found in any of the input files", file=sys.stderr)
|
| 618 |
+
sys.exit(1)
|
| 619 |
+
output_content = "\n\n".join(csv_outputs)
|
| 620 |
+
else: # text
|
| 621 |
+
output_content = "\n\n---\n\n".join(r.extract_text() for r in results)
|
| 622 |
+
|
| 623 |
+
# Write output
|
| 624 |
+
if args.output_file:
|
| 625 |
+
try:
|
| 626 |
+
with open(args.output_file, 'w', encoding='utf-8') as f:
|
| 627 |
+
f.write(output_content)
|
| 628 |
+
print(f"✅ Output written to: {args.output_file}", file=sys.stderr)
|
| 629 |
+
except Exception as e:
|
| 630 |
+
print(f"❌ Failed to write output file: {e}", file=sys.stderr)
|
| 631 |
+
return 1
|
| 632 |
+
else:
|
| 633 |
+
print(output_content)
|
| 634 |
+
|
| 635 |
+
# Summary
|
| 636 |
+
if args.verbose or len(args.input) > 1:
|
| 637 |
+
print(f"\nSummary: {len(results)} successful, {len(errors)} failed", file=sys.stderr)
|
| 638 |
+
|
| 639 |
+
return 0 if not errors else 1
|
| 640 |
+
|
| 641 |
+
|
| 642 |
+
if __name__ == "__main__":
|
| 643 |
+
sys.exit(main())
|
docstrange/config.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# docstrange/config.py
|
| 2 |
+
|
| 3 |
+
class InternalConfig:
|
| 4 |
+
# Internal feature flags and defaults (not exposed to end users)
|
| 5 |
+
use_markdownify = True
|
| 6 |
+
ocr_provider = 'neural' # OCR provider to use (neural for docling models)
|
| 7 |
+
|
| 8 |
+
# PDF processing configuration
|
| 9 |
+
pdf_to_image_enabled = True # Convert PDF pages to images for OCR
|
| 10 |
+
pdf_image_dpi = 300 # DPI for PDF to image conversion
|
| 11 |
+
pdf_image_scale = 2.0 # Scale factor for better OCR accuracy
|
| 12 |
+
|
| 13 |
+
# Add other internal config options here as needed
|
| 14 |
+
# e.g. default_ocr_lang = 'en'
|
| 15 |
+
# e.g. enable_layout_aware_ocr = True
|
docstrange/exceptions.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Custom exceptions for the LLM Data Converter library."""
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class ConversionError(Exception):
|
| 5 |
+
"""Raised when document conversion fails."""
|
| 6 |
+
pass
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class UnsupportedFormatError(Exception):
|
| 10 |
+
"""Raised when the input format is not supported."""
|
| 11 |
+
pass
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class DocumentNotFoundError(Exception):
|
| 15 |
+
"""Raised when the input file is not found."""
|
| 16 |
+
pass
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class NetworkError(Exception):
|
| 20 |
+
"""Raised when network operations fail (e.g., URL fetching)."""
|
| 21 |
+
pass
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# Backwards compatibility alias (deprecated: use DocumentNotFoundError instead)
|
| 25 |
+
FileNotFoundError = DocumentNotFoundError
|
docstrange/extractor.py
ADDED
|
@@ -0,0 +1,431 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Main extractor class for handling document conversion."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import logging
|
| 5 |
+
from typing import List, Optional
|
| 6 |
+
|
| 7 |
+
from .processors import (
|
| 8 |
+
PDFProcessor,
|
| 9 |
+
DOCXProcessor,
|
| 10 |
+
TXTProcessor,
|
| 11 |
+
ExcelProcessor,
|
| 12 |
+
URLProcessor,
|
| 13 |
+
HTMLProcessor,
|
| 14 |
+
PPTXProcessor,
|
| 15 |
+
ImageProcessor,
|
| 16 |
+
CloudProcessor,
|
| 17 |
+
GPUProcessor,
|
| 18 |
+
)
|
| 19 |
+
from .result import ConversionResult
|
| 20 |
+
from .exceptions import ConversionError, UnsupportedFormatError, FileNotFoundError
|
| 21 |
+
from .utils.gpu_utils import should_use_gpu_processor
|
| 22 |
+
from .services.api_key_pool import ApiKeyPool
|
| 23 |
+
|
| 24 |
+
# Configure logging
|
| 25 |
+
logger = logging.getLogger(__name__)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class DocumentExtractor:
|
| 29 |
+
"""Main class for converting documents to LLM-ready formats."""
|
| 30 |
+
|
| 31 |
+
def __init__(
|
| 32 |
+
self,
|
| 33 |
+
preserve_layout: bool = True,
|
| 34 |
+
include_images: bool = True,
|
| 35 |
+
ocr_enabled: bool = True,
|
| 36 |
+
api_key: Optional[str] = None,
|
| 37 |
+
api_keys: Optional[List[str]] = None,
|
| 38 |
+
model: Optional[str] = None,
|
| 39 |
+
gpu: bool = False
|
| 40 |
+
):
|
| 41 |
+
"""Initialize the file extractor.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
preserve_layout: Whether to preserve document layout
|
| 45 |
+
include_images: Whether to include images in output
|
| 46 |
+
ocr_enabled: Whether to enable OCR for image and PDF processing
|
| 47 |
+
api_key: Single API key for cloud processing. Prefer 'docstrange login' for 10k docs/month
|
| 48 |
+
api_keys: List of API keys for automatic rotation when one hits rate limit
|
| 49 |
+
model: Model to use for cloud processing (gemini, openapi) - only for cloud mode
|
| 50 |
+
gpu: Force local GPU processing (disables cloud mode, requires GPU)
|
| 51 |
+
|
| 52 |
+
Note:
|
| 53 |
+
- Cloud mode is default unless gpu is specified
|
| 54 |
+
- Multiple api_keys enable automatic rotation on rate limit
|
| 55 |
+
- Without login/API key, limited calls per day
|
| 56 |
+
- For 10k docs/month, run 'docstrange login' (recommended) or use API keys
|
| 57 |
+
"""
|
| 58 |
+
self.preserve_layout = preserve_layout
|
| 59 |
+
self.include_images = include_images
|
| 60 |
+
self.api_key = api_key
|
| 61 |
+
self.api_keys_list = api_keys or []
|
| 62 |
+
self.model = model
|
| 63 |
+
self.gpu = gpu
|
| 64 |
+
|
| 65 |
+
# Determine processing mode
|
| 66 |
+
# Cloud mode is default unless GPU preference is explicitly set
|
| 67 |
+
self.cloud_mode = not self.gpu
|
| 68 |
+
|
| 69 |
+
# Check GPU availability if GPU preference is set
|
| 70 |
+
if self.gpu and not should_use_gpu_processor():
|
| 71 |
+
raise RuntimeError(
|
| 72 |
+
"GPU preference specified but no GPU is available. "
|
| 73 |
+
"Please ensure CUDA is installed and a compatible GPU is present."
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# Default to True if not explicitly set
|
| 77 |
+
if ocr_enabled is None:
|
| 78 |
+
self.ocr_enabled = True
|
| 79 |
+
else:
|
| 80 |
+
self.ocr_enabled = ocr_enabled
|
| 81 |
+
|
| 82 |
+
# Initialize API key pool
|
| 83 |
+
self.api_key_pool = ApiKeyPool.get_instance()
|
| 84 |
+
|
| 85 |
+
# Add provided keys to the pool
|
| 86 |
+
if api_key:
|
| 87 |
+
self.api_key_pool.add_key(api_key, source="constructor")
|
| 88 |
+
for key in self.api_keys_list:
|
| 89 |
+
self.api_key_pool.add_key(key, source="constructor_list")
|
| 90 |
+
|
| 91 |
+
# Try to get API key from environment if not provided
|
| 92 |
+
if self.cloud_mode and not self.api_key:
|
| 93 |
+
env_keys = os.environ.get('NANONETS_API_KEYS', '')
|
| 94 |
+
if env_keys:
|
| 95 |
+
for key in env_keys.split(','):
|
| 96 |
+
key = key.strip()
|
| 97 |
+
if key:
|
| 98 |
+
self.api_key_pool.add_key(key, source="env")
|
| 99 |
+
|
| 100 |
+
# Also check single env var for backward compat
|
| 101 |
+
single_key = os.environ.get('NANONETS_API_KEY')
|
| 102 |
+
if single_key:
|
| 103 |
+
self.api_key_pool.add_key(single_key, source="env_single")
|
| 104 |
+
|
| 105 |
+
# If still no API keys, try to get from cached credentials
|
| 106 |
+
if not self.api_key_pool.has_available_keys():
|
| 107 |
+
try:
|
| 108 |
+
from .services.auth_service import get_authenticated_token
|
| 109 |
+
cached_token = get_authenticated_token(force_reauth=False)
|
| 110 |
+
if cached_token:
|
| 111 |
+
self.api_key_pool.add_key(cached_token, source="cached_credentials")
|
| 112 |
+
logger.info("Added cached authentication credentials to API key pool")
|
| 113 |
+
except ImportError:
|
| 114 |
+
logger.debug("Authentication service not available")
|
| 115 |
+
except Exception as e:
|
| 116 |
+
logger.warning(f"Could not retrieve cached credentials: {e}")
|
| 117 |
+
|
| 118 |
+
# Pre-create local GPU processor for fallback (if available)
|
| 119 |
+
self.local_gpu_processor = None
|
| 120 |
+
if should_use_gpu_processor():
|
| 121 |
+
try:
|
| 122 |
+
self.local_gpu_processor = GPUProcessor(
|
| 123 |
+
preserve_layout=preserve_layout,
|
| 124 |
+
include_images=include_images,
|
| 125 |
+
ocr_enabled=ocr_enabled
|
| 126 |
+
)
|
| 127 |
+
logger.info("Local GPU processor available for fallback")
|
| 128 |
+
except Exception as e:
|
| 129 |
+
logger.warning(f"Could not initialize local GPU processor: {e}")
|
| 130 |
+
|
| 131 |
+
# Initialize processors
|
| 132 |
+
self.processors = []
|
| 133 |
+
|
| 134 |
+
if self.cloud_mode:
|
| 135 |
+
# Cloud mode setup with key pool and local fallback
|
| 136 |
+
cloud_processor = CloudProcessor(
|
| 137 |
+
api_key=self.api_key, # Can be None, pool will be used
|
| 138 |
+
model_type=self.model,
|
| 139 |
+
preserve_layout=preserve_layout,
|
| 140 |
+
include_images=include_images,
|
| 141 |
+
api_key_pool=self.api_key_pool,
|
| 142 |
+
local_fallback_processor=self.local_gpu_processor
|
| 143 |
+
)
|
| 144 |
+
self.processors.append(cloud_processor)
|
| 145 |
+
|
| 146 |
+
pool_stats = self.api_key_pool.get_pool_stats()
|
| 147 |
+
if pool_stats["available"] > 0:
|
| 148 |
+
logger.info(f"Cloud processing enabled with {pool_stats['available']} API key(s) in pool")
|
| 149 |
+
else:
|
| 150 |
+
logger.info("Cloud processing enabled without API keys - will use local fallback when needed")
|
| 151 |
+
else:
|
| 152 |
+
# Local mode setup
|
| 153 |
+
logger.info("Local processing mode enabled")
|
| 154 |
+
self._setup_local_processors()
|
| 155 |
+
|
| 156 |
+
def authenticate(self, force_reauth: bool = False) -> bool:
|
| 157 |
+
"""
|
| 158 |
+
Perform browser-based authentication and update API key.
|
| 159 |
+
|
| 160 |
+
Args:
|
| 161 |
+
force_reauth: Force re-authentication even if cached credentials exist
|
| 162 |
+
|
| 163 |
+
Returns:
|
| 164 |
+
True if authentication successful, False otherwise
|
| 165 |
+
"""
|
| 166 |
+
try:
|
| 167 |
+
from .services.auth_service import get_authenticated_token
|
| 168 |
+
|
| 169 |
+
token = get_authenticated_token(force_reauth=force_reauth)
|
| 170 |
+
if token:
|
| 171 |
+
self.api_key = token
|
| 172 |
+
|
| 173 |
+
# Add to pool and update cloud processor
|
| 174 |
+
self.api_key_pool.add_key(token, source="authenticated")
|
| 175 |
+
for processor in self.processors:
|
| 176 |
+
if hasattr(processor, 'api_key'):
|
| 177 |
+
processor.api_key = token
|
| 178 |
+
logger.info("Updated processor with new authentication token")
|
| 179 |
+
|
| 180 |
+
return True
|
| 181 |
+
else:
|
| 182 |
+
return False
|
| 183 |
+
|
| 184 |
+
except ImportError:
|
| 185 |
+
logger.error("Authentication service not available")
|
| 186 |
+
return False
|
| 187 |
+
except Exception as e:
|
| 188 |
+
logger.error(f"Authentication failed: {e}")
|
| 189 |
+
return False
|
| 190 |
+
|
| 191 |
+
def _setup_local_processors(self):
|
| 192 |
+
"""Setup local processors based on GPU preferences."""
|
| 193 |
+
local_processors = [
|
| 194 |
+
PDFProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images, ocr_enabled=self.ocr_enabled),
|
| 195 |
+
DOCXProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images),
|
| 196 |
+
TXTProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images),
|
| 197 |
+
ExcelProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images),
|
| 198 |
+
HTMLProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images),
|
| 199 |
+
PPTXProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images),
|
| 200 |
+
ImageProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images, ocr_enabled=self.ocr_enabled),
|
| 201 |
+
URLProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images),
|
| 202 |
+
]
|
| 203 |
+
|
| 204 |
+
# Add GPU processor if GPU preference is specified
|
| 205 |
+
if self.gpu:
|
| 206 |
+
logger.info("GPU preference specified - adding GPU processor with Nanonets OCR")
|
| 207 |
+
gpu_processor = GPUProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images, ocr_enabled=self.ocr_enabled)
|
| 208 |
+
local_processors.append(gpu_processor)
|
| 209 |
+
|
| 210 |
+
self.processors.extend(local_processors)
|
| 211 |
+
|
| 212 |
+
def extract(self, file_path: str) -> ConversionResult:
|
| 213 |
+
"""Convert a file to internal format.
|
| 214 |
+
|
| 215 |
+
Args:
|
| 216 |
+
file_path: Path to the file to extract
|
| 217 |
+
|
| 218 |
+
Returns:
|
| 219 |
+
ConversionResult containing the processed content
|
| 220 |
+
|
| 221 |
+
Raises:
|
| 222 |
+
FileNotFoundError: If the file doesn't exist
|
| 223 |
+
UnsupportedFormatError: If the format is not supported
|
| 224 |
+
ConversionError: If conversion fails
|
| 225 |
+
"""
|
| 226 |
+
if not os.path.exists(file_path):
|
| 227 |
+
raise FileNotFoundError(f"File not found: {file_path}")
|
| 228 |
+
|
| 229 |
+
# Find the appropriate processor
|
| 230 |
+
processor = self._get_processor(file_path)
|
| 231 |
+
if not processor:
|
| 232 |
+
raise UnsupportedFormatError(f"No processor found for file: {file_path}")
|
| 233 |
+
|
| 234 |
+
logger.info(f"Using processor {processor.__class__.__name__} for {file_path}")
|
| 235 |
+
|
| 236 |
+
# Process the file
|
| 237 |
+
return processor.process(file_path)
|
| 238 |
+
|
| 239 |
+
def convert_with_output_type(self, file_path: str, output_type: str) -> ConversionResult:
|
| 240 |
+
"""Convert a file with specific output type for cloud processing.
|
| 241 |
+
|
| 242 |
+
Args:
|
| 243 |
+
file_path: Path to the file to extract
|
| 244 |
+
output_type: Desired output type (markdown, flat-json, html)
|
| 245 |
+
|
| 246 |
+
Returns:
|
| 247 |
+
ConversionResult containing the processed content
|
| 248 |
+
|
| 249 |
+
Raises:
|
| 250 |
+
FileNotFoundError: If the file doesn't exist
|
| 251 |
+
UnsupportedFormatError: If the format is not supported
|
| 252 |
+
ConversionError: If conversion fails
|
| 253 |
+
"""
|
| 254 |
+
if not os.path.exists(file_path):
|
| 255 |
+
raise FileNotFoundError(f"File not found: {file_path}")
|
| 256 |
+
|
| 257 |
+
# For cloud mode, create a processor with the specific output type
|
| 258 |
+
if self.cloud_mode:
|
| 259 |
+
cloud_processor = CloudProcessor(
|
| 260 |
+
api_key=self.api_key,
|
| 261 |
+
output_type=output_type,
|
| 262 |
+
model_type=self.model,
|
| 263 |
+
preserve_layout=self.preserve_layout,
|
| 264 |
+
include_images=self.include_images,
|
| 265 |
+
api_key_pool=self.api_key_pool,
|
| 266 |
+
local_fallback_processor=self.local_gpu_processor
|
| 267 |
+
)
|
| 268 |
+
if cloud_processor.can_process(file_path):
|
| 269 |
+
logger.info(f"Using cloud processor with output_type={output_type} for {file_path}")
|
| 270 |
+
return cloud_processor.process(file_path)
|
| 271 |
+
|
| 272 |
+
# Fallback to regular conversion for local mode
|
| 273 |
+
return self.extract(file_path)
|
| 274 |
+
|
| 275 |
+
def extract_url(self, url: str) -> ConversionResult:
|
| 276 |
+
"""Convert a URL to internal format.
|
| 277 |
+
|
| 278 |
+
Args:
|
| 279 |
+
url: URL to extract
|
| 280 |
+
|
| 281 |
+
Returns:
|
| 282 |
+
ConversionResult containing the processed content
|
| 283 |
+
|
| 284 |
+
Raises:
|
| 285 |
+
ConversionError: If conversion fails
|
| 286 |
+
"""
|
| 287 |
+
# Cloud mode doesn't support URL conversion
|
| 288 |
+
if self.cloud_mode:
|
| 289 |
+
raise ConversionError("URL conversion is not supported in cloud mode. Use local mode for URL processing.")
|
| 290 |
+
|
| 291 |
+
# Find the URL processor
|
| 292 |
+
url_processor = None
|
| 293 |
+
for processor in self.processors:
|
| 294 |
+
if isinstance(processor, URLProcessor):
|
| 295 |
+
url_processor = processor
|
| 296 |
+
break
|
| 297 |
+
|
| 298 |
+
if not url_processor:
|
| 299 |
+
raise ConversionError("URL processor not available")
|
| 300 |
+
|
| 301 |
+
logger.info(f"Converting URL: {url}")
|
| 302 |
+
return url_processor.process(url)
|
| 303 |
+
|
| 304 |
+
def extract_text(self, text: str) -> ConversionResult:
|
| 305 |
+
"""Convert plain text to internal format.
|
| 306 |
+
|
| 307 |
+
Args:
|
| 308 |
+
text: Plain text to extract
|
| 309 |
+
|
| 310 |
+
Returns:
|
| 311 |
+
ConversionResult containing the processed content
|
| 312 |
+
"""
|
| 313 |
+
# Cloud mode doesn't support text conversion
|
| 314 |
+
if self.cloud_mode:
|
| 315 |
+
raise ConversionError("Text conversion is not supported in cloud mode. Use local mode for text processing.")
|
| 316 |
+
|
| 317 |
+
metadata = {
|
| 318 |
+
"content_type": "text",
|
| 319 |
+
"processor": "TextConverter",
|
| 320 |
+
"preserve_layout": self.preserve_layout
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
return ConversionResult(text, metadata)
|
| 324 |
+
|
| 325 |
+
def is_cloud_enabled(self) -> bool:
|
| 326 |
+
"""Check if cloud processing is enabled and configured.
|
| 327 |
+
|
| 328 |
+
Returns:
|
| 329 |
+
True if cloud processing is available
|
| 330 |
+
"""
|
| 331 |
+
return self.cloud_mode and (bool(self.api_key) or self.api_key_pool.has_available_keys())
|
| 332 |
+
|
| 333 |
+
def get_processing_mode(self) -> str:
|
| 334 |
+
"""Get the current processing mode.
|
| 335 |
+
|
| 336 |
+
Returns:
|
| 337 |
+
String describing the current processing mode
|
| 338 |
+
"""
|
| 339 |
+
pool_stats = self.api_key_pool.get_pool_stats()
|
| 340 |
+
if self.cloud_mode and pool_stats["available"] > 0:
|
| 341 |
+
return f"cloud ({pool_stats['available']} key(s))"
|
| 342 |
+
elif self.cloud_mode and self.local_gpu_processor:
|
| 343 |
+
return "cloud (local fallback ready)"
|
| 344 |
+
elif self.gpu:
|
| 345 |
+
return "gpu_forced"
|
| 346 |
+
elif should_use_gpu_processor():
|
| 347 |
+
return "gpu_auto"
|
| 348 |
+
else:
|
| 349 |
+
return "cloud"
|
| 350 |
+
|
| 351 |
+
def get_api_key_pool_stats(self) -> dict:
|
| 352 |
+
"""Get API key pool statistics.
|
| 353 |
+
|
| 354 |
+
Returns:
|
| 355 |
+
Dictionary with pool statistics
|
| 356 |
+
"""
|
| 357 |
+
return self.api_key_pool.get_pool_stats()
|
| 358 |
+
|
| 359 |
+
def _get_processor(self, file_path: str):
|
| 360 |
+
"""Get the appropriate processor for the file.
|
| 361 |
+
|
| 362 |
+
Args:
|
| 363 |
+
file_path: Path to the file
|
| 364 |
+
|
| 365 |
+
Returns:
|
| 366 |
+
Processor that can handle the file, or None if none found
|
| 367 |
+
"""
|
| 368 |
+
# Define GPU-supported formats
|
| 369 |
+
gpu_supported_formats = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp', '.gif', '.pdf']
|
| 370 |
+
|
| 371 |
+
# Check file extension
|
| 372 |
+
_, ext = os.path.splitext(file_path.lower())
|
| 373 |
+
|
| 374 |
+
# Check if GPU processor should be used for this file type
|
| 375 |
+
gpu_available = should_use_gpu_processor()
|
| 376 |
+
|
| 377 |
+
# Try GPU processor only if format is supported AND (gpu OR auto-gpu)
|
| 378 |
+
if ext in gpu_supported_formats and (self.gpu or (gpu_available and not self.gpu)):
|
| 379 |
+
for processor in self.processors:
|
| 380 |
+
if isinstance(processor, GPUProcessor):
|
| 381 |
+
if self.gpu:
|
| 382 |
+
logger.info(f"Using GPU processor with Nanonets OCR for {file_path} (GPU preference specified)")
|
| 383 |
+
else:
|
| 384 |
+
logger.info(f"Using GPU processor with Nanonets OCR for {file_path} (GPU available and format supported)")
|
| 385 |
+
return processor
|
| 386 |
+
|
| 387 |
+
# Fallback to normal processor selection
|
| 388 |
+
for processor in self.processors:
|
| 389 |
+
if processor.can_process(file_path):
|
| 390 |
+
# Skip GPU processor in fallback mode to avoid infinite loops
|
| 391 |
+
if isinstance(processor, GPUProcessor):
|
| 392 |
+
continue
|
| 393 |
+
logger.info(f"Using {processor.__class__.__name__} for {file_path}")
|
| 394 |
+
return processor
|
| 395 |
+
return None
|
| 396 |
+
|
| 397 |
+
def get_supported_formats(self) -> List[str]:
|
| 398 |
+
"""Get list of supported file formats.
|
| 399 |
+
|
| 400 |
+
Returns:
|
| 401 |
+
List of supported file extensions
|
| 402 |
+
"""
|
| 403 |
+
formats = []
|
| 404 |
+
for processor in self.processors:
|
| 405 |
+
if hasattr(processor, 'can_process'):
|
| 406 |
+
# This is a simplified way to get formats
|
| 407 |
+
# In a real implementation, you might want to store this info
|
| 408 |
+
if isinstance(processor, PDFProcessor):
|
| 409 |
+
formats.extend(['.pdf'])
|
| 410 |
+
elif isinstance(processor, DOCXProcessor):
|
| 411 |
+
formats.extend(['.docx', '.doc'])
|
| 412 |
+
elif isinstance(processor, TXTProcessor):
|
| 413 |
+
formats.extend(['.txt', '.text'])
|
| 414 |
+
elif isinstance(processor, ExcelProcessor):
|
| 415 |
+
formats.extend(['.xlsx', '.xls', '.csv'])
|
| 416 |
+
elif isinstance(processor, HTMLProcessor):
|
| 417 |
+
formats.extend(['.html', '.htm'])
|
| 418 |
+
elif isinstance(processor, PPTXProcessor):
|
| 419 |
+
formats.extend(['.ppt', '.pptx'])
|
| 420 |
+
elif isinstance(processor, ImageProcessor):
|
| 421 |
+
formats.extend(['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp', '.gif'])
|
| 422 |
+
elif isinstance(processor, URLProcessor):
|
| 423 |
+
formats.append('URLs')
|
| 424 |
+
elif isinstance(processor, CloudProcessor):
|
| 425 |
+
# Cloud processor supports many formats, but we don't want duplicates
|
| 426 |
+
pass
|
| 427 |
+
elif isinstance(processor, GPUProcessor):
|
| 428 |
+
# GPU processor supports all image formats and PDFs
|
| 429 |
+
formats.extend(['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp', '.gif', '.pdf'])
|
| 430 |
+
|
| 431 |
+
return list(set(formats)) # Remove duplicates
|
docstrange/pipeline/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Pipeline package for document processing and OCR."""
|
docstrange/pipeline/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (204 Bytes). View file
|
|
|
docstrange/pipeline/__pycache__/ocr_service.cpython-310.pyc
ADDED
|
Binary file (5.79 kB). View file
|
|
|
docstrange/pipeline/layout_detector.py
ADDED
|
@@ -0,0 +1,329 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Layout detection and markdown generation for document processing."""
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
import logging
|
| 5 |
+
from typing import List, Dict, Tuple
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class LayoutElement:
|
| 12 |
+
"""Represents a layout element with position and content."""
|
| 13 |
+
|
| 14 |
+
def __init__(self, text: str, x: int, y: int, width: int, height: int,
|
| 15 |
+
element_type: str = "text", confidence: float = 0.0):
|
| 16 |
+
self.text = text
|
| 17 |
+
self.x = x
|
| 18 |
+
self.y = y
|
| 19 |
+
self.width = width
|
| 20 |
+
self.height = height
|
| 21 |
+
self.element_type = element_type
|
| 22 |
+
self.confidence = confidence
|
| 23 |
+
self.bbox = (x, y, x + width, y + height)
|
| 24 |
+
|
| 25 |
+
def area(self) -> int:
|
| 26 |
+
"""Calculate area of the element."""
|
| 27 |
+
return self.width * self.height
|
| 28 |
+
|
| 29 |
+
def center_y(self) -> float:
|
| 30 |
+
"""Get center Y coordinate."""
|
| 31 |
+
return self.y + self.height / 2
|
| 32 |
+
|
| 33 |
+
def center_x(self) -> float:
|
| 34 |
+
"""Get center X coordinate."""
|
| 35 |
+
return self.x + self.width / 2
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class LayoutDetector:
|
| 39 |
+
"""Handles layout detection and markdown generation."""
|
| 40 |
+
|
| 41 |
+
def __init__(self):
|
| 42 |
+
"""Initialize the layout detector."""
|
| 43 |
+
# Layout detection parameters
|
| 44 |
+
self._header_threshold = 0.15 # Top 15% of page considered header area
|
| 45 |
+
self._footer_threshold = 0.85 # Bottom 15% of page considered footer area
|
| 46 |
+
self._heading_height_threshold = 1.5 # Relative height for heading detection
|
| 47 |
+
self._list_patterns = [
|
| 48 |
+
r'^\d+\.', # Numbered list
|
| 49 |
+
r'^[•·▪▫◦‣⁃]', # Bullet points
|
| 50 |
+
r'^[-*+]', # Markdown list markers
|
| 51 |
+
r'^[a-zA-Z]\.', # Lettered list
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
def convert_to_structured_markdown(self, text_blocks: List[LayoutElement], image_size: Tuple[int, int]) -> str:
|
| 55 |
+
"""Convert text blocks to structured markdown with proper hierarchy."""
|
| 56 |
+
if not text_blocks:
|
| 57 |
+
return ""
|
| 58 |
+
|
| 59 |
+
# Sort blocks by vertical position (top to bottom), then horizontal (left to right)
|
| 60 |
+
text_blocks.sort(key=lambda x: (x.y, x.x))
|
| 61 |
+
|
| 62 |
+
# Group blocks into paragraphs based on vertical spacing and text analysis
|
| 63 |
+
paragraphs = self._group_into_paragraphs_advanced(text_blocks, image_size)
|
| 64 |
+
|
| 65 |
+
# Convert paragraphs to markdown
|
| 66 |
+
markdown_parts = []
|
| 67 |
+
|
| 68 |
+
for paragraph in paragraphs:
|
| 69 |
+
if paragraph:
|
| 70 |
+
# Determine if this paragraph is a heading, list, or regular text
|
| 71 |
+
paragraph_type = self._classify_paragraph(paragraph)
|
| 72 |
+
|
| 73 |
+
if paragraph_type == "heading":
|
| 74 |
+
level = self._determine_heading_level_from_text(paragraph)
|
| 75 |
+
markdown_parts.append(f"{'#' * level} {paragraph}")
|
| 76 |
+
elif paragraph_type == "list_item":
|
| 77 |
+
markdown_parts.append(f"- {paragraph}")
|
| 78 |
+
elif paragraph_type == "table_row":
|
| 79 |
+
markdown_parts.append(self._format_table_row(paragraph))
|
| 80 |
+
else:
|
| 81 |
+
markdown_parts.append(paragraph)
|
| 82 |
+
|
| 83 |
+
return '\n\n'.join(markdown_parts)
|
| 84 |
+
|
| 85 |
+
def _group_into_paragraphs_advanced(self, text_blocks: List[LayoutElement], image_size: Tuple[int, int]) -> List[str]:
|
| 86 |
+
"""Advanced paragraph grouping using multiple heuristics."""
|
| 87 |
+
if not text_blocks:
|
| 88 |
+
return []
|
| 89 |
+
|
| 90 |
+
# Calculate average text height for relative sizing
|
| 91 |
+
heights = [block.height for block in text_blocks]
|
| 92 |
+
avg_height = np.mean(heights) if heights else 20
|
| 93 |
+
|
| 94 |
+
# Group by proximity and text characteristics
|
| 95 |
+
paragraphs = []
|
| 96 |
+
current_paragraph = []
|
| 97 |
+
current_y = text_blocks[0].y
|
| 98 |
+
paragraph_threshold = 1.5 * avg_height # Dynamic threshold based on text size
|
| 99 |
+
|
| 100 |
+
for block in text_blocks:
|
| 101 |
+
# Check if this block is part of the same paragraph
|
| 102 |
+
if abs(block.y - current_y) <= paragraph_threshold:
|
| 103 |
+
current_paragraph.append(block)
|
| 104 |
+
else:
|
| 105 |
+
# Start new paragraph
|
| 106 |
+
if current_paragraph:
|
| 107 |
+
paragraph_text = self._join_paragraph_text_advanced(current_paragraph)
|
| 108 |
+
if paragraph_text:
|
| 109 |
+
paragraphs.append(paragraph_text)
|
| 110 |
+
current_paragraph = [block]
|
| 111 |
+
current_y = block.y
|
| 112 |
+
|
| 113 |
+
# Add the last paragraph
|
| 114 |
+
if current_paragraph:
|
| 115 |
+
paragraph_text = self._join_paragraph_text_advanced(current_paragraph)
|
| 116 |
+
if paragraph_text:
|
| 117 |
+
paragraphs.append(paragraph_text)
|
| 118 |
+
|
| 119 |
+
return paragraphs
|
| 120 |
+
|
| 121 |
+
def _join_paragraph_text_advanced(self, text_blocks: List[LayoutElement]) -> str:
|
| 122 |
+
"""Join text blocks into a coherent paragraph with better text processing."""
|
| 123 |
+
if not text_blocks:
|
| 124 |
+
return ""
|
| 125 |
+
|
| 126 |
+
# Sort blocks by reading order (left to right, top to bottom)
|
| 127 |
+
text_blocks.sort(key=lambda x: (x.y, x.x))
|
| 128 |
+
|
| 129 |
+
# Extract and clean text
|
| 130 |
+
texts = []
|
| 131 |
+
for block in text_blocks:
|
| 132 |
+
text = block.text.strip()
|
| 133 |
+
if text:
|
| 134 |
+
texts.append(text)
|
| 135 |
+
|
| 136 |
+
if not texts:
|
| 137 |
+
return ""
|
| 138 |
+
|
| 139 |
+
# Join with smart spacing
|
| 140 |
+
result = ""
|
| 141 |
+
for i, text in enumerate(texts):
|
| 142 |
+
if i == 0:
|
| 143 |
+
result = text
|
| 144 |
+
else:
|
| 145 |
+
# Check if we need a space before this text
|
| 146 |
+
prev_char = result[-1] if result else ""
|
| 147 |
+
curr_char = text[0] if text else ""
|
| 148 |
+
|
| 149 |
+
# Don't add space before punctuation
|
| 150 |
+
if curr_char in ',.!?;:':
|
| 151 |
+
result += text
|
| 152 |
+
# Don't add space after opening parenthesis/bracket
|
| 153 |
+
elif prev_char in '([{':
|
| 154 |
+
result += text
|
| 155 |
+
# Don't add space before closing parenthesis/bracket
|
| 156 |
+
elif curr_char in ')]}':
|
| 157 |
+
result += text
|
| 158 |
+
# Don't add space before common punctuation
|
| 159 |
+
elif curr_char in ';:':
|
| 160 |
+
result += text
|
| 161 |
+
# Handle hyphenation
|
| 162 |
+
elif prev_char == '-' and curr_char.isalpha():
|
| 163 |
+
result += text
|
| 164 |
+
else:
|
| 165 |
+
result += " " + text
|
| 166 |
+
|
| 167 |
+
# Post-process the text
|
| 168 |
+
result = self._post_process_text(result)
|
| 169 |
+
|
| 170 |
+
return result.strip()
|
| 171 |
+
|
| 172 |
+
def _post_process_text(self, text: str) -> str:
|
| 173 |
+
"""Post-process text to improve readability."""
|
| 174 |
+
# Fix common OCR issues
|
| 175 |
+
text = text.replace('|', 'I') # Common OCR mistake
|
| 176 |
+
|
| 177 |
+
# Note: We intentionally do NOT replace '0' with 'o' or '1' with 'l'
|
| 178 |
+
# as this would corrupt numeric data (e.g., "100" -> "ool", "2024" -> "oool")
|
| 179 |
+
|
| 180 |
+
# Fix spacing issues
|
| 181 |
+
text = re.sub(r'\s+', ' ', text) # Multiple spaces to single space
|
| 182 |
+
text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text) # Fix sentence spacing
|
| 183 |
+
|
| 184 |
+
# Fix common OCR artifacts
|
| 185 |
+
text = re.sub(r'[^\w\s.,!?;:()[\]{}"\'-]', '', text) # Remove strange characters
|
| 186 |
+
|
| 187 |
+
return text
|
| 188 |
+
|
| 189 |
+
def _classify_paragraph(self, text: str) -> str:
|
| 190 |
+
"""Classify a paragraph as heading, list item, table row, or regular text."""
|
| 191 |
+
text = text.strip()
|
| 192 |
+
|
| 193 |
+
# Check if it's a list item
|
| 194 |
+
if self._is_list_item(text):
|
| 195 |
+
return "list_item"
|
| 196 |
+
|
| 197 |
+
# Check if it's a table row
|
| 198 |
+
if self._is_table_row(text):
|
| 199 |
+
return "table_row"
|
| 200 |
+
|
| 201 |
+
# Check if it's a heading (short text, ends with period, or all caps)
|
| 202 |
+
if len(text.split()) <= 5 and (text.endswith('.') or text.isupper()):
|
| 203 |
+
return "heading"
|
| 204 |
+
|
| 205 |
+
return "text"
|
| 206 |
+
|
| 207 |
+
def _determine_heading_level_from_text(self, text: str) -> int:
|
| 208 |
+
"""Determine heading level based on text characteristics."""
|
| 209 |
+
text = text.strip()
|
| 210 |
+
|
| 211 |
+
# Short text is likely a higher level heading
|
| 212 |
+
if len(text.split()) <= 3:
|
| 213 |
+
return 1
|
| 214 |
+
elif len(text.split()) <= 5:
|
| 215 |
+
return 2
|
| 216 |
+
else:
|
| 217 |
+
return 3
|
| 218 |
+
|
| 219 |
+
def _is_list_item(self, text: str) -> bool:
|
| 220 |
+
"""Check if text is a list item."""
|
| 221 |
+
text = text.strip()
|
| 222 |
+
for pattern in self._list_patterns:
|
| 223 |
+
if re.match(pattern, text):
|
| 224 |
+
return True
|
| 225 |
+
return False
|
| 226 |
+
|
| 227 |
+
def _is_table_row(self, text: str) -> bool:
|
| 228 |
+
"""Check if text might be a table row."""
|
| 229 |
+
# Simple heuristic: if text contains multiple tab-separated or pipe-separated parts
|
| 230 |
+
if '|' in text or '\t' in text:
|
| 231 |
+
return True
|
| 232 |
+
|
| 233 |
+
# Check for regular spacing that might indicate table columns
|
| 234 |
+
words = text.split()
|
| 235 |
+
if len(words) >= 4: # More words likely indicate table data
|
| 236 |
+
# Check if there are multiple spaces between words (indicating columns)
|
| 237 |
+
if ' ' in text: # Double spaces often indicate column separation
|
| 238 |
+
return True
|
| 239 |
+
|
| 240 |
+
return False
|
| 241 |
+
|
| 242 |
+
def _format_table_row(self, text: str) -> str:
|
| 243 |
+
"""Format text as a table row."""
|
| 244 |
+
# Split by common table separators
|
| 245 |
+
if '|' in text:
|
| 246 |
+
cells = [cell.strip() for cell in text.split('|')]
|
| 247 |
+
elif '\t' in text:
|
| 248 |
+
cells = [cell.strip() for cell in text.split('\t')]
|
| 249 |
+
else:
|
| 250 |
+
# Try to split by multiple spaces
|
| 251 |
+
cells = [cell.strip() for cell in re.split(r'\s{2,}', text)]
|
| 252 |
+
|
| 253 |
+
# Format as markdown table row
|
| 254 |
+
return '| ' + ' | '.join(cells) + ' |'
|
| 255 |
+
|
| 256 |
+
def join_text_properly(self, texts: List[str]) -> str:
|
| 257 |
+
"""Join text words into proper sentences and paragraphs."""
|
| 258 |
+
if not texts:
|
| 259 |
+
return ""
|
| 260 |
+
|
| 261 |
+
# Clean and join text
|
| 262 |
+
cleaned_texts = []
|
| 263 |
+
for text in texts:
|
| 264 |
+
# Remove extra whitespace
|
| 265 |
+
text = text.strip()
|
| 266 |
+
if text:
|
| 267 |
+
cleaned_texts.append(text)
|
| 268 |
+
|
| 269 |
+
if not cleaned_texts:
|
| 270 |
+
return ""
|
| 271 |
+
|
| 272 |
+
# Join with spaces, but be smart about punctuation
|
| 273 |
+
result = ""
|
| 274 |
+
for i, text in enumerate(cleaned_texts):
|
| 275 |
+
if i == 0:
|
| 276 |
+
result = text
|
| 277 |
+
else:
|
| 278 |
+
# Check if we need a space before this word
|
| 279 |
+
prev_char = result[-1] if result else ""
|
| 280 |
+
curr_char = text[0] if text else ""
|
| 281 |
+
|
| 282 |
+
# Don't add space before punctuation
|
| 283 |
+
if curr_char in ',.!?;:':
|
| 284 |
+
result += text
|
| 285 |
+
# Don't add space after opening parenthesis/bracket
|
| 286 |
+
elif prev_char in '([{':
|
| 287 |
+
result += text
|
| 288 |
+
# Don't add space before closing parenthesis/bracket
|
| 289 |
+
elif curr_char in ')]}':
|
| 290 |
+
result += text
|
| 291 |
+
else:
|
| 292 |
+
result += " " + text
|
| 293 |
+
|
| 294 |
+
return result.strip()
|
| 295 |
+
|
| 296 |
+
def create_layout_element_from_block(self, block_data: List[Dict]) -> LayoutElement:
|
| 297 |
+
"""Create a LayoutElement from a block of text data."""
|
| 298 |
+
if not block_data:
|
| 299 |
+
return LayoutElement("", 0, 0, 0, 0)
|
| 300 |
+
|
| 301 |
+
# Sort by line_num and word_num to maintain reading order
|
| 302 |
+
block_data.sort(key=lambda x: (x['line_num'], x['word_num']))
|
| 303 |
+
|
| 304 |
+
# Extract text and position information
|
| 305 |
+
texts = [item['text'] for item in block_data]
|
| 306 |
+
x_coords = [item['x'] for item in block_data]
|
| 307 |
+
y_coords = [item['y'] for item in block_data]
|
| 308 |
+
widths = [item['width'] for item in block_data]
|
| 309 |
+
heights = [item['height'] for item in block_data]
|
| 310 |
+
confidences = [item['conf'] for item in block_data]
|
| 311 |
+
|
| 312 |
+
# Calculate bounding box
|
| 313 |
+
min_x = min(x_coords)
|
| 314 |
+
min_y = min(y_coords)
|
| 315 |
+
max_x = max(x + w for x, w in zip(x_coords, widths))
|
| 316 |
+
max_y = max(y + h for y, h in zip(y_coords, heights))
|
| 317 |
+
|
| 318 |
+
# Join text with proper spacing
|
| 319 |
+
text = self.join_text_properly(texts)
|
| 320 |
+
|
| 321 |
+
return LayoutElement(
|
| 322 |
+
text=text,
|
| 323 |
+
x=min_x,
|
| 324 |
+
y=min_y,
|
| 325 |
+
width=max_x - min_x,
|
| 326 |
+
height=max_y - min_y,
|
| 327 |
+
element_type="text",
|
| 328 |
+
confidence=np.mean(confidences) if confidences else 0.0
|
| 329 |
+
)
|
docstrange/pipeline/model_downloader.py
ADDED
|
@@ -0,0 +1,331 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Model downloader utility for downloading pre-trained models from Hugging Face."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Optional
|
| 7 |
+
import requests
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
from ..utils.gpu_utils import is_gpu_available, get_gpu_info
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ModelDownloader:
|
| 15 |
+
"""Downloads pre-trained models from Hugging Face or Nanonets S3."""
|
| 16 |
+
|
| 17 |
+
# Nanonets S3 model URLs (primary source)
|
| 18 |
+
S3_BASE_URL = "https://public-vlms.s3-us-west-2.amazonaws.com/llm-data-extractor"
|
| 19 |
+
|
| 20 |
+
# Model configurations with both S3 and HuggingFace sources
|
| 21 |
+
LAYOUT_MODEL = {
|
| 22 |
+
"s3_url": f"{S3_BASE_URL}/layout-model-v2.2.0.tar.gz",
|
| 23 |
+
"repo_id": "ds4sd/docling-models",
|
| 24 |
+
"revision": "v2.2.0",
|
| 25 |
+
"model_path": "model_artifacts/layout",
|
| 26 |
+
"cache_folder": "layout"
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
TABLE_MODEL = {
|
| 30 |
+
"s3_url": f"{S3_BASE_URL}/tableformer-model-v2.2.0.tar.gz",
|
| 31 |
+
"repo_id": "ds4sd/docling-models",
|
| 32 |
+
"revision": "v2.2.0",
|
| 33 |
+
"model_path": "model_artifacts/tableformer",
|
| 34 |
+
"cache_folder": "tableformer"
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
# Nanonets OCR model configuration
|
| 38 |
+
NANONETS_OCR_MODEL = {
|
| 39 |
+
"s3_url": f"{S3_BASE_URL}/Nanonets-OCR-s.tar.gz",
|
| 40 |
+
"repo_id": "nanonets/Nanonets-OCR-s",
|
| 41 |
+
"revision": "main",
|
| 42 |
+
"cache_folder": "nanonets-ocr",
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
# Note: EasyOCR downloads its own models automatically, no need for custom model
|
| 46 |
+
|
| 47 |
+
def __init__(self, cache_dir: Optional[Path] = None):
|
| 48 |
+
"""Initialize the model downloader.
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
cache_dir: Directory to cache downloaded models
|
| 52 |
+
"""
|
| 53 |
+
if cache_dir is None:
|
| 54 |
+
cache_dir = Path.home() / ".cache" / "docstrange" / "models"
|
| 55 |
+
|
| 56 |
+
self.cache_dir = Path(cache_dir)
|
| 57 |
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
| 58 |
+
|
| 59 |
+
logger.info(f"Model cache directory: {self.cache_dir}")
|
| 60 |
+
|
| 61 |
+
def download_models(self, force: bool = False, progress: bool = True) -> Path:
|
| 62 |
+
"""Download all required models.
|
| 63 |
+
|
| 64 |
+
Args:
|
| 65 |
+
force: Force re-download even if models exist
|
| 66 |
+
progress: Show download progress
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
Path to the models directory
|
| 70 |
+
"""
|
| 71 |
+
logger.info("Downloading pre-trained models...")
|
| 72 |
+
|
| 73 |
+
# Auto-detect GPU for Nanonets model
|
| 74 |
+
gpu_available = is_gpu_available()
|
| 75 |
+
logger.info(f"GPU available: {gpu_available}")
|
| 76 |
+
if gpu_available:
|
| 77 |
+
logger.info("GPU detected - including Nanonets OCR model")
|
| 78 |
+
else:
|
| 79 |
+
logger.info("No GPU detected - skipping Nanonets OCR model (cloud mode)")
|
| 80 |
+
|
| 81 |
+
models_to_download = [
|
| 82 |
+
("Layout Model", self.LAYOUT_MODEL),
|
| 83 |
+
("Table Structure Model", self.TABLE_MODEL)
|
| 84 |
+
]
|
| 85 |
+
|
| 86 |
+
# Add Nanonets OCR model only if GPU is available
|
| 87 |
+
if gpu_available:
|
| 88 |
+
models_to_download.append(("Nanonets OCR Model", self.NANONETS_OCR_MODEL))
|
| 89 |
+
|
| 90 |
+
for model_name, model_config in models_to_download:
|
| 91 |
+
logger.info(f"Downloading {model_name}...")
|
| 92 |
+
self._download_model(model_config, force, progress)
|
| 93 |
+
|
| 94 |
+
logger.info("All models downloaded successfully!")
|
| 95 |
+
return self.cache_dir
|
| 96 |
+
|
| 97 |
+
def _download_model(self, model_config: dict, force: bool, progress: bool):
|
| 98 |
+
"""Download a specific model.
|
| 99 |
+
|
| 100 |
+
Args:
|
| 101 |
+
model_config: Model configuration dictionary
|
| 102 |
+
force: Force re-download
|
| 103 |
+
progress: Show progress
|
| 104 |
+
"""
|
| 105 |
+
model_dir = self.cache_dir / model_config["cache_folder"]
|
| 106 |
+
|
| 107 |
+
if model_dir.exists() and not force:
|
| 108 |
+
logger.info(f"Model already exists at {model_dir}")
|
| 109 |
+
return
|
| 110 |
+
|
| 111 |
+
# Create model directory
|
| 112 |
+
model_dir.mkdir(parents=True, exist_ok=True)
|
| 113 |
+
|
| 114 |
+
success = False
|
| 115 |
+
|
| 116 |
+
# Check if user prefers Hugging Face via environment variable
|
| 117 |
+
prefer_hf = os.environ.get("document_extractor_PREFER_HF", "false").lower() == "true"
|
| 118 |
+
|
| 119 |
+
# Try S3 first (Nanonets hosted models) unless user prefers HF
|
| 120 |
+
if not prefer_hf and "s3_url" in model_config:
|
| 121 |
+
try:
|
| 122 |
+
logger.info(f"Downloading from Nanonets S3: {model_config['s3_url']}")
|
| 123 |
+
self._download_from_s3(
|
| 124 |
+
s3_url=model_config["s3_url"],
|
| 125 |
+
local_dir=model_dir,
|
| 126 |
+
force=force,
|
| 127 |
+
progress=progress
|
| 128 |
+
)
|
| 129 |
+
success = True
|
| 130 |
+
logger.info("Successfully downloaded from Nanonets S3")
|
| 131 |
+
except Exception as e:
|
| 132 |
+
logger.warning(f"S3 download failed: {e}")
|
| 133 |
+
logger.info("Falling back to Hugging Face...")
|
| 134 |
+
|
| 135 |
+
# Fallback to Hugging Face if S3 fails
|
| 136 |
+
if not success:
|
| 137 |
+
self._download_from_hf(
|
| 138 |
+
repo_id=model_config["repo_id"],
|
| 139 |
+
revision=model_config["revision"],
|
| 140 |
+
local_dir=model_dir,
|
| 141 |
+
force=force,
|
| 142 |
+
progress=progress
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
def _download_from_hf(self, repo_id: str, revision: str, local_dir: Path,
|
| 146 |
+
force: bool, progress: bool):
|
| 147 |
+
"""Download model from Hugging Face using docling's logic.
|
| 148 |
+
|
| 149 |
+
Args:
|
| 150 |
+
repo_id: Hugging Face repository ID
|
| 151 |
+
revision: Git revision/tag
|
| 152 |
+
local_dir: Local directory to save model
|
| 153 |
+
force: Force re-download
|
| 154 |
+
progress: Show progress
|
| 155 |
+
"""
|
| 156 |
+
try:
|
| 157 |
+
from huggingface_hub import snapshot_download
|
| 158 |
+
from huggingface_hub.utils import disable_progress_bars
|
| 159 |
+
import huggingface_hub
|
| 160 |
+
|
| 161 |
+
if not progress:
|
| 162 |
+
disable_progress_bars()
|
| 163 |
+
|
| 164 |
+
# Check if models are already downloaded
|
| 165 |
+
if local_dir.exists() and any(local_dir.iterdir()):
|
| 166 |
+
logger.info(f"Model {repo_id} already exists at {local_dir}")
|
| 167 |
+
return
|
| 168 |
+
|
| 169 |
+
# Try to download with current authentication
|
| 170 |
+
try:
|
| 171 |
+
download_path = snapshot_download(
|
| 172 |
+
repo_id=repo_id,
|
| 173 |
+
force_download=force,
|
| 174 |
+
local_dir=str(local_dir),
|
| 175 |
+
revision=revision,
|
| 176 |
+
token=None, # Use default token if available
|
| 177 |
+
)
|
| 178 |
+
logger.info(f"Successfully downloaded {repo_id} to {download_path}")
|
| 179 |
+
|
| 180 |
+
except huggingface_hub.errors.HfHubHTTPError as e:
|
| 181 |
+
if "401" in str(e) or "Unauthorized" in str(e):
|
| 182 |
+
logger.warning(
|
| 183 |
+
f"Authentication failed for {repo_id}. This model may require a Hugging Face token.\n"
|
| 184 |
+
"To fix this:\n"
|
| 185 |
+
"1. Create a free account at https://huggingface.co/\n"
|
| 186 |
+
"2. Generate a token at https://huggingface.co/settings/tokens\n"
|
| 187 |
+
"3. Set it as environment variable: export HF_TOKEN='your_token_here'\n"
|
| 188 |
+
"4. Or run: huggingface-cli login\n\n"
|
| 189 |
+
"The library will continue with basic OCR capabilities."
|
| 190 |
+
)
|
| 191 |
+
# Don't raise the error, just log it and continue
|
| 192 |
+
return
|
| 193 |
+
else:
|
| 194 |
+
raise
|
| 195 |
+
|
| 196 |
+
except ImportError:
|
| 197 |
+
logger.error("huggingface_hub not available. Please install it: pip install huggingface_hub")
|
| 198 |
+
raise
|
| 199 |
+
except Exception as e:
|
| 200 |
+
logger.error(f"Failed to download model {repo_id}: {e}")
|
| 201 |
+
# Don't raise for authentication errors - allow fallback processing
|
| 202 |
+
if "401" not in str(e) and "Unauthorized" not in str(e):
|
| 203 |
+
raise
|
| 204 |
+
|
| 205 |
+
def get_model_path(self, model_type: str) -> Optional[Path]:
|
| 206 |
+
"""Get the path to a specific model.
|
| 207 |
+
|
| 208 |
+
Args:
|
| 209 |
+
model_type: Type of model ('layout', 'table', 'nanonets-ocr')
|
| 210 |
+
|
| 211 |
+
Returns:
|
| 212 |
+
Path to the model directory, or None if not found
|
| 213 |
+
"""
|
| 214 |
+
model_mapping = {
|
| 215 |
+
'layout': self.LAYOUT_MODEL["cache_folder"],
|
| 216 |
+
'table': self.TABLE_MODEL["cache_folder"],
|
| 217 |
+
'nanonets-ocr': self.NANONETS_OCR_MODEL["cache_folder"]
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
if model_type not in model_mapping:
|
| 221 |
+
logger.error(f"Unknown model type: {model_type}")
|
| 222 |
+
return None
|
| 223 |
+
|
| 224 |
+
model_path = self.cache_dir / model_mapping[model_type]
|
| 225 |
+
|
| 226 |
+
if not model_path.exists():
|
| 227 |
+
logger.warning(f"Model {model_type} not found at {model_path}")
|
| 228 |
+
return None
|
| 229 |
+
|
| 230 |
+
return model_path
|
| 231 |
+
|
| 232 |
+
def are_models_cached(self) -> bool:
|
| 233 |
+
"""Check if all required models are cached.
|
| 234 |
+
|
| 235 |
+
Returns:
|
| 236 |
+
True if all required models are cached, False otherwise
|
| 237 |
+
"""
|
| 238 |
+
layout_path = self.get_model_path('layout')
|
| 239 |
+
table_path = self.get_model_path('table')
|
| 240 |
+
|
| 241 |
+
# Only check for Nanonets model if GPU is available
|
| 242 |
+
if is_gpu_available():
|
| 243 |
+
nanonets_path = self.get_model_path('nanonets-ocr')
|
| 244 |
+
return layout_path is not None and table_path is not None and nanonets_path is not None
|
| 245 |
+
else:
|
| 246 |
+
return layout_path is not None and table_path is not None
|
| 247 |
+
|
| 248 |
+
def _download_from_s3(self, s3_url: str, local_dir: Path, force: bool, progress: bool):
|
| 249 |
+
"""Download model from Nanonets S3.
|
| 250 |
+
|
| 251 |
+
Args:
|
| 252 |
+
s3_url: S3 URL of the model archive
|
| 253 |
+
local_dir: Local directory to extract model
|
| 254 |
+
force: Force re-download
|
| 255 |
+
progress: Show progress
|
| 256 |
+
"""
|
| 257 |
+
import tarfile
|
| 258 |
+
import tempfile
|
| 259 |
+
|
| 260 |
+
# Download the tar.gz file
|
| 261 |
+
response = requests.get(s3_url, stream=True)
|
| 262 |
+
response.raise_for_status()
|
| 263 |
+
|
| 264 |
+
total_size = int(response.headers.get('content-length', 0))
|
| 265 |
+
|
| 266 |
+
with tempfile.NamedTemporaryFile(suffix='.tar.gz', delete=False) as tmp_file:
|
| 267 |
+
if progress and total_size > 0:
|
| 268 |
+
with tqdm(total=total_size, unit='B', unit_scale=True, desc="Downloading") as pbar:
|
| 269 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 270 |
+
if chunk:
|
| 271 |
+
tmp_file.write(chunk)
|
| 272 |
+
pbar.update(len(chunk))
|
| 273 |
+
else:
|
| 274 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 275 |
+
if chunk:
|
| 276 |
+
tmp_file.write(chunk)
|
| 277 |
+
|
| 278 |
+
tmp_file_path = tmp_file.name
|
| 279 |
+
|
| 280 |
+
try:
|
| 281 |
+
# Extract the archive
|
| 282 |
+
logger.info(f"Extracting model to {local_dir}")
|
| 283 |
+
with tarfile.open(tmp_file_path, 'r:gz') as tar:
|
| 284 |
+
tar.extractall(path=local_dir)
|
| 285 |
+
|
| 286 |
+
logger.info("Model extraction completed successfully")
|
| 287 |
+
|
| 288 |
+
finally:
|
| 289 |
+
# Clean up temporary file
|
| 290 |
+
try:
|
| 291 |
+
os.unlink(tmp_file_path)
|
| 292 |
+
except OSError:
|
| 293 |
+
pass
|
| 294 |
+
|
| 295 |
+
def get_cache_info(self) -> dict:
|
| 296 |
+
"""Get information about cached models.
|
| 297 |
+
|
| 298 |
+
Returns:
|
| 299 |
+
Dictionary with cache information
|
| 300 |
+
"""
|
| 301 |
+
info = {
|
| 302 |
+
'cache_dir': str(self.cache_dir),
|
| 303 |
+
'gpu_info': get_gpu_info(),
|
| 304 |
+
'models': {}
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
# Always check layout and table models
|
| 308 |
+
for model_type in ['layout', 'table']:
|
| 309 |
+
path = self.get_model_path(model_type)
|
| 310 |
+
info['models'][model_type] = {
|
| 311 |
+
'cached': path is not None,
|
| 312 |
+
'path': str(path) if path else None
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
# Only check Nanonets model if GPU is available
|
| 316 |
+
if is_gpu_available():
|
| 317 |
+
path = self.get_model_path('nanonets-ocr')
|
| 318 |
+
info['models']['nanonets-ocr'] = {
|
| 319 |
+
'cached': path is not None,
|
| 320 |
+
'path': str(path) if path else None,
|
| 321 |
+
'gpu_required': True
|
| 322 |
+
}
|
| 323 |
+
else:
|
| 324 |
+
info['models']['nanonets-ocr'] = {
|
| 325 |
+
'cached': False,
|
| 326 |
+
'path': None,
|
| 327 |
+
'gpu_required': True,
|
| 328 |
+
'skipped': 'No GPU available'
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
return info
|
docstrange/pipeline/nanonets_processor.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Neural Document Processor using Nanonets OCR for superior document understanding."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
from typing import Optional
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from PIL import Image
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class NanonetsDocumentProcessor:
|
| 13 |
+
"""Neural Document Processor using Nanonets OCR model."""
|
| 14 |
+
|
| 15 |
+
def __init__(self, cache_dir: Optional[Path] = None):
|
| 16 |
+
"""Initialize the Neural Document Processor with Nanonets OCR."""
|
| 17 |
+
logger.info("Initializing Neural Document Processor with Nanonets OCR...")
|
| 18 |
+
|
| 19 |
+
# Initialize models
|
| 20 |
+
self._initialize_models(cache_dir)
|
| 21 |
+
|
| 22 |
+
logger.info("Neural Document Processor initialized successfully")
|
| 23 |
+
|
| 24 |
+
def _initialize_models(self, cache_dir: Optional[Path] = None):
|
| 25 |
+
"""Initialize Nanonets OCR model from local cache."""
|
| 26 |
+
try:
|
| 27 |
+
from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
|
| 28 |
+
from .model_downloader import ModelDownloader
|
| 29 |
+
|
| 30 |
+
# Get model downloader instance
|
| 31 |
+
model_downloader = ModelDownloader(cache_dir)
|
| 32 |
+
|
| 33 |
+
# Get the path to the locally cached Nanonets model
|
| 34 |
+
model_path = model_downloader.get_model_path('nanonets-ocr')
|
| 35 |
+
|
| 36 |
+
if model_path is None:
|
| 37 |
+
raise RuntimeError(
|
| 38 |
+
"Failed to download Nanonets OCR model. "
|
| 39 |
+
"Please ensure you have sufficient disk space and internet connection."
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
# The actual model files are in a subdirectory with the same name
|
| 43 |
+
actual_model_path = model_path / "Nanonets-OCR-ss"
|
| 44 |
+
|
| 45 |
+
if not actual_model_path.exists():
|
| 46 |
+
raise RuntimeError(
|
| 47 |
+
f"Model files not found at expected path: {actual_model_path}"
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
logger.info(f"Loading Nanonets OCR model from local cache: {actual_model_path}")
|
| 51 |
+
|
| 52 |
+
# Load model from local path
|
| 53 |
+
self.model = AutoModelForImageTextToText.from_pretrained(
|
| 54 |
+
str(actual_model_path),
|
| 55 |
+
torch_dtype="auto",
|
| 56 |
+
device_map="auto",
|
| 57 |
+
local_files_only=True # Use only local files
|
| 58 |
+
)
|
| 59 |
+
self.model.eval()
|
| 60 |
+
|
| 61 |
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
| 62 |
+
str(actual_model_path),
|
| 63 |
+
local_files_only=True
|
| 64 |
+
)
|
| 65 |
+
self.processor = AutoProcessor.from_pretrained(
|
| 66 |
+
str(actual_model_path),
|
| 67 |
+
local_files_only=True
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
logger.info("Nanonets OCR model loaded successfully from local cache")
|
| 71 |
+
|
| 72 |
+
except ImportError as e:
|
| 73 |
+
logger.error(f"Transformers library not available: {e}")
|
| 74 |
+
raise ImportError(
|
| 75 |
+
"Transformers library is required for Nanonets OCR. "
|
| 76 |
+
"Please install it: pip install transformers"
|
| 77 |
+
)
|
| 78 |
+
except Exception as e:
|
| 79 |
+
logger.error(f"Failed to initialize Nanonets OCR model: {e}")
|
| 80 |
+
raise
|
| 81 |
+
|
| 82 |
+
def extract_text(self, image_path: str) -> str:
|
| 83 |
+
"""Extract text from image using Nanonets OCR."""
|
| 84 |
+
try:
|
| 85 |
+
if not os.path.exists(image_path):
|
| 86 |
+
logger.error(f"Image file does not exist: {image_path}")
|
| 87 |
+
return ""
|
| 88 |
+
|
| 89 |
+
return self._extract_text_with_nanonets(image_path)
|
| 90 |
+
|
| 91 |
+
except Exception as e:
|
| 92 |
+
logger.error(f"Nanonets OCR extraction failed: {e}")
|
| 93 |
+
return ""
|
| 94 |
+
|
| 95 |
+
def extract_text_with_layout(self, image_path: str) -> str:
|
| 96 |
+
"""Extract text with layout awareness using Nanonets OCR.
|
| 97 |
+
|
| 98 |
+
Note: Nanonets OCR already provides layout-aware extraction,
|
| 99 |
+
so this method returns the same result as extract_text().
|
| 100 |
+
"""
|
| 101 |
+
return self.extract_text(image_path)
|
| 102 |
+
|
| 103 |
+
def _extract_text_with_nanonets(self, image_path: str, max_new_tokens: int = 4096) -> str:
|
| 104 |
+
"""Extract text using Nanonets OCR model."""
|
| 105 |
+
try:
|
| 106 |
+
prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""
|
| 107 |
+
|
| 108 |
+
image = Image.open(image_path)
|
| 109 |
+
messages = [
|
| 110 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
| 111 |
+
{"role": "user", "content": [
|
| 112 |
+
{"type": "image", "image": f"file://{image_path}"},
|
| 113 |
+
{"type": "text", "text": prompt},
|
| 114 |
+
]},
|
| 115 |
+
]
|
| 116 |
+
|
| 117 |
+
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 118 |
+
inputs = self.processor(text=[text], images=[image], padding=True, return_tensors="pt")
|
| 119 |
+
inputs = inputs.to(self.model.device)
|
| 120 |
+
|
| 121 |
+
output_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
|
| 122 |
+
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
|
| 123 |
+
|
| 124 |
+
output_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
| 125 |
+
return output_text[0]
|
| 126 |
+
|
| 127 |
+
except Exception as e:
|
| 128 |
+
logger.error(f"Nanonets OCR extraction failed: {e}")
|
| 129 |
+
return ""
|
docstrange/pipeline/neural_document_processor.py
ADDED
|
@@ -0,0 +1,644 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Neural Document Processor using docling's pre-trained models for superior document understanding."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
import platform
|
| 6 |
+
import sys
|
| 7 |
+
from typing import Optional, List, Dict, Any, Tuple
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from PIL import Image
|
| 10 |
+
import numpy as np
|
| 11 |
+
|
| 12 |
+
# macOS-specific NumPy compatibility fix
|
| 13 |
+
if platform.system() == "Darwin":
|
| 14 |
+
try:
|
| 15 |
+
import numpy as np
|
| 16 |
+
# Check if we're on NumPy 2.x
|
| 17 |
+
if hasattr(np, '__version__') and np.__version__.startswith('2'):
|
| 18 |
+
# Set environment variable to use NumPy 1.x compatibility mode
|
| 19 |
+
os.environ['NUMPY_EXPERIMENTAL_ARRAY_FUNCTION'] = '0'
|
| 20 |
+
# Also set this for PyTorch compatibility
|
| 21 |
+
os.environ['PYTORCH_NUMPY_COMPATIBILITY'] = '1'
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
logger.warning(
|
| 24 |
+
"NumPy 2.x detected on macOS. This may cause compatibility issues. "
|
| 25 |
+
"Consider downgrading to NumPy 1.x: pip install 'numpy<2.0.0'"
|
| 26 |
+
)
|
| 27 |
+
except ImportError:
|
| 28 |
+
pass
|
| 29 |
+
|
| 30 |
+
# Runtime NumPy version check
|
| 31 |
+
def _check_numpy_version():
|
| 32 |
+
"""Check NumPy version and warn about compatibility issues."""
|
| 33 |
+
try:
|
| 34 |
+
import numpy as np
|
| 35 |
+
version = np.__version__
|
| 36 |
+
if version.startswith('2'):
|
| 37 |
+
logger = logging.getLogger(__name__)
|
| 38 |
+
logger.error(
|
| 39 |
+
f"NumPy {version} detected. This library requires NumPy 1.x for compatibility "
|
| 40 |
+
"with docling models. Please downgrade NumPy:\n"
|
| 41 |
+
"pip install 'numpy<2.0.0'\n"
|
| 42 |
+
"or\n"
|
| 43 |
+
"pip install --upgrade llm-data-extractor"
|
| 44 |
+
)
|
| 45 |
+
if platform.system() == "Darwin":
|
| 46 |
+
logger.error(
|
| 47 |
+
"On macOS, NumPy 2.x is known to cause crashes with PyTorch. "
|
| 48 |
+
"Downgrading to NumPy 1.x is strongly recommended."
|
| 49 |
+
)
|
| 50 |
+
return False
|
| 51 |
+
return True
|
| 52 |
+
except ImportError:
|
| 53 |
+
return True
|
| 54 |
+
|
| 55 |
+
from .model_downloader import ModelDownloader
|
| 56 |
+
from .layout_detector import LayoutDetector
|
| 57 |
+
|
| 58 |
+
logger = logging.getLogger(__name__)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class NeuralDocumentProcessor:
|
| 62 |
+
"""Neural Document Processor using docling's pre-trained models."""
|
| 63 |
+
|
| 64 |
+
def __init__(self, cache_dir: Optional[Path] = None):
|
| 65 |
+
"""Initialize the Neural Document Processor."""
|
| 66 |
+
logger.info("Initializing Neural Document Processor...")
|
| 67 |
+
|
| 68 |
+
# Check NumPy version compatibility
|
| 69 |
+
if not _check_numpy_version():
|
| 70 |
+
raise RuntimeError(
|
| 71 |
+
"Incompatible NumPy version detected. Please downgrade to NumPy 1.x: "
|
| 72 |
+
"pip install 'numpy<2.0.0'"
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
# Initialize model downloader
|
| 76 |
+
self.model_downloader = ModelDownloader(cache_dir)
|
| 77 |
+
|
| 78 |
+
# Initialize layout detector
|
| 79 |
+
self.layout_detector = LayoutDetector()
|
| 80 |
+
|
| 81 |
+
# Initialize models
|
| 82 |
+
self._initialize_models()
|
| 83 |
+
|
| 84 |
+
logger.info("Neural Document Processor initialized successfully")
|
| 85 |
+
|
| 86 |
+
def _initialize_models(self):
|
| 87 |
+
"""Initialize all required models."""
|
| 88 |
+
try:
|
| 89 |
+
# Initialize model paths
|
| 90 |
+
self._initialize_model_paths()
|
| 91 |
+
|
| 92 |
+
# Initialize docling neural models
|
| 93 |
+
self._initialize_docling_models()
|
| 94 |
+
|
| 95 |
+
except Exception as e:
|
| 96 |
+
logger.error(f"Failed to initialize models: {e}")
|
| 97 |
+
raise
|
| 98 |
+
|
| 99 |
+
def _initialize_model_paths(self):
|
| 100 |
+
"""Initialize paths to downloaded models."""
|
| 101 |
+
from .model_downloader import ModelDownloader
|
| 102 |
+
|
| 103 |
+
downloader = ModelDownloader()
|
| 104 |
+
|
| 105 |
+
# Check if models exist, if not download them
|
| 106 |
+
layout_path = downloader.get_model_path('layout')
|
| 107 |
+
table_path = downloader.get_model_path('table')
|
| 108 |
+
|
| 109 |
+
# If any model is missing, download all models
|
| 110 |
+
if not layout_path or not table_path:
|
| 111 |
+
logger.info("Some models are missing. Downloading all required models...")
|
| 112 |
+
logger.info(f"Models will be cached at: {downloader.cache_dir}")
|
| 113 |
+
try:
|
| 114 |
+
downloader.download_models(force=False, progress=True)
|
| 115 |
+
# Get paths again after download
|
| 116 |
+
layout_path = downloader.get_model_path('layout')
|
| 117 |
+
table_path = downloader.get_model_path('table')
|
| 118 |
+
|
| 119 |
+
# Check if download was successful
|
| 120 |
+
if layout_path and table_path:
|
| 121 |
+
logger.info("Model download completed successfully!")
|
| 122 |
+
else:
|
| 123 |
+
logger.warning("Some models may not have downloaded successfully due to authentication issues.")
|
| 124 |
+
logger.info("Falling back to basic document processing without advanced neural models.")
|
| 125 |
+
# Set flags to indicate fallback mode
|
| 126 |
+
self._use_fallback_mode = True
|
| 127 |
+
return
|
| 128 |
+
|
| 129 |
+
except Exception as e:
|
| 130 |
+
logger.warning(f"Failed to download models: {e}")
|
| 131 |
+
if "401" in str(e) or "Unauthorized" in str(e) or "Authentication" in str(e):
|
| 132 |
+
logger.info(
|
| 133 |
+
"Model download failed due to authentication. Using basic document processing.\n"
|
| 134 |
+
"For enhanced features, please set up Hugging Face authentication:\n"
|
| 135 |
+
"1. Create account at https://huggingface.co/\n"
|
| 136 |
+
"2. Generate token at https://huggingface.co/settings/tokens\n"
|
| 137 |
+
"3. Run: huggingface-cli login"
|
| 138 |
+
)
|
| 139 |
+
self._use_fallback_mode = True
|
| 140 |
+
return
|
| 141 |
+
else:
|
| 142 |
+
raise ValueError(f"Failed to download required models: {e}")
|
| 143 |
+
else:
|
| 144 |
+
logger.info("All required models found in cache.")
|
| 145 |
+
|
| 146 |
+
# Set fallback mode flag
|
| 147 |
+
self._use_fallback_mode = False
|
| 148 |
+
|
| 149 |
+
# Set model paths
|
| 150 |
+
self.layout_model_path = layout_path
|
| 151 |
+
self.table_model_path = table_path
|
| 152 |
+
|
| 153 |
+
if not self.layout_model_path or not self.table_model_path:
|
| 154 |
+
if hasattr(self, '_use_fallback_mode') and self._use_fallback_mode:
|
| 155 |
+
logger.info("Running in fallback mode without advanced neural models")
|
| 156 |
+
return
|
| 157 |
+
else:
|
| 158 |
+
raise ValueError("One or more required models not found")
|
| 159 |
+
|
| 160 |
+
# The models are downloaded with the full repository structure
|
| 161 |
+
# The entire repo is downloaded to each cache folder, so we need to navigate to the specific model paths
|
| 162 |
+
# Layout model is in layout/model_artifacts/layout/
|
| 163 |
+
# Table model is in tableformer/model_artifacts/tableformer/accurate/
|
| 164 |
+
# Note: EasyOCR downloads its own models automatically
|
| 165 |
+
|
| 166 |
+
# Check if the expected structure exists, if not use the cache folder directly
|
| 167 |
+
layout_artifacts = self.layout_model_path / "model_artifacts" / "layout"
|
| 168 |
+
table_artifacts = self.table_model_path / "model_artifacts" / "tableformer" / "accurate"
|
| 169 |
+
|
| 170 |
+
if layout_artifacts.exists():
|
| 171 |
+
self.layout_model_path = layout_artifacts
|
| 172 |
+
else:
|
| 173 |
+
# Fallback: use the cache folder directly
|
| 174 |
+
logger.warning(f"Expected layout model structure not found, using cache folder directly")
|
| 175 |
+
|
| 176 |
+
if table_artifacts.exists():
|
| 177 |
+
self.table_model_path = table_artifacts
|
| 178 |
+
else:
|
| 179 |
+
# Fallback: use the cache folder directly
|
| 180 |
+
logger.warning(f"Expected table model structure not found, using cache folder directly")
|
| 181 |
+
|
| 182 |
+
logger.info(f"Layout model path: {self.layout_model_path}")
|
| 183 |
+
logger.info(f"Table model path: {self.table_model_path}")
|
| 184 |
+
logger.info("EasyOCR will download its own models automatically")
|
| 185 |
+
|
| 186 |
+
# Verify model files exist (with more flexible checking)
|
| 187 |
+
layout_model_file = self.layout_model_path / "model.safetensors"
|
| 188 |
+
table_config_file = self.table_model_path / "tm_config.json"
|
| 189 |
+
|
| 190 |
+
if not layout_model_file.exists():
|
| 191 |
+
# Try alternative locations
|
| 192 |
+
alt_layout_file = self.layout_model_path / "layout" / "model.safetensors"
|
| 193 |
+
if alt_layout_file.exists():
|
| 194 |
+
self.layout_model_path = self.layout_model_path / "layout"
|
| 195 |
+
layout_model_file = alt_layout_file
|
| 196 |
+
else:
|
| 197 |
+
raise FileNotFoundError(f"Missing layout model file. Checked: {layout_model_file}, {alt_layout_file}")
|
| 198 |
+
|
| 199 |
+
if not table_config_file.exists():
|
| 200 |
+
# Try alternative locations
|
| 201 |
+
alt_table_file = self.table_model_path / "tableformer" / "accurate" / "tm_config.json"
|
| 202 |
+
if alt_table_file.exists():
|
| 203 |
+
self.table_model_path = self.table_model_path / "tableformer" / "accurate"
|
| 204 |
+
table_config_file = alt_table_file
|
| 205 |
+
else:
|
| 206 |
+
raise FileNotFoundError(f"Missing table config file. Checked: {table_config_file}, {alt_table_file}")
|
| 207 |
+
|
| 208 |
+
def _initialize_docling_models(self):
|
| 209 |
+
"""Initialize docling's pre-trained models."""
|
| 210 |
+
# Check if we're in fallback mode
|
| 211 |
+
if hasattr(self, '_use_fallback_mode') and self._use_fallback_mode:
|
| 212 |
+
logger.info("Skipping docling models initialization - running in fallback mode")
|
| 213 |
+
self.use_advanced_models = False
|
| 214 |
+
self.layout_predictor = None
|
| 215 |
+
self.table_predictor = None
|
| 216 |
+
self.ocr_reader = None
|
| 217 |
+
return
|
| 218 |
+
|
| 219 |
+
try:
|
| 220 |
+
# Import docling models
|
| 221 |
+
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
| 222 |
+
from docling_ibm_models.tableformer.common import read_config
|
| 223 |
+
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
| 224 |
+
import easyocr
|
| 225 |
+
|
| 226 |
+
# Initialize layout model
|
| 227 |
+
self.layout_predictor = LayoutPredictor(
|
| 228 |
+
artifact_path=str(self.layout_model_path),
|
| 229 |
+
device='cpu',
|
| 230 |
+
num_threads=4
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
# Initialize table structure model
|
| 234 |
+
tm_config = read_config(str(self.table_model_path / "tm_config.json"))
|
| 235 |
+
tm_config["model"]["save_dir"] = str(self.table_model_path)
|
| 236 |
+
self.table_predictor = TFPredictor(tm_config, 'cpu', 4)
|
| 237 |
+
|
| 238 |
+
# Initialize OCR model
|
| 239 |
+
self.ocr_reader = easyocr.Reader(['en'])
|
| 240 |
+
|
| 241 |
+
self.use_advanced_models = True
|
| 242 |
+
logger.info("Docling neural models initialized successfully")
|
| 243 |
+
|
| 244 |
+
except ImportError as e:
|
| 245 |
+
logger.error(f"Docling models not available: {e}")
|
| 246 |
+
raise
|
| 247 |
+
except Exception as e:
|
| 248 |
+
error_msg = str(e)
|
| 249 |
+
if "NumPy" in error_msg or "numpy" in error_msg.lower():
|
| 250 |
+
logger.error(
|
| 251 |
+
f"NumPy compatibility error: {error_msg}\n"
|
| 252 |
+
"This is likely due to NumPy 2.x incompatibility. Please downgrade:\n"
|
| 253 |
+
"pip install 'numpy<2.0.0'"
|
| 254 |
+
)
|
| 255 |
+
if platform.system() == "Darwin":
|
| 256 |
+
logger.error(
|
| 257 |
+
"On macOS, NumPy 2.x is known to cause crashes with PyTorch. "
|
| 258 |
+
"Downgrading to NumPy 1.x is required."
|
| 259 |
+
)
|
| 260 |
+
else:
|
| 261 |
+
logger.error(f"Failed to initialize docling models: {e}")
|
| 262 |
+
raise
|
| 263 |
+
|
| 264 |
+
def extract_text(self, image_path: str) -> str:
|
| 265 |
+
"""Extract text from image using neural OCR."""
|
| 266 |
+
try:
|
| 267 |
+
if not os.path.exists(image_path):
|
| 268 |
+
logger.error(f"Image file does not exist: {image_path}")
|
| 269 |
+
return ""
|
| 270 |
+
|
| 271 |
+
return self._extract_text_advanced(image_path)
|
| 272 |
+
|
| 273 |
+
except Exception as e:
|
| 274 |
+
logger.error(f"OCR extraction failed: {e}")
|
| 275 |
+
return ""
|
| 276 |
+
|
| 277 |
+
def extract_text_with_layout(self, image_path: str) -> str:
|
| 278 |
+
"""Extract text with layout awareness using neural models."""
|
| 279 |
+
try:
|
| 280 |
+
if not os.path.exists(image_path):
|
| 281 |
+
logger.error(f"Image file does not exist: {image_path}")
|
| 282 |
+
return ""
|
| 283 |
+
|
| 284 |
+
return self._extract_text_with_layout_advanced(image_path)
|
| 285 |
+
|
| 286 |
+
except Exception as e:
|
| 287 |
+
logger.error(f"Layout-aware OCR extraction failed: {e}")
|
| 288 |
+
return ""
|
| 289 |
+
|
| 290 |
+
def _extract_text_advanced(self, image_path: str) -> str:
|
| 291 |
+
"""Extract text using docling's advanced models."""
|
| 292 |
+
try:
|
| 293 |
+
with Image.open(image_path) as img:
|
| 294 |
+
if img.mode != 'RGB':
|
| 295 |
+
img = img.convert('RGB')
|
| 296 |
+
|
| 297 |
+
results = self.ocr_reader.readtext(img)
|
| 298 |
+
texts = []
|
| 299 |
+
for (bbox, text, confidence) in results:
|
| 300 |
+
if confidence > 0.5:
|
| 301 |
+
texts.append(text)
|
| 302 |
+
|
| 303 |
+
return ' '.join(texts)
|
| 304 |
+
|
| 305 |
+
except Exception as e:
|
| 306 |
+
logger.error(f"Advanced OCR extraction failed: {e}")
|
| 307 |
+
return ""
|
| 308 |
+
|
| 309 |
+
def _extract_text_with_layout_advanced(self, image_path: str) -> str:
|
| 310 |
+
"""Extract text with layout awareness using docling's neural models."""
|
| 311 |
+
try:
|
| 312 |
+
with Image.open(image_path) as img:
|
| 313 |
+
if img.mode != 'RGB':
|
| 314 |
+
img = img.convert('RGB')
|
| 315 |
+
|
| 316 |
+
# Get layout predictions using neural model
|
| 317 |
+
layout_results = list(self.layout_predictor.predict(img))
|
| 318 |
+
|
| 319 |
+
# Process layout results and extract text
|
| 320 |
+
text_blocks = []
|
| 321 |
+
table_blocks = []
|
| 322 |
+
|
| 323 |
+
for pred in layout_results:
|
| 324 |
+
label = pred.get('label', '').lower().replace(' ', '_').replace('-', '_')
|
| 325 |
+
|
| 326 |
+
# Construct bbox from l, t, r, b
|
| 327 |
+
if all(k in pred for k in ['l', 't', 'r', 'b']):
|
| 328 |
+
bbox = [pred['l'], pred['t'], pred['r'], pred['b']]
|
| 329 |
+
else:
|
| 330 |
+
bbox = pred.get('bbox') or pred.get('box')
|
| 331 |
+
if not bbox:
|
| 332 |
+
continue
|
| 333 |
+
|
| 334 |
+
# Extract text from this region using OCR
|
| 335 |
+
region_text = self._extract_text_from_region(img, bbox)
|
| 336 |
+
|
| 337 |
+
if not region_text or pred.get('confidence', 1.0) < 0.5:
|
| 338 |
+
continue
|
| 339 |
+
|
| 340 |
+
from .layout_detector import LayoutElement
|
| 341 |
+
|
| 342 |
+
# Handle different element types
|
| 343 |
+
if label in ['table', 'document_index']:
|
| 344 |
+
# Process tables separately
|
| 345 |
+
table_blocks.append({
|
| 346 |
+
'text': region_text,
|
| 347 |
+
'bbox': bbox,
|
| 348 |
+
'label': label,
|
| 349 |
+
'confidence': pred.get('confidence', 1.0)
|
| 350 |
+
})
|
| 351 |
+
elif label in ['title', 'section_header', 'subtitle_level_1']:
|
| 352 |
+
# Headers
|
| 353 |
+
text_blocks.append(LayoutElement(
|
| 354 |
+
text=region_text,
|
| 355 |
+
x=bbox[0],
|
| 356 |
+
y=bbox[1],
|
| 357 |
+
width=bbox[2] - bbox[0],
|
| 358 |
+
height=bbox[3] - bbox[1],
|
| 359 |
+
element_type='heading',
|
| 360 |
+
confidence=pred.get('confidence', 1.0)
|
| 361 |
+
))
|
| 362 |
+
elif label in ['list_item']:
|
| 363 |
+
# List items
|
| 364 |
+
text_blocks.append(LayoutElement(
|
| 365 |
+
text=region_text,
|
| 366 |
+
x=bbox[0],
|
| 367 |
+
y=bbox[1],
|
| 368 |
+
width=bbox[2] - bbox[0],
|
| 369 |
+
height=bbox[3] - bbox[1],
|
| 370 |
+
element_type='list_item',
|
| 371 |
+
confidence=pred.get('confidence', 1.0)
|
| 372 |
+
))
|
| 373 |
+
else:
|
| 374 |
+
# Regular text/paragraphs
|
| 375 |
+
text_blocks.append(LayoutElement(
|
| 376 |
+
text=region_text,
|
| 377 |
+
x=bbox[0],
|
| 378 |
+
y=bbox[1],
|
| 379 |
+
width=bbox[2] - bbox[0],
|
| 380 |
+
height=bbox[3] - bbox[1],
|
| 381 |
+
element_type='paragraph',
|
| 382 |
+
confidence=pred.get('confidence', 1.0)
|
| 383 |
+
))
|
| 384 |
+
|
| 385 |
+
# Sort by position (top to bottom, left to right)
|
| 386 |
+
text_blocks.sort(key=lambda x: (x.y, x.x))
|
| 387 |
+
|
| 388 |
+
# Process tables using table structure model
|
| 389 |
+
processed_tables = self._process_tables_with_structure_model(img, table_blocks)
|
| 390 |
+
|
| 391 |
+
# Convert to markdown with proper structure
|
| 392 |
+
return self._convert_to_structured_markdown_advanced(text_blocks, processed_tables, img.size)
|
| 393 |
+
|
| 394 |
+
except Exception as e:
|
| 395 |
+
logger.error(f"Advanced layout-aware OCR failed: {e}")
|
| 396 |
+
return ""
|
| 397 |
+
|
| 398 |
+
def _process_tables_with_structure_model(self, img: Image.Image, table_blocks: List[Dict]) -> List[Dict]:
|
| 399 |
+
"""Process tables using the table structure model."""
|
| 400 |
+
processed_tables = []
|
| 401 |
+
|
| 402 |
+
for table_block in table_blocks:
|
| 403 |
+
try:
|
| 404 |
+
# Extract table region
|
| 405 |
+
bbox = table_block['bbox']
|
| 406 |
+
x1, y1, x2, y2 = bbox
|
| 407 |
+
table_region = img.crop((x1, y1, x2, y2))
|
| 408 |
+
|
| 409 |
+
# Convert to numpy array
|
| 410 |
+
table_np = np.array(table_region)
|
| 411 |
+
|
| 412 |
+
# Create page input in the format expected by docling table structure model
|
| 413 |
+
page_input = {
|
| 414 |
+
"width": table_np.shape[1],
|
| 415 |
+
"height": table_np.shape[0],
|
| 416 |
+
"image": table_np,
|
| 417 |
+
"tokens": [] # Empty tokens since we're not using cell matching
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
# The bbox coordinates should be relative to the table region
|
| 421 |
+
table_bbox = [0, 0, x2-x1, y2-y1]
|
| 422 |
+
|
| 423 |
+
# Predict table structure
|
| 424 |
+
tf_output = self.table_predictor.multi_table_predict(page_input, [table_bbox], do_matching=False)
|
| 425 |
+
table_out = tf_output[0] if isinstance(tf_output, list) else tf_output
|
| 426 |
+
|
| 427 |
+
# Extract table data
|
| 428 |
+
table_data = []
|
| 429 |
+
tf_responses = table_out.get("tf_responses", []) if isinstance(table_out, dict) else []
|
| 430 |
+
|
| 431 |
+
for element in tf_responses:
|
| 432 |
+
if isinstance(element, dict) and "bbox" in element:
|
| 433 |
+
cell_bbox = element["bbox"]
|
| 434 |
+
# Handle bbox as dict with keys l, t, r, b
|
| 435 |
+
if isinstance(cell_bbox, dict) and all(k in cell_bbox for k in ["l", "t", "r", "b"]):
|
| 436 |
+
cell_x1 = cell_bbox["l"]
|
| 437 |
+
cell_y1 = cell_bbox["t"]
|
| 438 |
+
cell_x2 = cell_bbox["r"]
|
| 439 |
+
cell_y2 = cell_bbox["b"]
|
| 440 |
+
cell_region = table_region.crop((cell_x1, cell_y1, cell_x2, cell_y2))
|
| 441 |
+
cell_np = np.array(cell_region)
|
| 442 |
+
cell_text = self._extract_text_from_region_numpy(cell_np)
|
| 443 |
+
table_data.append(cell_text)
|
| 444 |
+
elif isinstance(cell_bbox, list) and len(cell_bbox) == 4:
|
| 445 |
+
cell_x1, cell_y1, cell_x2, cell_y2 = cell_bbox
|
| 446 |
+
cell_region = table_region.crop((cell_x1, cell_y1, cell_x2, cell_y2))
|
| 447 |
+
cell_np = np.array(cell_region)
|
| 448 |
+
cell_text = self._extract_text_from_region_numpy(cell_np)
|
| 449 |
+
table_data.append(cell_text)
|
| 450 |
+
else:
|
| 451 |
+
pass
|
| 452 |
+
else:
|
| 453 |
+
pass
|
| 454 |
+
|
| 455 |
+
# Organize table data into rows and columns
|
| 456 |
+
processed_table = self._organize_table_data(table_data, table_out if isinstance(table_out, dict) else {})
|
| 457 |
+
# Preserve the original bbox from the table block
|
| 458 |
+
processed_table['bbox'] = table_block['bbox']
|
| 459 |
+
processed_tables.append(processed_table)
|
| 460 |
+
|
| 461 |
+
except Exception as e:
|
| 462 |
+
logger.error(f"Failed to process table: {e}")
|
| 463 |
+
# Fallback to simple table extraction
|
| 464 |
+
processed_tables.append({
|
| 465 |
+
'type': 'simple_table',
|
| 466 |
+
'text': table_block['text'],
|
| 467 |
+
'bbox': table_block['bbox']
|
| 468 |
+
})
|
| 469 |
+
|
| 470 |
+
return processed_tables
|
| 471 |
+
|
| 472 |
+
def _extract_text_from_region_numpy(self, region_np: np.ndarray) -> str:
|
| 473 |
+
"""Extract text from numpy array region."""
|
| 474 |
+
try:
|
| 475 |
+
results = self.ocr_reader.readtext(region_np)
|
| 476 |
+
texts = []
|
| 477 |
+
for (_, text, confidence) in results:
|
| 478 |
+
if confidence > 0.5:
|
| 479 |
+
texts.append(text)
|
| 480 |
+
return ' '.join(texts)
|
| 481 |
+
except Exception as e:
|
| 482 |
+
logger.error(f"Failed to extract text from numpy region: {e}")
|
| 483 |
+
return ""
|
| 484 |
+
|
| 485 |
+
def _organize_table_data(self, table_data: list, table_out: dict) -> dict:
|
| 486 |
+
"""Organize table data into proper structure using row/col indices from tf_responses."""
|
| 487 |
+
try:
|
| 488 |
+
tf_responses = table_out.get("tf_responses", []) if isinstance(table_out, dict) else []
|
| 489 |
+
num_rows = table_out.get("predict_details", {}).get("num_rows", 0)
|
| 490 |
+
num_cols = table_out.get("predict_details", {}).get("num_cols", 0)
|
| 491 |
+
|
| 492 |
+
# Build empty grid
|
| 493 |
+
grid = [["" for _ in range(num_cols)] for _ in range(num_rows)]
|
| 494 |
+
|
| 495 |
+
# Place cell texts in the correct grid positions
|
| 496 |
+
for idx, element in enumerate(tf_responses):
|
| 497 |
+
row = element.get("start_row_offset_idx", 0)
|
| 498 |
+
col = element.get("start_col_offset_idx", 0)
|
| 499 |
+
# Use the extracted text if available, else fallback to element text
|
| 500 |
+
text = table_data[idx] if idx < len(table_data) else element.get("text", "")
|
| 501 |
+
grid[row][col] = text
|
| 502 |
+
|
| 503 |
+
return {
|
| 504 |
+
'type': 'structured_table',
|
| 505 |
+
'grid': grid,
|
| 506 |
+
'num_rows': num_rows,
|
| 507 |
+
'num_cols': num_cols
|
| 508 |
+
}
|
| 509 |
+
except Exception as e:
|
| 510 |
+
logger.error(f"Failed to organize table data: {e}")
|
| 511 |
+
return {
|
| 512 |
+
'type': 'simple_table',
|
| 513 |
+
'data': table_data
|
| 514 |
+
}
|
| 515 |
+
|
| 516 |
+
def _convert_table_to_markdown(self, table: dict) -> str:
|
| 517 |
+
"""Convert structured table to markdown format."""
|
| 518 |
+
if table['type'] != 'structured_table':
|
| 519 |
+
return f"**Table:** {table.get('text', '')}"
|
| 520 |
+
grid = table['grid']
|
| 521 |
+
if not grid or not grid[0]:
|
| 522 |
+
return ""
|
| 523 |
+
|
| 524 |
+
# Find the first non-empty row to use as header
|
| 525 |
+
header_row = None
|
| 526 |
+
for row in grid:
|
| 527 |
+
if any(cell.strip() for cell in row):
|
| 528 |
+
header_row = row
|
| 529 |
+
break
|
| 530 |
+
|
| 531 |
+
if not header_row:
|
| 532 |
+
return ""
|
| 533 |
+
|
| 534 |
+
# Use the header row as is (preserve all columns)
|
| 535 |
+
header_cells = [cell.strip() if cell else "" for cell in header_row]
|
| 536 |
+
|
| 537 |
+
markdown_lines = []
|
| 538 |
+
markdown_lines.append("| " + " | ".join(header_cells) + " |")
|
| 539 |
+
markdown_lines.append("|" + "|".join(["---"] * len(header_cells)) + "|")
|
| 540 |
+
|
| 541 |
+
# Add data rows (skip the header row)
|
| 542 |
+
header_index = grid.index(header_row)
|
| 543 |
+
for row in grid[header_index + 1:]:
|
| 544 |
+
cells = [cell.strip() if cell else "" for cell in row]
|
| 545 |
+
markdown_lines.append("| " + " | ".join(cells) + " |")
|
| 546 |
+
|
| 547 |
+
return '\n'.join(markdown_lines)
|
| 548 |
+
|
| 549 |
+
def _convert_to_structured_markdown_advanced(self, text_blocks: List, processed_tables: List[Dict], img_size: Tuple[int, int]) -> str:
|
| 550 |
+
"""Convert text blocks and tables to structured markdown."""
|
| 551 |
+
markdown_parts = []
|
| 552 |
+
|
| 553 |
+
# Sort all elements by position
|
| 554 |
+
all_elements = []
|
| 555 |
+
|
| 556 |
+
# Add text blocks
|
| 557 |
+
for block in text_blocks:
|
| 558 |
+
all_elements.append({
|
| 559 |
+
'type': 'text',
|
| 560 |
+
'element': block,
|
| 561 |
+
'y': block.y,
|
| 562 |
+
'x': block.x
|
| 563 |
+
})
|
| 564 |
+
|
| 565 |
+
# Add tables
|
| 566 |
+
for table in processed_tables:
|
| 567 |
+
if 'bbox' in table:
|
| 568 |
+
all_elements.append({
|
| 569 |
+
'type': 'table',
|
| 570 |
+
'element': table,
|
| 571 |
+
'y': table['bbox'][1],
|
| 572 |
+
'x': table['bbox'][0]
|
| 573 |
+
})
|
| 574 |
+
else:
|
| 575 |
+
logger.warning(f"Table has no bbox, skipping: {table}")
|
| 576 |
+
|
| 577 |
+
# Sort by position
|
| 578 |
+
all_elements.sort(key=lambda x: (x['y'], x['x']))
|
| 579 |
+
|
| 580 |
+
# Convert to markdown
|
| 581 |
+
for element in all_elements:
|
| 582 |
+
if element['type'] == 'text':
|
| 583 |
+
block = element['element']
|
| 584 |
+
text = block.text.strip()
|
| 585 |
+
if not text:
|
| 586 |
+
continue
|
| 587 |
+
|
| 588 |
+
if block.element_type == 'heading':
|
| 589 |
+
# Determine heading level based on font size/position
|
| 590 |
+
level = self._determine_heading_level(block)
|
| 591 |
+
markdown_parts.append(f"{'#' * level} {text}")
|
| 592 |
+
markdown_parts.append("")
|
| 593 |
+
elif block.element_type == 'list_item':
|
| 594 |
+
markdown_parts.append(f"- {text}")
|
| 595 |
+
else:
|
| 596 |
+
markdown_parts.append(text)
|
| 597 |
+
markdown_parts.append("")
|
| 598 |
+
|
| 599 |
+
elif element['type'] == 'table':
|
| 600 |
+
table = element['element']
|
| 601 |
+
if table['type'] == 'structured_table':
|
| 602 |
+
# Convert structured table to markdown
|
| 603 |
+
table_md = self._convert_table_to_markdown(table)
|
| 604 |
+
markdown_parts.append(table_md)
|
| 605 |
+
markdown_parts.append("")
|
| 606 |
+
else:
|
| 607 |
+
# Simple table
|
| 608 |
+
markdown_parts.append(f"**Table:** {table.get('text', '')}")
|
| 609 |
+
markdown_parts.append("")
|
| 610 |
+
|
| 611 |
+
return '\n'.join(markdown_parts)
|
| 612 |
+
|
| 613 |
+
def _determine_heading_level(self, block) -> int:
|
| 614 |
+
"""Determine heading level based on font size and position."""
|
| 615 |
+
# Simple heuristic: larger text or positioned at top = higher level
|
| 616 |
+
if block.y < 100: # Near top of page
|
| 617 |
+
return 1
|
| 618 |
+
elif block.height > 30: # Large text
|
| 619 |
+
return 2
|
| 620 |
+
else:
|
| 621 |
+
return 3
|
| 622 |
+
|
| 623 |
+
def _extract_text_from_region(self, img: Image.Image, bbox: List[float]) -> str:
|
| 624 |
+
"""Extract text from a specific region of the image."""
|
| 625 |
+
try:
|
| 626 |
+
# Crop the region
|
| 627 |
+
x1, y1, x2, y2 = bbox
|
| 628 |
+
region = img.crop((x1, y1, x2, y2))
|
| 629 |
+
|
| 630 |
+
# Convert PIL image to numpy array for easyocr
|
| 631 |
+
region_np = np.array(region)
|
| 632 |
+
|
| 633 |
+
# Use OCR on the region
|
| 634 |
+
results = self.ocr_reader.readtext(region_np)
|
| 635 |
+
texts = []
|
| 636 |
+
for (_, text, confidence) in results:
|
| 637 |
+
if confidence > 0.5:
|
| 638 |
+
texts.append(text)
|
| 639 |
+
|
| 640 |
+
return ' '.join(texts)
|
| 641 |
+
|
| 642 |
+
except Exception as e:
|
| 643 |
+
logger.error(f"Failed to extract text from region: {e}")
|
| 644 |
+
return ""
|
docstrange/pipeline/ocr_service.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OCR Service abstraction for neural document processing."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import logging
|
| 5 |
+
from abc import ABC, abstractmethod
|
| 6 |
+
from typing import List, Dict, Any, Optional
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class OCRService(ABC):
|
| 12 |
+
"""Abstract base class for OCR services."""
|
| 13 |
+
|
| 14 |
+
@abstractmethod
|
| 15 |
+
def extract_text(self, image_path: str) -> str:
|
| 16 |
+
"""Extract text from image.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
image_path: Path to the image file
|
| 20 |
+
|
| 21 |
+
Returns:
|
| 22 |
+
Extracted text as string
|
| 23 |
+
"""
|
| 24 |
+
pass
|
| 25 |
+
|
| 26 |
+
@abstractmethod
|
| 27 |
+
def extract_text_with_layout(self, image_path: str) -> str:
|
| 28 |
+
"""Extract text with layout awareness from image.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
image_path: Path to the image file
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
Layout-aware extracted text as markdown
|
| 35 |
+
"""
|
| 36 |
+
pass
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class NanonetsOCRService(OCRService):
|
| 40 |
+
"""Nanonets OCR implementation using NanonetsDocumentProcessor."""
|
| 41 |
+
|
| 42 |
+
def __init__(self):
|
| 43 |
+
"""Initialize the service."""
|
| 44 |
+
from .nanonets_processor import NanonetsDocumentProcessor
|
| 45 |
+
self._processor = NanonetsDocumentProcessor()
|
| 46 |
+
logger.info("NanonetsOCRService initialized")
|
| 47 |
+
|
| 48 |
+
@property
|
| 49 |
+
def model(self):
|
| 50 |
+
"""Get the Nanonets model."""
|
| 51 |
+
return self._processor.model
|
| 52 |
+
|
| 53 |
+
@property
|
| 54 |
+
def processor(self):
|
| 55 |
+
"""Get the Nanonets processor."""
|
| 56 |
+
return self._processor.processor
|
| 57 |
+
|
| 58 |
+
@property
|
| 59 |
+
def tokenizer(self):
|
| 60 |
+
"""Get the Nanonets tokenizer."""
|
| 61 |
+
return self._processor.tokenizer
|
| 62 |
+
|
| 63 |
+
def extract_text(self, image_path: str) -> str:
|
| 64 |
+
"""Extract text using Nanonets OCR."""
|
| 65 |
+
try:
|
| 66 |
+
# Validate image file
|
| 67 |
+
if not os.path.exists(image_path):
|
| 68 |
+
logger.error(f"Image file does not exist: {image_path}")
|
| 69 |
+
return ""
|
| 70 |
+
|
| 71 |
+
# Check if file is readable
|
| 72 |
+
try:
|
| 73 |
+
from PIL import Image
|
| 74 |
+
with Image.open(image_path) as img:
|
| 75 |
+
logger.info(f"Image loaded successfully: {img.size} {img.mode}")
|
| 76 |
+
except Exception as e:
|
| 77 |
+
logger.error(f"Failed to load image: {e}")
|
| 78 |
+
return ""
|
| 79 |
+
|
| 80 |
+
try:
|
| 81 |
+
text = self._processor.extract_text(image_path)
|
| 82 |
+
logger.info(f"Extracted text length: {len(text)}")
|
| 83 |
+
return text.strip()
|
| 84 |
+
except Exception as e:
|
| 85 |
+
logger.error(f"Nanonets OCR extraction failed: {e}")
|
| 86 |
+
return ""
|
| 87 |
+
|
| 88 |
+
except Exception as e:
|
| 89 |
+
logger.error(f"Nanonets OCR extraction failed: {e}")
|
| 90 |
+
return ""
|
| 91 |
+
|
| 92 |
+
def extract_text_with_layout(self, image_path: str) -> str:
|
| 93 |
+
"""Extract text with layout awareness using Nanonets OCR."""
|
| 94 |
+
try:
|
| 95 |
+
# Validate image file
|
| 96 |
+
if not os.path.exists(image_path):
|
| 97 |
+
logger.error(f"Image file does not exist: {image_path}")
|
| 98 |
+
return ""
|
| 99 |
+
|
| 100 |
+
# Check if file is readable
|
| 101 |
+
try:
|
| 102 |
+
from PIL import Image
|
| 103 |
+
with Image.open(image_path) as img:
|
| 104 |
+
logger.info(f"Image loaded successfully: {img.size} {img.mode}")
|
| 105 |
+
except Exception as e:
|
| 106 |
+
logger.error(f"Failed to load image: {e}")
|
| 107 |
+
return ""
|
| 108 |
+
|
| 109 |
+
try:
|
| 110 |
+
text = self._processor.extract_text_with_layout(image_path)
|
| 111 |
+
logger.info(f"Layout-aware extracted text length: {len(text)}")
|
| 112 |
+
return text.strip()
|
| 113 |
+
except Exception as e:
|
| 114 |
+
logger.error(f"Nanonets OCR layout-aware extraction failed: {e}")
|
| 115 |
+
return ""
|
| 116 |
+
|
| 117 |
+
except Exception as e:
|
| 118 |
+
logger.error(f"Nanonets OCR layout-aware extraction failed: {e}")
|
| 119 |
+
return ""
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
class NeuralOCRService(OCRService):
|
| 123 |
+
"""Neural OCR implementation using docling's pre-trained models."""
|
| 124 |
+
|
| 125 |
+
def __init__(self):
|
| 126 |
+
"""Initialize the service."""
|
| 127 |
+
from .neural_document_processor import NeuralDocumentProcessor
|
| 128 |
+
self._processor = NeuralDocumentProcessor()
|
| 129 |
+
logger.info("NeuralOCRService initialized")
|
| 130 |
+
|
| 131 |
+
def extract_text(self, image_path: str) -> str:
|
| 132 |
+
"""Extract text using Neural OCR (docling models)."""
|
| 133 |
+
try:
|
| 134 |
+
# Validate image file
|
| 135 |
+
if not os.path.exists(image_path):
|
| 136 |
+
logger.error(f"Image file does not exist: {image_path}")
|
| 137 |
+
return ""
|
| 138 |
+
|
| 139 |
+
# Check if file is readable
|
| 140 |
+
try:
|
| 141 |
+
from PIL import Image
|
| 142 |
+
with Image.open(image_path) as img:
|
| 143 |
+
logger.info(f"Image loaded successfully: {img.size} {img.mode}")
|
| 144 |
+
except Exception as e:
|
| 145 |
+
logger.error(f"Failed to load image: {e}")
|
| 146 |
+
return ""
|
| 147 |
+
|
| 148 |
+
try:
|
| 149 |
+
text = self._processor.extract_text(image_path)
|
| 150 |
+
logger.info(f"Extracted text length: {len(text)}")
|
| 151 |
+
return text.strip()
|
| 152 |
+
except Exception as e:
|
| 153 |
+
logger.error(f"Neural OCR extraction failed: {e}")
|
| 154 |
+
return ""
|
| 155 |
+
|
| 156 |
+
except Exception as e:
|
| 157 |
+
logger.error(f"Neural OCR extraction failed: {e}")
|
| 158 |
+
return ""
|
| 159 |
+
|
| 160 |
+
def extract_text_with_layout(self, image_path: str) -> str:
|
| 161 |
+
"""Extract text with layout awareness using Neural OCR."""
|
| 162 |
+
try:
|
| 163 |
+
# Validate image file
|
| 164 |
+
if not os.path.exists(image_path):
|
| 165 |
+
logger.error(f"Image file does not exist: {image_path}")
|
| 166 |
+
return ""
|
| 167 |
+
|
| 168 |
+
# Check if file is readable
|
| 169 |
+
try:
|
| 170 |
+
from PIL import Image
|
| 171 |
+
with Image.open(image_path) as img:
|
| 172 |
+
logger.info(f"Image loaded successfully: {img.size} {img.mode}")
|
| 173 |
+
except Exception as e:
|
| 174 |
+
logger.error(f"Failed to load image: {e}")
|
| 175 |
+
return ""
|
| 176 |
+
|
| 177 |
+
try:
|
| 178 |
+
text = self._processor.extract_text_with_layout(image_path)
|
| 179 |
+
logger.info(f"Layout-aware extracted text length: {len(text)}")
|
| 180 |
+
return text.strip()
|
| 181 |
+
except Exception as e:
|
| 182 |
+
logger.error(f"Neural OCR layout-aware extraction failed: {e}")
|
| 183 |
+
return ""
|
| 184 |
+
|
| 185 |
+
except Exception as e:
|
| 186 |
+
logger.error(f"Neural OCR layout-aware extraction failed: {e}")
|
| 187 |
+
return ""
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
class OCRServiceFactory:
|
| 191 |
+
"""Factory for creating OCR services based on configuration."""
|
| 192 |
+
|
| 193 |
+
@staticmethod
|
| 194 |
+
def create_service(provider: str = None) -> OCRService:
|
| 195 |
+
"""Create OCR service based on provider configuration.
|
| 196 |
+
|
| 197 |
+
Args:
|
| 198 |
+
provider: OCR provider name (defaults to config)
|
| 199 |
+
|
| 200 |
+
Returns:
|
| 201 |
+
OCRService instance
|
| 202 |
+
"""
|
| 203 |
+
from docstrange.config import InternalConfig
|
| 204 |
+
|
| 205 |
+
if provider is None:
|
| 206 |
+
provider = getattr(InternalConfig, 'ocr_provider', 'nanonets')
|
| 207 |
+
|
| 208 |
+
if provider.lower() == 'nanonets':
|
| 209 |
+
return NanonetsOCRService()
|
| 210 |
+
elif provider.lower() == 'neural':
|
| 211 |
+
return NeuralOCRService()
|
| 212 |
+
else:
|
| 213 |
+
raise ValueError(f"Unsupported OCR provider: {provider}")
|
| 214 |
+
|
| 215 |
+
@staticmethod
|
| 216 |
+
def get_available_providers() -> List[str]:
|
| 217 |
+
"""Get list of available OCR providers.
|
| 218 |
+
|
| 219 |
+
Returns:
|
| 220 |
+
List of available provider names
|
| 221 |
+
"""
|
| 222 |
+
return ['nanonets', 'neural']
|
docstrange/processors/__init__.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Processors for different file types."""
|
| 2 |
+
|
| 3 |
+
from .pdf_processor import PDFProcessor
|
| 4 |
+
from .docx_processor import DOCXProcessor
|
| 5 |
+
from .txt_processor import TXTProcessor
|
| 6 |
+
from .excel_processor import ExcelProcessor
|
| 7 |
+
from .url_processor import URLProcessor
|
| 8 |
+
from .html_processor import HTMLProcessor
|
| 9 |
+
from .pptx_processor import PPTXProcessor
|
| 10 |
+
from .image_processor import ImageProcessor
|
| 11 |
+
from .cloud_processor import CloudProcessor, CloudConversionResult
|
| 12 |
+
from .gpu_processor import GPUProcessor, GPUConversionResult
|
| 13 |
+
|
| 14 |
+
__all__ = [
|
| 15 |
+
"PDFProcessor",
|
| 16 |
+
"DOCXProcessor",
|
| 17 |
+
"TXTProcessor",
|
| 18 |
+
"ExcelProcessor",
|
| 19 |
+
"URLProcessor",
|
| 20 |
+
"HTMLProcessor",
|
| 21 |
+
"PPTXProcessor",
|
| 22 |
+
"ImageProcessor",
|
| 23 |
+
"CloudProcessor",
|
| 24 |
+
"CloudConversionResult",
|
| 25 |
+
"GPUProcessor",
|
| 26 |
+
"GPUConversionResult"
|
| 27 |
+
]
|
docstrange/processors/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (859 Bytes). View file
|
|
|
docstrange/processors/__pycache__/base.cpython-310.pyc
ADDED
|
Binary file (3.01 kB). View file
|
|
|
docstrange/processors/__pycache__/cloud_processor.cpython-310.pyc
ADDED
|
Binary file (11.4 kB). View file
|
|
|
docstrange/processors/__pycache__/docx_processor.cpython-310.pyc
ADDED
|
Binary file (5.52 kB). View file
|
|
|
docstrange/processors/__pycache__/excel_processor.cpython-310.pyc
ADDED
|
Binary file (5.49 kB). View file
|
|
|
docstrange/processors/__pycache__/gpu_processor.cpython-310.pyc
ADDED
|
Binary file (14 kB). View file
|
|
|
docstrange/processors/__pycache__/html_processor.cpython-310.pyc
ADDED
|
Binary file (2.36 kB). View file
|
|
|
docstrange/processors/__pycache__/image_processor.cpython-310.pyc
ADDED
|
Binary file (3.84 kB). View file
|
|
|
docstrange/processors/__pycache__/pdf_processor.cpython-310.pyc
ADDED
|
Binary file (4.54 kB). View file
|
|
|
docstrange/processors/__pycache__/pptx_processor.cpython-310.pyc
ADDED
|
Binary file (4.22 kB). View file
|
|
|
docstrange/processors/__pycache__/txt_processor.cpython-310.pyc
ADDED
|
Binary file (2.92 kB). View file
|
|
|
docstrange/processors/__pycache__/url_processor.cpython-310.pyc
ADDED
|
Binary file (8.74 kB). View file
|
|
|
docstrange/processors/base.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Base processor class for document conversion."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import logging
|
| 5 |
+
from abc import ABC, abstractmethod
|
| 6 |
+
from typing import Any, Dict, Optional
|
| 7 |
+
|
| 8 |
+
from ..result import ConversionResult
|
| 9 |
+
from docstrange.config import InternalConfig
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class BaseProcessor(ABC):
|
| 15 |
+
"""Base class for all document processors."""
|
| 16 |
+
|
| 17 |
+
def __init__(self, preserve_layout: bool = True, include_images: bool = False, ocr_enabled: bool = True, use_markdownify: bool = InternalConfig.use_markdownify):
|
| 18 |
+
"""Initialize the processor.
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
preserve_layout: Whether to preserve document layout
|
| 22 |
+
include_images: Whether to include images in output
|
| 23 |
+
ocr_enabled: Whether to enable OCR for image processing
|
| 24 |
+
use_markdownify: Whether to use markdownify for HTML->Markdown conversion
|
| 25 |
+
"""
|
| 26 |
+
self.preserve_layout = preserve_layout
|
| 27 |
+
self.include_images = include_images
|
| 28 |
+
self.ocr_enabled = ocr_enabled
|
| 29 |
+
self.use_markdownify = use_markdownify
|
| 30 |
+
|
| 31 |
+
@abstractmethod
|
| 32 |
+
def can_process(self, file_path: str) -> bool:
|
| 33 |
+
"""Check if this processor can handle the given file.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
file_path: Path to the file to check
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
True if this processor can handle the file
|
| 40 |
+
"""
|
| 41 |
+
pass
|
| 42 |
+
|
| 43 |
+
@abstractmethod
|
| 44 |
+
def process(self, file_path: str) -> ConversionResult:
|
| 45 |
+
"""Process the file and return a conversion result.
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
file_path: Path to the file to process
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
ConversionResult containing the processed content
|
| 52 |
+
|
| 53 |
+
Raises:
|
| 54 |
+
ConversionError: If processing fails
|
| 55 |
+
"""
|
| 56 |
+
pass
|
| 57 |
+
|
| 58 |
+
def get_metadata(self, file_path: str) -> Dict[str, Any]:
|
| 59 |
+
"""Get metadata about the file.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
file_path: Path to the file
|
| 63 |
+
|
| 64 |
+
Returns:
|
| 65 |
+
Dictionary containing file metadata
|
| 66 |
+
"""
|
| 67 |
+
try:
|
| 68 |
+
file_stat = os.stat(file_path)
|
| 69 |
+
# Ensure file_path is a string for splitext
|
| 70 |
+
file_path_str = str(file_path)
|
| 71 |
+
return {
|
| 72 |
+
"file_size": file_stat.st_size,
|
| 73 |
+
"file_extension": os.path.splitext(file_path_str)[1].lower(),
|
| 74 |
+
"file_name": os.path.basename(file_path_str),
|
| 75 |
+
"processor": self.__class__.__name__,
|
| 76 |
+
"preserve_layout": self.preserve_layout,
|
| 77 |
+
"include_images": self.include_images,
|
| 78 |
+
"ocr_enabled": self.ocr_enabled
|
| 79 |
+
}
|
| 80 |
+
except Exception as e:
|
| 81 |
+
logger.warning(f"Failed to get metadata for {file_path}: {e}")
|
| 82 |
+
return {
|
| 83 |
+
"processor": self.__class__.__name__,
|
| 84 |
+
"preserve_layout": self.preserve_layout,
|
| 85 |
+
"include_images": self.include_images,
|
| 86 |
+
"ocr_enabled": self.ocr_enabled
|
| 87 |
+
}
|
docstrange/processors/cloud_processor.py
ADDED
|
@@ -0,0 +1,399 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Cloud processor for Nanonets API integration with API key pool rotation and local fallback."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import requests
|
| 5 |
+
import json
|
| 6 |
+
import logging
|
| 7 |
+
import time
|
| 8 |
+
from typing import Dict, Any, Optional, List
|
| 9 |
+
|
| 10 |
+
from .base import BaseProcessor
|
| 11 |
+
from ..result import ConversionResult
|
| 12 |
+
from ..exceptions import ConversionError
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
# Default reset time for rate-limited keys (1 hour)
|
| 17 |
+
DEFAULT_RATE_LIMIT_RESET = 3600
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class CloudConversionResult(ConversionResult):
|
| 21 |
+
"""Enhanced ConversionResult for cloud mode with lazy API calls, key rotation, and local fallback."""
|
| 22 |
+
|
| 23 |
+
def __init__(self, file_path: str, cloud_processor: 'CloudProcessor', metadata: Optional[Dict[str, Any]] = None,
|
| 24 |
+
api_key_pool=None, local_fallback_processor=None):
|
| 25 |
+
# Initialize with empty content - we'll make API calls on demand
|
| 26 |
+
super().__init__("", metadata)
|
| 27 |
+
self.file_path = file_path
|
| 28 |
+
self.cloud_processor = cloud_processor
|
| 29 |
+
self.api_key_pool = api_key_pool
|
| 30 |
+
self.local_fallback_processor = local_fallback_processor # GPU processor or None
|
| 31 |
+
self._cached_outputs = {} # Cache API responses by output type
|
| 32 |
+
self._used_fallback = False # Track if we fell back to local processing
|
| 33 |
+
|
| 34 |
+
def _get_cloud_output(self, output_type: str, specified_fields: Optional[list] = None, json_schema: Optional[dict] = None) -> str:
|
| 35 |
+
"""Get output from cloud API for specific type, with caching, key rotation, and local fallback."""
|
| 36 |
+
# Validate output type
|
| 37 |
+
valid_output_types = ["markdown", "flat-json", "html", "csv", "specified-fields", "specified-json"]
|
| 38 |
+
if output_type not in valid_output_types:
|
| 39 |
+
logger.warning(f"Invalid output type '{output_type}' for cloud API. Using 'markdown'.")
|
| 40 |
+
output_type = "markdown"
|
| 41 |
+
|
| 42 |
+
# Create cache key based on output type and parameters
|
| 43 |
+
cache_key = output_type
|
| 44 |
+
if specified_fields:
|
| 45 |
+
cache_key += f"_fields_{','.join(specified_fields)}"
|
| 46 |
+
if json_schema:
|
| 47 |
+
cache_key += f"_schema_{hash(str(json_schema))}"
|
| 48 |
+
|
| 49 |
+
if cache_key in self._cached_outputs:
|
| 50 |
+
return self._cached_outputs[cache_key]
|
| 51 |
+
|
| 52 |
+
# If we already fell back to local, skip cloud
|
| 53 |
+
if self._used_fallback:
|
| 54 |
+
return self._convert_locally(output_type)
|
| 55 |
+
|
| 56 |
+
# Try cloud API with key rotation
|
| 57 |
+
last_error = None
|
| 58 |
+
keys_tried = set()
|
| 59 |
+
|
| 60 |
+
while True:
|
| 61 |
+
# Get next available key from pool
|
| 62 |
+
current_key = None
|
| 63 |
+
if self.api_key_pool:
|
| 64 |
+
current_key = self.api_key_pool.get_next_key()
|
| 65 |
+
|
| 66 |
+
# Also try the processor's own key if set
|
| 67 |
+
if not current_key and self.cloud_processor.api_key:
|
| 68 |
+
current_key = self.cloud_processor.api_key
|
| 69 |
+
|
| 70 |
+
if not current_key:
|
| 71 |
+
logger.info("No API keys available, falling back to local processing")
|
| 72 |
+
return self._convert_locally(output_type)
|
| 73 |
+
|
| 74 |
+
# Don't try the same key twice in one cycle
|
| 75 |
+
if current_key in keys_tried:
|
| 76 |
+
logger.info("All API keys rate limited, falling back to local processing")
|
| 77 |
+
return self._convert_locally(output_type)
|
| 78 |
+
|
| 79 |
+
keys_tried.add(current_key)
|
| 80 |
+
|
| 81 |
+
try:
|
| 82 |
+
# Prepare headers
|
| 83 |
+
headers = {}
|
| 84 |
+
if current_key:
|
| 85 |
+
headers['Authorization'] = f'Bearer {current_key}'
|
| 86 |
+
|
| 87 |
+
# Prepare file for upload
|
| 88 |
+
with open(self.file_path, 'rb') as file:
|
| 89 |
+
files = {
|
| 90 |
+
'file': (os.path.basename(self.file_path), file, self.cloud_processor._get_content_type(self.file_path))
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
data = {
|
| 94 |
+
'output_type': output_type
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
# Add model_type if specified
|
| 98 |
+
if self.cloud_processor.model_type:
|
| 99 |
+
data['model_type'] = self.cloud_processor.model_type
|
| 100 |
+
|
| 101 |
+
# Add field extraction parameters
|
| 102 |
+
if output_type == "specified-fields" and specified_fields:
|
| 103 |
+
data['specified_fields'] = ','.join(specified_fields)
|
| 104 |
+
elif output_type == "specified-json" and json_schema:
|
| 105 |
+
data['json_schema'] = json.dumps(json_schema)
|
| 106 |
+
|
| 107 |
+
log_prefix = f"API key {current_key[:8]}..." if current_key else "no auth"
|
| 108 |
+
logger.info(f"Making cloud API call ({log_prefix}) for {output_type} on {self.file_path}")
|
| 109 |
+
|
| 110 |
+
# Make API request
|
| 111 |
+
response = requests.post(
|
| 112 |
+
self.cloud_processor.api_url,
|
| 113 |
+
headers=headers,
|
| 114 |
+
files=files,
|
| 115 |
+
data=data,
|
| 116 |
+
timeout=300
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
# Handle rate limiting (429) - mark key as limited and try next
|
| 120 |
+
if response.status_code == 429:
|
| 121 |
+
# Mark this key as rate limited in the pool
|
| 122 |
+
if self.api_key_pool:
|
| 123 |
+
self.api_key_pool.mark_key_rate_limited(current_key, DEFAULT_RATE_LIMIT_RESET)
|
| 124 |
+
|
| 125 |
+
# Also mark the processor's key if it matches
|
| 126 |
+
if self.cloud_processor.api_key == current_key:
|
| 127 |
+
logger.warning(f"Processor API key rate limited, will try pool keys")
|
| 128 |
+
|
| 129 |
+
logger.warning(f"API key {current_key[:8]}... rate limited, trying next key...")
|
| 130 |
+
last_error = f"Rate limited (429)"
|
| 131 |
+
continue
|
| 132 |
+
|
| 133 |
+
response.raise_for_status()
|
| 134 |
+
result_data = response.json()
|
| 135 |
+
|
| 136 |
+
# Extract content from response
|
| 137 |
+
content = self.cloud_processor._extract_content_from_response(result_data)
|
| 138 |
+
|
| 139 |
+
# Cache the result
|
| 140 |
+
self._cached_outputs[cache_key] = content
|
| 141 |
+
return content
|
| 142 |
+
|
| 143 |
+
except requests.exceptions.HTTPError as e:
|
| 144 |
+
if '429' in str(e):
|
| 145 |
+
if self.api_key_pool:
|
| 146 |
+
self.api_key_pool.mark_key_rate_limited(current_key, DEFAULT_RATE_LIMIT_RESET)
|
| 147 |
+
logger.warning(f"API key {current_key[:8]}... rate limited (HTTPError), trying next key...")
|
| 148 |
+
last_error = str(e)
|
| 149 |
+
continue
|
| 150 |
+
else:
|
| 151 |
+
logger.error(f"Cloud API HTTP error: {e}")
|
| 152 |
+
last_error = str(e)
|
| 153 |
+
break
|
| 154 |
+
except Exception as e:
|
| 155 |
+
logger.error(f"Cloud API call failed: {e}")
|
| 156 |
+
last_error = str(e)
|
| 157 |
+
break
|
| 158 |
+
|
| 159 |
+
# All keys exhausted, fall back to local processing
|
| 160 |
+
logger.warning(f"All API keys rate limited or failed. Falling back to local Docling processing.")
|
| 161 |
+
self._used_fallback = True
|
| 162 |
+
return self._convert_locally(output_type)
|
| 163 |
+
|
| 164 |
+
def _convert_locally(self, output_type: str) -> str:
|
| 165 |
+
"""Fallback to local Docling/GPU conversion methods."""
|
| 166 |
+
self._used_fallback = True
|
| 167 |
+
|
| 168 |
+
# Try the local fallback processor (GPU processor with Docling models)
|
| 169 |
+
if self.local_fallback_processor:
|
| 170 |
+
try:
|
| 171 |
+
logger.info(f"Using local Docling processor for fallback on {self.file_path}")
|
| 172 |
+
local_result = self.local_fallback_processor.process(self.file_path)
|
| 173 |
+
|
| 174 |
+
if output_type == "html":
|
| 175 |
+
return local_result.extract_html()
|
| 176 |
+
elif output_type == "flat-json":
|
| 177 |
+
return json.dumps(local_result.extract_data(), indent=2)
|
| 178 |
+
elif output_type == "csv":
|
| 179 |
+
return local_result.extract_csv(include_all_tables=True)
|
| 180 |
+
else:
|
| 181 |
+
return local_result.extract_markdown()
|
| 182 |
+
except Exception as e:
|
| 183 |
+
logger.error(f"Local Docling fallback also failed: {e}")
|
| 184 |
+
|
| 185 |
+
# Last resort: use parent class methods
|
| 186 |
+
if output_type == "html":
|
| 187 |
+
return super().extract_html()
|
| 188 |
+
elif output_type == "flat-json":
|
| 189 |
+
return json.dumps(super().extract_data(), indent=2)
|
| 190 |
+
elif output_type == "csv":
|
| 191 |
+
return super().extract_csv(include_all_tables=True)
|
| 192 |
+
else:
|
| 193 |
+
return self.content
|
| 194 |
+
|
| 195 |
+
def extract_markdown(self) -> str:
|
| 196 |
+
"""Export as markdown."""
|
| 197 |
+
return self._get_cloud_output("markdown")
|
| 198 |
+
|
| 199 |
+
def extract_html(self) -> str:
|
| 200 |
+
"""Export as HTML."""
|
| 201 |
+
return self._get_cloud_output("html")
|
| 202 |
+
|
| 203 |
+
def extract_data(self, specified_fields: Optional[list] = None, json_schema: Optional[dict] = None) -> Dict[str, Any]:
|
| 204 |
+
"""Export as structured JSON with optional field extraction.
|
| 205 |
+
|
| 206 |
+
Args:
|
| 207 |
+
specified_fields: Optional list of specific fields to extract
|
| 208 |
+
json_schema: Optional JSON schema defining fields and types to extract
|
| 209 |
+
|
| 210 |
+
Returns:
|
| 211 |
+
Structured JSON with extracted data
|
| 212 |
+
"""
|
| 213 |
+
try:
|
| 214 |
+
if specified_fields:
|
| 215 |
+
# Request specified fields extraction
|
| 216 |
+
content = self._get_cloud_output("specified-fields", specified_fields=specified_fields)
|
| 217 |
+
extracted_data = json.loads(content)
|
| 218 |
+
return {
|
| 219 |
+
"extracted_fields": extracted_data,
|
| 220 |
+
"format": "specified_fields"
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
elif json_schema:
|
| 224 |
+
# Request JSON schema extraction
|
| 225 |
+
content = self._get_cloud_output("specified-json", json_schema=json_schema)
|
| 226 |
+
extracted_data = json.loads(content)
|
| 227 |
+
return {
|
| 228 |
+
"structured_data": extracted_data,
|
| 229 |
+
"format": "structured_json"
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
else:
|
| 233 |
+
# Standard JSON extraction
|
| 234 |
+
json_content = self._get_cloud_output("flat-json")
|
| 235 |
+
parsed_content = json.loads(json_content)
|
| 236 |
+
return {
|
| 237 |
+
"document": parsed_content,
|
| 238 |
+
"format": "cloud_flat_json"
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
except Exception as e:
|
| 242 |
+
logger.error(f"Failed to parse JSON content: {e}")
|
| 243 |
+
return {
|
| 244 |
+
"document": {"raw_content": content if 'content' in locals() else ""},
|
| 245 |
+
"format": "json_parse_error",
|
| 246 |
+
"error": str(e)
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def extract_text(self) -> str:
|
| 252 |
+
"""Export as plain text."""
|
| 253 |
+
# For text output, we can try markdown first and then extract to text
|
| 254 |
+
try:
|
| 255 |
+
return self._get_cloud_output("markdown")
|
| 256 |
+
except Exception as e:
|
| 257 |
+
logger.error(f"Failed to get text output: {e}")
|
| 258 |
+
return ""
|
| 259 |
+
|
| 260 |
+
def extract_csv(self, table_index: int = 0, include_all_tables: bool = False) -> str:
|
| 261 |
+
"""Export tables as CSV format.
|
| 262 |
+
|
| 263 |
+
Args:
|
| 264 |
+
table_index: Which table to export (0-based index). Default is 0 (first table).
|
| 265 |
+
include_all_tables: If True, export all tables with separators. Default is False.
|
| 266 |
+
|
| 267 |
+
Returns:
|
| 268 |
+
CSV formatted string of the table(s)
|
| 269 |
+
|
| 270 |
+
Raises:
|
| 271 |
+
ValueError: If no tables are found or table_index is out of range
|
| 272 |
+
"""
|
| 273 |
+
return self._get_cloud_output("csv")
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
class CloudProcessor(BaseProcessor):
|
| 277 |
+
"""Processor for cloud-based document conversion using Nanonets API with API key pool rotation."""
|
| 278 |
+
|
| 279 |
+
def __init__(self, api_key: Optional[str] = None, output_type: str = None, model_type: Optional[str] = None,
|
| 280 |
+
specified_fields: Optional[list] = None, json_schema: Optional[dict] = None,
|
| 281 |
+
api_key_pool=None, local_fallback_processor=None, **kwargs):
|
| 282 |
+
"""Initialize the cloud processor.
|
| 283 |
+
|
| 284 |
+
Args:
|
| 285 |
+
api_key: API key for cloud processing (optional - uses rate-limited free tier without key)
|
| 286 |
+
output_type: Output type for cloud processing (markdown, flat-json, html, csv, specified-fields, specified-json)
|
| 287 |
+
model_type: Model type for cloud processing (gemini, openapi, nanonets)
|
| 288 |
+
specified_fields: List of fields to extract (for specified-fields output type)
|
| 289 |
+
json_schema: JSON schema defining fields and types to extract (for specified-json output type)
|
| 290 |
+
api_key_pool: ApiKeyPool instance for key rotation
|
| 291 |
+
local_fallback_processor: Local processor (GPU/Docling) for fallback when all keys exhausted
|
| 292 |
+
"""
|
| 293 |
+
super().__init__(**kwargs)
|
| 294 |
+
self.api_key = api_key
|
| 295 |
+
self.output_type = output_type
|
| 296 |
+
self.model_type = model_type
|
| 297 |
+
self.specified_fields = specified_fields
|
| 298 |
+
self.json_schema = json_schema
|
| 299 |
+
self.api_key_pool = api_key_pool
|
| 300 |
+
self.local_fallback_processor = local_fallback_processor
|
| 301 |
+
self.api_url = "https://extraction-api.nanonets.com/extract"
|
| 302 |
+
|
| 303 |
+
# Don't validate output_type during initialization - it will be validated during processing
|
| 304 |
+
# This prevents warnings during DocumentExtractor initialization
|
| 305 |
+
|
| 306 |
+
def can_process(self, file_path: str) -> bool:
|
| 307 |
+
"""Check if the processor can handle the file."""
|
| 308 |
+
# Cloud processor supports most common document formats
|
| 309 |
+
# API key is optional - without it, uses rate-limited free tier
|
| 310 |
+
supported_extensions = {
|
| 311 |
+
'.pdf', '.docx', '.doc', '.xlsx', '.xls', '.pptx', '.ppt',
|
| 312 |
+
'.txt', '.html', '.htm', '.png', '.jpg', '.jpeg', '.gif',
|
| 313 |
+
'.bmp', '.tiff', '.tif'
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
_, ext = os.path.splitext(file_path.lower())
|
| 317 |
+
return ext in supported_extensions
|
| 318 |
+
|
| 319 |
+
def process(self, file_path: str) -> CloudConversionResult:
|
| 320 |
+
"""Create a lazy CloudConversionResult that will make API calls on demand with key rotation.
|
| 321 |
+
|
| 322 |
+
Args:
|
| 323 |
+
file_path: Path to the file to process
|
| 324 |
+
|
| 325 |
+
Returns:
|
| 326 |
+
CloudConversionResult that makes API calls when output methods are called
|
| 327 |
+
|
| 328 |
+
Raises:
|
| 329 |
+
ConversionError: If file doesn't exist
|
| 330 |
+
"""
|
| 331 |
+
if not os.path.exists(file_path):
|
| 332 |
+
raise ConversionError(f"File not found: {file_path}")
|
| 333 |
+
|
| 334 |
+
# Create metadata without making any API calls
|
| 335 |
+
metadata = {
|
| 336 |
+
'source_file': file_path,
|
| 337 |
+
'processing_mode': 'cloud',
|
| 338 |
+
'api_provider': 'nanonets',
|
| 339 |
+
'file_size': os.path.getsize(file_path),
|
| 340 |
+
'model_type': self.model_type,
|
| 341 |
+
'has_api_key': bool(self.api_key),
|
| 342 |
+
'key_rotation': True,
|
| 343 |
+
'local_fallback': self.local_fallback_processor is not None
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
if self.api_key:
|
| 347 |
+
logger.info(f"Created cloud extractor for {file_path} with API key pool rotation")
|
| 348 |
+
else:
|
| 349 |
+
logger.info(f"Created cloud extractor for {file_path} without API key - will use pool + local fallback")
|
| 350 |
+
|
| 351 |
+
# Return lazy result with key pool and local fallback
|
| 352 |
+
return CloudConversionResult(
|
| 353 |
+
file_path=file_path,
|
| 354 |
+
cloud_processor=self,
|
| 355 |
+
metadata=metadata,
|
| 356 |
+
api_key_pool=self.api_key_pool,
|
| 357 |
+
local_fallback_processor=self.local_fallback_processor
|
| 358 |
+
)
|
| 359 |
+
|
| 360 |
+
def _extract_content_from_response(self, response_data: Dict[str, Any]) -> str:
|
| 361 |
+
"""Extract content from API response."""
|
| 362 |
+
try:
|
| 363 |
+
# API always returns content in the 'content' field
|
| 364 |
+
if 'content' in response_data:
|
| 365 |
+
return response_data['content']
|
| 366 |
+
|
| 367 |
+
# Fallback: return whole response as JSON if no content field
|
| 368 |
+
logger.warning("No 'content' field found in API response, returning full response")
|
| 369 |
+
return json.dumps(response_data, indent=2)
|
| 370 |
+
|
| 371 |
+
except Exception as e:
|
| 372 |
+
logger.error(f"Failed to extract content from API response: {e}")
|
| 373 |
+
return json.dumps(response_data, indent=2)
|
| 374 |
+
|
| 375 |
+
def _get_content_type(self, file_path: str) -> str:
|
| 376 |
+
"""Get content type for file upload."""
|
| 377 |
+
_, ext = os.path.splitext(file_path.lower())
|
| 378 |
+
|
| 379 |
+
content_types = {
|
| 380 |
+
'.pdf': 'application/pdf',
|
| 381 |
+
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
| 382 |
+
'.doc': 'application/msword',
|
| 383 |
+
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
| 384 |
+
'.xls': 'application/vnd.ms-excel',
|
| 385 |
+
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
| 386 |
+
'.ppt': 'application/vnd.ms-powerpoint',
|
| 387 |
+
'.txt': 'text/plain',
|
| 388 |
+
'.html': 'text/html',
|
| 389 |
+
'.htm': 'text/html',
|
| 390 |
+
'.png': 'image/png',
|
| 391 |
+
'.jpg': 'image/jpeg',
|
| 392 |
+
'.jpeg': 'image/jpeg',
|
| 393 |
+
'.gif': 'image/gif',
|
| 394 |
+
'.bmp': 'image/bmp',
|
| 395 |
+
'.tiff': 'image/tiff',
|
| 396 |
+
'.tif': 'image/tiff'
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
return content_types.get(ext, 'application/octet-stream')
|
docstrange/processors/docx_processor.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""DOCX file processor."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from typing import Dict, Any
|
| 5 |
+
|
| 6 |
+
from .base import BaseProcessor
|
| 7 |
+
from ..result import ConversionResult
|
| 8 |
+
from ..exceptions import ConversionError, FileNotFoundError
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class DOCXProcessor(BaseProcessor):
|
| 12 |
+
"""Processor for Microsoft Word DOCX and DOC files."""
|
| 13 |
+
|
| 14 |
+
def can_process(self, file_path: str) -> bool:
|
| 15 |
+
"""Check if this processor can handle the given file.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
file_path: Path to the file to check
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
True if this processor can handle the file
|
| 22 |
+
"""
|
| 23 |
+
if not os.path.exists(file_path):
|
| 24 |
+
return False
|
| 25 |
+
|
| 26 |
+
# Check file extension - ensure file_path is a string
|
| 27 |
+
file_path_str = str(file_path)
|
| 28 |
+
_, ext = os.path.splitext(file_path_str.lower())
|
| 29 |
+
return ext in ['.docx', '.doc']
|
| 30 |
+
|
| 31 |
+
def process(self, file_path: str) -> ConversionResult:
|
| 32 |
+
"""Process the DOCX file and return a conversion result.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
file_path: Path to the DOCX file to process
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
ConversionResult containing the processed content
|
| 39 |
+
|
| 40 |
+
Raises:
|
| 41 |
+
FileNotFoundError: If the file doesn't exist
|
| 42 |
+
ConversionError: If processing fails
|
| 43 |
+
"""
|
| 44 |
+
if not os.path.exists(file_path):
|
| 45 |
+
raise FileNotFoundError(f"File not found: {file_path}")
|
| 46 |
+
|
| 47 |
+
# Initialize metadata
|
| 48 |
+
metadata = {
|
| 49 |
+
"file_path": file_path,
|
| 50 |
+
"file_size": os.path.getsize(file_path),
|
| 51 |
+
"processor": "DOCXProcessor"
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
# Check file extension - ensure file_path is a string
|
| 55 |
+
file_path_str = str(file_path)
|
| 56 |
+
_, ext = os.path.splitext(file_path_str.lower())
|
| 57 |
+
|
| 58 |
+
if ext == '.doc':
|
| 59 |
+
return self._process_doc_file(file_path, metadata)
|
| 60 |
+
else:
|
| 61 |
+
return self._process_docx_file(file_path, metadata)
|
| 62 |
+
|
| 63 |
+
def _process_doc_file(self, file_path: str, metadata: Dict[str, Any]) -> ConversionResult:
|
| 64 |
+
"""Process .doc files using pypandoc."""
|
| 65 |
+
try:
|
| 66 |
+
import pypandoc
|
| 67 |
+
|
| 68 |
+
# Convert .doc to markdown using pandoc
|
| 69 |
+
content = pypandoc.convert_file(file_path, 'markdown')
|
| 70 |
+
|
| 71 |
+
metadata.update({
|
| 72 |
+
"file_type": "doc",
|
| 73 |
+
"extractor": "pypandoc"
|
| 74 |
+
})
|
| 75 |
+
|
| 76 |
+
# Clean up the content
|
| 77 |
+
content = self._clean_content(content)
|
| 78 |
+
|
| 79 |
+
return ConversionResult(content, metadata)
|
| 80 |
+
|
| 81 |
+
except ImportError:
|
| 82 |
+
raise ConversionError("pypandoc is required for .doc file processing. Install it with: pip install pypandoc")
|
| 83 |
+
except Exception as e:
|
| 84 |
+
raise ConversionError(f"Failed to process .doc file {file_path}: {str(e)}")
|
| 85 |
+
|
| 86 |
+
def _process_docx_file(self, file_path: str, metadata: Dict[str, Any]) -> ConversionResult:
|
| 87 |
+
"""Process .docx files using python-docx with improved table extraction."""
|
| 88 |
+
try:
|
| 89 |
+
from docx import Document
|
| 90 |
+
|
| 91 |
+
content_parts = []
|
| 92 |
+
doc = Document(file_path)
|
| 93 |
+
|
| 94 |
+
metadata.update({
|
| 95 |
+
"paragraph_count": len(doc.paragraphs),
|
| 96 |
+
"section_count": len(doc.sections),
|
| 97 |
+
"file_type": "docx",
|
| 98 |
+
"extractor": "python-docx"
|
| 99 |
+
})
|
| 100 |
+
|
| 101 |
+
# Extract text from paragraphs
|
| 102 |
+
for paragraph in doc.paragraphs:
|
| 103 |
+
if paragraph.text.strip():
|
| 104 |
+
# Check if this is a heading
|
| 105 |
+
if paragraph.style.name.startswith('Heading'):
|
| 106 |
+
level = paragraph.style.name.replace('Heading ', '')
|
| 107 |
+
try:
|
| 108 |
+
level_num = int(level)
|
| 109 |
+
content_parts.append(f"\n{'#' * min(level_num, 6)} {paragraph.text}\n")
|
| 110 |
+
except ValueError:
|
| 111 |
+
content_parts.append(f"\n## {paragraph.text}\n")
|
| 112 |
+
else:
|
| 113 |
+
content_parts.append(paragraph.text)
|
| 114 |
+
|
| 115 |
+
# Extract text from tables (improved)
|
| 116 |
+
for table_idx, table in enumerate(doc.tables):
|
| 117 |
+
# Check if preserve_layout is available (from base class or config)
|
| 118 |
+
preserve_layout = getattr(self, 'preserve_layout', False)
|
| 119 |
+
if preserve_layout:
|
| 120 |
+
content_parts.append(f"\n### Table {table_idx+1}\n")
|
| 121 |
+
|
| 122 |
+
# Gather all rows
|
| 123 |
+
rows = table.rows
|
| 124 |
+
if not rows:
|
| 125 |
+
continue
|
| 126 |
+
|
| 127 |
+
# Detect merged cells (optional warning)
|
| 128 |
+
merged_warning = False
|
| 129 |
+
for row in rows:
|
| 130 |
+
for cell in row.cells:
|
| 131 |
+
if len(cell._tc.xpath('.//w:vMerge')) > 0 or len(cell._tc.xpath('.//w:gridSpan')) > 0:
|
| 132 |
+
merged_warning = True
|
| 133 |
+
break
|
| 134 |
+
if merged_warning:
|
| 135 |
+
break
|
| 136 |
+
if merged_warning:
|
| 137 |
+
content_parts.append("*Warning: Table contains merged cells which may not render correctly in markdown.*\n")
|
| 138 |
+
|
| 139 |
+
# Row limit for large tables
|
| 140 |
+
row_limit = 20
|
| 141 |
+
if len(rows) > row_limit:
|
| 142 |
+
content_parts.append(f"*Table truncated to first {row_limit} rows out of {len(rows)} total.*\n")
|
| 143 |
+
|
| 144 |
+
# Build table data
|
| 145 |
+
table_data = []
|
| 146 |
+
for i, row in enumerate(rows):
|
| 147 |
+
if i >= row_limit:
|
| 148 |
+
break
|
| 149 |
+
row_data = [cell.text.strip().replace('\n', ' ') for cell in row.cells]
|
| 150 |
+
table_data.append(row_data)
|
| 151 |
+
|
| 152 |
+
# Ensure all rows have the same number of columns
|
| 153 |
+
max_cols = max(len(r) for r in table_data)
|
| 154 |
+
for r in table_data:
|
| 155 |
+
while len(r) < max_cols:
|
| 156 |
+
r.append("")
|
| 157 |
+
|
| 158 |
+
# Markdown table: first row as header
|
| 159 |
+
if table_data:
|
| 160 |
+
header = table_data[0]
|
| 161 |
+
separator = ["---"] * len(header)
|
| 162 |
+
content_parts.append("| " + " | ".join(header) + " |")
|
| 163 |
+
content_parts.append("| " + " | ".join(separator) + " |")
|
| 164 |
+
for row in table_data[1:]:
|
| 165 |
+
content_parts.append("| " + " | ".join(row) + " |")
|
| 166 |
+
content_parts.append("")
|
| 167 |
+
|
| 168 |
+
content = '\n'.join(content_parts)
|
| 169 |
+
content = self._clean_content(content)
|
| 170 |
+
return ConversionResult(content, metadata)
|
| 171 |
+
except ImportError:
|
| 172 |
+
raise ConversionError("python-docx is required for .docx file processing. Install it with: pip install python-docx")
|
| 173 |
+
except Exception as e:
|
| 174 |
+
raise ConversionError(f"Failed to process .docx file {file_path}: {str(e)}")
|
| 175 |
+
|
| 176 |
+
def _clean_content(self, content: str) -> str:
|
| 177 |
+
"""Clean up the extracted Word content.
|
| 178 |
+
|
| 179 |
+
Args:
|
| 180 |
+
content: Raw Word text content
|
| 181 |
+
|
| 182 |
+
Returns:
|
| 183 |
+
Cleaned text content
|
| 184 |
+
"""
|
| 185 |
+
# Remove excessive whitespace and normalize
|
| 186 |
+
lines = content.split('\n')
|
| 187 |
+
cleaned_lines = []
|
| 188 |
+
|
| 189 |
+
for line in lines:
|
| 190 |
+
# Remove excessive whitespace
|
| 191 |
+
line = ' '.join(line.split())
|
| 192 |
+
if line.strip():
|
| 193 |
+
cleaned_lines.append(line)
|
| 194 |
+
|
| 195 |
+
# Join lines and add proper spacing
|
| 196 |
+
content = '\n'.join(cleaned_lines)
|
| 197 |
+
|
| 198 |
+
# Add spacing around headers
|
| 199 |
+
content = content.replace('## ', '\n## ')
|
| 200 |
+
content = content.replace('### ', '\n### ')
|
| 201 |
+
|
| 202 |
+
return content.strip()
|
docstrange/processors/excel_processor.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Excel file processor."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import logging
|
| 5 |
+
from typing import Dict, Any
|
| 6 |
+
|
| 7 |
+
from .base import BaseProcessor
|
| 8 |
+
from ..result import ConversionResult
|
| 9 |
+
from ..exceptions import ConversionError, FileNotFoundError
|
| 10 |
+
|
| 11 |
+
# Configure logging
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class ExcelProcessor(BaseProcessor):
|
| 16 |
+
"""Processor for Excel files (XLSX, XLS) and CSV files."""
|
| 17 |
+
|
| 18 |
+
def can_process(self, file_path: str) -> bool:
|
| 19 |
+
"""Check if this processor can handle the given file.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
file_path: Path to the file to check
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
True if this processor can handle the file
|
| 26 |
+
"""
|
| 27 |
+
if not os.path.exists(file_path):
|
| 28 |
+
return False
|
| 29 |
+
|
| 30 |
+
# Check file extension - ensure file_path is a string
|
| 31 |
+
file_path_str = str(file_path)
|
| 32 |
+
_, ext = os.path.splitext(file_path_str.lower())
|
| 33 |
+
return ext in ['.xlsx', '.xls', '.csv']
|
| 34 |
+
|
| 35 |
+
def process(self, file_path: str) -> ConversionResult:
|
| 36 |
+
"""Process the Excel file and return a conversion result.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
file_path: Path to the Excel file to process
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
ConversionResult containing the processed content
|
| 43 |
+
|
| 44 |
+
Raises:
|
| 45 |
+
FileNotFoundError: If the file doesn't exist
|
| 46 |
+
ConversionError: If processing fails
|
| 47 |
+
"""
|
| 48 |
+
if not os.path.exists(file_path):
|
| 49 |
+
raise FileNotFoundError(f"File not found: {file_path}")
|
| 50 |
+
|
| 51 |
+
# Check file extension - ensure file_path is a string
|
| 52 |
+
file_path_str = str(file_path)
|
| 53 |
+
_, ext = os.path.splitext(file_path_str.lower())
|
| 54 |
+
|
| 55 |
+
if ext == '.csv':
|
| 56 |
+
return self._process_csv(file_path)
|
| 57 |
+
else:
|
| 58 |
+
return self._process_excel(file_path)
|
| 59 |
+
|
| 60 |
+
def _process_csv(self, file_path: str) -> ConversionResult:
|
| 61 |
+
"""Process a CSV file and return a conversion result.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
file_path: Path to the CSV file to process
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
ConversionResult containing the processed content
|
| 68 |
+
"""
|
| 69 |
+
try:
|
| 70 |
+
import pandas as pd
|
| 71 |
+
|
| 72 |
+
df = pd.read_csv(file_path)
|
| 73 |
+
content_parts = []
|
| 74 |
+
|
| 75 |
+
content_parts.append(f"# CSV Data: {os.path.basename(file_path)}")
|
| 76 |
+
content_parts.append("")
|
| 77 |
+
|
| 78 |
+
# Convert DataFrame to markdown table
|
| 79 |
+
table_md = self._dataframe_to_markdown(df, pd)
|
| 80 |
+
content_parts.append(table_md)
|
| 81 |
+
|
| 82 |
+
metadata = {
|
| 83 |
+
"row_count": len(df),
|
| 84 |
+
"column_count": len(df.columns),
|
| 85 |
+
"columns": df.columns.tolist(),
|
| 86 |
+
"extractor": "pandas"
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
content = '\n'.join(content_parts)
|
| 90 |
+
|
| 91 |
+
return ConversionResult(content, metadata)
|
| 92 |
+
|
| 93 |
+
except ImportError:
|
| 94 |
+
raise ConversionError("pandas is required for CSV processing. Install it with: pip install pandas")
|
| 95 |
+
except Exception as e:
|
| 96 |
+
raise ConversionError(f"Failed to process CSV file {file_path}: {str(e)}")
|
| 97 |
+
|
| 98 |
+
def _process_excel(self, file_path: str) -> ConversionResult:
|
| 99 |
+
"""Process an Excel file and return a conversion result.
|
| 100 |
+
|
| 101 |
+
Args:
|
| 102 |
+
file_path: Path to the Excel file to process
|
| 103 |
+
|
| 104 |
+
Returns:
|
| 105 |
+
ConversionResult containing the processed content
|
| 106 |
+
"""
|
| 107 |
+
try:
|
| 108 |
+
import pandas as pd
|
| 109 |
+
|
| 110 |
+
excel_file = pd.ExcelFile(file_path)
|
| 111 |
+
sheet_names = excel_file.sheet_names
|
| 112 |
+
|
| 113 |
+
metadata = {
|
| 114 |
+
"sheet_count": len(sheet_names),
|
| 115 |
+
"sheet_names": sheet_names,
|
| 116 |
+
"extractor": "pandas"
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
content_parts = []
|
| 120 |
+
|
| 121 |
+
for sheet_name in sheet_names:
|
| 122 |
+
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
| 123 |
+
if not df.empty:
|
| 124 |
+
content_parts.append(f"\n## Sheet: {sheet_name}")
|
| 125 |
+
content_parts.append("")
|
| 126 |
+
|
| 127 |
+
# Convert DataFrame to markdown table
|
| 128 |
+
table_md = self._dataframe_to_markdown(df, pd)
|
| 129 |
+
content_parts.append(table_md)
|
| 130 |
+
content_parts.append("")
|
| 131 |
+
|
| 132 |
+
# Add metadata for this sheet
|
| 133 |
+
metadata.update({
|
| 134 |
+
f"sheet_{sheet_name}_rows": len(df),
|
| 135 |
+
f"sheet_{sheet_name}_columns": len(df.columns),
|
| 136 |
+
f"sheet_{sheet_name}_columns_list": df.columns.tolist()
|
| 137 |
+
})
|
| 138 |
+
|
| 139 |
+
content = '\n'.join(content_parts)
|
| 140 |
+
|
| 141 |
+
return ConversionResult(content, metadata)
|
| 142 |
+
|
| 143 |
+
except ImportError:
|
| 144 |
+
raise ConversionError("pandas and openpyxl are required for Excel processing. Install them with: pip install pandas openpyxl")
|
| 145 |
+
except Exception as e:
|
| 146 |
+
if isinstance(e, (FileNotFoundError, ConversionError)):
|
| 147 |
+
raise
|
| 148 |
+
raise ConversionError(f"Failed to process Excel file {file_path}: {str(e)}")
|
| 149 |
+
|
| 150 |
+
def _dataframe_to_markdown(self, df, pd) -> str:
|
| 151 |
+
"""Convert pandas DataFrame to markdown table.
|
| 152 |
+
|
| 153 |
+
Args:
|
| 154 |
+
df: pandas DataFrame
|
| 155 |
+
pd: pandas module reference
|
| 156 |
+
|
| 157 |
+
Returns:
|
| 158 |
+
Markdown table string
|
| 159 |
+
"""
|
| 160 |
+
if df.empty:
|
| 161 |
+
return "*No data available*"
|
| 162 |
+
|
| 163 |
+
# Convert DataFrame to markdown table
|
| 164 |
+
markdown_parts = []
|
| 165 |
+
|
| 166 |
+
# Header
|
| 167 |
+
markdown_parts.append("| " + " | ".join(str(col) for col in df.columns) + " |")
|
| 168 |
+
markdown_parts.append("| " + " | ".join(["---"] * len(df.columns)) + " |")
|
| 169 |
+
|
| 170 |
+
# Data rows
|
| 171 |
+
for _, row in df.iterrows():
|
| 172 |
+
row_data = []
|
| 173 |
+
for cell in row:
|
| 174 |
+
if pd.isna(cell):
|
| 175 |
+
row_data.append("")
|
| 176 |
+
else:
|
| 177 |
+
row_data.append(str(cell))
|
| 178 |
+
markdown_parts.append("| " + " | ".join(row_data) + " |")
|
| 179 |
+
|
| 180 |
+
return "\n".join(markdown_parts)
|
| 181 |
+
|
| 182 |
+
def _clean_content(self, content: str) -> str:
|
| 183 |
+
"""Clean up the extracted Excel content.
|
| 184 |
+
|
| 185 |
+
Args:
|
| 186 |
+
content: Raw Excel text content
|
| 187 |
+
|
| 188 |
+
Returns:
|
| 189 |
+
Cleaned text content
|
| 190 |
+
"""
|
| 191 |
+
# Remove excessive whitespace and normalize
|
| 192 |
+
lines = content.split('\n')
|
| 193 |
+
cleaned_lines = []
|
| 194 |
+
|
| 195 |
+
for line in lines:
|
| 196 |
+
# Remove excessive whitespace
|
| 197 |
+
line = ' '.join(line.split())
|
| 198 |
+
if line.strip():
|
| 199 |
+
cleaned_lines.append(line)
|
| 200 |
+
|
| 201 |
+
# Join lines and add proper spacing
|
| 202 |
+
content = '\n'.join(cleaned_lines)
|
| 203 |
+
|
| 204 |
+
# Add spacing around headers
|
| 205 |
+
content = content.replace('# ', '\n# ')
|
| 206 |
+
content = content.replace('## ', '\n## ')
|
| 207 |
+
|
| 208 |
+
return content.strip()
|
docstrange/processors/gpu_processor.py
ADDED
|
@@ -0,0 +1,501 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""GPU processor with OCR capabilities for images and PDFs."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import json
|
| 5 |
+
import logging
|
| 6 |
+
import tempfile
|
| 7 |
+
import re
|
| 8 |
+
from typing import Dict, Any, List, Optional
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
from .base import BaseProcessor
|
| 12 |
+
from ..result import ConversionResult
|
| 13 |
+
from ..exceptions import ConversionError, FileNotFoundError
|
| 14 |
+
from ..pipeline.ocr_service import OCRServiceFactory
|
| 15 |
+
|
| 16 |
+
# Configure logging
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class GPUConversionResult(ConversionResult):
|
| 21 |
+
"""Enhanced ConversionResult for GPU processing with Nanonets OCR capabilities."""
|
| 22 |
+
|
| 23 |
+
def __init__(self, content: str, metadata: Optional[Dict[str, Any]] = None,
|
| 24 |
+
gpu_processor: Optional['GPUProcessor'] = None, file_path: Optional[str] = None,
|
| 25 |
+
ocr_provider: str = "nanonets"):
|
| 26 |
+
super().__init__(content, metadata)
|
| 27 |
+
self.gpu_processor = gpu_processor
|
| 28 |
+
self.file_path = file_path
|
| 29 |
+
self.ocr_provider = ocr_provider
|
| 30 |
+
|
| 31 |
+
# Add GPU-specific metadata
|
| 32 |
+
if metadata is None:
|
| 33 |
+
self.metadata = {}
|
| 34 |
+
|
| 35 |
+
# Ensure GPU-specific metadata is present
|
| 36 |
+
if 'processing_mode' not in self.metadata:
|
| 37 |
+
self.metadata['processing_mode'] = 'gpu'
|
| 38 |
+
if 'ocr_provider' not in self.metadata:
|
| 39 |
+
self.metadata['ocr_provider'] = ocr_provider
|
| 40 |
+
if 'gpu_processing' not in self.metadata:
|
| 41 |
+
self.metadata['gpu_processing'] = True
|
| 42 |
+
|
| 43 |
+
def get_ocr_info(self) -> Dict[str, Any]:
|
| 44 |
+
"""Get information about the OCR processing used.
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
Dictionary with OCR processing information
|
| 48 |
+
"""
|
| 49 |
+
return {
|
| 50 |
+
'ocr_provider': self.ocr_provider,
|
| 51 |
+
'processing_mode': 'gpu',
|
| 52 |
+
'file_path': self.file_path,
|
| 53 |
+
'gpu_processor_available': self.gpu_processor is not None
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
def extract_markdown(self) -> str:
|
| 57 |
+
"""Export as markdown without GPU processing metadata."""
|
| 58 |
+
return self.content
|
| 59 |
+
|
| 60 |
+
def extract_html(self) -> str:
|
| 61 |
+
"""Export as HTML with GPU processing styling."""
|
| 62 |
+
# Get the base HTML from parent class
|
| 63 |
+
html_content = super().extract_html()
|
| 64 |
+
|
| 65 |
+
# Add GPU processing indicator
|
| 66 |
+
gpu_indicator = f"""
|
| 67 |
+
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 8px; margin-bottom: 2rem; text-align: center;">
|
| 68 |
+
<strong>🚀 GPU Processed</strong> - Enhanced with {self.ocr_provider} OCR
|
| 69 |
+
</div>
|
| 70 |
+
"""
|
| 71 |
+
|
| 72 |
+
# Insert the indicator after the opening body tag
|
| 73 |
+
body_start = html_content.find('<body')
|
| 74 |
+
if body_start != -1:
|
| 75 |
+
body_end = html_content.find('>', body_start) + 1
|
| 76 |
+
return html_content[:body_end] + gpu_indicator + html_content[body_end:]
|
| 77 |
+
|
| 78 |
+
return html_content
|
| 79 |
+
|
| 80 |
+
def extract_data(self) -> Dict[str, Any]:
|
| 81 |
+
"""Export as structured JSON using Nanonets model with specific prompt."""
|
| 82 |
+
logger.debug(f"GPUConversionResult.extract_data() called for {self.file_path}")
|
| 83 |
+
|
| 84 |
+
try:
|
| 85 |
+
# If we have a GPU processor and file path, use the model to extract JSON
|
| 86 |
+
if self.gpu_processor and self.file_path and os.path.exists(self.file_path):
|
| 87 |
+
logger.info("Using Nanonets model for JSON extraction")
|
| 88 |
+
return self._extract_json_with_model()
|
| 89 |
+
else:
|
| 90 |
+
logger.info("Using fallback JSON conversion")
|
| 91 |
+
# Fallback to base JSON conversion
|
| 92 |
+
return self._convert_to_base_json()
|
| 93 |
+
except Exception as e:
|
| 94 |
+
logger.warning(f"Failed to extract JSON with model: {e}. Using fallback conversion.")
|
| 95 |
+
return self._convert_to_base_json()
|
| 96 |
+
|
| 97 |
+
def _extract_json_with_model(self) -> Dict[str, Any]:
|
| 98 |
+
"""Extract structured JSON using Nanonets model with specific prompt."""
|
| 99 |
+
try:
|
| 100 |
+
from PIL import Image
|
| 101 |
+
from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
|
| 102 |
+
|
| 103 |
+
# Get the model from the GPU processor's OCR service
|
| 104 |
+
ocr_service = self.gpu_processor._get_ocr_service()
|
| 105 |
+
|
| 106 |
+
# Access the model components from the OCR service
|
| 107 |
+
if hasattr(ocr_service, 'processor') and hasattr(ocr_service, 'model') and hasattr(ocr_service, 'tokenizer'):
|
| 108 |
+
model = ocr_service.model
|
| 109 |
+
processor = ocr_service.processor
|
| 110 |
+
tokenizer = ocr_service.tokenizer
|
| 111 |
+
else:
|
| 112 |
+
# Fallback: load model directly
|
| 113 |
+
model_path = "nanonets/Nanonets-OCR-s"
|
| 114 |
+
model = AutoModelForImageTextToText.from_pretrained(
|
| 115 |
+
model_path,
|
| 116 |
+
torch_dtype="auto",
|
| 117 |
+
device_map="auto"
|
| 118 |
+
)
|
| 119 |
+
model.eval()
|
| 120 |
+
processor = AutoProcessor.from_pretrained(model_path)
|
| 121 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
| 122 |
+
|
| 123 |
+
# Define the JSON extraction prompt
|
| 124 |
+
prompt = """Extract all information from the above document and return it as a valid JSON object.
|
| 125 |
+
|
| 126 |
+
Instructions:
|
| 127 |
+
- The output should be a single JSON object.
|
| 128 |
+
- Keys should be meaningful field names.
|
| 129 |
+
- If multiple similar blocks (like invoice items or line items), return a list of JSON objects under a key.
|
| 130 |
+
- Use strings for all values.
|
| 131 |
+
- Wrap page numbers using: "page_number": "1"
|
| 132 |
+
- Wrap watermarks using: "watermark": "CONFIDENTIAL"
|
| 133 |
+
- Use ☐ and ☑ for checkboxes.
|
| 134 |
+
|
| 135 |
+
Example:
|
| 136 |
+
{
|
| 137 |
+
"Name": "John Doe",
|
| 138 |
+
"Invoice Number": "INV-4567",
|
| 139 |
+
"Amount Due": "$123.45",
|
| 140 |
+
"Items": [
|
| 141 |
+
{"Description": "Widget A", "Price": "$20"},
|
| 142 |
+
{"Description": "Widget B", "Price": "$30"}
|
| 143 |
+
],
|
| 144 |
+
"page_number": "1",
|
| 145 |
+
"watermark": "CONFIDENTIAL"
|
| 146 |
+
}"""
|
| 147 |
+
|
| 148 |
+
# Load the image
|
| 149 |
+
image = Image.open(self.file_path)
|
| 150 |
+
|
| 151 |
+
# Prepare messages for the model
|
| 152 |
+
messages = [
|
| 153 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
| 154 |
+
{"role": "user", "content": [
|
| 155 |
+
{"type": "image", "image": f"file://{self.file_path}"},
|
| 156 |
+
{"type": "text", "text": prompt},
|
| 157 |
+
]},
|
| 158 |
+
]
|
| 159 |
+
|
| 160 |
+
# Apply chat template and process
|
| 161 |
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 162 |
+
inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
|
| 163 |
+
inputs = inputs.to(model.device)
|
| 164 |
+
|
| 165 |
+
# Generate JSON response
|
| 166 |
+
output_ids = model.generate(**inputs, max_new_tokens=15000, do_sample=False)
|
| 167 |
+
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
|
| 168 |
+
|
| 169 |
+
json_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
|
| 170 |
+
logger.debug(f"Generated JSON text: {json_text[:200]}...")
|
| 171 |
+
|
| 172 |
+
# Try to parse the JSON response with improved parsing
|
| 173 |
+
def try_parse_json(text):
|
| 174 |
+
try:
|
| 175 |
+
return json.loads(text)
|
| 176 |
+
except json.JSONDecodeError:
|
| 177 |
+
# Try cleaning and reparsing
|
| 178 |
+
try:
|
| 179 |
+
text = re.sub(r"(\w+):", r'"\1":', text) # wrap keys
|
| 180 |
+
text = text.replace("'", '"') # replace single quotes
|
| 181 |
+
return json.loads(text)
|
| 182 |
+
except (json.JSONDecodeError, Exception):
|
| 183 |
+
return {"raw_text": text}
|
| 184 |
+
|
| 185 |
+
# Parse the JSON
|
| 186 |
+
extracted_data = try_parse_json(json_text)
|
| 187 |
+
|
| 188 |
+
# Create the result structure
|
| 189 |
+
result = {
|
| 190 |
+
"document": extracted_data,
|
| 191 |
+
"format": "gpu_structured_json",
|
| 192 |
+
"gpu_processing_info": {
|
| 193 |
+
'ocr_provider': self.ocr_provider,
|
| 194 |
+
'processing_mode': 'gpu',
|
| 195 |
+
'file_path': self.file_path,
|
| 196 |
+
'gpu_processor_available': self.gpu_processor is not None,
|
| 197 |
+
'json_extraction_method': 'nanonets_model'
|
| 198 |
+
}
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
return result
|
| 202 |
+
|
| 203 |
+
except Exception as e:
|
| 204 |
+
logger.error(f"Failed to extract JSON with model: {e}")
|
| 205 |
+
raise
|
| 206 |
+
|
| 207 |
+
def _convert_to_base_json(self) -> Dict[str, Any]:
|
| 208 |
+
"""Fallback to base JSON conversion method."""
|
| 209 |
+
# Get the base JSON from parent class
|
| 210 |
+
base_json = super().extract_data()
|
| 211 |
+
|
| 212 |
+
# Add GPU-specific metadata
|
| 213 |
+
base_json['gpu_processing_info'] = {
|
| 214 |
+
'ocr_provider': self.ocr_provider,
|
| 215 |
+
'processing_mode': 'gpu',
|
| 216 |
+
'file_path': self.file_path,
|
| 217 |
+
'gpu_processor_available': self.gpu_processor is not None,
|
| 218 |
+
'json_extraction_method': 'fallback_conversion'
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
# Update the format to indicate GPU processing
|
| 222 |
+
base_json['format'] = 'gpu_structured_json'
|
| 223 |
+
|
| 224 |
+
return base_json
|
| 225 |
+
|
| 226 |
+
def extract_text(self) -> str:
|
| 227 |
+
"""Export as plain text without GPU processing header."""
|
| 228 |
+
return self.content
|
| 229 |
+
|
| 230 |
+
def get_processing_stats(self) -> Dict[str, Any]:
|
| 231 |
+
"""Get processing statistics and information.
|
| 232 |
+
|
| 233 |
+
Returns:
|
| 234 |
+
Dictionary with processing statistics
|
| 235 |
+
"""
|
| 236 |
+
stats = {
|
| 237 |
+
'processing_mode': 'gpu',
|
| 238 |
+
'ocr_provider': self.ocr_provider,
|
| 239 |
+
'file_path': self.file_path,
|
| 240 |
+
'content_length': len(self.content),
|
| 241 |
+
'word_count': len(self.content.split()),
|
| 242 |
+
'line_count': len(self.content.split('\n')),
|
| 243 |
+
'gpu_processor_available': self.gpu_processor is not None
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
# Add metadata if available
|
| 247 |
+
if self.metadata:
|
| 248 |
+
stats['metadata'] = self.metadata
|
| 249 |
+
|
| 250 |
+
return stats
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
class GPUProcessor(BaseProcessor):
|
| 254 |
+
"""Processor for image files and PDFs with Nanonets OCR capabilities."""
|
| 255 |
+
|
| 256 |
+
def __init__(self, preserve_layout: bool = True, include_images: bool = False, ocr_enabled: bool = True, use_markdownify: bool = None, ocr_service=None):
|
| 257 |
+
super().__init__(preserve_layout, include_images, ocr_enabled, use_markdownify)
|
| 258 |
+
self._ocr_service = ocr_service
|
| 259 |
+
|
| 260 |
+
def can_process(self, file_path: str) -> bool:
|
| 261 |
+
"""Check if this processor can handle the given file.
|
| 262 |
+
|
| 263 |
+
Args:
|
| 264 |
+
file_path: Path to the file to check
|
| 265 |
+
|
| 266 |
+
Returns:
|
| 267 |
+
True if this processor can handle the file
|
| 268 |
+
"""
|
| 269 |
+
if not os.path.exists(file_path):
|
| 270 |
+
return False
|
| 271 |
+
|
| 272 |
+
# Check file extension - ensure file_path is a string
|
| 273 |
+
file_path_str = str(file_path)
|
| 274 |
+
_, ext = os.path.splitext(file_path_str.lower())
|
| 275 |
+
return ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp', '.gif', '.pdf']
|
| 276 |
+
|
| 277 |
+
def _get_ocr_service(self):
|
| 278 |
+
"""Get OCR service instance."""
|
| 279 |
+
if self._ocr_service is not None:
|
| 280 |
+
return self._ocr_service
|
| 281 |
+
# Use Nanonets OCR service by default
|
| 282 |
+
self._ocr_service = OCRServiceFactory.create_service('nanonets')
|
| 283 |
+
return self._ocr_service
|
| 284 |
+
|
| 285 |
+
def process(self, file_path: str) -> GPUConversionResult:
|
| 286 |
+
"""Process image file or PDF with OCR capabilities.
|
| 287 |
+
|
| 288 |
+
Args:
|
| 289 |
+
file_path: Path to the image file or PDF
|
| 290 |
+
|
| 291 |
+
Returns:
|
| 292 |
+
GPUConversionResult with extracted content
|
| 293 |
+
"""
|
| 294 |
+
try:
|
| 295 |
+
if not os.path.exists(file_path):
|
| 296 |
+
raise FileNotFoundError(f"File not found: {file_path}")
|
| 297 |
+
|
| 298 |
+
# Check file type
|
| 299 |
+
file_path_str = str(file_path)
|
| 300 |
+
_, ext = os.path.splitext(file_path_str.lower())
|
| 301 |
+
|
| 302 |
+
if ext == '.pdf':
|
| 303 |
+
logger.info(f"Processing PDF file: {file_path}")
|
| 304 |
+
return self._process_pdf(file_path)
|
| 305 |
+
else:
|
| 306 |
+
logger.info(f"Processing image file: {file_path}")
|
| 307 |
+
return self._process_image(file_path)
|
| 308 |
+
|
| 309 |
+
except Exception as e:
|
| 310 |
+
logger.error(f"Failed to process file {file_path}: {e}")
|
| 311 |
+
raise ConversionError(f"GPU processing failed: {e}")
|
| 312 |
+
|
| 313 |
+
def _process_image(self, file_path: str) -> GPUConversionResult:
|
| 314 |
+
"""Process image file with OCR capabilities.
|
| 315 |
+
|
| 316 |
+
Args:
|
| 317 |
+
file_path: Path to the image file
|
| 318 |
+
|
| 319 |
+
Returns:
|
| 320 |
+
GPUConversionResult with extracted content
|
| 321 |
+
"""
|
| 322 |
+
# Get OCR service
|
| 323 |
+
ocr_service = self._get_ocr_service()
|
| 324 |
+
|
| 325 |
+
# Extract text with layout awareness if enabled
|
| 326 |
+
if self.ocr_enabled and self.preserve_layout:
|
| 327 |
+
logger.info("Extracting text with layout awareness using Nanonets OCR")
|
| 328 |
+
extracted_text = ocr_service.extract_text_with_layout(file_path)
|
| 329 |
+
elif self.ocr_enabled:
|
| 330 |
+
logger.info("Extracting text without layout awareness using Nanonets OCR")
|
| 331 |
+
extracted_text = ocr_service.extract_text(file_path)
|
| 332 |
+
else:
|
| 333 |
+
logger.warning("OCR is disabled, returning empty content")
|
| 334 |
+
extracted_text = ""
|
| 335 |
+
|
| 336 |
+
# Create GPU result
|
| 337 |
+
result = GPUConversionResult(
|
| 338 |
+
content=extracted_text,
|
| 339 |
+
metadata={
|
| 340 |
+
'file_path': file_path,
|
| 341 |
+
'file_type': 'image',
|
| 342 |
+
'ocr_enabled': self.ocr_enabled,
|
| 343 |
+
'preserve_layout': self.preserve_layout,
|
| 344 |
+
'ocr_provider': 'nanonets'
|
| 345 |
+
},
|
| 346 |
+
gpu_processor=self,
|
| 347 |
+
file_path=file_path,
|
| 348 |
+
ocr_provider='nanonets'
|
| 349 |
+
)
|
| 350 |
+
|
| 351 |
+
logger.info(f"Image processing completed. Extracted {len(extracted_text)} characters")
|
| 352 |
+
return result
|
| 353 |
+
|
| 354 |
+
def _process_pdf(self, file_path: str) -> GPUConversionResult:
|
| 355 |
+
"""Process PDF file by converting to images and using OCR.
|
| 356 |
+
|
| 357 |
+
Args:
|
| 358 |
+
file_path: Path to the PDF file
|
| 359 |
+
|
| 360 |
+
Returns:
|
| 361 |
+
GPUConversionResult with extracted content
|
| 362 |
+
"""
|
| 363 |
+
try:
|
| 364 |
+
# Convert PDF to images
|
| 365 |
+
image_paths = self._convert_pdf_to_images(file_path)
|
| 366 |
+
|
| 367 |
+
if not image_paths:
|
| 368 |
+
logger.warning("No pages could be extracted from PDF")
|
| 369 |
+
return GPUConversionResult(
|
| 370 |
+
content="",
|
| 371 |
+
metadata={
|
| 372 |
+
'file_path': file_path,
|
| 373 |
+
'file_type': 'pdf',
|
| 374 |
+
'ocr_enabled': self.ocr_enabled,
|
| 375 |
+
'preserve_layout': self.preserve_layout,
|
| 376 |
+
'ocr_provider': 'nanonets',
|
| 377 |
+
'pages_processed': 0
|
| 378 |
+
},
|
| 379 |
+
gpu_processor=self,
|
| 380 |
+
file_path=file_path,
|
| 381 |
+
ocr_provider='nanonets'
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
# Process each page with OCR
|
| 385 |
+
all_texts = []
|
| 386 |
+
ocr_service = self._get_ocr_service()
|
| 387 |
+
|
| 388 |
+
for i, image_path in enumerate(image_paths):
|
| 389 |
+
logger.info(f"Processing PDF page {i+1}/{len(image_paths)}")
|
| 390 |
+
|
| 391 |
+
try:
|
| 392 |
+
if self.ocr_enabled and self.preserve_layout:
|
| 393 |
+
page_text = ocr_service.extract_text_with_layout(image_path)
|
| 394 |
+
elif self.ocr_enabled:
|
| 395 |
+
page_text = ocr_service.extract_text(image_path)
|
| 396 |
+
else:
|
| 397 |
+
page_text = ""
|
| 398 |
+
|
| 399 |
+
# Add page header and content if there's text
|
| 400 |
+
if page_text.strip():
|
| 401 |
+
# Add page header (markdown style)
|
| 402 |
+
all_texts.append(f"\n## Page {i+1}\n\n")
|
| 403 |
+
all_texts.append(page_text)
|
| 404 |
+
|
| 405 |
+
# Add horizontal rule after content (except for last page)
|
| 406 |
+
if i < len(image_paths) - 1:
|
| 407 |
+
all_texts.append("\n\n---\n\n")
|
| 408 |
+
|
| 409 |
+
except Exception as e:
|
| 410 |
+
logger.error(f"Failed to process page {i+1}: {e}")
|
| 411 |
+
# Add error page with markdown formatting
|
| 412 |
+
all_texts.append(f"\n## Page {i+1}\n\n*Error processing this page: {e}*\n\n")
|
| 413 |
+
if i < len(image_paths) - 1:
|
| 414 |
+
all_texts.append("---\n\n")
|
| 415 |
+
|
| 416 |
+
finally:
|
| 417 |
+
# Clean up temporary image file
|
| 418 |
+
try:
|
| 419 |
+
os.unlink(image_path)
|
| 420 |
+
except OSError:
|
| 421 |
+
pass
|
| 422 |
+
|
| 423 |
+
# Combine all page texts
|
| 424 |
+
combined_text = ''.join(all_texts)
|
| 425 |
+
|
| 426 |
+
# Create result
|
| 427 |
+
result = GPUConversionResult(
|
| 428 |
+
content=combined_text,
|
| 429 |
+
metadata={
|
| 430 |
+
'file_path': file_path,
|
| 431 |
+
'file_type': 'pdf',
|
| 432 |
+
'ocr_enabled': self.ocr_enabled,
|
| 433 |
+
'preserve_layout': self.preserve_layout,
|
| 434 |
+
'ocr_provider': 'nanonets',
|
| 435 |
+
'pages_processed': len(image_paths)
|
| 436 |
+
},
|
| 437 |
+
gpu_processor=self,
|
| 438 |
+
file_path=file_path,
|
| 439 |
+
ocr_provider='nanonets'
|
| 440 |
+
)
|
| 441 |
+
|
| 442 |
+
logger.info(f"PDF processing completed. Processed {len(image_paths)} pages, extracted {len(combined_text)} characters")
|
| 443 |
+
return result
|
| 444 |
+
|
| 445 |
+
except Exception as e:
|
| 446 |
+
logger.error(f"Failed to process PDF {file_path}: {e}")
|
| 447 |
+
raise ConversionError(f"PDF processing failed: {e}")
|
| 448 |
+
|
| 449 |
+
def _convert_pdf_to_images(self, pdf_path: str) -> List[str]:
|
| 450 |
+
"""Convert PDF pages to images.
|
| 451 |
+
|
| 452 |
+
Args:
|
| 453 |
+
pdf_path: Path to the PDF file
|
| 454 |
+
|
| 455 |
+
Returns:
|
| 456 |
+
List of paths to temporary image files
|
| 457 |
+
"""
|
| 458 |
+
try:
|
| 459 |
+
from pdf2image import convert_from_path
|
| 460 |
+
from ..config import InternalConfig
|
| 461 |
+
|
| 462 |
+
# Get DPI from config
|
| 463 |
+
dpi = getattr(InternalConfig, 'pdf_image_dpi', 300)
|
| 464 |
+
|
| 465 |
+
# Convert PDF pages to images using pdf2image
|
| 466 |
+
images = convert_from_path(pdf_path, dpi=dpi)
|
| 467 |
+
image_paths = []
|
| 468 |
+
|
| 469 |
+
# Save each image to a temporary file
|
| 470 |
+
for page_num, image in enumerate(images):
|
| 471 |
+
persistent_image_path = tempfile.mktemp(suffix='.png')
|
| 472 |
+
image.save(persistent_image_path, 'PNG')
|
| 473 |
+
image_paths.append(persistent_image_path)
|
| 474 |
+
|
| 475 |
+
logger.info(f"Converted PDF to {len(image_paths)} images")
|
| 476 |
+
return image_paths
|
| 477 |
+
|
| 478 |
+
except ImportError:
|
| 479 |
+
logger.error("pdf2image not available. Please install it: pip install pdf2image")
|
| 480 |
+
raise ConversionError("pdf2image is required for PDF processing")
|
| 481 |
+
except Exception as e:
|
| 482 |
+
logger.error(f"Failed to extract PDF to images: {e}")
|
| 483 |
+
raise ConversionError(f"PDF to image conversion failed: {e}")
|
| 484 |
+
|
| 485 |
+
@staticmethod
|
| 486 |
+
def predownload_ocr_models():
|
| 487 |
+
"""Pre-download OCR models by running a dummy prediction."""
|
| 488 |
+
try:
|
| 489 |
+
from docstrange.pipeline.ocr_service import OCRServiceFactory
|
| 490 |
+
ocr_service = OCRServiceFactory.create_service('nanonets')
|
| 491 |
+
# Create a blank image for testing
|
| 492 |
+
from PIL import Image
|
| 493 |
+
import tempfile
|
| 494 |
+
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
|
| 495 |
+
img = Image.new('RGB', (100, 100), color='white')
|
| 496 |
+
img.save(tmp.name)
|
| 497 |
+
ocr_service.extract_text_with_layout(tmp.name)
|
| 498 |
+
os.unlink(tmp.name)
|
| 499 |
+
logger.info("Nanonets OCR models pre-downloaded and cached.")
|
| 500 |
+
except Exception as e:
|
| 501 |
+
logger.error(f"Failed to pre-download Nanonets OCR models: {e}")
|
docstrange/processors/html_processor.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""HTML file processor."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import logging
|
| 5 |
+
from typing import Dict, Any
|
| 6 |
+
|
| 7 |
+
from .base import BaseProcessor
|
| 8 |
+
from ..result import ConversionResult
|
| 9 |
+
from ..exceptions import ConversionError, FileNotFoundError
|
| 10 |
+
|
| 11 |
+
# Configure logging
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class HTMLProcessor(BaseProcessor):
|
| 16 |
+
"""Processor for HTML files using markdownify for conversion."""
|
| 17 |
+
|
| 18 |
+
def can_process(self, file_path: str) -> bool:
|
| 19 |
+
"""Check if this processor can handle the given file.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
file_path: Path to the file to check
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
True if this processor can handle the file
|
| 26 |
+
"""
|
| 27 |
+
if not os.path.exists(file_path):
|
| 28 |
+
return False
|
| 29 |
+
|
| 30 |
+
# Check file extension - ensure file_path is a string
|
| 31 |
+
file_path_str = str(file_path)
|
| 32 |
+
_, ext = os.path.splitext(file_path_str.lower())
|
| 33 |
+
return ext in ['.html', '.htm']
|
| 34 |
+
|
| 35 |
+
def process(self, file_path: str) -> ConversionResult:
|
| 36 |
+
"""Process the HTML file and return a conversion result.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
file_path: Path to the HTML file to process
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
ConversionResult containing the processed content
|
| 43 |
+
|
| 44 |
+
Raises:
|
| 45 |
+
FileNotFoundError: If the file doesn't exist
|
| 46 |
+
ConversionError: If processing fails
|
| 47 |
+
"""
|
| 48 |
+
if not os.path.exists(file_path):
|
| 49 |
+
raise FileNotFoundError(f"File not found: {file_path}")
|
| 50 |
+
|
| 51 |
+
try:
|
| 52 |
+
try:
|
| 53 |
+
from markdownify import markdownify as md
|
| 54 |
+
except ImportError:
|
| 55 |
+
raise ConversionError("markdownify is required for HTML processing. Install it with: pip install markdownify")
|
| 56 |
+
|
| 57 |
+
metadata = self.get_metadata(file_path)
|
| 58 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 59 |
+
html_content = f.read()
|
| 60 |
+
content = md(html_content, heading_style="ATX")
|
| 61 |
+
return ConversionResult(content, metadata)
|
| 62 |
+
except Exception as e:
|
| 63 |
+
if isinstance(e, (FileNotFoundError, ConversionError)):
|
| 64 |
+
raise
|
| 65 |
+
raise ConversionError(f"Failed to process HTML file {file_path}: {str(e)}")
|
docstrange/processors/image_processor.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Image file processor with OCR capabilities."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import logging
|
| 5 |
+
from typing import Dict, Any
|
| 6 |
+
|
| 7 |
+
from .base import BaseProcessor
|
| 8 |
+
from ..result import ConversionResult
|
| 9 |
+
from ..exceptions import ConversionError, FileNotFoundError
|
| 10 |
+
from ..pipeline.ocr_service import OCRServiceFactory
|
| 11 |
+
|
| 12 |
+
# Configure logging
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class ImageProcessor(BaseProcessor):
|
| 17 |
+
"""Processor for image files (JPG, PNG, etc.) with OCR capabilities."""
|
| 18 |
+
|
| 19 |
+
def __init__(self, preserve_layout: bool = True, include_images: bool = False, ocr_enabled: bool = True, use_markdownify: bool = None, ocr_service=None):
|
| 20 |
+
super().__init__(preserve_layout, include_images, ocr_enabled, use_markdownify)
|
| 21 |
+
self._ocr_service = ocr_service
|
| 22 |
+
|
| 23 |
+
def can_process(self, file_path: str) -> bool:
|
| 24 |
+
"""Check if this processor can handle the given file.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
file_path: Path to the file to check
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
True if this processor can handle the file
|
| 31 |
+
"""
|
| 32 |
+
if not os.path.exists(file_path):
|
| 33 |
+
return False
|
| 34 |
+
|
| 35 |
+
# Check file extension - ensure file_path is a string
|
| 36 |
+
file_path_str = str(file_path)
|
| 37 |
+
_, ext = os.path.splitext(file_path_str.lower())
|
| 38 |
+
return ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp', '.gif']
|
| 39 |
+
|
| 40 |
+
def _get_ocr_service(self):
|
| 41 |
+
"""Get OCR service instance."""
|
| 42 |
+
if self._ocr_service is not None:
|
| 43 |
+
return self._ocr_service
|
| 44 |
+
self._ocr_service = OCRServiceFactory.create_service()
|
| 45 |
+
return self._ocr_service
|
| 46 |
+
|
| 47 |
+
def process(self, file_path: str) -> ConversionResult:
|
| 48 |
+
"""Process image file with OCR capabilities.
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
file_path: Path to the image file
|
| 52 |
+
|
| 53 |
+
Returns:
|
| 54 |
+
ConversionResult with extracted content
|
| 55 |
+
"""
|
| 56 |
+
try:
|
| 57 |
+
if not os.path.exists(file_path):
|
| 58 |
+
raise FileNotFoundError(f"Image file not found: {file_path}")
|
| 59 |
+
|
| 60 |
+
logger.info(f"Processing image file: {file_path}")
|
| 61 |
+
|
| 62 |
+
# Get OCR service
|
| 63 |
+
ocr_service = self._get_ocr_service()
|
| 64 |
+
|
| 65 |
+
# Extract text with layout awareness if enabled
|
| 66 |
+
if self.ocr_enabled and self.preserve_layout:
|
| 67 |
+
logger.info("Extracting text with layout awareness")
|
| 68 |
+
extracted_text = ocr_service.extract_text_with_layout(file_path)
|
| 69 |
+
elif self.ocr_enabled:
|
| 70 |
+
logger.info("Extracting text without layout awareness")
|
| 71 |
+
extracted_text = ocr_service.extract_text(file_path)
|
| 72 |
+
else:
|
| 73 |
+
logger.warning("OCR is disabled, returning empty content")
|
| 74 |
+
extracted_text = ""
|
| 75 |
+
|
| 76 |
+
# Create result
|
| 77 |
+
result = ConversionResult(
|
| 78 |
+
content=extracted_text,
|
| 79 |
+
metadata={
|
| 80 |
+
'file_path': file_path,
|
| 81 |
+
'file_type': 'image',
|
| 82 |
+
'ocr_enabled': self.ocr_enabled,
|
| 83 |
+
'preserve_layout': self.preserve_layout
|
| 84 |
+
}
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
logger.info(f"Image processing completed. Extracted {len(extracted_text)} characters")
|
| 88 |
+
return result
|
| 89 |
+
|
| 90 |
+
except Exception as e:
|
| 91 |
+
logger.error(f"Failed to process image file {file_path}: {e}")
|
| 92 |
+
raise ConversionError(f"Image processing failed: {e}")
|
| 93 |
+
|
| 94 |
+
@staticmethod
|
| 95 |
+
def predownload_ocr_models():
|
| 96 |
+
"""Pre-download OCR models by running a dummy prediction."""
|
| 97 |
+
try:
|
| 98 |
+
from docstrange.services.ocr_service import OCRServiceFactory
|
| 99 |
+
ocr_service = OCRServiceFactory.create_service()
|
| 100 |
+
# Create a blank image for testing
|
| 101 |
+
from PIL import Image
|
| 102 |
+
import tempfile
|
| 103 |
+
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
|
| 104 |
+
img = Image.new('RGB', (100, 100), color='white')
|
| 105 |
+
img.save(tmp.name)
|
| 106 |
+
ocr_service.extract_text_with_layout(tmp.name)
|
| 107 |
+
os.unlink(tmp.name)
|
| 108 |
+
logger.info("OCR models pre-downloaded and cached.")
|
| 109 |
+
except Exception as e:
|
| 110 |
+
logger.error(f"Failed to pre-download OCR models: {e}")
|
docstrange/processors/pdf_processor.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""PDF file processor with OCR support for scanned PDFs."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import logging
|
| 5 |
+
import tempfile
|
| 6 |
+
from typing import Dict, Any, List, Tuple
|
| 7 |
+
|
| 8 |
+
from .base import BaseProcessor
|
| 9 |
+
from .image_processor import ImageProcessor
|
| 10 |
+
from ..result import ConversionResult
|
| 11 |
+
from ..exceptions import ConversionError, FileNotFoundError
|
| 12 |
+
from ..config import InternalConfig
|
| 13 |
+
from ..pipeline.ocr_service import OCRServiceFactory, NeuralOCRService
|
| 14 |
+
|
| 15 |
+
# Configure logging
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class PDFProcessor(BaseProcessor):
|
| 20 |
+
"""Processor for PDF files using PDF-to-image conversion with OCR."""
|
| 21 |
+
|
| 22 |
+
def __init__(self, preserve_layout: bool = True, include_images: bool = False, ocr_enabled: bool = True, use_markdownify: bool = None):
|
| 23 |
+
super().__init__(preserve_layout, include_images, ocr_enabled, use_markdownify)
|
| 24 |
+
# Create a shared OCR service instance for all pages
|
| 25 |
+
shared_ocr_service = NeuralOCRService()
|
| 26 |
+
self._image_processor = ImageProcessor(
|
| 27 |
+
preserve_layout=preserve_layout,
|
| 28 |
+
include_images=include_images,
|
| 29 |
+
ocr_enabled=ocr_enabled,
|
| 30 |
+
use_markdownify=use_markdownify,
|
| 31 |
+
ocr_service=shared_ocr_service
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
def can_process(self, file_path: str) -> bool:
|
| 35 |
+
"""Check if this processor can handle the given file.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
file_path: Path to the file to check
|
| 39 |
+
|
| 40 |
+
Returns:
|
| 41 |
+
True if this processor can handle the file
|
| 42 |
+
"""
|
| 43 |
+
if not os.path.exists(file_path):
|
| 44 |
+
return False
|
| 45 |
+
|
| 46 |
+
# Check file extension - ensure file_path is a string
|
| 47 |
+
file_path_str = str(file_path)
|
| 48 |
+
_, ext = os.path.splitext(file_path_str.lower())
|
| 49 |
+
return ext == '.pdf'
|
| 50 |
+
|
| 51 |
+
def process(self, file_path: str) -> ConversionResult:
|
| 52 |
+
"""Process PDF file with OCR capabilities.
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
file_path: Path to the PDF file
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
ConversionResult with extracted content
|
| 59 |
+
"""
|
| 60 |
+
try:
|
| 61 |
+
from ..config import InternalConfig
|
| 62 |
+
pdf_to_image_enabled = InternalConfig.pdf_to_image_enabled
|
| 63 |
+
except (ImportError, AttributeError):
|
| 64 |
+
# Fallback if config is not available
|
| 65 |
+
pdf_to_image_enabled = True
|
| 66 |
+
logger.warning("InternalConfig not available, defaulting to pdf_to_image_enabled = True")
|
| 67 |
+
|
| 68 |
+
try:
|
| 69 |
+
if not os.path.exists(file_path):
|
| 70 |
+
raise FileNotFoundError(f"PDF file not found: {file_path}")
|
| 71 |
+
|
| 72 |
+
logger.info(f"Processing PDF file: {file_path}")
|
| 73 |
+
logger.info(f"pdf_to_image_enabled = {pdf_to_image_enabled}")
|
| 74 |
+
|
| 75 |
+
# Always use OCR-based processing (pdf2image + OCR)
|
| 76 |
+
logger.info("Using OCR-based PDF processing with pdf2image")
|
| 77 |
+
return self._process_with_ocr(file_path)
|
| 78 |
+
|
| 79 |
+
except Exception as e:
|
| 80 |
+
logger.error(f"Failed to process PDF file {file_path}: {e}")
|
| 81 |
+
raise ConversionError(f"PDF processing failed: {e}")
|
| 82 |
+
|
| 83 |
+
def _process_with_ocr(self, file_path: str) -> ConversionResult:
|
| 84 |
+
"""Process PDF using OCR after converting pages to images."""
|
| 85 |
+
try:
|
| 86 |
+
from pdf2image import convert_from_path
|
| 87 |
+
from ..config import InternalConfig
|
| 88 |
+
|
| 89 |
+
# Get DPI from config
|
| 90 |
+
dpi = getattr(InternalConfig, 'pdf_image_dpi', 300)
|
| 91 |
+
|
| 92 |
+
# Convert PDF pages to images using pdf2image
|
| 93 |
+
images = convert_from_path(file_path, dpi=dpi)
|
| 94 |
+
page_count = len(images)
|
| 95 |
+
all_content = []
|
| 96 |
+
|
| 97 |
+
for page_num, image in enumerate(images):
|
| 98 |
+
# Save to temporary file for OCR processing
|
| 99 |
+
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
|
| 100 |
+
image.save(tmp.name, 'PNG')
|
| 101 |
+
temp_image_path = tmp.name
|
| 102 |
+
|
| 103 |
+
try:
|
| 104 |
+
# Process the page image
|
| 105 |
+
page_result = self._image_processor.process(temp_image_path)
|
| 106 |
+
page_content = page_result.content
|
| 107 |
+
|
| 108 |
+
if page_content.strip():
|
| 109 |
+
all_content.append(f"## Page {page_num + 1}\n\n{page_content}")
|
| 110 |
+
|
| 111 |
+
finally:
|
| 112 |
+
# Clean up temporary file
|
| 113 |
+
os.unlink(temp_image_path)
|
| 114 |
+
|
| 115 |
+
content = "\n\n".join(all_content) if all_content else "No content extracted from PDF"
|
| 116 |
+
|
| 117 |
+
return ConversionResult(
|
| 118 |
+
content=content,
|
| 119 |
+
metadata={
|
| 120 |
+
'file_path': file_path,
|
| 121 |
+
'file_type': 'pdf',
|
| 122 |
+
'pages': page_count,
|
| 123 |
+
'extraction_method': 'ocr'
|
| 124 |
+
}
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
except ImportError:
|
| 128 |
+
logger.error("pdf2image not available. Please install it: pip install pdf2image")
|
| 129 |
+
raise ConversionError("pdf2image is required for PDF processing")
|
| 130 |
+
except Exception as e:
|
| 131 |
+
logger.error(f"OCR-based PDF processing failed: {e}")
|
| 132 |
+
raise ConversionError(f"OCR-based PDF processing failed: {e}")
|
| 133 |
+
|
| 134 |
+
@staticmethod
|
| 135 |
+
def predownload_ocr_models():
|
| 136 |
+
"""Pre-download OCR models by running a dummy prediction."""
|
| 137 |
+
try:
|
| 138 |
+
# Use ImageProcessor's predownload method
|
| 139 |
+
ImageProcessor.predownload_ocr_models()
|
| 140 |
+
except Exception as e:
|
| 141 |
+
logger.error(f"Failed to pre-download OCR models: {e}")
|
docstrange/processors/pptx_processor.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""PowerPoint file processor."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import logging
|
| 5 |
+
from typing import Dict, Any
|
| 6 |
+
|
| 7 |
+
from .base import BaseProcessor
|
| 8 |
+
from ..result import ConversionResult
|
| 9 |
+
from ..exceptions import ConversionError, FileNotFoundError
|
| 10 |
+
|
| 11 |
+
# Configure logging
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class PPTXProcessor(BaseProcessor):
|
| 16 |
+
"""Processor for PowerPoint files (PPT, PPTX)."""
|
| 17 |
+
|
| 18 |
+
def can_process(self, file_path: str) -> bool:
|
| 19 |
+
"""Check if this processor can handle the given file.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
file_path: Path to the file to check
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
True if this processor can handle the file
|
| 26 |
+
"""
|
| 27 |
+
if not os.path.exists(file_path):
|
| 28 |
+
return False
|
| 29 |
+
|
| 30 |
+
# Check file extension - ensure file_path is a string
|
| 31 |
+
file_path_str = str(file_path)
|
| 32 |
+
_, ext = os.path.splitext(file_path_str.lower())
|
| 33 |
+
return ext in ['.ppt', '.pptx']
|
| 34 |
+
|
| 35 |
+
def process(self, file_path: str) -> ConversionResult:
|
| 36 |
+
"""Process the PowerPoint file and return a conversion result.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
file_path: Path to the PowerPoint file to process
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
ConversionResult containing the processed content
|
| 43 |
+
|
| 44 |
+
Raises:
|
| 45 |
+
FileNotFoundError: If the file doesn't exist
|
| 46 |
+
ConversionError: If processing fails
|
| 47 |
+
"""
|
| 48 |
+
if not os.path.exists(file_path):
|
| 49 |
+
raise FileNotFoundError(f"File not found: {file_path}")
|
| 50 |
+
|
| 51 |
+
# Initialize metadata
|
| 52 |
+
metadata = {
|
| 53 |
+
"file_path": file_path,
|
| 54 |
+
"file_size": os.path.getsize(file_path),
|
| 55 |
+
"processor": "PPTXProcessor"
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
# Check file extension to determine processing method
|
| 59 |
+
file_path_str = str(file_path)
|
| 60 |
+
_, ext = os.path.splitext(file_path_str.lower())
|
| 61 |
+
|
| 62 |
+
if ext == '.ppt':
|
| 63 |
+
return self._process_ppt_file(file_path, metadata)
|
| 64 |
+
else:
|
| 65 |
+
return self._process_pptx_file(file_path, metadata)
|
| 66 |
+
|
| 67 |
+
def _process_ppt_file(self, file_path: str, metadata: Dict[str, Any]) -> ConversionResult:
|
| 68 |
+
"""Process .ppt files using pypandoc."""
|
| 69 |
+
try:
|
| 70 |
+
import pypandoc
|
| 71 |
+
|
| 72 |
+
# Convert .ppt to markdown using pandoc
|
| 73 |
+
content = pypandoc.convert_file(file_path, 'markdown')
|
| 74 |
+
|
| 75 |
+
metadata.update({
|
| 76 |
+
"file_type": "ppt",
|
| 77 |
+
"extractor": "pypandoc"
|
| 78 |
+
})
|
| 79 |
+
|
| 80 |
+
# Clean up the content
|
| 81 |
+
content = self._clean_content(content)
|
| 82 |
+
|
| 83 |
+
return ConversionResult(content, metadata)
|
| 84 |
+
|
| 85 |
+
except ImportError:
|
| 86 |
+
raise ConversionError("pypandoc is required for .ppt file processing. Install it with: pip install pypandoc")
|
| 87 |
+
except Exception as e:
|
| 88 |
+
raise ConversionError(f"Failed to process .ppt file {file_path}: {str(e)}")
|
| 89 |
+
|
| 90 |
+
def _process_pptx_file(self, file_path: str, metadata: Dict[str, Any]) -> ConversionResult:
|
| 91 |
+
"""Process .pptx files using python-pptx."""
|
| 92 |
+
try:
|
| 93 |
+
from pptx import Presentation
|
| 94 |
+
|
| 95 |
+
content_parts = []
|
| 96 |
+
prs = Presentation(file_path)
|
| 97 |
+
|
| 98 |
+
metadata.update({
|
| 99 |
+
"slide_count": len(prs.slides),
|
| 100 |
+
"file_type": "pptx",
|
| 101 |
+
"extractor": "python-pptx"
|
| 102 |
+
})
|
| 103 |
+
|
| 104 |
+
# Check if preserve_layout is available (from base class or config)
|
| 105 |
+
preserve_layout = getattr(self, 'preserve_layout', False)
|
| 106 |
+
|
| 107 |
+
for slide_num, slide in enumerate(prs.slides, 1):
|
| 108 |
+
if preserve_layout:
|
| 109 |
+
content_parts.append(f"\n## Slide {slide_num}\n")
|
| 110 |
+
|
| 111 |
+
slide_content = []
|
| 112 |
+
|
| 113 |
+
for shape in slide.shapes:
|
| 114 |
+
if hasattr(shape, "text") and shape.text.strip():
|
| 115 |
+
slide_content.append(shape.text.strip())
|
| 116 |
+
|
| 117 |
+
if slide_content:
|
| 118 |
+
content_parts.extend(slide_content)
|
| 119 |
+
content_parts.append("") # Add spacing between slides
|
| 120 |
+
|
| 121 |
+
content = "\n\n".join(content_parts)
|
| 122 |
+
|
| 123 |
+
# Clean up the content
|
| 124 |
+
content = self._clean_content(content)
|
| 125 |
+
|
| 126 |
+
return ConversionResult(content, metadata)
|
| 127 |
+
|
| 128 |
+
except ImportError:
|
| 129 |
+
raise ConversionError("python-pptx is required for .pptx file processing. Install it with: pip install python-pptx")
|
| 130 |
+
except Exception as e:
|
| 131 |
+
if isinstance(e, (FileNotFoundError, ConversionError)):
|
| 132 |
+
raise
|
| 133 |
+
raise ConversionError(f"Failed to process .pptx file {file_path}: {str(e)}")
|
| 134 |
+
|
| 135 |
+
def _clean_content(self, content: str) -> str:
|
| 136 |
+
"""Clean up the extracted PowerPoint content.
|
| 137 |
+
|
| 138 |
+
Args:
|
| 139 |
+
content: Raw PowerPoint text content
|
| 140 |
+
|
| 141 |
+
Returns:
|
| 142 |
+
Cleaned text content
|
| 143 |
+
"""
|
| 144 |
+
# Remove excessive whitespace and normalize
|
| 145 |
+
lines = content.split('\n')
|
| 146 |
+
cleaned_lines = []
|
| 147 |
+
|
| 148 |
+
for line in lines:
|
| 149 |
+
# Remove excessive whitespace
|
| 150 |
+
line = ' '.join(line.split())
|
| 151 |
+
if line.strip():
|
| 152 |
+
cleaned_lines.append(line)
|
| 153 |
+
|
| 154 |
+
# Join lines and add proper spacing
|
| 155 |
+
content = '\n'.join(cleaned_lines)
|
| 156 |
+
|
| 157 |
+
# Add spacing around headers
|
| 158 |
+
content = content.replace('## Slide', '\n## Slide')
|
| 159 |
+
|
| 160 |
+
return content.strip()
|
docstrange/processors/txt_processor.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Text file processor."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from typing import Dict, Any
|
| 5 |
+
|
| 6 |
+
from .base import BaseProcessor
|
| 7 |
+
from ..result import ConversionResult
|
| 8 |
+
from ..exceptions import ConversionError, FileNotFoundError
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class TXTProcessor(BaseProcessor):
|
| 12 |
+
"""Processor for plain text files."""
|
| 13 |
+
|
| 14 |
+
def can_process(self, file_path: str) -> bool:
|
| 15 |
+
"""Check if this processor can handle the given file.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
file_path: Path to the file to check
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
True if this processor can handle the file
|
| 22 |
+
"""
|
| 23 |
+
if not os.path.exists(file_path):
|
| 24 |
+
return False
|
| 25 |
+
|
| 26 |
+
# Check file extension - ensure file_path is a string
|
| 27 |
+
file_path_str = str(file_path)
|
| 28 |
+
_, ext = os.path.splitext(file_path_str.lower())
|
| 29 |
+
return ext in ['.txt', '.text']
|
| 30 |
+
|
| 31 |
+
def process(self, file_path: str) -> ConversionResult:
|
| 32 |
+
"""Process the text file and return a conversion result.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
file_path: Path to the text file to process
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
ConversionResult containing the processed content
|
| 39 |
+
|
| 40 |
+
Raises:
|
| 41 |
+
FileNotFoundError: If the file doesn't exist
|
| 42 |
+
ConversionError: If processing fails
|
| 43 |
+
"""
|
| 44 |
+
if not os.path.exists(file_path):
|
| 45 |
+
raise FileNotFoundError(f"File not found: {file_path}")
|
| 46 |
+
|
| 47 |
+
try:
|
| 48 |
+
# Try different encodings
|
| 49 |
+
encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
|
| 50 |
+
content = None
|
| 51 |
+
|
| 52 |
+
for encoding in encodings:
|
| 53 |
+
try:
|
| 54 |
+
with open(file_path, 'r', encoding=encoding) as f:
|
| 55 |
+
content = f.read()
|
| 56 |
+
break
|
| 57 |
+
except UnicodeDecodeError:
|
| 58 |
+
continue
|
| 59 |
+
|
| 60 |
+
if content is None:
|
| 61 |
+
raise ConversionError(f"Could not decode file {file_path} with any supported encoding")
|
| 62 |
+
|
| 63 |
+
# Clean up the content
|
| 64 |
+
content = self._clean_content(content)
|
| 65 |
+
|
| 66 |
+
metadata = self.get_metadata(file_path)
|
| 67 |
+
metadata.update({
|
| 68 |
+
"encoding": encoding,
|
| 69 |
+
"line_count": len(content.split('\n')),
|
| 70 |
+
"word_count": len(content.split())
|
| 71 |
+
})
|
| 72 |
+
|
| 73 |
+
return ConversionResult(content, metadata)
|
| 74 |
+
|
| 75 |
+
except Exception as e:
|
| 76 |
+
if isinstance(e, (FileNotFoundError, ConversionError)):
|
| 77 |
+
raise
|
| 78 |
+
raise ConversionError(f"Failed to process text file {file_path}: {str(e)}")
|
| 79 |
+
|
| 80 |
+
def _clean_content(self, content: str) -> str:
|
| 81 |
+
"""Clean up the text content.
|
| 82 |
+
|
| 83 |
+
Args:
|
| 84 |
+
content: Raw text content
|
| 85 |
+
|
| 86 |
+
Returns:
|
| 87 |
+
Cleaned text content
|
| 88 |
+
"""
|
| 89 |
+
# Remove excessive whitespace
|
| 90 |
+
lines = content.split('\n')
|
| 91 |
+
cleaned_lines = []
|
| 92 |
+
|
| 93 |
+
for line in lines:
|
| 94 |
+
# Remove trailing whitespace
|
| 95 |
+
line = line.rstrip()
|
| 96 |
+
cleaned_lines.append(line)
|
| 97 |
+
|
| 98 |
+
# Remove empty lines at the beginning and end
|
| 99 |
+
while cleaned_lines and not cleaned_lines[0].strip():
|
| 100 |
+
cleaned_lines.pop(0)
|
| 101 |
+
|
| 102 |
+
while cleaned_lines and not cleaned_lines[-1].strip():
|
| 103 |
+
cleaned_lines.pop()
|
| 104 |
+
|
| 105 |
+
return '\n'.join(cleaned_lines)
|
docstrange/processors/url_processor.py
ADDED
|
@@ -0,0 +1,361 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""URL processor for handling web pages and file downloads."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import re
|
| 5 |
+
import tempfile
|
| 6 |
+
from typing import Dict, Any, Optional
|
| 7 |
+
from urllib.parse import urlparse
|
| 8 |
+
|
| 9 |
+
from .base import BaseProcessor
|
| 10 |
+
from ..result import ConversionResult
|
| 11 |
+
from ..exceptions import ConversionError, NetworkError
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class URLProcessor(BaseProcessor):
|
| 15 |
+
"""Processor for URLs and web pages."""
|
| 16 |
+
|
| 17 |
+
def can_process(self, file_path: str) -> bool:
|
| 18 |
+
"""Check if this processor can handle the given file.
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
file_path: Path to the file to check (or URL)
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
True if this processor can handle the file
|
| 25 |
+
"""
|
| 26 |
+
# Check if it looks like a URL
|
| 27 |
+
return self._is_url(file_path)
|
| 28 |
+
|
| 29 |
+
def process(self, file_path: str) -> ConversionResult:
|
| 30 |
+
"""Process the URL and return a conversion result.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
file_path: URL to process
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
ConversionResult containing the processed content
|
| 37 |
+
|
| 38 |
+
Raises:
|
| 39 |
+
NetworkError: If network operations fail
|
| 40 |
+
ConversionError: If processing fails
|
| 41 |
+
"""
|
| 42 |
+
try:
|
| 43 |
+
import requests
|
| 44 |
+
|
| 45 |
+
# First, check if this URL points to a file
|
| 46 |
+
file_info = self._detect_file_from_url(file_path)
|
| 47 |
+
|
| 48 |
+
if file_info:
|
| 49 |
+
# This is a file URL, download and process it
|
| 50 |
+
return self._process_file_url(file_path, file_info)
|
| 51 |
+
else:
|
| 52 |
+
# This is a web page, process it as HTML
|
| 53 |
+
return self._process_web_page(file_path)
|
| 54 |
+
|
| 55 |
+
except ImportError:
|
| 56 |
+
raise ConversionError("requests and beautifulsoup4 are required for URL processing. Install them with: pip install requests beautifulsoup4")
|
| 57 |
+
except requests.RequestException as e:
|
| 58 |
+
raise NetworkError(f"Failed to fetch URL {file_path}: {str(e)}")
|
| 59 |
+
except Exception as e:
|
| 60 |
+
if isinstance(e, (NetworkError, ConversionError)):
|
| 61 |
+
raise
|
| 62 |
+
raise ConversionError(f"Failed to process URL {file_path}: {str(e)}")
|
| 63 |
+
|
| 64 |
+
def _detect_file_from_url(self, url: str) -> Optional[Dict[str, Any]]:
|
| 65 |
+
"""Detect if a URL points to a file and return file information.
|
| 66 |
+
|
| 67 |
+
Args:
|
| 68 |
+
url: URL to check
|
| 69 |
+
|
| 70 |
+
Returns:
|
| 71 |
+
File info dict if it's a file URL, None otherwise
|
| 72 |
+
"""
|
| 73 |
+
try:
|
| 74 |
+
import requests
|
| 75 |
+
|
| 76 |
+
# Check URL path for file extensions
|
| 77 |
+
parsed_url = urlparse(url)
|
| 78 |
+
path = parsed_url.path.lower()
|
| 79 |
+
|
| 80 |
+
# Common file extensions
|
| 81 |
+
file_extensions = {
|
| 82 |
+
'.pdf': 'pdf',
|
| 83 |
+
'.doc': 'doc',
|
| 84 |
+
'.docx': 'docx',
|
| 85 |
+
'.txt': 'txt',
|
| 86 |
+
'.md': 'markdown',
|
| 87 |
+
'.html': 'html',
|
| 88 |
+
'.htm': 'html',
|
| 89 |
+
'.xlsx': 'xlsx',
|
| 90 |
+
'.xls': 'xls',
|
| 91 |
+
'.csv': 'csv',
|
| 92 |
+
'.ppt': 'ppt',
|
| 93 |
+
'.pptx': 'pptx',
|
| 94 |
+
'.jpg': 'image',
|
| 95 |
+
'.jpeg': 'image',
|
| 96 |
+
'.png': 'image',
|
| 97 |
+
'.gif': 'image',
|
| 98 |
+
'.bmp': 'image',
|
| 99 |
+
'.tiff': 'image',
|
| 100 |
+
'.tif': 'image',
|
| 101 |
+
'.webp': 'image'
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
# Check for file extension in URL path
|
| 105 |
+
for ext, file_type in file_extensions.items():
|
| 106 |
+
if path.endswith(ext):
|
| 107 |
+
return {
|
| 108 |
+
'file_type': file_type,
|
| 109 |
+
'extension': ext,
|
| 110 |
+
'filename': os.path.basename(path) or f"downloaded_file{ext}"
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
# If no extension in URL, check content-type header
|
| 114 |
+
try:
|
| 115 |
+
headers = {
|
| 116 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
# Make a HEAD request to check content-type
|
| 120 |
+
response = requests.head(url, headers=headers, timeout=10, allow_redirects=True)
|
| 121 |
+
|
| 122 |
+
if response.status_code == 200:
|
| 123 |
+
content_type = response.headers.get('content-type', '').lower()
|
| 124 |
+
|
| 125 |
+
# Check for file content types
|
| 126 |
+
if 'application/pdf' in content_type:
|
| 127 |
+
return {'file_type': 'pdf', 'extension': '.pdf', 'filename': 'downloaded_file.pdf'}
|
| 128 |
+
elif 'application/msword' in content_type or 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' in content_type:
|
| 129 |
+
ext = '.docx' if 'openxmlformats' in content_type else '.doc'
|
| 130 |
+
return {'file_type': 'doc' if ext == '.doc' else 'docx', 'extension': ext, 'filename': f'downloaded_file{ext}'}
|
| 131 |
+
elif 'application/vnd.ms-excel' in content_type or 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' in content_type:
|
| 132 |
+
ext = '.xlsx' if 'openxmlformats' in content_type else '.xls'
|
| 133 |
+
return {'file_type': 'xlsx' if ext == '.xlsx' else 'xls', 'extension': ext, 'filename': f'downloaded_file{ext}'}
|
| 134 |
+
elif 'application/vnd.ms-powerpoint' in content_type or 'application/vnd.openxmlformats-officedocument.presentationml.presentation' in content_type:
|
| 135 |
+
ext = '.pptx' if 'openxmlformats' in content_type else '.ppt'
|
| 136 |
+
return {'file_type': 'pptx' if ext == '.pptx' else 'ppt', 'extension': ext, 'filename': f'downloaded_file{ext}'}
|
| 137 |
+
elif 'text/plain' in content_type:
|
| 138 |
+
return {'file_type': 'txt', 'extension': '.txt', 'filename': 'downloaded_file.txt'}
|
| 139 |
+
elif 'text/markdown' in content_type:
|
| 140 |
+
return {'file_type': 'markdown', 'extension': '.md', 'filename': 'downloaded_file.md'}
|
| 141 |
+
elif 'text/html' in content_type:
|
| 142 |
+
# HTML could be a web page or a file, check if it's likely a file
|
| 143 |
+
if 'attachment' in response.headers.get('content-disposition', '').lower():
|
| 144 |
+
return {'file_type': 'html', 'extension': '.html', 'filename': 'downloaded_file.html'}
|
| 145 |
+
# If it's HTML but not an attachment, treat as web page
|
| 146 |
+
return None
|
| 147 |
+
elif any(img_type in content_type for img_type in ['image/jpeg', 'image/png', 'image/gif', 'image/bmp', 'image/tiff', 'image/webp']):
|
| 148 |
+
# Determine extension from content type
|
| 149 |
+
ext_map = {
|
| 150 |
+
'image/jpeg': '.jpg',
|
| 151 |
+
'image/png': '.png',
|
| 152 |
+
'image/gif': '.gif',
|
| 153 |
+
'image/bmp': '.bmp',
|
| 154 |
+
'image/tiff': '.tiff',
|
| 155 |
+
'image/webp': '.webp'
|
| 156 |
+
}
|
| 157 |
+
ext = ext_map.get(content_type, '.jpg')
|
| 158 |
+
return {'file_type': 'image', 'extension': ext, 'filename': f'downloaded_file{ext}'}
|
| 159 |
+
|
| 160 |
+
except requests.RequestException:
|
| 161 |
+
# If HEAD request fails, assume it's a web page
|
| 162 |
+
pass
|
| 163 |
+
|
| 164 |
+
except Exception:
|
| 165 |
+
pass
|
| 166 |
+
|
| 167 |
+
return None
|
| 168 |
+
|
| 169 |
+
def _process_file_url(self, url: str, file_info: Dict[str, Any]) -> ConversionResult:
|
| 170 |
+
"""Download and process a file from URL.
|
| 171 |
+
|
| 172 |
+
Args:
|
| 173 |
+
url: URL to download from
|
| 174 |
+
file_info: Information about the file
|
| 175 |
+
|
| 176 |
+
Returns:
|
| 177 |
+
ConversionResult containing the processed content
|
| 178 |
+
"""
|
| 179 |
+
try:
|
| 180 |
+
import requests
|
| 181 |
+
from ..extractor import DocumentExtractor
|
| 182 |
+
|
| 183 |
+
# Download the file
|
| 184 |
+
headers = {
|
| 185 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
response = requests.get(url, headers=headers, timeout=60, stream=True)
|
| 189 |
+
response.raise_for_status()
|
| 190 |
+
|
| 191 |
+
# Create a temporary file
|
| 192 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=file_info['extension']) as temp_file:
|
| 193 |
+
# Write the downloaded content and track size
|
| 194 |
+
content_length = 0
|
| 195 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 196 |
+
if chunk: # Filter out keep-alive chunks
|
| 197 |
+
temp_file.write(chunk)
|
| 198 |
+
content_length += len(chunk)
|
| 199 |
+
|
| 200 |
+
temp_file_path = temp_file.name
|
| 201 |
+
|
| 202 |
+
try:
|
| 203 |
+
# Process the downloaded file using the appropriate processor
|
| 204 |
+
extractor = DocumentExtractor()
|
| 205 |
+
result = extractor.extract(temp_file_path)
|
| 206 |
+
|
| 207 |
+
# Add URL metadata to the result
|
| 208 |
+
result.metadata.update({
|
| 209 |
+
"source_url": url,
|
| 210 |
+
"downloaded_filename": file_info['filename'],
|
| 211 |
+
"content_type": response.headers.get('content-type', ''),
|
| 212 |
+
"content_length": content_length
|
| 213 |
+
})
|
| 214 |
+
|
| 215 |
+
return result
|
| 216 |
+
|
| 217 |
+
finally:
|
| 218 |
+
# Clean up the temporary file
|
| 219 |
+
try:
|
| 220 |
+
os.unlink(temp_file_path)
|
| 221 |
+
except OSError:
|
| 222 |
+
pass
|
| 223 |
+
|
| 224 |
+
except Exception as e:
|
| 225 |
+
raise ConversionError(f"Failed to download and process file from URL {url}: {str(e)}")
|
| 226 |
+
|
| 227 |
+
def _process_web_page(self, url: str) -> ConversionResult:
|
| 228 |
+
"""Process a web page URL.
|
| 229 |
+
|
| 230 |
+
Args:
|
| 231 |
+
url: URL to process
|
| 232 |
+
|
| 233 |
+
Returns:
|
| 234 |
+
ConversionResult containing the processed content
|
| 235 |
+
"""
|
| 236 |
+
try:
|
| 237 |
+
from bs4 import BeautifulSoup
|
| 238 |
+
import requests
|
| 239 |
+
|
| 240 |
+
# Fetch the web page
|
| 241 |
+
headers = {
|
| 242 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
response = requests.get(url, headers=headers, timeout=30)
|
| 246 |
+
response.raise_for_status()
|
| 247 |
+
|
| 248 |
+
# Parse the HTML
|
| 249 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 250 |
+
|
| 251 |
+
# Remove script and style elements
|
| 252 |
+
for script in soup(["script", "style"]):
|
| 253 |
+
script.decompose()
|
| 254 |
+
|
| 255 |
+
# Extract text content
|
| 256 |
+
content_parts = []
|
| 257 |
+
|
| 258 |
+
# Get title
|
| 259 |
+
title = soup.find('title')
|
| 260 |
+
if title:
|
| 261 |
+
content_parts.append(f"# {title.get_text().strip()}\n")
|
| 262 |
+
|
| 263 |
+
# Get main content
|
| 264 |
+
main_content = self._extract_main_content(soup)
|
| 265 |
+
if main_content:
|
| 266 |
+
content_parts.append(main_content)
|
| 267 |
+
else:
|
| 268 |
+
# Fallback to body text
|
| 269 |
+
body = soup.find('body')
|
| 270 |
+
if body:
|
| 271 |
+
content_parts.append(body.get_text())
|
| 272 |
+
|
| 273 |
+
content = '\n'.join(content_parts)
|
| 274 |
+
|
| 275 |
+
# Clean up the content
|
| 276 |
+
content = self._clean_content(content)
|
| 277 |
+
|
| 278 |
+
metadata = {
|
| 279 |
+
"url": url,
|
| 280 |
+
"status_code": response.status_code,
|
| 281 |
+
"content_type": response.headers.get('content-type', ''),
|
| 282 |
+
"content_length": len(response.content),
|
| 283 |
+
"processor": self.__class__.__name__
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
return ConversionResult(content, metadata)
|
| 287 |
+
|
| 288 |
+
except Exception as e:
|
| 289 |
+
raise ConversionError(f"Failed to process web page {url}: {str(e)}")
|
| 290 |
+
|
| 291 |
+
def _is_url(self, text: str) -> bool:
|
| 292 |
+
"""Check if the text looks like a URL.
|
| 293 |
+
|
| 294 |
+
Args:
|
| 295 |
+
text: Text to check
|
| 296 |
+
|
| 297 |
+
Returns:
|
| 298 |
+
True if text looks like a URL
|
| 299 |
+
"""
|
| 300 |
+
try:
|
| 301 |
+
result = urlparse(text)
|
| 302 |
+
return all([result.scheme, result.netloc])
|
| 303 |
+
except Exception:
|
| 304 |
+
return False
|
| 305 |
+
|
| 306 |
+
def _extract_main_content(self, soup) -> str:
|
| 307 |
+
"""Extract main content from the HTML.
|
| 308 |
+
|
| 309 |
+
Args:
|
| 310 |
+
soup: BeautifulSoup object
|
| 311 |
+
|
| 312 |
+
Returns:
|
| 313 |
+
Extracted main content
|
| 314 |
+
"""
|
| 315 |
+
# Try to find main content areas
|
| 316 |
+
main_selectors = [
|
| 317 |
+
'main',
|
| 318 |
+
'[role="main"]',
|
| 319 |
+
'.main-content',
|
| 320 |
+
'.content',
|
| 321 |
+
'#content',
|
| 322 |
+
'article',
|
| 323 |
+
'.post-content',
|
| 324 |
+
'.entry-content'
|
| 325 |
+
]
|
| 326 |
+
|
| 327 |
+
for selector in main_selectors:
|
| 328 |
+
element = soup.select_one(selector)
|
| 329 |
+
if element:
|
| 330 |
+
return element.get_text()
|
| 331 |
+
|
| 332 |
+
# If no main content found, return empty string
|
| 333 |
+
return ""
|
| 334 |
+
|
| 335 |
+
def _clean_content(self, content: str) -> str:
|
| 336 |
+
"""Clean up the extracted web content.
|
| 337 |
+
|
| 338 |
+
Args:
|
| 339 |
+
content: Raw web text content
|
| 340 |
+
|
| 341 |
+
Returns:
|
| 342 |
+
Cleaned text content
|
| 343 |
+
"""
|
| 344 |
+
# Remove excessive whitespace and normalize
|
| 345 |
+
lines = content.split('\n')
|
| 346 |
+
cleaned_lines = []
|
| 347 |
+
|
| 348 |
+
for line in lines:
|
| 349 |
+
# Remove excessive whitespace
|
| 350 |
+
line = ' '.join(line.split())
|
| 351 |
+
if line.strip():
|
| 352 |
+
cleaned_lines.append(line)
|
| 353 |
+
|
| 354 |
+
# Join lines and add proper spacing
|
| 355 |
+
content = '\n'.join(cleaned_lines)
|
| 356 |
+
|
| 357 |
+
# Add spacing around headers
|
| 358 |
+
content = content.replace('# ', '\n# ')
|
| 359 |
+
content = content.replace('## ', '\n## ')
|
| 360 |
+
|
| 361 |
+
return content.strip()
|
docstrange/result.py
ADDED
|
@@ -0,0 +1,1143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Conversion result class for handling different output formats."""
|
| 2 |
+
|
| 3 |
+
import csv
|
| 4 |
+
import io
|
| 5 |
+
import json
|
| 6 |
+
import logging
|
| 7 |
+
import re
|
| 8 |
+
from typing import Any, Dict, List, Optional, Union
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class MarkdownToJSONParser:
|
| 14 |
+
"""Comprehensive markdown to structured JSON parser."""
|
| 15 |
+
|
| 16 |
+
def __init__(self):
|
| 17 |
+
"""Initialize the parser."""
|
| 18 |
+
# Compile regex patterns for better performance
|
| 19 |
+
self.header_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
|
| 20 |
+
self.list_item_pattern = re.compile(r'^(\s*)[*\-+]\s+(.+)$', re.MULTILINE)
|
| 21 |
+
self.ordered_list_pattern = re.compile(r'^(\s*)\d+\.\s+(.+)$', re.MULTILINE)
|
| 22 |
+
self.code_block_pattern = re.compile(r'```(\w+)?\n(.*?)```', re.DOTALL)
|
| 23 |
+
self.inline_code_pattern = re.compile(r'`([^`]+)`')
|
| 24 |
+
self.link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
|
| 25 |
+
self.image_pattern = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
|
| 26 |
+
self.table_pattern = re.compile(r'\|(.+)\|\s*\n\|[-\s|:]+\|\s*\n((?:\|.+\|\s*\n?)*)', re.MULTILINE)
|
| 27 |
+
self.blockquote_pattern = re.compile(r'^>\s+(.+)$', re.MULTILINE)
|
| 28 |
+
self.bold_pattern = re.compile(r'\*\*(.+?)\*\*')
|
| 29 |
+
self.italic_pattern = re.compile(r'\*(.+?)\*')
|
| 30 |
+
|
| 31 |
+
def parse(self, markdown_text: str) -> Dict[str, Any]:
|
| 32 |
+
"""Parse markdown text into structured JSON.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
markdown_text: The markdown content to parse
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
Structured JSON representation
|
| 39 |
+
"""
|
| 40 |
+
if not markdown_text or not markdown_text.strip():
|
| 41 |
+
return {
|
| 42 |
+
"document": {
|
| 43 |
+
"sections": [],
|
| 44 |
+
"metadata": {"total_sections": 0}
|
| 45 |
+
}
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
lines = markdown_text.split('\n')
|
| 49 |
+
sections = []
|
| 50 |
+
current_section = None
|
| 51 |
+
current_content = []
|
| 52 |
+
|
| 53 |
+
for line in lines:
|
| 54 |
+
line = line.rstrip()
|
| 55 |
+
|
| 56 |
+
# Check if this is a header
|
| 57 |
+
header_match = self.header_pattern.match(line)
|
| 58 |
+
if header_match:
|
| 59 |
+
# Save previous section if exists
|
| 60 |
+
if current_section is not None:
|
| 61 |
+
current_section['content'] = self._parse_content('\n'.join(current_content))
|
| 62 |
+
sections.append(current_section)
|
| 63 |
+
|
| 64 |
+
# Start new section
|
| 65 |
+
header_level = len(header_match.group(1))
|
| 66 |
+
header_text = header_match.group(2).strip()
|
| 67 |
+
|
| 68 |
+
current_section = {
|
| 69 |
+
"title": header_text,
|
| 70 |
+
"level": header_level,
|
| 71 |
+
"type": "section",
|
| 72 |
+
"content": {}
|
| 73 |
+
}
|
| 74 |
+
current_content = []
|
| 75 |
+
else:
|
| 76 |
+
# Add to current content
|
| 77 |
+
if line.strip() or current_content: # Keep empty lines only if we have content
|
| 78 |
+
current_content.append(line)
|
| 79 |
+
|
| 80 |
+
# Don't forget the last section
|
| 81 |
+
if current_section is not None:
|
| 82 |
+
current_section['content'] = self._parse_content('\n'.join(current_content))
|
| 83 |
+
sections.append(current_section)
|
| 84 |
+
elif current_content:
|
| 85 |
+
# Handle content without any headers
|
| 86 |
+
sections.append({
|
| 87 |
+
"title": "Content",
|
| 88 |
+
"level": 1,
|
| 89 |
+
"type": "section",
|
| 90 |
+
"content": self._parse_content('\n'.join(current_content))
|
| 91 |
+
})
|
| 92 |
+
|
| 93 |
+
# Create hierarchical structure
|
| 94 |
+
structured_sections = self._create_hierarchy(sections)
|
| 95 |
+
|
| 96 |
+
return {
|
| 97 |
+
"document": {
|
| 98 |
+
"sections": structured_sections,
|
| 99 |
+
"metadata": {
|
| 100 |
+
"total_sections": len(sections),
|
| 101 |
+
"max_heading_level": max([s.get('level', 1) for s in sections]) if sections else 0,
|
| 102 |
+
"has_tables": any('tables' in s.get('content', {}) for s in sections),
|
| 103 |
+
"has_code_blocks": any('code_blocks' in s.get('content', {}) for s in sections),
|
| 104 |
+
"has_lists": any('lists' in s.get('content', {}) for s in sections),
|
| 105 |
+
"has_images": any('images' in s.get('content', {}) for s in sections)
|
| 106 |
+
}
|
| 107 |
+
}
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
def _parse_content(self, content: str) -> Dict[str, Any]:
|
| 111 |
+
"""Parse content within a section into structured components."""
|
| 112 |
+
if not content.strip():
|
| 113 |
+
return {}
|
| 114 |
+
|
| 115 |
+
result = {}
|
| 116 |
+
|
| 117 |
+
# Extract and parse different content types
|
| 118 |
+
paragraphs = self._extract_paragraphs(content)
|
| 119 |
+
if paragraphs:
|
| 120 |
+
result['paragraphs'] = paragraphs
|
| 121 |
+
|
| 122 |
+
lists = self._extract_lists(content)
|
| 123 |
+
if lists:
|
| 124 |
+
result['lists'] = lists
|
| 125 |
+
|
| 126 |
+
code_blocks = self._extract_code_blocks(content)
|
| 127 |
+
if code_blocks:
|
| 128 |
+
result['code_blocks'] = code_blocks
|
| 129 |
+
|
| 130 |
+
tables = self._extract_tables(content)
|
| 131 |
+
if tables:
|
| 132 |
+
result['tables'] = tables
|
| 133 |
+
|
| 134 |
+
images = self._extract_images(content)
|
| 135 |
+
if images:
|
| 136 |
+
result['images'] = images
|
| 137 |
+
|
| 138 |
+
links = self._extract_links(content)
|
| 139 |
+
if links:
|
| 140 |
+
result['links'] = links
|
| 141 |
+
|
| 142 |
+
blockquotes = self._extract_blockquotes(content)
|
| 143 |
+
if blockquotes:
|
| 144 |
+
result['blockquotes'] = blockquotes
|
| 145 |
+
|
| 146 |
+
return result
|
| 147 |
+
|
| 148 |
+
def _extract_paragraphs(self, content: str) -> List[str]:
|
| 149 |
+
"""Extract paragraphs from content."""
|
| 150 |
+
# Remove code blocks, tables, lists, etc. to get clean paragraphs
|
| 151 |
+
clean_content = content
|
| 152 |
+
|
| 153 |
+
# Remove code blocks
|
| 154 |
+
clean_content = self.code_block_pattern.sub('', clean_content)
|
| 155 |
+
|
| 156 |
+
# Remove tables (simplified)
|
| 157 |
+
clean_content = re.sub(r'\|.*\|', '', clean_content)
|
| 158 |
+
|
| 159 |
+
# Remove list items
|
| 160 |
+
clean_content = self.list_item_pattern.sub('', clean_content)
|
| 161 |
+
clean_content = self.ordered_list_pattern.sub('', clean_content)
|
| 162 |
+
|
| 163 |
+
# Remove blockquotes
|
| 164 |
+
clean_content = self.blockquote_pattern.sub('', clean_content)
|
| 165 |
+
|
| 166 |
+
# Split into paragraphs and clean
|
| 167 |
+
paragraphs = []
|
| 168 |
+
for para in clean_content.split('\n\n'):
|
| 169 |
+
para = para.strip()
|
| 170 |
+
if para and not para.startswith('#'):
|
| 171 |
+
# Clean up markdown formatting for paragraphs
|
| 172 |
+
para = self._clean_inline_formatting(para)
|
| 173 |
+
paragraphs.append(para)
|
| 174 |
+
|
| 175 |
+
return paragraphs
|
| 176 |
+
|
| 177 |
+
def _extract_lists(self, content: str) -> List[Dict[str, Any]]:
|
| 178 |
+
"""Extract lists from content."""
|
| 179 |
+
lists = []
|
| 180 |
+
lines = content.split('\n')
|
| 181 |
+
current_list = None
|
| 182 |
+
|
| 183 |
+
for line in lines:
|
| 184 |
+
line = line.rstrip()
|
| 185 |
+
|
| 186 |
+
# Check for unordered list
|
| 187 |
+
unordered_match = self.list_item_pattern.match(line)
|
| 188 |
+
if unordered_match:
|
| 189 |
+
indent_level = len(unordered_match.group(1)) // 2
|
| 190 |
+
item_text = self._clean_inline_formatting(unordered_match.group(2))
|
| 191 |
+
|
| 192 |
+
if current_list is None or current_list['type'] != 'unordered':
|
| 193 |
+
if current_list:
|
| 194 |
+
lists.append(current_list)
|
| 195 |
+
current_list = {'type': 'unordered', 'items': []}
|
| 196 |
+
|
| 197 |
+
current_list['items'].append({
|
| 198 |
+
'text': item_text,
|
| 199 |
+
'level': indent_level
|
| 200 |
+
})
|
| 201 |
+
continue
|
| 202 |
+
|
| 203 |
+
# Check for ordered list
|
| 204 |
+
ordered_match = self.ordered_list_pattern.match(line)
|
| 205 |
+
if ordered_match:
|
| 206 |
+
indent_level = len(ordered_match.group(1)) // 2
|
| 207 |
+
item_text = self._clean_inline_formatting(ordered_match.group(2))
|
| 208 |
+
|
| 209 |
+
if current_list is None or current_list['type'] != 'ordered':
|
| 210 |
+
if current_list:
|
| 211 |
+
lists.append(current_list)
|
| 212 |
+
current_list = {'type': 'ordered', 'items': []}
|
| 213 |
+
|
| 214 |
+
current_list['items'].append({
|
| 215 |
+
'text': item_text,
|
| 216 |
+
'level': indent_level
|
| 217 |
+
})
|
| 218 |
+
continue
|
| 219 |
+
|
| 220 |
+
# If we hit a non-list line and have a current list, save it
|
| 221 |
+
if current_list and line.strip():
|
| 222 |
+
lists.append(current_list)
|
| 223 |
+
current_list = None
|
| 224 |
+
|
| 225 |
+
# Don't forget the last list
|
| 226 |
+
if current_list:
|
| 227 |
+
lists.append(current_list)
|
| 228 |
+
|
| 229 |
+
return lists
|
| 230 |
+
|
| 231 |
+
def _extract_code_blocks(self, content: str) -> List[Dict[str, str]]:
|
| 232 |
+
"""Extract code blocks from content."""
|
| 233 |
+
code_blocks = []
|
| 234 |
+
|
| 235 |
+
for match in self.code_block_pattern.finditer(content):
|
| 236 |
+
language = match.group(1) or 'text'
|
| 237 |
+
code = match.group(2).strip()
|
| 238 |
+
|
| 239 |
+
code_blocks.append({
|
| 240 |
+
'language': language,
|
| 241 |
+
'code': code
|
| 242 |
+
})
|
| 243 |
+
|
| 244 |
+
return code_blocks
|
| 245 |
+
|
| 246 |
+
def _extract_tables(self, content: str) -> List[Dict[str, Any]]:
|
| 247 |
+
"""Extract tables from content."""
|
| 248 |
+
tables = []
|
| 249 |
+
|
| 250 |
+
for match in self.table_pattern.finditer(content):
|
| 251 |
+
header_row = match.group(1).strip()
|
| 252 |
+
body_rows = match.group(2).strip()
|
| 253 |
+
|
| 254 |
+
# Parse header
|
| 255 |
+
headers = [cell.strip() for cell in header_row.split('|') if cell.strip()]
|
| 256 |
+
|
| 257 |
+
# Parse body rows
|
| 258 |
+
rows = []
|
| 259 |
+
for row_line in body_rows.split('\n'):
|
| 260 |
+
if row_line.strip() and '|' in row_line:
|
| 261 |
+
cells = [cell.strip() for cell in row_line.split('|') if cell.strip()]
|
| 262 |
+
if cells:
|
| 263 |
+
rows.append(cells)
|
| 264 |
+
|
| 265 |
+
if headers and rows:
|
| 266 |
+
tables.append({
|
| 267 |
+
'headers': headers,
|
| 268 |
+
'rows': rows,
|
| 269 |
+
'columns': len(headers)
|
| 270 |
+
})
|
| 271 |
+
|
| 272 |
+
return tables
|
| 273 |
+
|
| 274 |
+
def _extract_images(self, content: str) -> List[Dict[str, str]]:
|
| 275 |
+
"""Extract images from content."""
|
| 276 |
+
images = []
|
| 277 |
+
|
| 278 |
+
for match in self.image_pattern.finditer(content):
|
| 279 |
+
alt_text = match.group(1)
|
| 280 |
+
url = match.group(2)
|
| 281 |
+
|
| 282 |
+
images.append({
|
| 283 |
+
'alt_text': alt_text,
|
| 284 |
+
'url': url
|
| 285 |
+
})
|
| 286 |
+
|
| 287 |
+
return images
|
| 288 |
+
|
| 289 |
+
def _extract_links(self, content: str) -> List[Dict[str, str]]:
|
| 290 |
+
"""Extract links from content."""
|
| 291 |
+
links = []
|
| 292 |
+
|
| 293 |
+
for match in self.link_pattern.finditer(content):
|
| 294 |
+
text = match.group(1)
|
| 295 |
+
url = match.group(2)
|
| 296 |
+
|
| 297 |
+
links.append({
|
| 298 |
+
'text': text,
|
| 299 |
+
'url': url
|
| 300 |
+
})
|
| 301 |
+
|
| 302 |
+
return links
|
| 303 |
+
|
| 304 |
+
def _extract_blockquotes(self, content: str) -> List[str]:
|
| 305 |
+
"""Extract blockquotes from content."""
|
| 306 |
+
blockquotes = []
|
| 307 |
+
|
| 308 |
+
for match in self.blockquote_pattern.finditer(content):
|
| 309 |
+
quote_text = match.group(1).strip()
|
| 310 |
+
blockquotes.append(quote_text)
|
| 311 |
+
|
| 312 |
+
return blockquotes
|
| 313 |
+
|
| 314 |
+
def _clean_inline_formatting(self, text: str) -> str:
|
| 315 |
+
"""Clean inline markdown formatting from text."""
|
| 316 |
+
# Remove bold
|
| 317 |
+
text = self.bold_pattern.sub(r'\1', text)
|
| 318 |
+
# Remove italic
|
| 319 |
+
text = self.italic_pattern.sub(r'\1', text)
|
| 320 |
+
# Remove inline code
|
| 321 |
+
text = self.inline_code_pattern.sub(r'\1', text)
|
| 322 |
+
|
| 323 |
+
return text.strip()
|
| 324 |
+
|
| 325 |
+
def _create_hierarchy(self, sections: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 326 |
+
"""Create hierarchical structure from flat sections list."""
|
| 327 |
+
if not sections:
|
| 328 |
+
return []
|
| 329 |
+
|
| 330 |
+
result = []
|
| 331 |
+
stack = []
|
| 332 |
+
|
| 333 |
+
for section in sections:
|
| 334 |
+
level = section['level']
|
| 335 |
+
|
| 336 |
+
# Pop from stack until we find a parent at appropriate level
|
| 337 |
+
while stack and stack[-1]['level'] >= level:
|
| 338 |
+
stack.pop()
|
| 339 |
+
|
| 340 |
+
# If we have a parent, add this section as a subsection
|
| 341 |
+
if stack:
|
| 342 |
+
parent = stack[-1]
|
| 343 |
+
if 'subsections' not in parent:
|
| 344 |
+
parent['subsections'] = []
|
| 345 |
+
parent['subsections'].append(section)
|
| 346 |
+
else:
|
| 347 |
+
# This is a top-level section
|
| 348 |
+
result.append(section)
|
| 349 |
+
|
| 350 |
+
# Add this section to the stack
|
| 351 |
+
stack.append(section)
|
| 352 |
+
|
| 353 |
+
return result
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
class MarkdownToHTMLConverter:
|
| 357 |
+
"""Comprehensive markdown to HTML extractor."""
|
| 358 |
+
|
| 359 |
+
def __init__(self):
|
| 360 |
+
"""Initialize the extractor."""
|
| 361 |
+
# Compile regex patterns for better performance
|
| 362 |
+
self.header_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
|
| 363 |
+
self.bold_pattern = re.compile(r'\*\*(.+?)\*\*')
|
| 364 |
+
self.italic_pattern = re.compile(r'\*(.+?)\*')
|
| 365 |
+
self.bold_italic_pattern = re.compile(r'\*\*\*(.+?)\*\*\*')
|
| 366 |
+
self.strikethrough_pattern = re.compile(r'~~(.+?)~~')
|
| 367 |
+
self.inline_code_pattern = re.compile(r'`([^`]+)`')
|
| 368 |
+
self.link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
|
| 369 |
+
self.image_pattern = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
|
| 370 |
+
self.horizontal_rule_pattern = re.compile(r'^---+$', re.MULTILINE)
|
| 371 |
+
self.blockquote_pattern = re.compile(r'^>\s+(.+)$', re.MULTILINE)
|
| 372 |
+
|
| 373 |
+
def extract(self, markdown_text: str) -> str:
|
| 374 |
+
"""Convert markdown text to HTML.
|
| 375 |
+
|
| 376 |
+
Args:
|
| 377 |
+
markdown_text: The markdown content to extract
|
| 378 |
+
|
| 379 |
+
Returns:
|
| 380 |
+
HTML string
|
| 381 |
+
"""
|
| 382 |
+
html = markdown_text
|
| 383 |
+
|
| 384 |
+
# Process code blocks first (before other inline processing)
|
| 385 |
+
html = self._process_code_blocks(html)
|
| 386 |
+
|
| 387 |
+
# Process tables
|
| 388 |
+
html = self._process_tables(html)
|
| 389 |
+
|
| 390 |
+
# Process horizontal rules
|
| 391 |
+
html = self._process_horizontal_rules(html)
|
| 392 |
+
|
| 393 |
+
# Process blockquotes
|
| 394 |
+
html = self._process_blockquotes(html)
|
| 395 |
+
|
| 396 |
+
# Process headers
|
| 397 |
+
html = self._process_headers(html)
|
| 398 |
+
|
| 399 |
+
# Process lists
|
| 400 |
+
html = self._process_lists(html)
|
| 401 |
+
|
| 402 |
+
# Process inline elements
|
| 403 |
+
html = self._process_inline_elements(html)
|
| 404 |
+
|
| 405 |
+
# Process paragraphs
|
| 406 |
+
html = self._process_paragraphs(html)
|
| 407 |
+
|
| 408 |
+
return html
|
| 409 |
+
|
| 410 |
+
def _process_code_blocks(self, text: str) -> str:
|
| 411 |
+
"""Process fenced code blocks."""
|
| 412 |
+
# Handle ```code blocks```
|
| 413 |
+
def replace_code_block(match):
|
| 414 |
+
language = match.group(1) or ''
|
| 415 |
+
code = match.group(2)
|
| 416 |
+
lang_class = f' class="language-{language}"' if language else ''
|
| 417 |
+
return f'<pre><code{lang_class}>{self._escape_html(code)}</code></pre>'
|
| 418 |
+
|
| 419 |
+
text = re.sub(r'```(\w+)?\n(.*?)\n```', replace_code_block, text, flags=re.DOTALL)
|
| 420 |
+
|
| 421 |
+
# Handle indented code blocks (4 spaces or tab)
|
| 422 |
+
lines = text.split('\n')
|
| 423 |
+
in_code_block = False
|
| 424 |
+
code_lines = []
|
| 425 |
+
result_lines = []
|
| 426 |
+
|
| 427 |
+
for line in lines:
|
| 428 |
+
if line.startswith(' ') or line.startswith('\t'):
|
| 429 |
+
if not in_code_block:
|
| 430 |
+
in_code_block = True
|
| 431 |
+
code_lines = [line.lstrip()]
|
| 432 |
+
else:
|
| 433 |
+
code_lines.append(line.lstrip())
|
| 434 |
+
else:
|
| 435 |
+
if in_code_block:
|
| 436 |
+
# End code block
|
| 437 |
+
code_content = '\n'.join(code_lines)
|
| 438 |
+
result_lines.append(f'<pre><code>{self._escape_html(code_content)}</code></pre>')
|
| 439 |
+
code_lines = []
|
| 440 |
+
in_code_block = False
|
| 441 |
+
result_lines.append(line)
|
| 442 |
+
|
| 443 |
+
if in_code_block:
|
| 444 |
+
code_content = '\n'.join(code_lines)
|
| 445 |
+
result_lines.append(f'<pre><code>{self._escape_html(code_content)}</code></pre>')
|
| 446 |
+
|
| 447 |
+
return '\n'.join(result_lines)
|
| 448 |
+
|
| 449 |
+
def _process_tables(self, text: str) -> str:
|
| 450 |
+
"""Process markdown tables."""
|
| 451 |
+
lines = text.split('\n')
|
| 452 |
+
result_lines = []
|
| 453 |
+
i = 0
|
| 454 |
+
|
| 455 |
+
while i < len(lines):
|
| 456 |
+
line = lines[i]
|
| 457 |
+
|
| 458 |
+
# Check if this line looks like a table header
|
| 459 |
+
if '|' in line and i + 1 < len(lines) and '|' in lines[i + 1]:
|
| 460 |
+
# Check if next line is separator
|
| 461 |
+
next_line = lines[i + 1]
|
| 462 |
+
if re.match(r'^\s*\|[\s\-:|]+\|\s*$', next_line):
|
| 463 |
+
# This is a table
|
| 464 |
+
table_lines = [line]
|
| 465 |
+
j = i + 1
|
| 466 |
+
|
| 467 |
+
# Collect all table rows
|
| 468 |
+
while j < len(lines) and '|' in lines[j]:
|
| 469 |
+
table_lines.append(lines[j])
|
| 470 |
+
j += 1
|
| 471 |
+
|
| 472 |
+
# Convert table to HTML
|
| 473 |
+
html_table = self._convert_table_to_html(table_lines)
|
| 474 |
+
result_lines.append(html_table)
|
| 475 |
+
i = j
|
| 476 |
+
continue
|
| 477 |
+
|
| 478 |
+
result_lines.append(line)
|
| 479 |
+
i += 1
|
| 480 |
+
|
| 481 |
+
return '\n'.join(result_lines)
|
| 482 |
+
|
| 483 |
+
def _convert_table_to_html(self, table_lines: List[str]) -> str:
|
| 484 |
+
"""Convert table lines to HTML table."""
|
| 485 |
+
if len(table_lines) < 2:
|
| 486 |
+
return table_lines[0] if table_lines else ''
|
| 487 |
+
|
| 488 |
+
html_parts = ['<table>']
|
| 489 |
+
|
| 490 |
+
# Process header
|
| 491 |
+
header_cells = [cell.strip() for cell in table_lines[0].split('|')[1:-1]]
|
| 492 |
+
html_parts.append('<thead><tr>')
|
| 493 |
+
for cell in header_cells:
|
| 494 |
+
html_parts.append(f'<th>{self._escape_html(cell)}</th>')
|
| 495 |
+
html_parts.append('</tr></thead>')
|
| 496 |
+
|
| 497 |
+
# Process body (skip separator line)
|
| 498 |
+
html_parts.append('<tbody>')
|
| 499 |
+
for line in table_lines[2:]:
|
| 500 |
+
cells = [cell.strip() for cell in line.split('|')[1:-1]]
|
| 501 |
+
html_parts.append('<tr>')
|
| 502 |
+
for cell in cells:
|
| 503 |
+
html_parts.append(f'<td>{self._escape_html(cell)}</td>')
|
| 504 |
+
html_parts.append('</tr>')
|
| 505 |
+
html_parts.append('</tbody>')
|
| 506 |
+
|
| 507 |
+
html_parts.append('</table>')
|
| 508 |
+
return '\n'.join(html_parts)
|
| 509 |
+
|
| 510 |
+
def _process_horizontal_rules(self, text: str) -> str:
|
| 511 |
+
"""Process horizontal rules."""
|
| 512 |
+
return self.horizontal_rule_pattern.sub('<hr>', text)
|
| 513 |
+
|
| 514 |
+
def _process_blockquotes(self, text: str) -> str:
|
| 515 |
+
"""Process blockquotes."""
|
| 516 |
+
lines = text.split('\n')
|
| 517 |
+
result_lines = []
|
| 518 |
+
i = 0
|
| 519 |
+
|
| 520 |
+
while i < len(lines):
|
| 521 |
+
line = lines[i]
|
| 522 |
+
|
| 523 |
+
if line.startswith('> '):
|
| 524 |
+
# Start blockquote
|
| 525 |
+
quote_lines = [line[2:]] # Remove '> '
|
| 526 |
+
j = i + 1
|
| 527 |
+
|
| 528 |
+
# Collect all quote lines
|
| 529 |
+
while j < len(lines) and (lines[j].startswith('> ') or lines[j].strip() == ''):
|
| 530 |
+
if lines[j].startswith('> '):
|
| 531 |
+
quote_lines.append(lines[j][2:])
|
| 532 |
+
else:
|
| 533 |
+
quote_lines.append('')
|
| 534 |
+
j += 1
|
| 535 |
+
|
| 536 |
+
# Convert to HTML
|
| 537 |
+
quote_content = '\n'.join(quote_lines)
|
| 538 |
+
quote_html = self._process_inline_elements(quote_content)
|
| 539 |
+
result_lines.append(f'<blockquote>{quote_html}</blockquote>')
|
| 540 |
+
i = j
|
| 541 |
+
continue
|
| 542 |
+
|
| 543 |
+
result_lines.append(line)
|
| 544 |
+
i += 1
|
| 545 |
+
|
| 546 |
+
return '\n'.join(result_lines)
|
| 547 |
+
|
| 548 |
+
def _process_headers(self, text: str) -> str:
|
| 549 |
+
"""Process markdown headers."""
|
| 550 |
+
def replace_header(match):
|
| 551 |
+
level = len(match.group(1))
|
| 552 |
+
content = match.group(2)
|
| 553 |
+
return f'<h{level}>{self._escape_html(content)}</h{level}>'
|
| 554 |
+
|
| 555 |
+
return self.header_pattern.sub(replace_header, text)
|
| 556 |
+
|
| 557 |
+
def _process_lists(self, text: str) -> str:
|
| 558 |
+
"""Process ordered and unordered lists."""
|
| 559 |
+
lines = text.split('\n')
|
| 560 |
+
result_lines = []
|
| 561 |
+
i = 0
|
| 562 |
+
|
| 563 |
+
while i < len(lines):
|
| 564 |
+
line = lines[i]
|
| 565 |
+
|
| 566 |
+
# Check for unordered list
|
| 567 |
+
if re.match(r'^[\s]*[-*+]\s+', line):
|
| 568 |
+
list_lines = self._collect_list_items(lines, i, r'^[\s]*[-*+]\s+')
|
| 569 |
+
html_list = self._convert_list_to_html(list_lines, 'ul')
|
| 570 |
+
result_lines.append(html_list)
|
| 571 |
+
i += len(list_lines)
|
| 572 |
+
continue
|
| 573 |
+
|
| 574 |
+
# Check for ordered list
|
| 575 |
+
elif re.match(r'^[\s]*\d+\.\s+', line):
|
| 576 |
+
list_lines = self._collect_list_items(lines, i, r'^[\s]*\d+\.\s+')
|
| 577 |
+
html_list = self._convert_list_to_html(list_lines, 'ol')
|
| 578 |
+
result_lines.append(html_list)
|
| 579 |
+
i += len(list_lines)
|
| 580 |
+
continue
|
| 581 |
+
|
| 582 |
+
result_lines.append(line)
|
| 583 |
+
i += 1
|
| 584 |
+
|
| 585 |
+
return '\n'.join(result_lines)
|
| 586 |
+
|
| 587 |
+
def _collect_list_items(self, lines: List[str], start_idx: int, pattern: str) -> List[str]:
|
| 588 |
+
"""Collect consecutive list items."""
|
| 589 |
+
items = []
|
| 590 |
+
i = start_idx
|
| 591 |
+
|
| 592 |
+
while i < len(lines):
|
| 593 |
+
line = lines[i]
|
| 594 |
+
if re.match(pattern, line):
|
| 595 |
+
items.append(line)
|
| 596 |
+
i += 1
|
| 597 |
+
elif line.strip() == '':
|
| 598 |
+
# Empty line might be part of list item
|
| 599 |
+
items.append(line)
|
| 600 |
+
i += 1
|
| 601 |
+
else:
|
| 602 |
+
break
|
| 603 |
+
|
| 604 |
+
return items
|
| 605 |
+
|
| 606 |
+
def _convert_list_to_html(self, list_lines: List[str], list_type: str) -> str:
|
| 607 |
+
"""Convert list lines to HTML list."""
|
| 608 |
+
html_parts = [f'<{list_type}>']
|
| 609 |
+
|
| 610 |
+
for line in list_lines:
|
| 611 |
+
if line.strip() == '':
|
| 612 |
+
continue
|
| 613 |
+
|
| 614 |
+
# Extract list item content
|
| 615 |
+
if list_type == 'ul':
|
| 616 |
+
content = re.sub(r'^[\s]*[-*+]\s+', '', line)
|
| 617 |
+
else:
|
| 618 |
+
content = re.sub(r'^[\s]*\d+\.\s+', '', line)
|
| 619 |
+
|
| 620 |
+
# Process inline elements in list item
|
| 621 |
+
content = self._process_inline_elements(content)
|
| 622 |
+
html_parts.append(f'<li>{content}</li>')
|
| 623 |
+
|
| 624 |
+
html_parts.append(f'</{list_type}>')
|
| 625 |
+
return '\n'.join(html_parts)
|
| 626 |
+
|
| 627 |
+
def _process_inline_elements(self, text: str) -> str:
|
| 628 |
+
"""Process inline markdown elements."""
|
| 629 |
+
# Process bold and italic (order matters)
|
| 630 |
+
text = self.bold_italic_pattern.sub(r'<strong><em>\1</em></strong>', text)
|
| 631 |
+
text = self.bold_pattern.sub(r'<strong>\1</strong>', text)
|
| 632 |
+
text = self.italic_pattern.sub(r'<em>\1</em>', text)
|
| 633 |
+
|
| 634 |
+
# Process strikethrough
|
| 635 |
+
text = self.strikethrough_pattern.sub(r'<del>\1</del>', text)
|
| 636 |
+
|
| 637 |
+
# Process inline code
|
| 638 |
+
text = self.inline_code_pattern.sub(r'<code>\1</code>', text)
|
| 639 |
+
|
| 640 |
+
# Process links
|
| 641 |
+
text = self.link_pattern.sub(r'<a href="\2">\1</a>', text)
|
| 642 |
+
|
| 643 |
+
# Process images
|
| 644 |
+
text = self.image_pattern.sub(r'<img src="\2" alt="\1">', text)
|
| 645 |
+
|
| 646 |
+
return text
|
| 647 |
+
|
| 648 |
+
def _process_paragraphs(self, text: str) -> str:
|
| 649 |
+
"""Process paragraphs by wrapping non-empty lines in <p> tags."""
|
| 650 |
+
lines = text.split('\n')
|
| 651 |
+
result_lines = []
|
| 652 |
+
current_paragraph = []
|
| 653 |
+
|
| 654 |
+
for line in lines:
|
| 655 |
+
if line.strip() == '':
|
| 656 |
+
if current_paragraph:
|
| 657 |
+
# End current paragraph
|
| 658 |
+
paragraph_content = ' '.join(current_paragraph)
|
| 659 |
+
result_lines.append(f'<p>{paragraph_content}</p>')
|
| 660 |
+
current_paragraph = []
|
| 661 |
+
else:
|
| 662 |
+
# Check if line is already an HTML block element
|
| 663 |
+
if re.match(r'^<(h[1-6]|p|div|blockquote|pre|table|ul|ol|li|hr)', line.strip()):
|
| 664 |
+
# Flush current paragraph if any
|
| 665 |
+
if current_paragraph:
|
| 666 |
+
paragraph_content = ' '.join(current_paragraph)
|
| 667 |
+
result_lines.append(f'<p>{paragraph_content}</p>')
|
| 668 |
+
current_paragraph = []
|
| 669 |
+
result_lines.append(line)
|
| 670 |
+
else:
|
| 671 |
+
current_paragraph.append(line)
|
| 672 |
+
|
| 673 |
+
# Handle any remaining paragraph
|
| 674 |
+
if current_paragraph:
|
| 675 |
+
paragraph_content = ' '.join(current_paragraph)
|
| 676 |
+
result_lines.append(f'<p>{paragraph_content}</p>')
|
| 677 |
+
|
| 678 |
+
return '\n'.join(result_lines)
|
| 679 |
+
|
| 680 |
+
def _escape_html(self, text: str) -> str:
|
| 681 |
+
"""Escape HTML special characters."""
|
| 682 |
+
return (text.replace('&', '&')
|
| 683 |
+
.replace('<', '<')
|
| 684 |
+
.replace('>', '>')
|
| 685 |
+
.replace('"', '"')
|
| 686 |
+
.replace("'", '''))
|
| 687 |
+
|
| 688 |
+
|
| 689 |
+
class ConversionResult:
|
| 690 |
+
"""Result object with methods to export to different formats."""
|
| 691 |
+
|
| 692 |
+
def __init__(self, content: str, metadata: Optional[Dict[str, Any]] = None):
|
| 693 |
+
"""Initialize the conversion result.
|
| 694 |
+
|
| 695 |
+
Args:
|
| 696 |
+
content: The converted content as string
|
| 697 |
+
metadata: Optional metadata about the conversion
|
| 698 |
+
"""
|
| 699 |
+
self.content = content
|
| 700 |
+
self.metadata = metadata or {}
|
| 701 |
+
self._html_converter = MarkdownToHTMLConverter()
|
| 702 |
+
self._json_parser = MarkdownToJSONParser()
|
| 703 |
+
|
| 704 |
+
def extract_markdown(self) -> str:
|
| 705 |
+
"""Export as markdown.
|
| 706 |
+
|
| 707 |
+
Returns:
|
| 708 |
+
The content formatted as markdown
|
| 709 |
+
"""
|
| 710 |
+
return self.content
|
| 711 |
+
|
| 712 |
+
def extract_html(self) -> str:
|
| 713 |
+
"""Export as HTML.
|
| 714 |
+
|
| 715 |
+
Returns:
|
| 716 |
+
The content formatted as HTML
|
| 717 |
+
"""
|
| 718 |
+
# Convert markdown content to HTML using the comprehensive extractor
|
| 719 |
+
html_content = self._html_converter.extract(self.content)
|
| 720 |
+
|
| 721 |
+
# Wrap in HTML structure with Nanonets design system
|
| 722 |
+
return f"""<!DOCTYPE html>
|
| 723 |
+
<html lang="en">
|
| 724 |
+
<head>
|
| 725 |
+
<meta charset="UTF-8">
|
| 726 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 727 |
+
<title>Converted Document</title>
|
| 728 |
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 729 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
| 730 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
|
| 731 |
+
<style>
|
| 732 |
+
* {{
|
| 733 |
+
box-sizing: border-box;
|
| 734 |
+
}}
|
| 735 |
+
|
| 736 |
+
body {{
|
| 737 |
+
font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
| 738 |
+
line-height: 1.6;
|
| 739 |
+
color: #1F2129;
|
| 740 |
+
background-color: #FFFFFF;
|
| 741 |
+
margin: 0;
|
| 742 |
+
padding: 2rem;
|
| 743 |
+
max-width: 1200px;
|
| 744 |
+
margin: 0 auto;
|
| 745 |
+
}}
|
| 746 |
+
|
| 747 |
+
.content {{
|
| 748 |
+
background: #FFFFFF;
|
| 749 |
+
padding: 2rem;
|
| 750 |
+
border-radius: 8px;
|
| 751 |
+
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
|
| 752 |
+
}}
|
| 753 |
+
|
| 754 |
+
/* Typography */
|
| 755 |
+
h1, h2, h3, h4, h5, h6 {{
|
| 756 |
+
font-family: 'Inter', sans-serif;
|
| 757 |
+
color: #1D2554;
|
| 758 |
+
margin-top: 2rem;
|
| 759 |
+
margin-bottom: 1rem;
|
| 760 |
+
font-weight: 600;
|
| 761 |
+
line-height: 1.3;
|
| 762 |
+
}}
|
| 763 |
+
|
| 764 |
+
h1 {{ font-size: 48px; letter-spacing: -0.02em; margin-top: 0; }}
|
| 765 |
+
h2 {{ font-size: 36px; letter-spacing: -0.01em; }}
|
| 766 |
+
h3 {{ font-size: 24px; }}
|
| 767 |
+
h4 {{ font-size: 20px; }}
|
| 768 |
+
h5 {{ font-size: 16px; }}
|
| 769 |
+
h6 {{ font-size: 14px; }}
|
| 770 |
+
|
| 771 |
+
p {{
|
| 772 |
+
font-size: 16px;
|
| 773 |
+
line-height: 1.6;
|
| 774 |
+
margin-bottom: 1rem;
|
| 775 |
+
color: #1F2129;
|
| 776 |
+
}}
|
| 777 |
+
|
| 778 |
+
/* Lists */
|
| 779 |
+
ul, ol {{
|
| 780 |
+
margin: 1rem 0;
|
| 781 |
+
padding-left: 2rem;
|
| 782 |
+
}}
|
| 783 |
+
|
| 784 |
+
li {{
|
| 785 |
+
margin-bottom: 0.5rem;
|
| 786 |
+
line-height: 1.6;
|
| 787 |
+
}}
|
| 788 |
+
|
| 789 |
+
/* Code */
|
| 790 |
+
code {{
|
| 791 |
+
background-color: #F8FAFF;
|
| 792 |
+
color: #3A4DB2;
|
| 793 |
+
padding: 0.2rem 0.4rem;
|
| 794 |
+
border-radius: 4px;
|
| 795 |
+
font-family: 'SF Mono', Monaco, 'Cascadia Code', 'Roboto Mono', Consolas, 'Courier New', monospace;
|
| 796 |
+
font-size: 0.9em;
|
| 797 |
+
border: 1px solid #EAEDFF;
|
| 798 |
+
}}
|
| 799 |
+
|
| 800 |
+
pre {{
|
| 801 |
+
background-color: #F8FAFF;
|
| 802 |
+
border: 1px solid #EAEDFF;
|
| 803 |
+
border-radius: 8px;
|
| 804 |
+
padding: 1.5rem;
|
| 805 |
+
overflow-x: auto;
|
| 806 |
+
margin: 1.5rem 0;
|
| 807 |
+
}}
|
| 808 |
+
|
| 809 |
+
pre code {{
|
| 810 |
+
background: none;
|
| 811 |
+
border: none;
|
| 812 |
+
padding: 0;
|
| 813 |
+
color: #1F2129;
|
| 814 |
+
}}
|
| 815 |
+
|
| 816 |
+
/* Tables */
|
| 817 |
+
table {{
|
| 818 |
+
border-collapse: collapse;
|
| 819 |
+
width: 100%;
|
| 820 |
+
margin: 1.5rem 0;
|
| 821 |
+
border-radius: 8px;
|
| 822 |
+
overflow: hidden;
|
| 823 |
+
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
|
| 824 |
+
}}
|
| 825 |
+
|
| 826 |
+
th, td {{
|
| 827 |
+
border: 1px solid #EAEDFF;
|
| 828 |
+
padding: 0.75rem;
|
| 829 |
+
text-align: left;
|
| 830 |
+
vertical-align: top;
|
| 831 |
+
}}
|
| 832 |
+
|
| 833 |
+
th {{
|
| 834 |
+
background-color: #F2F4FF;
|
| 835 |
+
color: #1D2554;
|
| 836 |
+
font-weight: 600;
|
| 837 |
+
font-size: 14px;
|
| 838 |
+
}}
|
| 839 |
+
|
| 840 |
+
td {{
|
| 841 |
+
background-color: #FFFFFF;
|
| 842 |
+
font-size: 14px;
|
| 843 |
+
}}
|
| 844 |
+
|
| 845 |
+
tr:nth-child(even) td {{
|
| 846 |
+
background-color: #F8FAFF;
|
| 847 |
+
}}
|
| 848 |
+
|
| 849 |
+
/* Links */
|
| 850 |
+
a {{
|
| 851 |
+
color: #546FFF;
|
| 852 |
+
text-decoration: none;
|
| 853 |
+
border-bottom: 1px solid transparent;
|
| 854 |
+
transition: border-bottom-color 0.2s ease;
|
| 855 |
+
}}
|
| 856 |
+
|
| 857 |
+
a:hover {{
|
| 858 |
+
border-bottom-color: #546FFF;
|
| 859 |
+
}}
|
| 860 |
+
|
| 861 |
+
/* Images */
|
| 862 |
+
img {{
|
| 863 |
+
max-width: 100%;
|
| 864 |
+
height: auto;
|
| 865 |
+
border-radius: 8px;
|
| 866 |
+
margin: 1rem 0;
|
| 867 |
+
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
|
| 868 |
+
}}
|
| 869 |
+
|
| 870 |
+
/* Blockquotes */
|
| 871 |
+
blockquote {{
|
| 872 |
+
border-left: 4px solid #546FFF;
|
| 873 |
+
margin: 1.5rem 0;
|
| 874 |
+
padding: 1rem 1.5rem;
|
| 875 |
+
background-color: #F8FAFF;
|
| 876 |
+
border-radius: 0 8px 8px 0;
|
| 877 |
+
font-style: italic;
|
| 878 |
+
}}
|
| 879 |
+
|
| 880 |
+
blockquote p {{
|
| 881 |
+
margin: 0;
|
| 882 |
+
color: #3A4DB2;
|
| 883 |
+
}}
|
| 884 |
+
|
| 885 |
+
/* Horizontal rules */
|
| 886 |
+
hr {{
|
| 887 |
+
border: none;
|
| 888 |
+
height: 1px;
|
| 889 |
+
background-color: #EAEDFF;
|
| 890 |
+
margin: 2rem 0;
|
| 891 |
+
}}
|
| 892 |
+
|
| 893 |
+
/* Emphasis */
|
| 894 |
+
strong {{
|
| 895 |
+
font-weight: 600;
|
| 896 |
+
color: #1D2554;
|
| 897 |
+
}}
|
| 898 |
+
|
| 899 |
+
em {{
|
| 900 |
+
font-style: italic;
|
| 901 |
+
color: #3A4DB2;
|
| 902 |
+
}}
|
| 903 |
+
|
| 904 |
+
del {{
|
| 905 |
+
text-decoration: line-through;
|
| 906 |
+
color: #676767;
|
| 907 |
+
}}
|
| 908 |
+
|
| 909 |
+
/* Responsive design */
|
| 910 |
+
@media (max-width: 768px) {{
|
| 911 |
+
body {{
|
| 912 |
+
padding: 1rem;
|
| 913 |
+
}}
|
| 914 |
+
|
| 915 |
+
.content {{
|
| 916 |
+
padding: 1rem;
|
| 917 |
+
}}
|
| 918 |
+
|
| 919 |
+
h1 {{ font-size: 32px; }}
|
| 920 |
+
h2 {{ font-size: 28px; }}
|
| 921 |
+
h3 {{ font-size: 20px; }}
|
| 922 |
+
|
| 923 |
+
table {{
|
| 924 |
+
font-size: 12px;
|
| 925 |
+
}}
|
| 926 |
+
|
| 927 |
+
th, td {{
|
| 928 |
+
padding: 0.5rem;
|
| 929 |
+
}}
|
| 930 |
+
}}
|
| 931 |
+
</style>
|
| 932 |
+
</head>
|
| 933 |
+
<body>
|
| 934 |
+
<div class="content">
|
| 935 |
+
{html_content}
|
| 936 |
+
</div>
|
| 937 |
+
</body>
|
| 938 |
+
</html>"""
|
| 939 |
+
|
| 940 |
+
def extract_data(self, specified_fields: Optional[list] = None, json_schema: Optional[dict] = None,
|
| 941 |
+
ollama_url: str = "http://localhost:11434", ollama_model: str = "llama3.2") -> Dict[str, Any]:
|
| 942 |
+
"""Convert content to JSON format.
|
| 943 |
+
|
| 944 |
+
Args:
|
| 945 |
+
specified_fields: List of specific fields to extract (uses Ollama)
|
| 946 |
+
json_schema: JSON schema to conform to (uses Ollama)
|
| 947 |
+
ollama_url: Ollama server URL for local processing
|
| 948 |
+
ollama_model: Model name for local processing
|
| 949 |
+
|
| 950 |
+
Returns:
|
| 951 |
+
Dictionary containing the JSON representation
|
| 952 |
+
"""
|
| 953 |
+
try:
|
| 954 |
+
# If specific fields or schema are requested, use Ollama extraction
|
| 955 |
+
if specified_fields or json_schema:
|
| 956 |
+
try:
|
| 957 |
+
from docstrange.services import OllamaFieldExtractor
|
| 958 |
+
extractor = OllamaFieldExtractor(base_url=ollama_url, model=ollama_model)
|
| 959 |
+
|
| 960 |
+
if extractor.is_available():
|
| 961 |
+
if specified_fields:
|
| 962 |
+
extracted_data = extractor.extract_fields(self.content, specified_fields)
|
| 963 |
+
return {
|
| 964 |
+
"extracted_fields": extracted_data,
|
| 965 |
+
"requested_fields": specified_fields,
|
| 966 |
+
**self.metadata,
|
| 967 |
+
"format": "local_specified_fields",
|
| 968 |
+
"extractor": "ollama"
|
| 969 |
+
}
|
| 970 |
+
elif json_schema:
|
| 971 |
+
extracted_data = extractor.extract_with_schema(self.content, json_schema)
|
| 972 |
+
return {
|
| 973 |
+
"extracted_data": extracted_data,
|
| 974 |
+
"schema": json_schema,
|
| 975 |
+
**self.metadata,
|
| 976 |
+
"format": "local_json_schema",
|
| 977 |
+
"extractor": "ollama"
|
| 978 |
+
}
|
| 979 |
+
else:
|
| 980 |
+
logger.warning("Ollama not available for field extraction, falling back to standard parsing")
|
| 981 |
+
except Exception as e:
|
| 982 |
+
logger.warning(f"Ollama extraction failed: {e}, falling back to standard parsing")
|
| 983 |
+
|
| 984 |
+
# For general JSON conversion, try Ollama first for better document understanding
|
| 985 |
+
try:
|
| 986 |
+
from docstrange.services import OllamaFieldExtractor
|
| 987 |
+
extractor = OllamaFieldExtractor(base_url=ollama_url, model=ollama_model)
|
| 988 |
+
|
| 989 |
+
if extractor.is_available():
|
| 990 |
+
# Ask Ollama to extract the entire document to structured JSON
|
| 991 |
+
document_json = extractor.extract_document_json(self.content)
|
| 992 |
+
return {
|
| 993 |
+
**document_json,
|
| 994 |
+
**self.metadata,
|
| 995 |
+
"format": "ollama_structured_json",
|
| 996 |
+
"extractor": "ollama"
|
| 997 |
+
}
|
| 998 |
+
else:
|
| 999 |
+
logger.info("Ollama not available, using fallback JSON parser")
|
| 1000 |
+
except Exception as e:
|
| 1001 |
+
logger.warning(f"Ollama document conversion failed: {e}, using fallback parser")
|
| 1002 |
+
|
| 1003 |
+
# Fallback to original parsing logic
|
| 1004 |
+
parsed_content = self._json_parser.parse(self.content)
|
| 1005 |
+
return {
|
| 1006 |
+
**parsed_content,
|
| 1007 |
+
**self.metadata,
|
| 1008 |
+
"format": "structured_json"
|
| 1009 |
+
}
|
| 1010 |
+
|
| 1011 |
+
except Exception as e:
|
| 1012 |
+
logger.error(f"JSON conversion failed: {e}")
|
| 1013 |
+
return {
|
| 1014 |
+
"error": f"Failed to extract to JSON: {str(e)}",
|
| 1015 |
+
"raw_content": self.content,
|
| 1016 |
+
**self.metadata,
|
| 1017 |
+
"format": "error"
|
| 1018 |
+
}
|
| 1019 |
+
|
| 1020 |
+
def extract_text(self) -> str:
|
| 1021 |
+
"""Export as plain text.
|
| 1022 |
+
|
| 1023 |
+
Returns:
|
| 1024 |
+
The content as plain text
|
| 1025 |
+
"""
|
| 1026 |
+
return self.content
|
| 1027 |
+
|
| 1028 |
+
def extract_csv(self, table_index: int = 0, include_all_tables: bool = False) -> str:
|
| 1029 |
+
"""Export tables as CSV format.
|
| 1030 |
+
|
| 1031 |
+
Args:
|
| 1032 |
+
table_index: Which table to export (0-based index). Default is 0 (first table).
|
| 1033 |
+
include_all_tables: If True, export all tables with separators. Default is False.
|
| 1034 |
+
|
| 1035 |
+
Returns:
|
| 1036 |
+
CSV formatted string of the table(s)
|
| 1037 |
+
|
| 1038 |
+
Raises:
|
| 1039 |
+
ValueError: If no tables are found or table_index is out of range
|
| 1040 |
+
"""
|
| 1041 |
+
# Parse the content to extract tables
|
| 1042 |
+
json_data = self.extract_data()
|
| 1043 |
+
|
| 1044 |
+
# Extract all tables from all sections
|
| 1045 |
+
tables = []
|
| 1046 |
+
|
| 1047 |
+
def extract_tables_from_sections(sections):
|
| 1048 |
+
for section in sections:
|
| 1049 |
+
content = section.get('content', {})
|
| 1050 |
+
if 'tables' in content:
|
| 1051 |
+
tables.extend(content['tables'])
|
| 1052 |
+
# Recursively check subsections
|
| 1053 |
+
if 'subsections' in section:
|
| 1054 |
+
extract_tables_from_sections(section['subsections'])
|
| 1055 |
+
|
| 1056 |
+
if 'document' in json_data and 'sections' in json_data['document']:
|
| 1057 |
+
extract_tables_from_sections(json_data['document']['sections'])
|
| 1058 |
+
|
| 1059 |
+
if not tables:
|
| 1060 |
+
# If no structured tables found, try to parse markdown tables directly
|
| 1061 |
+
tables = self._extract_markdown_tables_directly(self.content)
|
| 1062 |
+
|
| 1063 |
+
if not tables:
|
| 1064 |
+
raise ValueError("No tables found in the document content")
|
| 1065 |
+
|
| 1066 |
+
if include_all_tables:
|
| 1067 |
+
# Export all tables with separators
|
| 1068 |
+
csv_output = io.StringIO()
|
| 1069 |
+
writer = csv.writer(csv_output)
|
| 1070 |
+
|
| 1071 |
+
for i, table in enumerate(tables):
|
| 1072 |
+
if i > 0:
|
| 1073 |
+
# Add separator between tables
|
| 1074 |
+
writer.writerow([])
|
| 1075 |
+
writer.writerow([f"=== Table {i + 1} ==="])
|
| 1076 |
+
writer.writerow([])
|
| 1077 |
+
|
| 1078 |
+
# Write table headers if available
|
| 1079 |
+
if 'headers' in table and table['headers']:
|
| 1080 |
+
writer.writerow(table['headers'])
|
| 1081 |
+
|
| 1082 |
+
# Write table rows
|
| 1083 |
+
if 'rows' in table:
|
| 1084 |
+
for row in table['rows']:
|
| 1085 |
+
writer.writerow(row)
|
| 1086 |
+
|
| 1087 |
+
return csv_output.getvalue()
|
| 1088 |
+
else:
|
| 1089 |
+
# Export specific table
|
| 1090 |
+
if table_index >= len(tables):
|
| 1091 |
+
raise ValueError(f"Table index {table_index} out of range. Found {len(tables)} table(s)")
|
| 1092 |
+
|
| 1093 |
+
table = tables[table_index]
|
| 1094 |
+
csv_output = io.StringIO()
|
| 1095 |
+
writer = csv.writer(csv_output)
|
| 1096 |
+
|
| 1097 |
+
# Write table headers if available
|
| 1098 |
+
if 'headers' in table and table['headers']:
|
| 1099 |
+
writer.writerow(table['headers'])
|
| 1100 |
+
|
| 1101 |
+
# Write table rows
|
| 1102 |
+
if 'rows' in table:
|
| 1103 |
+
for row in table['rows']:
|
| 1104 |
+
writer.writerow(row)
|
| 1105 |
+
|
| 1106 |
+
return csv_output.getvalue()
|
| 1107 |
+
|
| 1108 |
+
def _extract_markdown_tables_directly(self, content: str) -> List[Dict[str, Any]]:
|
| 1109 |
+
"""Extract tables directly from markdown content as fallback."""
|
| 1110 |
+
tables = []
|
| 1111 |
+
table_pattern = re.compile(r'\|(.+)\|\s*\n\|[-\s|:]+\|\s*\n((?:\|.+\|\s*\n?)*)', re.MULTILINE)
|
| 1112 |
+
|
| 1113 |
+
for match in table_pattern.finditer(content):
|
| 1114 |
+
header_row = match.group(1).strip()
|
| 1115 |
+
body_rows = match.group(2).strip()
|
| 1116 |
+
|
| 1117 |
+
# Parse header
|
| 1118 |
+
headers = [cell.strip() for cell in header_row.split('|') if cell.strip()]
|
| 1119 |
+
|
| 1120 |
+
# Parse body rows
|
| 1121 |
+
rows = []
|
| 1122 |
+
for row_line in body_rows.split('\n'):
|
| 1123 |
+
if row_line.strip() and '|' in row_line:
|
| 1124 |
+
cells = [cell.strip() for cell in row_line.split('|') if cell.strip()]
|
| 1125 |
+
if cells:
|
| 1126 |
+
rows.append(cells)
|
| 1127 |
+
|
| 1128 |
+
if headers and rows:
|
| 1129 |
+
tables.append({
|
| 1130 |
+
'headers': headers,
|
| 1131 |
+
'rows': rows,
|
| 1132 |
+
'columns': len(headers)
|
| 1133 |
+
})
|
| 1134 |
+
|
| 1135 |
+
return tables
|
| 1136 |
+
|
| 1137 |
+
def __str__(self) -> str:
|
| 1138 |
+
"""String representation of the result."""
|
| 1139 |
+
return self.content
|
| 1140 |
+
|
| 1141 |
+
def __repr__(self) -> str:
|
| 1142 |
+
"""Representation of the result object."""
|
| 1143 |
+
return f"ConversionResult(content='{self.content[:50]}...', metadata={self.metadata})"
|
docstrange/services/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Services for authentication, API key pooling, and local LLM processing."""
|
| 2 |
+
|
| 3 |
+
from .ollama_service import OllamaFieldExtractor
|
| 4 |
+
from .api_key_pool import (
|
| 5 |
+
ApiKeyPool,
|
| 6 |
+
get_pool,
|
| 7 |
+
add_api_key,
|
| 8 |
+
remove_api_key,
|
| 9 |
+
list_api_keys,
|
| 10 |
+
get_available_key,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
__all__ = [
|
| 14 |
+
"OllamaFieldExtractor",
|
| 15 |
+
"ApiKeyPool",
|
| 16 |
+
"get_pool",
|
| 17 |
+
"add_api_key",
|
| 18 |
+
"remove_api_key",
|
| 19 |
+
"list_api_keys",
|
| 20 |
+
"get_available_key",
|
| 21 |
+
]
|
docstrange/services/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (511 Bytes). View file
|
|
|
docstrange/services/__pycache__/api_key_pool.cpython-310.pyc
ADDED
|
Binary file (8.41 kB). View file
|
|
|
docstrange/services/__pycache__/ollama_service.cpython-310.pyc
ADDED
|
Binary file (8.45 kB). View file
|
|
|
docstrange/services/api_key_pool.py
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
API Key Pool Manager for DocStrange.
|
| 3 |
+
|
| 4 |
+
Manages a pool of Nanonets API keys with automatic rotation on rate limit (429).
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import json
|
| 9 |
+
import time
|
| 10 |
+
import threading
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Optional, List, Dict, Any
|
| 13 |
+
import logging
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class KeyStatus:
|
| 19 |
+
ACTIVE = "active"
|
| 20 |
+
RATE_LIMITED = "rate_limited"
|
| 21 |
+
EXPIRED = "expired"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class ApiKeyEntry:
|
| 25 |
+
"""Represents a single API key in the pool with its state."""
|
| 26 |
+
|
| 27 |
+
def __init__(self, key: str, source: str = "manual"):
|
| 28 |
+
self.key = key
|
| 29 |
+
self.source = source # "manual", "env", "config", "credentials"
|
| 30 |
+
self.status = KeyStatus.ACTIVE
|
| 31 |
+
self.rate_limited_at = None
|
| 32 |
+
self.reset_at = None # When the rate limit resets (epoch time)
|
| 33 |
+
self.requests_made = 0
|
| 34 |
+
self.last_used = None
|
| 35 |
+
|
| 36 |
+
def mark_rate_limited(self, reset_after_seconds: int = 3600):
|
| 37 |
+
"""Mark this key as rate-limited."""
|
| 38 |
+
self.status = KeyStatus.RATE_LIMITED
|
| 39 |
+
self.rate_limited_at = time.time()
|
| 40 |
+
self.reset_at = time.time() + reset_after_seconds
|
| 41 |
+
logger.warning(f"API key {self.key[:8]}... rate limited, resets at {self.reset_at}")
|
| 42 |
+
|
| 43 |
+
def is_available(self) -> bool:
|
| 44 |
+
"""Check if this key is available for use."""
|
| 45 |
+
if self.status == KeyStatus.ACTIVE:
|
| 46 |
+
return True
|
| 47 |
+
if self.status == KeyStatus.RATE_LIMITED and self.reset_at:
|
| 48 |
+
if time.time() >= self.reset_at:
|
| 49 |
+
self.status = KeyStatus.ACTIVE
|
| 50 |
+
self.rate_limited_at = None
|
| 51 |
+
self.reset_at = None
|
| 52 |
+
return True
|
| 53 |
+
return False
|
| 54 |
+
|
| 55 |
+
def record_use(self):
|
| 56 |
+
"""Record that this key was used."""
|
| 57 |
+
self.requests_made += 1
|
| 58 |
+
self.last_used = time.time()
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class ApiKeyPool:
|
| 62 |
+
"""
|
| 63 |
+
Manages a pool of API keys with automatic rotation.
|
| 64 |
+
|
| 65 |
+
When a key hits rate limit (429), it's marked as unavailable and the next
|
| 66 |
+
key in the pool is tried. When all keys are exhausted, signals fallback.
|
| 67 |
+
"""
|
| 68 |
+
|
| 69 |
+
_instance = None
|
| 70 |
+
_lock = threading.Lock()
|
| 71 |
+
|
| 72 |
+
def __init__(self):
|
| 73 |
+
self._keys: List[ApiKeyEntry] = []
|
| 74 |
+
self._current_index = 0
|
| 75 |
+
self._lock_pool = threading.Lock()
|
| 76 |
+
self._config_path = Path.home() / ".docstrange" / "api_keys.json"
|
| 77 |
+
self._load_config()
|
| 78 |
+
|
| 79 |
+
@classmethod
|
| 80 |
+
def get_instance(cls) -> "ApiKeyPool":
|
| 81 |
+
"""Get singleton instance."""
|
| 82 |
+
if cls._instance is None:
|
| 83 |
+
with cls._lock:
|
| 84 |
+
if cls._instance is None:
|
| 85 |
+
cls._instance = cls()
|
| 86 |
+
return cls._instance
|
| 87 |
+
|
| 88 |
+
def _load_config(self):
|
| 89 |
+
"""Load API keys from config file."""
|
| 90 |
+
try:
|
| 91 |
+
if self._config_path.exists():
|
| 92 |
+
with open(self._config_path, 'r') as f:
|
| 93 |
+
config = json.load(f)
|
| 94 |
+
|
| 95 |
+
keys = config.get("api_keys", [])
|
| 96 |
+
for key_entry in keys:
|
| 97 |
+
if isinstance(key_entry, str):
|
| 98 |
+
self.add_key(key_entry, source="config")
|
| 99 |
+
elif isinstance(key_entry, dict) and "key" in key_entry:
|
| 100 |
+
self.add_key(key_entry["key"], source=key_entry.get("source", "config"))
|
| 101 |
+
|
| 102 |
+
logger.info(f"Loaded {len(self._keys)} API keys from config")
|
| 103 |
+
except Exception as e:
|
| 104 |
+
logger.warning(f"Failed to load API key config: {e}")
|
| 105 |
+
|
| 106 |
+
# Also check environment variable for a comma-separated list of keys
|
| 107 |
+
env_keys = os.environ.get('NANONETS_API_KEYS', '')
|
| 108 |
+
if env_keys:
|
| 109 |
+
for key in env_keys.split(','):
|
| 110 |
+
key = key.strip()
|
| 111 |
+
if key:
|
| 112 |
+
self.add_key(key, source="env")
|
| 113 |
+
|
| 114 |
+
def save_config(self):
|
| 115 |
+
"""Save API keys to config file."""
|
| 116 |
+
try:
|
| 117 |
+
config_dir = self._config_path.parent
|
| 118 |
+
config_dir.mkdir(exist_ok=True)
|
| 119 |
+
|
| 120 |
+
keys_data = []
|
| 121 |
+
for entry in self._keys:
|
| 122 |
+
keys_data.append({
|
| 123 |
+
"key": entry.key,
|
| 124 |
+
"source": entry.source
|
| 125 |
+
})
|
| 126 |
+
|
| 127 |
+
with open(self._config_path, 'w') as f:
|
| 128 |
+
json.dump({"api_keys": keys_data}, f, indent=2)
|
| 129 |
+
|
| 130 |
+
os.chmod(self._config_path, 0o600)
|
| 131 |
+
logger.info(f"Saved {len(keys_data)} API keys to config")
|
| 132 |
+
except Exception as e:
|
| 133 |
+
logger.error(f"Failed to save API key config: {e}")
|
| 134 |
+
|
| 135 |
+
def add_key(self, key: str, source: str = "manual") -> bool:
|
| 136 |
+
"""Add an API key to the pool."""
|
| 137 |
+
with self._lock_pool:
|
| 138 |
+
# Check for duplicates
|
| 139 |
+
for entry in self._keys:
|
| 140 |
+
if entry.key == key:
|
| 141 |
+
return False
|
| 142 |
+
|
| 143 |
+
self._keys.append(ApiKeyEntry(key, source))
|
| 144 |
+
logger.info(f"Added API key from {source} to pool (total: {len(self._keys)})")
|
| 145 |
+
return True
|
| 146 |
+
|
| 147 |
+
def remove_key(self, key: str) -> bool:
|
| 148 |
+
"""Remove an API key from the pool."""
|
| 149 |
+
with self._lock_pool:
|
| 150 |
+
for i, entry in enumerate(self._keys):
|
| 151 |
+
if entry.key == key:
|
| 152 |
+
self._keys.pop(i)
|
| 153 |
+
return True
|
| 154 |
+
return False
|
| 155 |
+
|
| 156 |
+
def get_next_key(self) -> Optional[str]:
|
| 157 |
+
"""
|
| 158 |
+
Get the next available API key.
|
| 159 |
+
|
| 160 |
+
Returns None if all keys are rate-limited.
|
| 161 |
+
"""
|
| 162 |
+
with self._lock_pool:
|
| 163 |
+
if not self._keys:
|
| 164 |
+
return None
|
| 165 |
+
|
| 166 |
+
# Try to find an available key starting from current index
|
| 167 |
+
total_keys = len(self._keys)
|
| 168 |
+
for i in range(total_keys):
|
| 169 |
+
idx = (self._current_index + i) % total_keys
|
| 170 |
+
if self._keys[idx].is_available():
|
| 171 |
+
self._current_index = idx
|
| 172 |
+
self._keys[idx].record_use()
|
| 173 |
+
return self._keys[idx].key
|
| 174 |
+
|
| 175 |
+
return None
|
| 176 |
+
|
| 177 |
+
def mark_key_rate_limited(self, key: str, reset_after_seconds: int = 3600):
|
| 178 |
+
"""Mark a specific key as rate-limited."""
|
| 179 |
+
with self._lock_pool:
|
| 180 |
+
for entry in self._keys:
|
| 181 |
+
if entry.key == key:
|
| 182 |
+
entry.mark_rate_limited(reset_after_seconds)
|
| 183 |
+
break
|
| 184 |
+
|
| 185 |
+
def has_available_keys(self) -> bool:
|
| 186 |
+
"""Check if any API keys are available."""
|
| 187 |
+
with self._lock_pool:
|
| 188 |
+
return any(k.is_available() for k in self._keys)
|
| 189 |
+
|
| 190 |
+
def get_pool_stats(self) -> Dict[str, Any]:
|
| 191 |
+
"""Get statistics about the key pool."""
|
| 192 |
+
with self._lock_pool:
|
| 193 |
+
stats = {
|
| 194 |
+
"total_keys": len(self._keys),
|
| 195 |
+
"available": 0,
|
| 196 |
+
"rate_limited": 0,
|
| 197 |
+
"total_requests": 0
|
| 198 |
+
}
|
| 199 |
+
for key in self._keys:
|
| 200 |
+
if key.is_available():
|
| 201 |
+
stats["available"] += 1
|
| 202 |
+
else:
|
| 203 |
+
stats["rate_limited"] += 1
|
| 204 |
+
stats["total_requests"] += key.requests_made
|
| 205 |
+
return stats
|
| 206 |
+
|
| 207 |
+
def get_all_keys(self) -> List[str]:
|
| 208 |
+
"""Get all API keys (masked for display)."""
|
| 209 |
+
with self._lock_pool:
|
| 210 |
+
return [f"{k.key[:8]}...{k.key[-4:]}" if len(k.key) > 12 else "***" for k in self._keys]
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
# Convenience functions
|
| 214 |
+
def get_pool() -> ApiKeyPool:
|
| 215 |
+
"""Get the API key pool singleton."""
|
| 216 |
+
return ApiKeyPool.get_instance()
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
def add_api_key(key: str):
|
| 220 |
+
"""Add an API key to the pool."""
|
| 221 |
+
pool = get_pool()
|
| 222 |
+
pool.add_key(key)
|
| 223 |
+
pool.save_config()
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def remove_api_key(key: str):
|
| 227 |
+
"""Remove an API key from the pool."""
|
| 228 |
+
pool = get_pool()
|
| 229 |
+
pool.remove_key(key)
|
| 230 |
+
pool.save_config()
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def list_api_keys() -> List[str]:
|
| 234 |
+
"""List all API keys (masked)."""
|
| 235 |
+
pool = get_pool()
|
| 236 |
+
return pool.get_all_keys()
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
def get_available_key() -> Optional[str]:
|
| 240 |
+
"""Get the next available API key."""
|
| 241 |
+
return get_pool().get_next_key()
|