arjunbhargav212 commited on
Commit
5b14aa2
·
verified ·
1 Parent(s): dc23f92

Upload 63 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. docstrange/WEB_INTERFACE.md +168 -0
  2. docstrange/__init__.py +34 -0
  3. docstrange/__pycache__/__init__.cpython-310.pyc +0 -0
  4. docstrange/__pycache__/config.cpython-310.pyc +0 -0
  5. docstrange/__pycache__/exceptions.cpython-310.pyc +0 -0
  6. docstrange/__pycache__/extractor.cpython-310.pyc +0 -0
  7. docstrange/__pycache__/result.cpython-310.pyc +0 -0
  8. docstrange/__pycache__/web_app.cpython-310.pyc +0 -0
  9. docstrange/cli.py +643 -0
  10. docstrange/config.py +15 -0
  11. docstrange/exceptions.py +25 -0
  12. docstrange/extractor.py +431 -0
  13. docstrange/pipeline/__init__.py +1 -0
  14. docstrange/pipeline/__pycache__/__init__.cpython-310.pyc +0 -0
  15. docstrange/pipeline/__pycache__/ocr_service.cpython-310.pyc +0 -0
  16. docstrange/pipeline/layout_detector.py +329 -0
  17. docstrange/pipeline/model_downloader.py +331 -0
  18. docstrange/pipeline/nanonets_processor.py +129 -0
  19. docstrange/pipeline/neural_document_processor.py +644 -0
  20. docstrange/pipeline/ocr_service.py +222 -0
  21. docstrange/processors/__init__.py +27 -0
  22. docstrange/processors/__pycache__/__init__.cpython-310.pyc +0 -0
  23. docstrange/processors/__pycache__/base.cpython-310.pyc +0 -0
  24. docstrange/processors/__pycache__/cloud_processor.cpython-310.pyc +0 -0
  25. docstrange/processors/__pycache__/docx_processor.cpython-310.pyc +0 -0
  26. docstrange/processors/__pycache__/excel_processor.cpython-310.pyc +0 -0
  27. docstrange/processors/__pycache__/gpu_processor.cpython-310.pyc +0 -0
  28. docstrange/processors/__pycache__/html_processor.cpython-310.pyc +0 -0
  29. docstrange/processors/__pycache__/image_processor.cpython-310.pyc +0 -0
  30. docstrange/processors/__pycache__/pdf_processor.cpython-310.pyc +0 -0
  31. docstrange/processors/__pycache__/pptx_processor.cpython-310.pyc +0 -0
  32. docstrange/processors/__pycache__/txt_processor.cpython-310.pyc +0 -0
  33. docstrange/processors/__pycache__/url_processor.cpython-310.pyc +0 -0
  34. docstrange/processors/base.py +87 -0
  35. docstrange/processors/cloud_processor.py +399 -0
  36. docstrange/processors/docx_processor.py +202 -0
  37. docstrange/processors/excel_processor.py +208 -0
  38. docstrange/processors/gpu_processor.py +501 -0
  39. docstrange/processors/html_processor.py +65 -0
  40. docstrange/processors/image_processor.py +110 -0
  41. docstrange/processors/pdf_processor.py +141 -0
  42. docstrange/processors/pptx_processor.py +160 -0
  43. docstrange/processors/txt_processor.py +105 -0
  44. docstrange/processors/url_processor.py +361 -0
  45. docstrange/result.py +1143 -0
  46. docstrange/services/__init__.py +21 -0
  47. docstrange/services/__pycache__/__init__.cpython-310.pyc +0 -0
  48. docstrange/services/__pycache__/api_key_pool.cpython-310.pyc +0 -0
  49. docstrange/services/__pycache__/ollama_service.cpython-310.pyc +0 -0
  50. docstrange/services/api_key_pool.py +241 -0
docstrange/WEB_INTERFACE.md ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DocStrange Web Interface
2
+
3
+ A beautiful, modern web interface for the DocStrange document extraction library, inspired by the data-extraction-apis project design.
4
+
5
+ ## Features
6
+
7
+ - **Modern UI**: Clean, responsive design with drag-and-drop file upload
8
+ - **Multiple Formats**: Support for PDF, Word, Excel, PowerPoint, images, and more
9
+ - **Output Options**: Convert to Markdown, HTML, JSON, CSV, or Flat JSON
10
+ - **Real-time Processing**: Live extraction with progress indicators
11
+ - **Download Results**: Save extracted content in various formats
12
+ - **Mobile Friendly**: Responsive design that works on all devices
13
+
14
+ ## Quick Start
15
+
16
+ ### 1. Install Dependencies
17
+
18
+ ```bash
19
+ pip install docstrange[web]
20
+ ```
21
+
22
+ ### 2. Start the Web Interface
23
+
24
+ ```bash
25
+ docstrange web
26
+ ```
27
+
28
+ ### 3. Open Your Browser
29
+
30
+ Navigate to: http://localhost:8000
31
+
32
+ ## Usage
33
+
34
+ ### File Upload
35
+
36
+ 1. **Drag & Drop**: Simply drag your file onto the upload area
37
+ 2. **Click to Browse**: Click the upload area to select a file from your computer
38
+ 3. **Supported Formats**: PDF, Word (.docx, .doc), Excel (.xlsx, .xls), PowerPoint (.pptx, .ppt), HTML, CSV, Text, Images (PNG, JPG, TIFF, BMP)
39
+
40
+ ### Output Format Selection
41
+
42
+ Choose from multiple output formats:
43
+
44
+ - **Markdown**: Clean, structured markdown text
45
+ - **HTML**: Formatted HTML with styling
46
+ - **JSON**: Structured JSON data
47
+ - **CSV**: Table data in CSV format
48
+ - **Flat JSON**: Simplified JSON structure
49
+
50
+ ### Results View
51
+
52
+ After processing, you can:
53
+
54
+ - **Preview**: View formatted content in the preview tab
55
+ - **Raw Output**: See the raw extracted text
56
+ - **Download**: Save results as text or JSON files
57
+
58
+ ## API Endpoints
59
+
60
+ The web interface also provides REST API endpoints:
61
+
62
+ ### Health Check
63
+ ```
64
+ GET /api/health
65
+ ```
66
+
67
+ ### Get Supported Formats
68
+ ```
69
+ GET /api/supported-formats
70
+ ```
71
+
72
+ ### Extract Document
73
+ ```
74
+ POST /api/extract
75
+ Content-Type: multipart/form-data
76
+
77
+ Parameters:
78
+ - file: The document file to extract
79
+ - output_format: markdown, html, json, csv, flat-json
80
+ ```
81
+
82
+ ## Configuration
83
+
84
+ ### Environment Variables
85
+
86
+ - `FLASK_ENV`: Set to `development` for debug mode
87
+ - `MAX_CONTENT_LENGTH`: Maximum file size (default: 100MB)
88
+
89
+ ### Customization
90
+
91
+ The web interface uses a modular design system:
92
+
93
+ - **CSS Variables**: Easy theming via CSS custom properties
94
+ - **Responsive Design**: Mobile-first approach
95
+ - **Component-based**: Reusable UI components
96
+
97
+ ## Development
98
+
99
+ ### Running in Development Mode
100
+
101
+ ```bash
102
+ # Install development dependencies
103
+ pip install -e .
104
+
105
+ # Start with debug mode
106
+ python -m docstrange.web_app
107
+ ```
108
+
109
+ ### File Structure
110
+
111
+ ```
112
+ docstrange/
113
+ ├── web_app.py # Flask application
114
+ ├── templates/
115
+ │ └── index.html # Main HTML template
116
+ └── static/
117
+ ├── styles.css # Design system CSS
118
+ └── script.js # Frontend JavaScript
119
+ ```
120
+
121
+ ### Testing
122
+
123
+ ```bash
124
+ # Run the test script
125
+ python test_web_interface.py
126
+ ```
127
+
128
+ ## Troubleshooting
129
+
130
+ ### Common Issues
131
+
132
+ 1. **Port Already in Use**
133
+ ```bash
134
+ # Use a different port
135
+ docstrange web --port 8080
136
+ ```
137
+
138
+ 2. **File Upload Fails**
139
+ - Check file size (max 100MB)
140
+ - Verify file format is supported
141
+ - Ensure proper file permissions
142
+
143
+ 3. **Extraction Errors**
144
+ - Check console logs for detailed error messages
145
+ - Verify document is not corrupted
146
+ - Try different output formats
147
+
148
+ ### Logs
149
+
150
+ The web interface logs to the console. Check for:
151
+ - File upload events
152
+ - Processing status
153
+ - Error messages
154
+ - API request details
155
+
156
+ ## Contributing
157
+
158
+ To contribute to the web interface:
159
+
160
+ 1. Fork the repository
161
+ 2. Create a feature branch
162
+ 3. Make your changes
163
+ 4. Test thoroughly
164
+ 5. Submit a pull request
165
+
166
+ ## License
167
+
168
+ This web interface is part of the DocStrange project and is licensed under the MIT License.
docstrange/__init__.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Document Data Extractor - Extract structured data from any document into LLM-ready formats.
3
+ """
4
+
5
+ from .extractor import DocumentExtractor
6
+ from .result import ConversionResult
7
+ from .processors import GPUConversionResult, CloudConversionResult
8
+ from .exceptions import ConversionError, UnsupportedFormatError
9
+ from .config import InternalConfig
10
+ from .services.api_key_pool import (
11
+ ApiKeyPool,
12
+ get_pool,
13
+ add_api_key,
14
+ remove_api_key,
15
+ list_api_keys,
16
+ get_available_key,
17
+ )
18
+
19
+ __version__ = "1.1.5"
20
+ __all__ = [
21
+ "DocumentExtractor",
22
+ "ConversionResult",
23
+ "GPUConversionResult",
24
+ "CloudConversionResult",
25
+ "ConversionError",
26
+ "UnsupportedFormatError",
27
+ "InternalConfig",
28
+ "ApiKeyPool",
29
+ "get_pool",
30
+ "add_api_key",
31
+ "remove_api_key",
32
+ "list_api_keys",
33
+ "get_available_key",
34
+ ]
docstrange/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (842 Bytes). View file
 
docstrange/__pycache__/config.cpython-310.pyc ADDED
Binary file (426 Bytes). View file
 
docstrange/__pycache__/exceptions.cpython-310.pyc ADDED
Binary file (964 Bytes). View file
 
docstrange/__pycache__/extractor.cpython-310.pyc ADDED
Binary file (11.7 kB). View file
 
docstrange/__pycache__/result.cpython-310.pyc ADDED
Binary file (28 kB). View file
 
docstrange/__pycache__/web_app.cpython-310.pyc ADDED
Binary file (21.2 kB). View file
 
docstrange/cli.py ADDED
@@ -0,0 +1,643 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Command-line interface for docstrange."""
2
+
3
+ import argparse
4
+ import sys
5
+ import os
6
+ import json
7
+ from pathlib import Path
8
+ from typing import List
9
+
10
+ from .extractor import DocumentExtractor
11
+ from .exceptions import ConversionError, UnsupportedFormatError, FileNotFoundError
12
+ from . import __version__
13
+
14
+
15
+ def print_version():
16
+ """Print version information."""
17
+ print(f"docstrange v{__version__}")
18
+ print("Convert any document, text, or URL into LLM-ready data format")
19
+ print("with advanced intelligent document processing capabilities.")
20
+
21
+
22
+ def print_supported_formats(extractor: DocumentExtractor):
23
+ """Print supported formats in a nice format."""
24
+ print("Supported input formats:")
25
+ print()
26
+
27
+ formats = extractor.get_supported_formats()
28
+
29
+ # Group formats by category
30
+ categories = {
31
+ "Documents": [f for f in formats if f in ['.pdf', '.docx', '.doc', '.txt', '.text']],
32
+ "Data Files": [f for f in formats if f in ['.xlsx', '.xls', '.csv']],
33
+ "Presentations": [f for f in formats if f in ['.ppt', '.pptx']],
34
+ "Web": [f for f in formats if f == 'URLs'],
35
+ "Images": [f for f in formats if f in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp', '.gif']],
36
+ "Web Files": [f for f in formats if f in ['.html', '.htm']]
37
+ }
38
+
39
+ for category, format_list in categories.items():
40
+ if format_list:
41
+ print(f" {category}:")
42
+ for fmt in format_list:
43
+ print(f" - {fmt}")
44
+ print()
45
+
46
+
47
+ def process_single_input(extractor: DocumentExtractor, input_item: str, output_format: str, verbose: bool = False) -> dict:
48
+ """Process a single input item and return result with metadata."""
49
+ if verbose:
50
+ print(f"Processing: {input_item}", file=sys.stderr)
51
+
52
+ try:
53
+ # Check if it's a URL
54
+ if input_item.startswith(('http://', 'https://')):
55
+ if extractor.cloud_mode:
56
+ raise ConversionError("URL processing is not supported in cloud mode. Use local mode for URLs.")
57
+ result = extractor.extract_url(input_item)
58
+ input_type = "URL"
59
+ # Check if it's a file
60
+ elif os.path.exists(input_item):
61
+ result = extractor.extract(input_item)
62
+ input_type = "File"
63
+ # Treat as text
64
+ else:
65
+ if extractor.cloud_mode:
66
+ raise ConversionError("Text processing is not supported in cloud mode. Use local mode for text.")
67
+ result = extractor.extract_text(input_item)
68
+ input_type = "Text"
69
+
70
+ return {
71
+ "success": True,
72
+ "result": result,
73
+ "input_type": input_type,
74
+ "input_item": input_item
75
+ }
76
+
77
+ except FileNotFoundError:
78
+ return {
79
+ "success": False,
80
+ "error": "File not found",
81
+ "input_item": input_item
82
+ }
83
+ except UnsupportedFormatError:
84
+ return {
85
+ "success": False,
86
+ "error": "Unsupported format",
87
+ "input_item": input_item
88
+ }
89
+ except ConversionError as e:
90
+ return {
91
+ "success": False,
92
+ "error": f"Conversion error: {e}",
93
+ "input_item": input_item
94
+ }
95
+ except Exception as e:
96
+ return {
97
+ "success": False,
98
+ "error": f"Unexpected error: {e}",
99
+ "input_item": input_item
100
+ }
101
+
102
+
103
+ def handle_login(force_reauth: bool = False) -> int:
104
+ """Handle login command."""
105
+ try:
106
+ from .services.auth_service import get_authenticated_token
107
+
108
+ print("\n🔐 DocStrange Authentication")
109
+ print("=" * 50)
110
+
111
+ token = get_authenticated_token(force_reauth=force_reauth)
112
+ if token:
113
+ print("✅ Authentication successful!")
114
+
115
+ # Get cached credentials to show user info
116
+ try:
117
+ from .services.auth_service import AuthService
118
+ auth_service = AuthService()
119
+ cached_creds = auth_service.get_cached_credentials()
120
+
121
+ if cached_creds and cached_creds.get('auth0_direct'):
122
+ print(f"👤 Logged in as: {cached_creds.get('user_email', 'Unknown')}")
123
+ print(f"👤 Name: {cached_creds.get('user_name', 'Unknown')}")
124
+ print(f"🔐 Via: Auth0 Google Login")
125
+ print(f"🔑 Access Token: {token[:12]}...{token[-4:]}")
126
+ print("💾 Credentials cached securely")
127
+ else:
128
+ print(f"🔑 Access Token: {token[:12]}...{token[-4:]}")
129
+ print("💾 Credentials cached securely")
130
+ except Exception:
131
+ print(f"🔑 Access Token: {token[:12]}...{token[-4:]}")
132
+ print("💾 Credentials cached securely")
133
+
134
+ print("\n💡 You can now use DocStrange cloud features without specifying --api-key")
135
+ print("🌐 Your CLI is authenticated with the same Google account used on docstrange.nanonets.com")
136
+ return 0
137
+ else:
138
+ print("❌ Authentication failed.")
139
+ return 1
140
+ except ImportError:
141
+ print("❌ Authentication service not available.", file=sys.stderr)
142
+ return 1
143
+ except Exception as e:
144
+ print(f"❌ Authentication error: {e}", file=sys.stderr)
145
+ return 1
146
+
147
+
148
+ def handle_logout() -> int:
149
+ """Handle logout command."""
150
+ try:
151
+ from .services.auth_service import clear_auth
152
+
153
+ clear_auth()
154
+ print("✅ Logged out successfully.")
155
+ print("💾 Cached authentication credentials cleared.")
156
+ return 0
157
+ except ImportError:
158
+ print("❌ Authentication service not available.", file=sys.stderr)
159
+ return 1
160
+ except Exception as e:
161
+ print(f"❌ Error clearing credentials: {e}", file=sys.stderr)
162
+ return 1
163
+
164
+
165
+ def handle_api_keys_command(argv: list) -> int:
166
+ """Handle API key management commands.
167
+
168
+ Usage:
169
+ docstrange api-keys list
170
+ docstrange api-keys add <key>
171
+ docstrange api-keys remove <key>
172
+ docstrange api-keys stats
173
+ """
174
+ from .services.api_key_pool import ApiKeyPool
175
+
176
+ pool = ApiKeyPool.get_instance()
177
+
178
+ if not argv or argv[0] == "list":
179
+ keys = pool.get_all_keys()
180
+ stats = pool.get_pool_stats()
181
+ print(f"\n🔑 API Key Pool")
182
+ print("=" * 40)
183
+ print(f"Total keys: {stats['total_keys']}")
184
+ print(f"Available: {stats['available']}")
185
+ print(f"Rate limited: {stats['rate_limited']}")
186
+ print(f"Total requests: {stats['total_requests']}")
187
+ print()
188
+ if keys:
189
+ print("Keys:")
190
+ for i, masked in enumerate(keys, 1):
191
+ print(f" {i}. {masked}")
192
+ else:
193
+ print("No API keys configured.")
194
+ print("\n💡 Add keys with: docstrange api-keys add <key>")
195
+ print("💡 Or set NANONETS_API_KEYS env var (comma-separated)")
196
+ return 0
197
+
198
+ elif argv[0] == "add":
199
+ if len(argv) < 2:
200
+ print("❌ Usage: docstrange api-keys add <key>", file=sys.stderr)
201
+ return 1
202
+ key = argv[1]
203
+ if pool.add_key(key, source="cli"):
204
+ pool.save_config()
205
+ print(f"✅ API key added: {key[:8]}...{key[-4:]}")
206
+ return 0
207
+ else:
208
+ print("⚠️ API key already exists in pool")
209
+ return 0
210
+
211
+ elif argv[0] == "remove":
212
+ if len(argv) < 2:
213
+ print("❌ Usage: docstrange api-keys remove <key>", file=sys.stderr)
214
+ return 1
215
+ key = argv[1]
216
+ if pool.remove_key(key):
217
+ pool.save_config()
218
+ print(f"✅ API key removed: {key[:8]}...{key[-4:]}")
219
+ return 0
220
+ else:
221
+ print("❌ API key not found in pool", file=sys.stderr)
222
+ return 1
223
+
224
+ elif argv[0] == "stats":
225
+ stats = pool.get_pool_stats()
226
+ print(f"\n📊 API Key Pool Statistics")
227
+ print("=" * 40)
228
+ print(f"Total keys: {stats['total_keys']}")
229
+ print(f"Available: {stats['available']}")
230
+ print(f"Rate limited: {stats['rate_limited']}")
231
+ print(f"Total requests: {stats['total_requests']}")
232
+ return 0
233
+
234
+ else:
235
+ print(f"❌ Unknown api-keys command: {argv[0]}", file=sys.stderr)
236
+ print("Usage: docstrange api-keys [list|add|remove|stats]", file=sys.stderr)
237
+ return 1
238
+
239
+
240
+ def main():
241
+ """Main CLI function."""
242
+ parser = argparse.ArgumentParser(
243
+ description="Convert documents to LLM-ready formats with intelligent document processing",
244
+ formatter_class=argparse.RawDescriptionHelpFormatter,
245
+ epilog="""
246
+ Examples:
247
+ # Authentication (browser-based login)
248
+ docstrange login # One-click browser login
249
+ docstrange login --reauth # Force re-authentication
250
+
251
+ # API Key Management
252
+ docstrange api-keys list # List all configured API keys
253
+ docstrange api-keys add <key> # Add an API key to the rotation pool
254
+ docstrange api-keys remove <key> # Remove an API key
255
+ docstrange api-keys stats # Show pool usage statistics
256
+
257
+ # Start web interface
258
+ docstrange web # Start web interface at http://localhost:8000
259
+
260
+ # Convert a PDF to markdown (default cloud mode)
261
+ docstrange document.pdf
262
+
263
+ # Convert with free API key with increased limits
264
+ docstrange document.pdf --api-key YOUR_API_KEY
265
+
266
+ # Convert with multiple API keys for automatic rotation
267
+ docstrange document.pdf --api-keys KEY1 KEY2 KEY3
268
+
269
+ # Force local GPU processing
270
+ docstrange document.pdf --gpu-mode
271
+
272
+ # Convert to different output formats
273
+ docstrange document.pdf --output html
274
+ docstrange document.pdf --output json
275
+ docstrange document.pdf --output csv # Extract tables as CSV
276
+
277
+ # Use specific model for cloud processing
278
+ docstrange document.pdf --model gemini
279
+ docstrange document.pdf --model openapi --output json
280
+ docstrange document.pdf --model nanonets --output csv
281
+
282
+ # Convert a URL (works in all modes)
283
+ docstrange https://example.com --output html
284
+
285
+ # Convert plain text (works in all modes)
286
+ docstrange "Hello world" --output json
287
+
288
+ # Convert multiple files
289
+ docstrange file1.pdf file2.docx file3.xlsx --output markdown
290
+
291
+ # Extract specific fields using cloud processing
292
+ docstrange invoice.pdf --output json --extract-fields invoice_number total_amount vendor_name
293
+
294
+ # Extract using JSON schema with cloud processing
295
+ docstrange document.pdf --output json --json-schema schema.json
296
+
297
+ # Save output to file
298
+ docstrange document.pdf --output-file output.md
299
+
300
+ # Use environment variable for API key
301
+ export NANONETS_API_KEY=your_api_key
302
+ docstrange document.pdf
303
+
304
+ # List supported formats
305
+ docstrange --list-formats
306
+
307
+ # Show version
308
+ docstrange --version
309
+ """
310
+ )
311
+
312
+ parser.add_argument(
313
+ "input",
314
+ nargs="*",
315
+ help="Input file(s), URL(s), or text to extract"
316
+ )
317
+
318
+ parser.add_argument(
319
+ "--output", "-o",
320
+ choices=["markdown", "html", "json", "text", "csv"],
321
+ default="markdown",
322
+ help="Output format (default: markdown)"
323
+ )
324
+
325
+ # Processing mode arguments
326
+ parser.add_argument(
327
+ "--gpu-mode",
328
+ action="store_true",
329
+ help="Force local GPU processing (disables cloud mode, requires GPU)"
330
+ )
331
+
332
+ parser.add_argument(
333
+ "--api-key",
334
+ help="API key for increased cloud access (get it free from https://app.nanonets.com/#/keys)"
335
+ )
336
+
337
+ parser.add_argument(
338
+ "--api-keys",
339
+ nargs="+",
340
+ help="Multiple API keys for automatic rotation when one hits rate limit"
341
+ )
342
+
343
+ parser.add_argument(
344
+ "--model",
345
+ choices=["gemini", "openapi", "nanonets"],
346
+ help="Model to use for cloud processing (gemini, openapi, nanonets)"
347
+ )
348
+
349
+ parser.add_argument(
350
+ "--ollama-url",
351
+ default="http://localhost:11434",
352
+ help="Ollama server URL for local field extraction (default: http://localhost:11434)"
353
+ )
354
+
355
+ parser.add_argument(
356
+ "--ollama-model",
357
+ default="llama3.2",
358
+ help="Ollama model for local field extraction (default: llama3.2)"
359
+ )
360
+
361
+ parser.add_argument(
362
+ "--extract-fields",
363
+ nargs="+",
364
+ help="Extract specific fields using cloud processing (e.g., --extract-fields invoice_number total_amount)"
365
+ )
366
+
367
+ parser.add_argument(
368
+ "--json-schema",
369
+ help="JSON schema file for structured extraction using cloud processing"
370
+ )
371
+
372
+ parser.add_argument(
373
+ "--preserve-layout",
374
+ action="store_true",
375
+ default=True,
376
+ help="Preserve document layout (default: True)"
377
+ )
378
+
379
+ parser.add_argument(
380
+ "--include-images",
381
+ action="store_true",
382
+ help="Include images in output"
383
+ )
384
+
385
+ parser.add_argument(
386
+ "--ocr-enabled",
387
+ action="store_true",
388
+ help="Enable intelligent document processing for images and PDFs"
389
+ )
390
+
391
+ parser.add_argument(
392
+ "--output-file", "-f",
393
+ help="Output file path (if not specified, prints to stdout)"
394
+ )
395
+
396
+ parser.add_argument(
397
+ "--list-formats",
398
+ action="store_true",
399
+ help="List supported input formats and exit"
400
+ )
401
+
402
+ parser.add_argument(
403
+ "--version",
404
+ action="store_true",
405
+ help="Show version information and exit"
406
+ )
407
+
408
+ parser.add_argument(
409
+ "--verbose", "-v",
410
+ action="store_true",
411
+ help="Enable verbose output"
412
+ )
413
+
414
+ parser.add_argument(
415
+ "--login",
416
+ action="store_true",
417
+ help="Perform browser-based authentication login"
418
+ )
419
+
420
+ parser.add_argument(
421
+ "--reauth",
422
+ action="store_true",
423
+ help="Force re-authentication (use with --login)"
424
+ )
425
+
426
+ parser.add_argument(
427
+ "--logout",
428
+ action="store_true",
429
+ help="Clear cached authentication credentials"
430
+ )
431
+
432
+ args = parser.parse_args()
433
+
434
+ # Handle version flag
435
+ if args.version:
436
+ print_version()
437
+ return 0
438
+
439
+ # Handle list formats flag
440
+ if args.list_formats:
441
+ # Create a extractor to get supported formats
442
+ extractor = DocumentExtractor(
443
+ api_key=args.api_key,
444
+ model=args.model,
445
+ gpu=args.gpu_mode
446
+ )
447
+ print_supported_formats(extractor)
448
+ return 0
449
+
450
+ # Handle authentication commands
451
+ # Check if first argument is "login" command
452
+ if args.input and args.input[0] == "login":
453
+ force_reauth = "--reauth" in sys.argv
454
+ return handle_login(force_reauth)
455
+
456
+ # Handle API key management commands
457
+ if args.input and args.input[0] == "api-keys":
458
+ return handle_api_keys_command(sys.argv[1:])
459
+
460
+ # Handle web command
461
+ if args.input and args.input[0] == "web":
462
+ try:
463
+ from .web_app import run_web_app
464
+ print("Starting DocStrange web interface...")
465
+ print("Open your browser and go to: http://localhost:8000")
466
+ print("Press Ctrl+C to stop the server")
467
+ run_web_app(host='0.0.0.0', port=8000, debug=False)
468
+ return 0
469
+ except ImportError:
470
+ print("❌ Web interface not available. Install Flask: pip install Flask", file=sys.stderr)
471
+ return 1
472
+
473
+ # Handle login flags
474
+ if args.login or args.logout:
475
+ if args.logout:
476
+ return handle_logout()
477
+ else:
478
+ return handle_login(args.reauth)
479
+
480
+ # Check if input is provided
481
+ if not args.input:
482
+ parser.error("No input specified. Please provide file(s), URL(s), or text to extract.")
483
+
484
+ # Cloud mode is default. Without login/API key it's limited calls.
485
+ # Use 'docstrange login' (recommended) or --api-key for 10k docs/month for free.
486
+
487
+ # Initialize extractor
488
+ extractor = DocumentExtractor(
489
+ api_key=args.api_key,
490
+ api_keys=args.api_keys,
491
+ model=args.model,
492
+ gpu=args.gpu_mode
493
+ )
494
+
495
+ if args.verbose:
496
+ mode = "local" if args.gpu_mode else "cloud"
497
+ print(f"Initialized extractor in {mode} mode:")
498
+ print(f" - Output format: {args.output}")
499
+ if mode == "cloud":
500
+ pool_stats = extractor.get_api_key_pool_stats()
501
+ print(f" - API Key Pool: {pool_stats['available']}/{pool_stats['total_keys']} keys available")
502
+ if args.model:
503
+ print(f" - Model: {args.model}")
504
+ else:
505
+ print(f" - Local processing: GPU")
506
+ print()
507
+
508
+ # Process inputs
509
+ results = []
510
+ errors = []
511
+
512
+ for i, input_item in enumerate(args.input, 1):
513
+ if args.verbose and len(args.input) > 1:
514
+ print(f"[{i}/{len(args.input)}] Processing: {input_item}", file=sys.stderr)
515
+
516
+ result = process_single_input(extractor, input_item, args.output, args.verbose)
517
+
518
+ if result["success"]:
519
+ results.append(result["result"])
520
+ if not args.verbose:
521
+ print(f"Processing ... : {input_item}", file=sys.stderr)
522
+ else:
523
+ errors.append(result)
524
+ print(f"❌ Failed: {input_item} - {result['error']}", file=sys.stderr)
525
+
526
+ # Check if we have any successful results
527
+ if not results:
528
+ print("❌ No files were successfully processed.", file=sys.stderr)
529
+ if errors:
530
+ print("Errors encountered:", file=sys.stderr)
531
+ for error in errors:
532
+ print(f" - {error['input_item']}: {error['error']}", file=sys.stderr)
533
+ return 1
534
+
535
+ # Generate output
536
+ if len(results) == 1:
537
+ # Single result
538
+ result = results[0]
539
+ if args.output == "markdown":
540
+ output_content = result.extract_markdown()
541
+ elif args.output == "html":
542
+ output_content = result.extract_html()
543
+ elif args.output == "json":
544
+ # Handle field extraction if specified
545
+ json_schema = None
546
+ if args.json_schema:
547
+ try:
548
+ with open(args.json_schema, 'r') as f:
549
+ json_schema = json.load(f)
550
+ except Exception as e:
551
+ print(f"Error loading JSON schema: {e}", file=sys.stderr)
552
+ sys.exit(1)
553
+
554
+ try:
555
+ result_json = result.extract_data(
556
+ specified_fields=args.extract_fields,
557
+ json_schema=json_schema,
558
+ )
559
+ output_content = json.dumps(result_json, indent=2)
560
+ except Exception as e:
561
+ print(f"Error during JSON extraction: {e}", file=sys.stderr)
562
+ sys.exit(1)
563
+ elif args.output == "csv":
564
+ try:
565
+ output_content = result.extract_csv(include_all_tables=True)
566
+ except ValueError as e:
567
+ print(f"Error: {e}", file=sys.stderr)
568
+ sys.exit(1)
569
+ else: # text
570
+ output_content = result.extract_text()
571
+ else:
572
+ # Multiple results - combine them
573
+ if args.output == "markdown":
574
+ output_content = "\n\n---\n\n".join(r.extract_markdown() for r in results)
575
+ elif args.output == "html":
576
+ output_content = "\n\n<hr>\n\n".join(r.extract_html() for r in results)
577
+ elif args.output == "json":
578
+ # Handle field extraction for multiple results
579
+ json_schema = None
580
+ if args.json_schema:
581
+ try:
582
+ with open(args.json_schema, 'r') as f:
583
+ json_schema = json.load(f)
584
+ except Exception as e:
585
+ print(f"Error loading JSON schema: {e}", file=sys.stderr)
586
+ sys.exit(1)
587
+
588
+ try:
589
+ extracted_results = []
590
+ for r in results:
591
+ result_json = r.extract_data(
592
+ specified_fields=args.extract_fields,
593
+ json_schema=json_schema,
594
+ )
595
+ extracted_results.append(result_json)
596
+
597
+ combined_json = {
598
+ "results": extracted_results,
599
+ "count": len(results),
600
+ "errors": [{"input": e["input_item"], "error": e["error"]} for e in errors] if errors else []
601
+ }
602
+ output_content = json.dumps(combined_json, indent=2)
603
+ except Exception as e:
604
+ print(f"Error during JSON extraction: {e}", file=sys.stderr)
605
+ sys.exit(1)
606
+ elif args.output == "csv":
607
+ csv_outputs = []
608
+ for i, r in enumerate(results):
609
+ try:
610
+ csv_content = r.extract_csv(include_all_tables=True)
611
+ if csv_content.strip():
612
+ csv_outputs.append(f"=== File {i + 1} ===\n{csv_content}")
613
+ except ValueError:
614
+ # Skip files without tables
615
+ continue
616
+ if not csv_outputs:
617
+ print("Error: No tables found in any of the input files", file=sys.stderr)
618
+ sys.exit(1)
619
+ output_content = "\n\n".join(csv_outputs)
620
+ else: # text
621
+ output_content = "\n\n---\n\n".join(r.extract_text() for r in results)
622
+
623
+ # Write output
624
+ if args.output_file:
625
+ try:
626
+ with open(args.output_file, 'w', encoding='utf-8') as f:
627
+ f.write(output_content)
628
+ print(f"✅ Output written to: {args.output_file}", file=sys.stderr)
629
+ except Exception as e:
630
+ print(f"❌ Failed to write output file: {e}", file=sys.stderr)
631
+ return 1
632
+ else:
633
+ print(output_content)
634
+
635
+ # Summary
636
+ if args.verbose or len(args.input) > 1:
637
+ print(f"\nSummary: {len(results)} successful, {len(errors)} failed", file=sys.stderr)
638
+
639
+ return 0 if not errors else 1
640
+
641
+
642
+ if __name__ == "__main__":
643
+ sys.exit(main())
docstrange/config.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # docstrange/config.py
2
+
3
+ class InternalConfig:
4
+ # Internal feature flags and defaults (not exposed to end users)
5
+ use_markdownify = True
6
+ ocr_provider = 'neural' # OCR provider to use (neural for docling models)
7
+
8
+ # PDF processing configuration
9
+ pdf_to_image_enabled = True # Convert PDF pages to images for OCR
10
+ pdf_image_dpi = 300 # DPI for PDF to image conversion
11
+ pdf_image_scale = 2.0 # Scale factor for better OCR accuracy
12
+
13
+ # Add other internal config options here as needed
14
+ # e.g. default_ocr_lang = 'en'
15
+ # e.g. enable_layout_aware_ocr = True
docstrange/exceptions.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Custom exceptions for the LLM Data Converter library."""
2
+
3
+
4
+ class ConversionError(Exception):
5
+ """Raised when document conversion fails."""
6
+ pass
7
+
8
+
9
+ class UnsupportedFormatError(Exception):
10
+ """Raised when the input format is not supported."""
11
+ pass
12
+
13
+
14
+ class DocumentNotFoundError(Exception):
15
+ """Raised when the input file is not found."""
16
+ pass
17
+
18
+
19
+ class NetworkError(Exception):
20
+ """Raised when network operations fail (e.g., URL fetching)."""
21
+ pass
22
+
23
+
24
+ # Backwards compatibility alias (deprecated: use DocumentNotFoundError instead)
25
+ FileNotFoundError = DocumentNotFoundError
docstrange/extractor.py ADDED
@@ -0,0 +1,431 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Main extractor class for handling document conversion."""
2
+
3
+ import os
4
+ import logging
5
+ from typing import List, Optional
6
+
7
+ from .processors import (
8
+ PDFProcessor,
9
+ DOCXProcessor,
10
+ TXTProcessor,
11
+ ExcelProcessor,
12
+ URLProcessor,
13
+ HTMLProcessor,
14
+ PPTXProcessor,
15
+ ImageProcessor,
16
+ CloudProcessor,
17
+ GPUProcessor,
18
+ )
19
+ from .result import ConversionResult
20
+ from .exceptions import ConversionError, UnsupportedFormatError, FileNotFoundError
21
+ from .utils.gpu_utils import should_use_gpu_processor
22
+ from .services.api_key_pool import ApiKeyPool
23
+
24
+ # Configure logging
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class DocumentExtractor:
29
+ """Main class for converting documents to LLM-ready formats."""
30
+
31
+ def __init__(
32
+ self,
33
+ preserve_layout: bool = True,
34
+ include_images: bool = True,
35
+ ocr_enabled: bool = True,
36
+ api_key: Optional[str] = None,
37
+ api_keys: Optional[List[str]] = None,
38
+ model: Optional[str] = None,
39
+ gpu: bool = False
40
+ ):
41
+ """Initialize the file extractor.
42
+
43
+ Args:
44
+ preserve_layout: Whether to preserve document layout
45
+ include_images: Whether to include images in output
46
+ ocr_enabled: Whether to enable OCR for image and PDF processing
47
+ api_key: Single API key for cloud processing. Prefer 'docstrange login' for 10k docs/month
48
+ api_keys: List of API keys for automatic rotation when one hits rate limit
49
+ model: Model to use for cloud processing (gemini, openapi) - only for cloud mode
50
+ gpu: Force local GPU processing (disables cloud mode, requires GPU)
51
+
52
+ Note:
53
+ - Cloud mode is default unless gpu is specified
54
+ - Multiple api_keys enable automatic rotation on rate limit
55
+ - Without login/API key, limited calls per day
56
+ - For 10k docs/month, run 'docstrange login' (recommended) or use API keys
57
+ """
58
+ self.preserve_layout = preserve_layout
59
+ self.include_images = include_images
60
+ self.api_key = api_key
61
+ self.api_keys_list = api_keys or []
62
+ self.model = model
63
+ self.gpu = gpu
64
+
65
+ # Determine processing mode
66
+ # Cloud mode is default unless GPU preference is explicitly set
67
+ self.cloud_mode = not self.gpu
68
+
69
+ # Check GPU availability if GPU preference is set
70
+ if self.gpu and not should_use_gpu_processor():
71
+ raise RuntimeError(
72
+ "GPU preference specified but no GPU is available. "
73
+ "Please ensure CUDA is installed and a compatible GPU is present."
74
+ )
75
+
76
+ # Default to True if not explicitly set
77
+ if ocr_enabled is None:
78
+ self.ocr_enabled = True
79
+ else:
80
+ self.ocr_enabled = ocr_enabled
81
+
82
+ # Initialize API key pool
83
+ self.api_key_pool = ApiKeyPool.get_instance()
84
+
85
+ # Add provided keys to the pool
86
+ if api_key:
87
+ self.api_key_pool.add_key(api_key, source="constructor")
88
+ for key in self.api_keys_list:
89
+ self.api_key_pool.add_key(key, source="constructor_list")
90
+
91
+ # Try to get API key from environment if not provided
92
+ if self.cloud_mode and not self.api_key:
93
+ env_keys = os.environ.get('NANONETS_API_KEYS', '')
94
+ if env_keys:
95
+ for key in env_keys.split(','):
96
+ key = key.strip()
97
+ if key:
98
+ self.api_key_pool.add_key(key, source="env")
99
+
100
+ # Also check single env var for backward compat
101
+ single_key = os.environ.get('NANONETS_API_KEY')
102
+ if single_key:
103
+ self.api_key_pool.add_key(single_key, source="env_single")
104
+
105
+ # If still no API keys, try to get from cached credentials
106
+ if not self.api_key_pool.has_available_keys():
107
+ try:
108
+ from .services.auth_service import get_authenticated_token
109
+ cached_token = get_authenticated_token(force_reauth=False)
110
+ if cached_token:
111
+ self.api_key_pool.add_key(cached_token, source="cached_credentials")
112
+ logger.info("Added cached authentication credentials to API key pool")
113
+ except ImportError:
114
+ logger.debug("Authentication service not available")
115
+ except Exception as e:
116
+ logger.warning(f"Could not retrieve cached credentials: {e}")
117
+
118
+ # Pre-create local GPU processor for fallback (if available)
119
+ self.local_gpu_processor = None
120
+ if should_use_gpu_processor():
121
+ try:
122
+ self.local_gpu_processor = GPUProcessor(
123
+ preserve_layout=preserve_layout,
124
+ include_images=include_images,
125
+ ocr_enabled=ocr_enabled
126
+ )
127
+ logger.info("Local GPU processor available for fallback")
128
+ except Exception as e:
129
+ logger.warning(f"Could not initialize local GPU processor: {e}")
130
+
131
+ # Initialize processors
132
+ self.processors = []
133
+
134
+ if self.cloud_mode:
135
+ # Cloud mode setup with key pool and local fallback
136
+ cloud_processor = CloudProcessor(
137
+ api_key=self.api_key, # Can be None, pool will be used
138
+ model_type=self.model,
139
+ preserve_layout=preserve_layout,
140
+ include_images=include_images,
141
+ api_key_pool=self.api_key_pool,
142
+ local_fallback_processor=self.local_gpu_processor
143
+ )
144
+ self.processors.append(cloud_processor)
145
+
146
+ pool_stats = self.api_key_pool.get_pool_stats()
147
+ if pool_stats["available"] > 0:
148
+ logger.info(f"Cloud processing enabled with {pool_stats['available']} API key(s) in pool")
149
+ else:
150
+ logger.info("Cloud processing enabled without API keys - will use local fallback when needed")
151
+ else:
152
+ # Local mode setup
153
+ logger.info("Local processing mode enabled")
154
+ self._setup_local_processors()
155
+
156
+ def authenticate(self, force_reauth: bool = False) -> bool:
157
+ """
158
+ Perform browser-based authentication and update API key.
159
+
160
+ Args:
161
+ force_reauth: Force re-authentication even if cached credentials exist
162
+
163
+ Returns:
164
+ True if authentication successful, False otherwise
165
+ """
166
+ try:
167
+ from .services.auth_service import get_authenticated_token
168
+
169
+ token = get_authenticated_token(force_reauth=force_reauth)
170
+ if token:
171
+ self.api_key = token
172
+
173
+ # Add to pool and update cloud processor
174
+ self.api_key_pool.add_key(token, source="authenticated")
175
+ for processor in self.processors:
176
+ if hasattr(processor, 'api_key'):
177
+ processor.api_key = token
178
+ logger.info("Updated processor with new authentication token")
179
+
180
+ return True
181
+ else:
182
+ return False
183
+
184
+ except ImportError:
185
+ logger.error("Authentication service not available")
186
+ return False
187
+ except Exception as e:
188
+ logger.error(f"Authentication failed: {e}")
189
+ return False
190
+
191
+ def _setup_local_processors(self):
192
+ """Setup local processors based on GPU preferences."""
193
+ local_processors = [
194
+ PDFProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images, ocr_enabled=self.ocr_enabled),
195
+ DOCXProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images),
196
+ TXTProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images),
197
+ ExcelProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images),
198
+ HTMLProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images),
199
+ PPTXProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images),
200
+ ImageProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images, ocr_enabled=self.ocr_enabled),
201
+ URLProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images),
202
+ ]
203
+
204
+ # Add GPU processor if GPU preference is specified
205
+ if self.gpu:
206
+ logger.info("GPU preference specified - adding GPU processor with Nanonets OCR")
207
+ gpu_processor = GPUProcessor(preserve_layout=self.preserve_layout, include_images=self.include_images, ocr_enabled=self.ocr_enabled)
208
+ local_processors.append(gpu_processor)
209
+
210
+ self.processors.extend(local_processors)
211
+
212
+ def extract(self, file_path: str) -> ConversionResult:
213
+ """Convert a file to internal format.
214
+
215
+ Args:
216
+ file_path: Path to the file to extract
217
+
218
+ Returns:
219
+ ConversionResult containing the processed content
220
+
221
+ Raises:
222
+ FileNotFoundError: If the file doesn't exist
223
+ UnsupportedFormatError: If the format is not supported
224
+ ConversionError: If conversion fails
225
+ """
226
+ if not os.path.exists(file_path):
227
+ raise FileNotFoundError(f"File not found: {file_path}")
228
+
229
+ # Find the appropriate processor
230
+ processor = self._get_processor(file_path)
231
+ if not processor:
232
+ raise UnsupportedFormatError(f"No processor found for file: {file_path}")
233
+
234
+ logger.info(f"Using processor {processor.__class__.__name__} for {file_path}")
235
+
236
+ # Process the file
237
+ return processor.process(file_path)
238
+
239
+ def convert_with_output_type(self, file_path: str, output_type: str) -> ConversionResult:
240
+ """Convert a file with specific output type for cloud processing.
241
+
242
+ Args:
243
+ file_path: Path to the file to extract
244
+ output_type: Desired output type (markdown, flat-json, html)
245
+
246
+ Returns:
247
+ ConversionResult containing the processed content
248
+
249
+ Raises:
250
+ FileNotFoundError: If the file doesn't exist
251
+ UnsupportedFormatError: If the format is not supported
252
+ ConversionError: If conversion fails
253
+ """
254
+ if not os.path.exists(file_path):
255
+ raise FileNotFoundError(f"File not found: {file_path}")
256
+
257
+ # For cloud mode, create a processor with the specific output type
258
+ if self.cloud_mode:
259
+ cloud_processor = CloudProcessor(
260
+ api_key=self.api_key,
261
+ output_type=output_type,
262
+ model_type=self.model,
263
+ preserve_layout=self.preserve_layout,
264
+ include_images=self.include_images,
265
+ api_key_pool=self.api_key_pool,
266
+ local_fallback_processor=self.local_gpu_processor
267
+ )
268
+ if cloud_processor.can_process(file_path):
269
+ logger.info(f"Using cloud processor with output_type={output_type} for {file_path}")
270
+ return cloud_processor.process(file_path)
271
+
272
+ # Fallback to regular conversion for local mode
273
+ return self.extract(file_path)
274
+
275
+ def extract_url(self, url: str) -> ConversionResult:
276
+ """Convert a URL to internal format.
277
+
278
+ Args:
279
+ url: URL to extract
280
+
281
+ Returns:
282
+ ConversionResult containing the processed content
283
+
284
+ Raises:
285
+ ConversionError: If conversion fails
286
+ """
287
+ # Cloud mode doesn't support URL conversion
288
+ if self.cloud_mode:
289
+ raise ConversionError("URL conversion is not supported in cloud mode. Use local mode for URL processing.")
290
+
291
+ # Find the URL processor
292
+ url_processor = None
293
+ for processor in self.processors:
294
+ if isinstance(processor, URLProcessor):
295
+ url_processor = processor
296
+ break
297
+
298
+ if not url_processor:
299
+ raise ConversionError("URL processor not available")
300
+
301
+ logger.info(f"Converting URL: {url}")
302
+ return url_processor.process(url)
303
+
304
+ def extract_text(self, text: str) -> ConversionResult:
305
+ """Convert plain text to internal format.
306
+
307
+ Args:
308
+ text: Plain text to extract
309
+
310
+ Returns:
311
+ ConversionResult containing the processed content
312
+ """
313
+ # Cloud mode doesn't support text conversion
314
+ if self.cloud_mode:
315
+ raise ConversionError("Text conversion is not supported in cloud mode. Use local mode for text processing.")
316
+
317
+ metadata = {
318
+ "content_type": "text",
319
+ "processor": "TextConverter",
320
+ "preserve_layout": self.preserve_layout
321
+ }
322
+
323
+ return ConversionResult(text, metadata)
324
+
325
+ def is_cloud_enabled(self) -> bool:
326
+ """Check if cloud processing is enabled and configured.
327
+
328
+ Returns:
329
+ True if cloud processing is available
330
+ """
331
+ return self.cloud_mode and (bool(self.api_key) or self.api_key_pool.has_available_keys())
332
+
333
+ def get_processing_mode(self) -> str:
334
+ """Get the current processing mode.
335
+
336
+ Returns:
337
+ String describing the current processing mode
338
+ """
339
+ pool_stats = self.api_key_pool.get_pool_stats()
340
+ if self.cloud_mode and pool_stats["available"] > 0:
341
+ return f"cloud ({pool_stats['available']} key(s))"
342
+ elif self.cloud_mode and self.local_gpu_processor:
343
+ return "cloud (local fallback ready)"
344
+ elif self.gpu:
345
+ return "gpu_forced"
346
+ elif should_use_gpu_processor():
347
+ return "gpu_auto"
348
+ else:
349
+ return "cloud"
350
+
351
+ def get_api_key_pool_stats(self) -> dict:
352
+ """Get API key pool statistics.
353
+
354
+ Returns:
355
+ Dictionary with pool statistics
356
+ """
357
+ return self.api_key_pool.get_pool_stats()
358
+
359
+ def _get_processor(self, file_path: str):
360
+ """Get the appropriate processor for the file.
361
+
362
+ Args:
363
+ file_path: Path to the file
364
+
365
+ Returns:
366
+ Processor that can handle the file, or None if none found
367
+ """
368
+ # Define GPU-supported formats
369
+ gpu_supported_formats = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp', '.gif', '.pdf']
370
+
371
+ # Check file extension
372
+ _, ext = os.path.splitext(file_path.lower())
373
+
374
+ # Check if GPU processor should be used for this file type
375
+ gpu_available = should_use_gpu_processor()
376
+
377
+ # Try GPU processor only if format is supported AND (gpu OR auto-gpu)
378
+ if ext in gpu_supported_formats and (self.gpu or (gpu_available and not self.gpu)):
379
+ for processor in self.processors:
380
+ if isinstance(processor, GPUProcessor):
381
+ if self.gpu:
382
+ logger.info(f"Using GPU processor with Nanonets OCR for {file_path} (GPU preference specified)")
383
+ else:
384
+ logger.info(f"Using GPU processor with Nanonets OCR for {file_path} (GPU available and format supported)")
385
+ return processor
386
+
387
+ # Fallback to normal processor selection
388
+ for processor in self.processors:
389
+ if processor.can_process(file_path):
390
+ # Skip GPU processor in fallback mode to avoid infinite loops
391
+ if isinstance(processor, GPUProcessor):
392
+ continue
393
+ logger.info(f"Using {processor.__class__.__name__} for {file_path}")
394
+ return processor
395
+ return None
396
+
397
+ def get_supported_formats(self) -> List[str]:
398
+ """Get list of supported file formats.
399
+
400
+ Returns:
401
+ List of supported file extensions
402
+ """
403
+ formats = []
404
+ for processor in self.processors:
405
+ if hasattr(processor, 'can_process'):
406
+ # This is a simplified way to get formats
407
+ # In a real implementation, you might want to store this info
408
+ if isinstance(processor, PDFProcessor):
409
+ formats.extend(['.pdf'])
410
+ elif isinstance(processor, DOCXProcessor):
411
+ formats.extend(['.docx', '.doc'])
412
+ elif isinstance(processor, TXTProcessor):
413
+ formats.extend(['.txt', '.text'])
414
+ elif isinstance(processor, ExcelProcessor):
415
+ formats.extend(['.xlsx', '.xls', '.csv'])
416
+ elif isinstance(processor, HTMLProcessor):
417
+ formats.extend(['.html', '.htm'])
418
+ elif isinstance(processor, PPTXProcessor):
419
+ formats.extend(['.ppt', '.pptx'])
420
+ elif isinstance(processor, ImageProcessor):
421
+ formats.extend(['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp', '.gif'])
422
+ elif isinstance(processor, URLProcessor):
423
+ formats.append('URLs')
424
+ elif isinstance(processor, CloudProcessor):
425
+ # Cloud processor supports many formats, but we don't want duplicates
426
+ pass
427
+ elif isinstance(processor, GPUProcessor):
428
+ # GPU processor supports all image formats and PDFs
429
+ formats.extend(['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp', '.gif', '.pdf'])
430
+
431
+ return list(set(formats)) # Remove duplicates
docstrange/pipeline/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Pipeline package for document processing and OCR."""
docstrange/pipeline/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (204 Bytes). View file
 
docstrange/pipeline/__pycache__/ocr_service.cpython-310.pyc ADDED
Binary file (5.79 kB). View file
 
docstrange/pipeline/layout_detector.py ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Layout detection and markdown generation for document processing."""
2
+
3
+ import re
4
+ import logging
5
+ from typing import List, Dict, Tuple
6
+ import numpy as np
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class LayoutElement:
12
+ """Represents a layout element with position and content."""
13
+
14
+ def __init__(self, text: str, x: int, y: int, width: int, height: int,
15
+ element_type: str = "text", confidence: float = 0.0):
16
+ self.text = text
17
+ self.x = x
18
+ self.y = y
19
+ self.width = width
20
+ self.height = height
21
+ self.element_type = element_type
22
+ self.confidence = confidence
23
+ self.bbox = (x, y, x + width, y + height)
24
+
25
+ def area(self) -> int:
26
+ """Calculate area of the element."""
27
+ return self.width * self.height
28
+
29
+ def center_y(self) -> float:
30
+ """Get center Y coordinate."""
31
+ return self.y + self.height / 2
32
+
33
+ def center_x(self) -> float:
34
+ """Get center X coordinate."""
35
+ return self.x + self.width / 2
36
+
37
+
38
+ class LayoutDetector:
39
+ """Handles layout detection and markdown generation."""
40
+
41
+ def __init__(self):
42
+ """Initialize the layout detector."""
43
+ # Layout detection parameters
44
+ self._header_threshold = 0.15 # Top 15% of page considered header area
45
+ self._footer_threshold = 0.85 # Bottom 15% of page considered footer area
46
+ self._heading_height_threshold = 1.5 # Relative height for heading detection
47
+ self._list_patterns = [
48
+ r'^\d+\.', # Numbered list
49
+ r'^[•·▪▫◦‣⁃]', # Bullet points
50
+ r'^[-*+]', # Markdown list markers
51
+ r'^[a-zA-Z]\.', # Lettered list
52
+ ]
53
+
54
+ def convert_to_structured_markdown(self, text_blocks: List[LayoutElement], image_size: Tuple[int, int]) -> str:
55
+ """Convert text blocks to structured markdown with proper hierarchy."""
56
+ if not text_blocks:
57
+ return ""
58
+
59
+ # Sort blocks by vertical position (top to bottom), then horizontal (left to right)
60
+ text_blocks.sort(key=lambda x: (x.y, x.x))
61
+
62
+ # Group blocks into paragraphs based on vertical spacing and text analysis
63
+ paragraphs = self._group_into_paragraphs_advanced(text_blocks, image_size)
64
+
65
+ # Convert paragraphs to markdown
66
+ markdown_parts = []
67
+
68
+ for paragraph in paragraphs:
69
+ if paragraph:
70
+ # Determine if this paragraph is a heading, list, or regular text
71
+ paragraph_type = self._classify_paragraph(paragraph)
72
+
73
+ if paragraph_type == "heading":
74
+ level = self._determine_heading_level_from_text(paragraph)
75
+ markdown_parts.append(f"{'#' * level} {paragraph}")
76
+ elif paragraph_type == "list_item":
77
+ markdown_parts.append(f"- {paragraph}")
78
+ elif paragraph_type == "table_row":
79
+ markdown_parts.append(self._format_table_row(paragraph))
80
+ else:
81
+ markdown_parts.append(paragraph)
82
+
83
+ return '\n\n'.join(markdown_parts)
84
+
85
+ def _group_into_paragraphs_advanced(self, text_blocks: List[LayoutElement], image_size: Tuple[int, int]) -> List[str]:
86
+ """Advanced paragraph grouping using multiple heuristics."""
87
+ if not text_blocks:
88
+ return []
89
+
90
+ # Calculate average text height for relative sizing
91
+ heights = [block.height for block in text_blocks]
92
+ avg_height = np.mean(heights) if heights else 20
93
+
94
+ # Group by proximity and text characteristics
95
+ paragraphs = []
96
+ current_paragraph = []
97
+ current_y = text_blocks[0].y
98
+ paragraph_threshold = 1.5 * avg_height # Dynamic threshold based on text size
99
+
100
+ for block in text_blocks:
101
+ # Check if this block is part of the same paragraph
102
+ if abs(block.y - current_y) <= paragraph_threshold:
103
+ current_paragraph.append(block)
104
+ else:
105
+ # Start new paragraph
106
+ if current_paragraph:
107
+ paragraph_text = self._join_paragraph_text_advanced(current_paragraph)
108
+ if paragraph_text:
109
+ paragraphs.append(paragraph_text)
110
+ current_paragraph = [block]
111
+ current_y = block.y
112
+
113
+ # Add the last paragraph
114
+ if current_paragraph:
115
+ paragraph_text = self._join_paragraph_text_advanced(current_paragraph)
116
+ if paragraph_text:
117
+ paragraphs.append(paragraph_text)
118
+
119
+ return paragraphs
120
+
121
+ def _join_paragraph_text_advanced(self, text_blocks: List[LayoutElement]) -> str:
122
+ """Join text blocks into a coherent paragraph with better text processing."""
123
+ if not text_blocks:
124
+ return ""
125
+
126
+ # Sort blocks by reading order (left to right, top to bottom)
127
+ text_blocks.sort(key=lambda x: (x.y, x.x))
128
+
129
+ # Extract and clean text
130
+ texts = []
131
+ for block in text_blocks:
132
+ text = block.text.strip()
133
+ if text:
134
+ texts.append(text)
135
+
136
+ if not texts:
137
+ return ""
138
+
139
+ # Join with smart spacing
140
+ result = ""
141
+ for i, text in enumerate(texts):
142
+ if i == 0:
143
+ result = text
144
+ else:
145
+ # Check if we need a space before this text
146
+ prev_char = result[-1] if result else ""
147
+ curr_char = text[0] if text else ""
148
+
149
+ # Don't add space before punctuation
150
+ if curr_char in ',.!?;:':
151
+ result += text
152
+ # Don't add space after opening parenthesis/bracket
153
+ elif prev_char in '([{':
154
+ result += text
155
+ # Don't add space before closing parenthesis/bracket
156
+ elif curr_char in ')]}':
157
+ result += text
158
+ # Don't add space before common punctuation
159
+ elif curr_char in ';:':
160
+ result += text
161
+ # Handle hyphenation
162
+ elif prev_char == '-' and curr_char.isalpha():
163
+ result += text
164
+ else:
165
+ result += " " + text
166
+
167
+ # Post-process the text
168
+ result = self._post_process_text(result)
169
+
170
+ return result.strip()
171
+
172
+ def _post_process_text(self, text: str) -> str:
173
+ """Post-process text to improve readability."""
174
+ # Fix common OCR issues
175
+ text = text.replace('|', 'I') # Common OCR mistake
176
+
177
+ # Note: We intentionally do NOT replace '0' with 'o' or '1' with 'l'
178
+ # as this would corrupt numeric data (e.g., "100" -> "ool", "2024" -> "oool")
179
+
180
+ # Fix spacing issues
181
+ text = re.sub(r'\s+', ' ', text) # Multiple spaces to single space
182
+ text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text) # Fix sentence spacing
183
+
184
+ # Fix common OCR artifacts
185
+ text = re.sub(r'[^\w\s.,!?;:()[\]{}"\'-]', '', text) # Remove strange characters
186
+
187
+ return text
188
+
189
+ def _classify_paragraph(self, text: str) -> str:
190
+ """Classify a paragraph as heading, list item, table row, or regular text."""
191
+ text = text.strip()
192
+
193
+ # Check if it's a list item
194
+ if self._is_list_item(text):
195
+ return "list_item"
196
+
197
+ # Check if it's a table row
198
+ if self._is_table_row(text):
199
+ return "table_row"
200
+
201
+ # Check if it's a heading (short text, ends with period, or all caps)
202
+ if len(text.split()) <= 5 and (text.endswith('.') or text.isupper()):
203
+ return "heading"
204
+
205
+ return "text"
206
+
207
+ def _determine_heading_level_from_text(self, text: str) -> int:
208
+ """Determine heading level based on text characteristics."""
209
+ text = text.strip()
210
+
211
+ # Short text is likely a higher level heading
212
+ if len(text.split()) <= 3:
213
+ return 1
214
+ elif len(text.split()) <= 5:
215
+ return 2
216
+ else:
217
+ return 3
218
+
219
+ def _is_list_item(self, text: str) -> bool:
220
+ """Check if text is a list item."""
221
+ text = text.strip()
222
+ for pattern in self._list_patterns:
223
+ if re.match(pattern, text):
224
+ return True
225
+ return False
226
+
227
+ def _is_table_row(self, text: str) -> bool:
228
+ """Check if text might be a table row."""
229
+ # Simple heuristic: if text contains multiple tab-separated or pipe-separated parts
230
+ if '|' in text or '\t' in text:
231
+ return True
232
+
233
+ # Check for regular spacing that might indicate table columns
234
+ words = text.split()
235
+ if len(words) >= 4: # More words likely indicate table data
236
+ # Check if there are multiple spaces between words (indicating columns)
237
+ if ' ' in text: # Double spaces often indicate column separation
238
+ return True
239
+
240
+ return False
241
+
242
+ def _format_table_row(self, text: str) -> str:
243
+ """Format text as a table row."""
244
+ # Split by common table separators
245
+ if '|' in text:
246
+ cells = [cell.strip() for cell in text.split('|')]
247
+ elif '\t' in text:
248
+ cells = [cell.strip() for cell in text.split('\t')]
249
+ else:
250
+ # Try to split by multiple spaces
251
+ cells = [cell.strip() for cell in re.split(r'\s{2,}', text)]
252
+
253
+ # Format as markdown table row
254
+ return '| ' + ' | '.join(cells) + ' |'
255
+
256
+ def join_text_properly(self, texts: List[str]) -> str:
257
+ """Join text words into proper sentences and paragraphs."""
258
+ if not texts:
259
+ return ""
260
+
261
+ # Clean and join text
262
+ cleaned_texts = []
263
+ for text in texts:
264
+ # Remove extra whitespace
265
+ text = text.strip()
266
+ if text:
267
+ cleaned_texts.append(text)
268
+
269
+ if not cleaned_texts:
270
+ return ""
271
+
272
+ # Join with spaces, but be smart about punctuation
273
+ result = ""
274
+ for i, text in enumerate(cleaned_texts):
275
+ if i == 0:
276
+ result = text
277
+ else:
278
+ # Check if we need a space before this word
279
+ prev_char = result[-1] if result else ""
280
+ curr_char = text[0] if text else ""
281
+
282
+ # Don't add space before punctuation
283
+ if curr_char in ',.!?;:':
284
+ result += text
285
+ # Don't add space after opening parenthesis/bracket
286
+ elif prev_char in '([{':
287
+ result += text
288
+ # Don't add space before closing parenthesis/bracket
289
+ elif curr_char in ')]}':
290
+ result += text
291
+ else:
292
+ result += " " + text
293
+
294
+ return result.strip()
295
+
296
+ def create_layout_element_from_block(self, block_data: List[Dict]) -> LayoutElement:
297
+ """Create a LayoutElement from a block of text data."""
298
+ if not block_data:
299
+ return LayoutElement("", 0, 0, 0, 0)
300
+
301
+ # Sort by line_num and word_num to maintain reading order
302
+ block_data.sort(key=lambda x: (x['line_num'], x['word_num']))
303
+
304
+ # Extract text and position information
305
+ texts = [item['text'] for item in block_data]
306
+ x_coords = [item['x'] for item in block_data]
307
+ y_coords = [item['y'] for item in block_data]
308
+ widths = [item['width'] for item in block_data]
309
+ heights = [item['height'] for item in block_data]
310
+ confidences = [item['conf'] for item in block_data]
311
+
312
+ # Calculate bounding box
313
+ min_x = min(x_coords)
314
+ min_y = min(y_coords)
315
+ max_x = max(x + w for x, w in zip(x_coords, widths))
316
+ max_y = max(y + h for y, h in zip(y_coords, heights))
317
+
318
+ # Join text with proper spacing
319
+ text = self.join_text_properly(texts)
320
+
321
+ return LayoutElement(
322
+ text=text,
323
+ x=min_x,
324
+ y=min_y,
325
+ width=max_x - min_x,
326
+ height=max_y - min_y,
327
+ element_type="text",
328
+ confidence=np.mean(confidences) if confidences else 0.0
329
+ )
docstrange/pipeline/model_downloader.py ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Model downloader utility for downloading pre-trained models from Hugging Face."""
2
+
3
+ import logging
4
+ import os
5
+ from pathlib import Path
6
+ from typing import Optional
7
+ import requests
8
+ from tqdm import tqdm
9
+ from ..utils.gpu_utils import is_gpu_available, get_gpu_info
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class ModelDownloader:
15
+ """Downloads pre-trained models from Hugging Face or Nanonets S3."""
16
+
17
+ # Nanonets S3 model URLs (primary source)
18
+ S3_BASE_URL = "https://public-vlms.s3-us-west-2.amazonaws.com/llm-data-extractor"
19
+
20
+ # Model configurations with both S3 and HuggingFace sources
21
+ LAYOUT_MODEL = {
22
+ "s3_url": f"{S3_BASE_URL}/layout-model-v2.2.0.tar.gz",
23
+ "repo_id": "ds4sd/docling-models",
24
+ "revision": "v2.2.0",
25
+ "model_path": "model_artifacts/layout",
26
+ "cache_folder": "layout"
27
+ }
28
+
29
+ TABLE_MODEL = {
30
+ "s3_url": f"{S3_BASE_URL}/tableformer-model-v2.2.0.tar.gz",
31
+ "repo_id": "ds4sd/docling-models",
32
+ "revision": "v2.2.0",
33
+ "model_path": "model_artifacts/tableformer",
34
+ "cache_folder": "tableformer"
35
+ }
36
+
37
+ # Nanonets OCR model configuration
38
+ NANONETS_OCR_MODEL = {
39
+ "s3_url": f"{S3_BASE_URL}/Nanonets-OCR-s.tar.gz",
40
+ "repo_id": "nanonets/Nanonets-OCR-s",
41
+ "revision": "main",
42
+ "cache_folder": "nanonets-ocr",
43
+ }
44
+
45
+ # Note: EasyOCR downloads its own models automatically, no need for custom model
46
+
47
+ def __init__(self, cache_dir: Optional[Path] = None):
48
+ """Initialize the model downloader.
49
+
50
+ Args:
51
+ cache_dir: Directory to cache downloaded models
52
+ """
53
+ if cache_dir is None:
54
+ cache_dir = Path.home() / ".cache" / "docstrange" / "models"
55
+
56
+ self.cache_dir = Path(cache_dir)
57
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
58
+
59
+ logger.info(f"Model cache directory: {self.cache_dir}")
60
+
61
+ def download_models(self, force: bool = False, progress: bool = True) -> Path:
62
+ """Download all required models.
63
+
64
+ Args:
65
+ force: Force re-download even if models exist
66
+ progress: Show download progress
67
+
68
+ Returns:
69
+ Path to the models directory
70
+ """
71
+ logger.info("Downloading pre-trained models...")
72
+
73
+ # Auto-detect GPU for Nanonets model
74
+ gpu_available = is_gpu_available()
75
+ logger.info(f"GPU available: {gpu_available}")
76
+ if gpu_available:
77
+ logger.info("GPU detected - including Nanonets OCR model")
78
+ else:
79
+ logger.info("No GPU detected - skipping Nanonets OCR model (cloud mode)")
80
+
81
+ models_to_download = [
82
+ ("Layout Model", self.LAYOUT_MODEL),
83
+ ("Table Structure Model", self.TABLE_MODEL)
84
+ ]
85
+
86
+ # Add Nanonets OCR model only if GPU is available
87
+ if gpu_available:
88
+ models_to_download.append(("Nanonets OCR Model", self.NANONETS_OCR_MODEL))
89
+
90
+ for model_name, model_config in models_to_download:
91
+ logger.info(f"Downloading {model_name}...")
92
+ self._download_model(model_config, force, progress)
93
+
94
+ logger.info("All models downloaded successfully!")
95
+ return self.cache_dir
96
+
97
+ def _download_model(self, model_config: dict, force: bool, progress: bool):
98
+ """Download a specific model.
99
+
100
+ Args:
101
+ model_config: Model configuration dictionary
102
+ force: Force re-download
103
+ progress: Show progress
104
+ """
105
+ model_dir = self.cache_dir / model_config["cache_folder"]
106
+
107
+ if model_dir.exists() and not force:
108
+ logger.info(f"Model already exists at {model_dir}")
109
+ return
110
+
111
+ # Create model directory
112
+ model_dir.mkdir(parents=True, exist_ok=True)
113
+
114
+ success = False
115
+
116
+ # Check if user prefers Hugging Face via environment variable
117
+ prefer_hf = os.environ.get("document_extractor_PREFER_HF", "false").lower() == "true"
118
+
119
+ # Try S3 first (Nanonets hosted models) unless user prefers HF
120
+ if not prefer_hf and "s3_url" in model_config:
121
+ try:
122
+ logger.info(f"Downloading from Nanonets S3: {model_config['s3_url']}")
123
+ self._download_from_s3(
124
+ s3_url=model_config["s3_url"],
125
+ local_dir=model_dir,
126
+ force=force,
127
+ progress=progress
128
+ )
129
+ success = True
130
+ logger.info("Successfully downloaded from Nanonets S3")
131
+ except Exception as e:
132
+ logger.warning(f"S3 download failed: {e}")
133
+ logger.info("Falling back to Hugging Face...")
134
+
135
+ # Fallback to Hugging Face if S3 fails
136
+ if not success:
137
+ self._download_from_hf(
138
+ repo_id=model_config["repo_id"],
139
+ revision=model_config["revision"],
140
+ local_dir=model_dir,
141
+ force=force,
142
+ progress=progress
143
+ )
144
+
145
+ def _download_from_hf(self, repo_id: str, revision: str, local_dir: Path,
146
+ force: bool, progress: bool):
147
+ """Download model from Hugging Face using docling's logic.
148
+
149
+ Args:
150
+ repo_id: Hugging Face repository ID
151
+ revision: Git revision/tag
152
+ local_dir: Local directory to save model
153
+ force: Force re-download
154
+ progress: Show progress
155
+ """
156
+ try:
157
+ from huggingface_hub import snapshot_download
158
+ from huggingface_hub.utils import disable_progress_bars
159
+ import huggingface_hub
160
+
161
+ if not progress:
162
+ disable_progress_bars()
163
+
164
+ # Check if models are already downloaded
165
+ if local_dir.exists() and any(local_dir.iterdir()):
166
+ logger.info(f"Model {repo_id} already exists at {local_dir}")
167
+ return
168
+
169
+ # Try to download with current authentication
170
+ try:
171
+ download_path = snapshot_download(
172
+ repo_id=repo_id,
173
+ force_download=force,
174
+ local_dir=str(local_dir),
175
+ revision=revision,
176
+ token=None, # Use default token if available
177
+ )
178
+ logger.info(f"Successfully downloaded {repo_id} to {download_path}")
179
+
180
+ except huggingface_hub.errors.HfHubHTTPError as e:
181
+ if "401" in str(e) or "Unauthorized" in str(e):
182
+ logger.warning(
183
+ f"Authentication failed for {repo_id}. This model may require a Hugging Face token.\n"
184
+ "To fix this:\n"
185
+ "1. Create a free account at https://huggingface.co/\n"
186
+ "2. Generate a token at https://huggingface.co/settings/tokens\n"
187
+ "3. Set it as environment variable: export HF_TOKEN='your_token_here'\n"
188
+ "4. Or run: huggingface-cli login\n\n"
189
+ "The library will continue with basic OCR capabilities."
190
+ )
191
+ # Don't raise the error, just log it and continue
192
+ return
193
+ else:
194
+ raise
195
+
196
+ except ImportError:
197
+ logger.error("huggingface_hub not available. Please install it: pip install huggingface_hub")
198
+ raise
199
+ except Exception as e:
200
+ logger.error(f"Failed to download model {repo_id}: {e}")
201
+ # Don't raise for authentication errors - allow fallback processing
202
+ if "401" not in str(e) and "Unauthorized" not in str(e):
203
+ raise
204
+
205
+ def get_model_path(self, model_type: str) -> Optional[Path]:
206
+ """Get the path to a specific model.
207
+
208
+ Args:
209
+ model_type: Type of model ('layout', 'table', 'nanonets-ocr')
210
+
211
+ Returns:
212
+ Path to the model directory, or None if not found
213
+ """
214
+ model_mapping = {
215
+ 'layout': self.LAYOUT_MODEL["cache_folder"],
216
+ 'table': self.TABLE_MODEL["cache_folder"],
217
+ 'nanonets-ocr': self.NANONETS_OCR_MODEL["cache_folder"]
218
+ }
219
+
220
+ if model_type not in model_mapping:
221
+ logger.error(f"Unknown model type: {model_type}")
222
+ return None
223
+
224
+ model_path = self.cache_dir / model_mapping[model_type]
225
+
226
+ if not model_path.exists():
227
+ logger.warning(f"Model {model_type} not found at {model_path}")
228
+ return None
229
+
230
+ return model_path
231
+
232
+ def are_models_cached(self) -> bool:
233
+ """Check if all required models are cached.
234
+
235
+ Returns:
236
+ True if all required models are cached, False otherwise
237
+ """
238
+ layout_path = self.get_model_path('layout')
239
+ table_path = self.get_model_path('table')
240
+
241
+ # Only check for Nanonets model if GPU is available
242
+ if is_gpu_available():
243
+ nanonets_path = self.get_model_path('nanonets-ocr')
244
+ return layout_path is not None and table_path is not None and nanonets_path is not None
245
+ else:
246
+ return layout_path is not None and table_path is not None
247
+
248
+ def _download_from_s3(self, s3_url: str, local_dir: Path, force: bool, progress: bool):
249
+ """Download model from Nanonets S3.
250
+
251
+ Args:
252
+ s3_url: S3 URL of the model archive
253
+ local_dir: Local directory to extract model
254
+ force: Force re-download
255
+ progress: Show progress
256
+ """
257
+ import tarfile
258
+ import tempfile
259
+
260
+ # Download the tar.gz file
261
+ response = requests.get(s3_url, stream=True)
262
+ response.raise_for_status()
263
+
264
+ total_size = int(response.headers.get('content-length', 0))
265
+
266
+ with tempfile.NamedTemporaryFile(suffix='.tar.gz', delete=False) as tmp_file:
267
+ if progress and total_size > 0:
268
+ with tqdm(total=total_size, unit='B', unit_scale=True, desc="Downloading") as pbar:
269
+ for chunk in response.iter_content(chunk_size=8192):
270
+ if chunk:
271
+ tmp_file.write(chunk)
272
+ pbar.update(len(chunk))
273
+ else:
274
+ for chunk in response.iter_content(chunk_size=8192):
275
+ if chunk:
276
+ tmp_file.write(chunk)
277
+
278
+ tmp_file_path = tmp_file.name
279
+
280
+ try:
281
+ # Extract the archive
282
+ logger.info(f"Extracting model to {local_dir}")
283
+ with tarfile.open(tmp_file_path, 'r:gz') as tar:
284
+ tar.extractall(path=local_dir)
285
+
286
+ logger.info("Model extraction completed successfully")
287
+
288
+ finally:
289
+ # Clean up temporary file
290
+ try:
291
+ os.unlink(tmp_file_path)
292
+ except OSError:
293
+ pass
294
+
295
+ def get_cache_info(self) -> dict:
296
+ """Get information about cached models.
297
+
298
+ Returns:
299
+ Dictionary with cache information
300
+ """
301
+ info = {
302
+ 'cache_dir': str(self.cache_dir),
303
+ 'gpu_info': get_gpu_info(),
304
+ 'models': {}
305
+ }
306
+
307
+ # Always check layout and table models
308
+ for model_type in ['layout', 'table']:
309
+ path = self.get_model_path(model_type)
310
+ info['models'][model_type] = {
311
+ 'cached': path is not None,
312
+ 'path': str(path) if path else None
313
+ }
314
+
315
+ # Only check Nanonets model if GPU is available
316
+ if is_gpu_available():
317
+ path = self.get_model_path('nanonets-ocr')
318
+ info['models']['nanonets-ocr'] = {
319
+ 'cached': path is not None,
320
+ 'path': str(path) if path else None,
321
+ 'gpu_required': True
322
+ }
323
+ else:
324
+ info['models']['nanonets-ocr'] = {
325
+ 'cached': False,
326
+ 'path': None,
327
+ 'gpu_required': True,
328
+ 'skipped': 'No GPU available'
329
+ }
330
+
331
+ return info
docstrange/pipeline/nanonets_processor.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Neural Document Processor using Nanonets OCR for superior document understanding."""
2
+
3
+ import logging
4
+ import os
5
+ from typing import Optional
6
+ from pathlib import Path
7
+ from PIL import Image
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class NanonetsDocumentProcessor:
13
+ """Neural Document Processor using Nanonets OCR model."""
14
+
15
+ def __init__(self, cache_dir: Optional[Path] = None):
16
+ """Initialize the Neural Document Processor with Nanonets OCR."""
17
+ logger.info("Initializing Neural Document Processor with Nanonets OCR...")
18
+
19
+ # Initialize models
20
+ self._initialize_models(cache_dir)
21
+
22
+ logger.info("Neural Document Processor initialized successfully")
23
+
24
+ def _initialize_models(self, cache_dir: Optional[Path] = None):
25
+ """Initialize Nanonets OCR model from local cache."""
26
+ try:
27
+ from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
28
+ from .model_downloader import ModelDownloader
29
+
30
+ # Get model downloader instance
31
+ model_downloader = ModelDownloader(cache_dir)
32
+
33
+ # Get the path to the locally cached Nanonets model
34
+ model_path = model_downloader.get_model_path('nanonets-ocr')
35
+
36
+ if model_path is None:
37
+ raise RuntimeError(
38
+ "Failed to download Nanonets OCR model. "
39
+ "Please ensure you have sufficient disk space and internet connection."
40
+ )
41
+
42
+ # The actual model files are in a subdirectory with the same name
43
+ actual_model_path = model_path / "Nanonets-OCR-ss"
44
+
45
+ if not actual_model_path.exists():
46
+ raise RuntimeError(
47
+ f"Model files not found at expected path: {actual_model_path}"
48
+ )
49
+
50
+ logger.info(f"Loading Nanonets OCR model from local cache: {actual_model_path}")
51
+
52
+ # Load model from local path
53
+ self.model = AutoModelForImageTextToText.from_pretrained(
54
+ str(actual_model_path),
55
+ torch_dtype="auto",
56
+ device_map="auto",
57
+ local_files_only=True # Use only local files
58
+ )
59
+ self.model.eval()
60
+
61
+ self.tokenizer = AutoTokenizer.from_pretrained(
62
+ str(actual_model_path),
63
+ local_files_only=True
64
+ )
65
+ self.processor = AutoProcessor.from_pretrained(
66
+ str(actual_model_path),
67
+ local_files_only=True
68
+ )
69
+
70
+ logger.info("Nanonets OCR model loaded successfully from local cache")
71
+
72
+ except ImportError as e:
73
+ logger.error(f"Transformers library not available: {e}")
74
+ raise ImportError(
75
+ "Transformers library is required for Nanonets OCR. "
76
+ "Please install it: pip install transformers"
77
+ )
78
+ except Exception as e:
79
+ logger.error(f"Failed to initialize Nanonets OCR model: {e}")
80
+ raise
81
+
82
+ def extract_text(self, image_path: str) -> str:
83
+ """Extract text from image using Nanonets OCR."""
84
+ try:
85
+ if not os.path.exists(image_path):
86
+ logger.error(f"Image file does not exist: {image_path}")
87
+ return ""
88
+
89
+ return self._extract_text_with_nanonets(image_path)
90
+
91
+ except Exception as e:
92
+ logger.error(f"Nanonets OCR extraction failed: {e}")
93
+ return ""
94
+
95
+ def extract_text_with_layout(self, image_path: str) -> str:
96
+ """Extract text with layout awareness using Nanonets OCR.
97
+
98
+ Note: Nanonets OCR already provides layout-aware extraction,
99
+ so this method returns the same result as extract_text().
100
+ """
101
+ return self.extract_text(image_path)
102
+
103
+ def _extract_text_with_nanonets(self, image_path: str, max_new_tokens: int = 4096) -> str:
104
+ """Extract text using Nanonets OCR model."""
105
+ try:
106
+ prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""
107
+
108
+ image = Image.open(image_path)
109
+ messages = [
110
+ {"role": "system", "content": "You are a helpful assistant."},
111
+ {"role": "user", "content": [
112
+ {"type": "image", "image": f"file://{image_path}"},
113
+ {"type": "text", "text": prompt},
114
+ ]},
115
+ ]
116
+
117
+ text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
118
+ inputs = self.processor(text=[text], images=[image], padding=True, return_tensors="pt")
119
+ inputs = inputs.to(self.model.device)
120
+
121
+ output_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
122
+ generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
123
+
124
+ output_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
125
+ return output_text[0]
126
+
127
+ except Exception as e:
128
+ logger.error(f"Nanonets OCR extraction failed: {e}")
129
+ return ""
docstrange/pipeline/neural_document_processor.py ADDED
@@ -0,0 +1,644 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Neural Document Processor using docling's pre-trained models for superior document understanding."""
2
+
3
+ import logging
4
+ import os
5
+ import platform
6
+ import sys
7
+ from typing import Optional, List, Dict, Any, Tuple
8
+ from pathlib import Path
9
+ from PIL import Image
10
+ import numpy as np
11
+
12
+ # macOS-specific NumPy compatibility fix
13
+ if platform.system() == "Darwin":
14
+ try:
15
+ import numpy as np
16
+ # Check if we're on NumPy 2.x
17
+ if hasattr(np, '__version__') and np.__version__.startswith('2'):
18
+ # Set environment variable to use NumPy 1.x compatibility mode
19
+ os.environ['NUMPY_EXPERIMENTAL_ARRAY_FUNCTION'] = '0'
20
+ # Also set this for PyTorch compatibility
21
+ os.environ['PYTORCH_NUMPY_COMPATIBILITY'] = '1'
22
+ logger = logging.getLogger(__name__)
23
+ logger.warning(
24
+ "NumPy 2.x detected on macOS. This may cause compatibility issues. "
25
+ "Consider downgrading to NumPy 1.x: pip install 'numpy<2.0.0'"
26
+ )
27
+ except ImportError:
28
+ pass
29
+
30
+ # Runtime NumPy version check
31
+ def _check_numpy_version():
32
+ """Check NumPy version and warn about compatibility issues."""
33
+ try:
34
+ import numpy as np
35
+ version = np.__version__
36
+ if version.startswith('2'):
37
+ logger = logging.getLogger(__name__)
38
+ logger.error(
39
+ f"NumPy {version} detected. This library requires NumPy 1.x for compatibility "
40
+ "with docling models. Please downgrade NumPy:\n"
41
+ "pip install 'numpy<2.0.0'\n"
42
+ "or\n"
43
+ "pip install --upgrade llm-data-extractor"
44
+ )
45
+ if platform.system() == "Darwin":
46
+ logger.error(
47
+ "On macOS, NumPy 2.x is known to cause crashes with PyTorch. "
48
+ "Downgrading to NumPy 1.x is strongly recommended."
49
+ )
50
+ return False
51
+ return True
52
+ except ImportError:
53
+ return True
54
+
55
+ from .model_downloader import ModelDownloader
56
+ from .layout_detector import LayoutDetector
57
+
58
+ logger = logging.getLogger(__name__)
59
+
60
+
61
+ class NeuralDocumentProcessor:
62
+ """Neural Document Processor using docling's pre-trained models."""
63
+
64
+ def __init__(self, cache_dir: Optional[Path] = None):
65
+ """Initialize the Neural Document Processor."""
66
+ logger.info("Initializing Neural Document Processor...")
67
+
68
+ # Check NumPy version compatibility
69
+ if not _check_numpy_version():
70
+ raise RuntimeError(
71
+ "Incompatible NumPy version detected. Please downgrade to NumPy 1.x: "
72
+ "pip install 'numpy<2.0.0'"
73
+ )
74
+
75
+ # Initialize model downloader
76
+ self.model_downloader = ModelDownloader(cache_dir)
77
+
78
+ # Initialize layout detector
79
+ self.layout_detector = LayoutDetector()
80
+
81
+ # Initialize models
82
+ self._initialize_models()
83
+
84
+ logger.info("Neural Document Processor initialized successfully")
85
+
86
+ def _initialize_models(self):
87
+ """Initialize all required models."""
88
+ try:
89
+ # Initialize model paths
90
+ self._initialize_model_paths()
91
+
92
+ # Initialize docling neural models
93
+ self._initialize_docling_models()
94
+
95
+ except Exception as e:
96
+ logger.error(f"Failed to initialize models: {e}")
97
+ raise
98
+
99
+ def _initialize_model_paths(self):
100
+ """Initialize paths to downloaded models."""
101
+ from .model_downloader import ModelDownloader
102
+
103
+ downloader = ModelDownloader()
104
+
105
+ # Check if models exist, if not download them
106
+ layout_path = downloader.get_model_path('layout')
107
+ table_path = downloader.get_model_path('table')
108
+
109
+ # If any model is missing, download all models
110
+ if not layout_path or not table_path:
111
+ logger.info("Some models are missing. Downloading all required models...")
112
+ logger.info(f"Models will be cached at: {downloader.cache_dir}")
113
+ try:
114
+ downloader.download_models(force=False, progress=True)
115
+ # Get paths again after download
116
+ layout_path = downloader.get_model_path('layout')
117
+ table_path = downloader.get_model_path('table')
118
+
119
+ # Check if download was successful
120
+ if layout_path and table_path:
121
+ logger.info("Model download completed successfully!")
122
+ else:
123
+ logger.warning("Some models may not have downloaded successfully due to authentication issues.")
124
+ logger.info("Falling back to basic document processing without advanced neural models.")
125
+ # Set flags to indicate fallback mode
126
+ self._use_fallback_mode = True
127
+ return
128
+
129
+ except Exception as e:
130
+ logger.warning(f"Failed to download models: {e}")
131
+ if "401" in str(e) or "Unauthorized" in str(e) or "Authentication" in str(e):
132
+ logger.info(
133
+ "Model download failed due to authentication. Using basic document processing.\n"
134
+ "For enhanced features, please set up Hugging Face authentication:\n"
135
+ "1. Create account at https://huggingface.co/\n"
136
+ "2. Generate token at https://huggingface.co/settings/tokens\n"
137
+ "3. Run: huggingface-cli login"
138
+ )
139
+ self._use_fallback_mode = True
140
+ return
141
+ else:
142
+ raise ValueError(f"Failed to download required models: {e}")
143
+ else:
144
+ logger.info("All required models found in cache.")
145
+
146
+ # Set fallback mode flag
147
+ self._use_fallback_mode = False
148
+
149
+ # Set model paths
150
+ self.layout_model_path = layout_path
151
+ self.table_model_path = table_path
152
+
153
+ if not self.layout_model_path or not self.table_model_path:
154
+ if hasattr(self, '_use_fallback_mode') and self._use_fallback_mode:
155
+ logger.info("Running in fallback mode without advanced neural models")
156
+ return
157
+ else:
158
+ raise ValueError("One or more required models not found")
159
+
160
+ # The models are downloaded with the full repository structure
161
+ # The entire repo is downloaded to each cache folder, so we need to navigate to the specific model paths
162
+ # Layout model is in layout/model_artifacts/layout/
163
+ # Table model is in tableformer/model_artifacts/tableformer/accurate/
164
+ # Note: EasyOCR downloads its own models automatically
165
+
166
+ # Check if the expected structure exists, if not use the cache folder directly
167
+ layout_artifacts = self.layout_model_path / "model_artifacts" / "layout"
168
+ table_artifacts = self.table_model_path / "model_artifacts" / "tableformer" / "accurate"
169
+
170
+ if layout_artifacts.exists():
171
+ self.layout_model_path = layout_artifacts
172
+ else:
173
+ # Fallback: use the cache folder directly
174
+ logger.warning(f"Expected layout model structure not found, using cache folder directly")
175
+
176
+ if table_artifacts.exists():
177
+ self.table_model_path = table_artifacts
178
+ else:
179
+ # Fallback: use the cache folder directly
180
+ logger.warning(f"Expected table model structure not found, using cache folder directly")
181
+
182
+ logger.info(f"Layout model path: {self.layout_model_path}")
183
+ logger.info(f"Table model path: {self.table_model_path}")
184
+ logger.info("EasyOCR will download its own models automatically")
185
+
186
+ # Verify model files exist (with more flexible checking)
187
+ layout_model_file = self.layout_model_path / "model.safetensors"
188
+ table_config_file = self.table_model_path / "tm_config.json"
189
+
190
+ if not layout_model_file.exists():
191
+ # Try alternative locations
192
+ alt_layout_file = self.layout_model_path / "layout" / "model.safetensors"
193
+ if alt_layout_file.exists():
194
+ self.layout_model_path = self.layout_model_path / "layout"
195
+ layout_model_file = alt_layout_file
196
+ else:
197
+ raise FileNotFoundError(f"Missing layout model file. Checked: {layout_model_file}, {alt_layout_file}")
198
+
199
+ if not table_config_file.exists():
200
+ # Try alternative locations
201
+ alt_table_file = self.table_model_path / "tableformer" / "accurate" / "tm_config.json"
202
+ if alt_table_file.exists():
203
+ self.table_model_path = self.table_model_path / "tableformer" / "accurate"
204
+ table_config_file = alt_table_file
205
+ else:
206
+ raise FileNotFoundError(f"Missing table config file. Checked: {table_config_file}, {alt_table_file}")
207
+
208
+ def _initialize_docling_models(self):
209
+ """Initialize docling's pre-trained models."""
210
+ # Check if we're in fallback mode
211
+ if hasattr(self, '_use_fallback_mode') and self._use_fallback_mode:
212
+ logger.info("Skipping docling models initialization - running in fallback mode")
213
+ self.use_advanced_models = False
214
+ self.layout_predictor = None
215
+ self.table_predictor = None
216
+ self.ocr_reader = None
217
+ return
218
+
219
+ try:
220
+ # Import docling models
221
+ from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
222
+ from docling_ibm_models.tableformer.common import read_config
223
+ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
224
+ import easyocr
225
+
226
+ # Initialize layout model
227
+ self.layout_predictor = LayoutPredictor(
228
+ artifact_path=str(self.layout_model_path),
229
+ device='cpu',
230
+ num_threads=4
231
+ )
232
+
233
+ # Initialize table structure model
234
+ tm_config = read_config(str(self.table_model_path / "tm_config.json"))
235
+ tm_config["model"]["save_dir"] = str(self.table_model_path)
236
+ self.table_predictor = TFPredictor(tm_config, 'cpu', 4)
237
+
238
+ # Initialize OCR model
239
+ self.ocr_reader = easyocr.Reader(['en'])
240
+
241
+ self.use_advanced_models = True
242
+ logger.info("Docling neural models initialized successfully")
243
+
244
+ except ImportError as e:
245
+ logger.error(f"Docling models not available: {e}")
246
+ raise
247
+ except Exception as e:
248
+ error_msg = str(e)
249
+ if "NumPy" in error_msg or "numpy" in error_msg.lower():
250
+ logger.error(
251
+ f"NumPy compatibility error: {error_msg}\n"
252
+ "This is likely due to NumPy 2.x incompatibility. Please downgrade:\n"
253
+ "pip install 'numpy<2.0.0'"
254
+ )
255
+ if platform.system() == "Darwin":
256
+ logger.error(
257
+ "On macOS, NumPy 2.x is known to cause crashes with PyTorch. "
258
+ "Downgrading to NumPy 1.x is required."
259
+ )
260
+ else:
261
+ logger.error(f"Failed to initialize docling models: {e}")
262
+ raise
263
+
264
+ def extract_text(self, image_path: str) -> str:
265
+ """Extract text from image using neural OCR."""
266
+ try:
267
+ if not os.path.exists(image_path):
268
+ logger.error(f"Image file does not exist: {image_path}")
269
+ return ""
270
+
271
+ return self._extract_text_advanced(image_path)
272
+
273
+ except Exception as e:
274
+ logger.error(f"OCR extraction failed: {e}")
275
+ return ""
276
+
277
+ def extract_text_with_layout(self, image_path: str) -> str:
278
+ """Extract text with layout awareness using neural models."""
279
+ try:
280
+ if not os.path.exists(image_path):
281
+ logger.error(f"Image file does not exist: {image_path}")
282
+ return ""
283
+
284
+ return self._extract_text_with_layout_advanced(image_path)
285
+
286
+ except Exception as e:
287
+ logger.error(f"Layout-aware OCR extraction failed: {e}")
288
+ return ""
289
+
290
+ def _extract_text_advanced(self, image_path: str) -> str:
291
+ """Extract text using docling's advanced models."""
292
+ try:
293
+ with Image.open(image_path) as img:
294
+ if img.mode != 'RGB':
295
+ img = img.convert('RGB')
296
+
297
+ results = self.ocr_reader.readtext(img)
298
+ texts = []
299
+ for (bbox, text, confidence) in results:
300
+ if confidence > 0.5:
301
+ texts.append(text)
302
+
303
+ return ' '.join(texts)
304
+
305
+ except Exception as e:
306
+ logger.error(f"Advanced OCR extraction failed: {e}")
307
+ return ""
308
+
309
+ def _extract_text_with_layout_advanced(self, image_path: str) -> str:
310
+ """Extract text with layout awareness using docling's neural models."""
311
+ try:
312
+ with Image.open(image_path) as img:
313
+ if img.mode != 'RGB':
314
+ img = img.convert('RGB')
315
+
316
+ # Get layout predictions using neural model
317
+ layout_results = list(self.layout_predictor.predict(img))
318
+
319
+ # Process layout results and extract text
320
+ text_blocks = []
321
+ table_blocks = []
322
+
323
+ for pred in layout_results:
324
+ label = pred.get('label', '').lower().replace(' ', '_').replace('-', '_')
325
+
326
+ # Construct bbox from l, t, r, b
327
+ if all(k in pred for k in ['l', 't', 'r', 'b']):
328
+ bbox = [pred['l'], pred['t'], pred['r'], pred['b']]
329
+ else:
330
+ bbox = pred.get('bbox') or pred.get('box')
331
+ if not bbox:
332
+ continue
333
+
334
+ # Extract text from this region using OCR
335
+ region_text = self._extract_text_from_region(img, bbox)
336
+
337
+ if not region_text or pred.get('confidence', 1.0) < 0.5:
338
+ continue
339
+
340
+ from .layout_detector import LayoutElement
341
+
342
+ # Handle different element types
343
+ if label in ['table', 'document_index']:
344
+ # Process tables separately
345
+ table_blocks.append({
346
+ 'text': region_text,
347
+ 'bbox': bbox,
348
+ 'label': label,
349
+ 'confidence': pred.get('confidence', 1.0)
350
+ })
351
+ elif label in ['title', 'section_header', 'subtitle_level_1']:
352
+ # Headers
353
+ text_blocks.append(LayoutElement(
354
+ text=region_text,
355
+ x=bbox[0],
356
+ y=bbox[1],
357
+ width=bbox[2] - bbox[0],
358
+ height=bbox[3] - bbox[1],
359
+ element_type='heading',
360
+ confidence=pred.get('confidence', 1.0)
361
+ ))
362
+ elif label in ['list_item']:
363
+ # List items
364
+ text_blocks.append(LayoutElement(
365
+ text=region_text,
366
+ x=bbox[0],
367
+ y=bbox[1],
368
+ width=bbox[2] - bbox[0],
369
+ height=bbox[3] - bbox[1],
370
+ element_type='list_item',
371
+ confidence=pred.get('confidence', 1.0)
372
+ ))
373
+ else:
374
+ # Regular text/paragraphs
375
+ text_blocks.append(LayoutElement(
376
+ text=region_text,
377
+ x=bbox[0],
378
+ y=bbox[1],
379
+ width=bbox[2] - bbox[0],
380
+ height=bbox[3] - bbox[1],
381
+ element_type='paragraph',
382
+ confidence=pred.get('confidence', 1.0)
383
+ ))
384
+
385
+ # Sort by position (top to bottom, left to right)
386
+ text_blocks.sort(key=lambda x: (x.y, x.x))
387
+
388
+ # Process tables using table structure model
389
+ processed_tables = self._process_tables_with_structure_model(img, table_blocks)
390
+
391
+ # Convert to markdown with proper structure
392
+ return self._convert_to_structured_markdown_advanced(text_blocks, processed_tables, img.size)
393
+
394
+ except Exception as e:
395
+ logger.error(f"Advanced layout-aware OCR failed: {e}")
396
+ return ""
397
+
398
+ def _process_tables_with_structure_model(self, img: Image.Image, table_blocks: List[Dict]) -> List[Dict]:
399
+ """Process tables using the table structure model."""
400
+ processed_tables = []
401
+
402
+ for table_block in table_blocks:
403
+ try:
404
+ # Extract table region
405
+ bbox = table_block['bbox']
406
+ x1, y1, x2, y2 = bbox
407
+ table_region = img.crop((x1, y1, x2, y2))
408
+
409
+ # Convert to numpy array
410
+ table_np = np.array(table_region)
411
+
412
+ # Create page input in the format expected by docling table structure model
413
+ page_input = {
414
+ "width": table_np.shape[1],
415
+ "height": table_np.shape[0],
416
+ "image": table_np,
417
+ "tokens": [] # Empty tokens since we're not using cell matching
418
+ }
419
+
420
+ # The bbox coordinates should be relative to the table region
421
+ table_bbox = [0, 0, x2-x1, y2-y1]
422
+
423
+ # Predict table structure
424
+ tf_output = self.table_predictor.multi_table_predict(page_input, [table_bbox], do_matching=False)
425
+ table_out = tf_output[0] if isinstance(tf_output, list) else tf_output
426
+
427
+ # Extract table data
428
+ table_data = []
429
+ tf_responses = table_out.get("tf_responses", []) if isinstance(table_out, dict) else []
430
+
431
+ for element in tf_responses:
432
+ if isinstance(element, dict) and "bbox" in element:
433
+ cell_bbox = element["bbox"]
434
+ # Handle bbox as dict with keys l, t, r, b
435
+ if isinstance(cell_bbox, dict) and all(k in cell_bbox for k in ["l", "t", "r", "b"]):
436
+ cell_x1 = cell_bbox["l"]
437
+ cell_y1 = cell_bbox["t"]
438
+ cell_x2 = cell_bbox["r"]
439
+ cell_y2 = cell_bbox["b"]
440
+ cell_region = table_region.crop((cell_x1, cell_y1, cell_x2, cell_y2))
441
+ cell_np = np.array(cell_region)
442
+ cell_text = self._extract_text_from_region_numpy(cell_np)
443
+ table_data.append(cell_text)
444
+ elif isinstance(cell_bbox, list) and len(cell_bbox) == 4:
445
+ cell_x1, cell_y1, cell_x2, cell_y2 = cell_bbox
446
+ cell_region = table_region.crop((cell_x1, cell_y1, cell_x2, cell_y2))
447
+ cell_np = np.array(cell_region)
448
+ cell_text = self._extract_text_from_region_numpy(cell_np)
449
+ table_data.append(cell_text)
450
+ else:
451
+ pass
452
+ else:
453
+ pass
454
+
455
+ # Organize table data into rows and columns
456
+ processed_table = self._organize_table_data(table_data, table_out if isinstance(table_out, dict) else {})
457
+ # Preserve the original bbox from the table block
458
+ processed_table['bbox'] = table_block['bbox']
459
+ processed_tables.append(processed_table)
460
+
461
+ except Exception as e:
462
+ logger.error(f"Failed to process table: {e}")
463
+ # Fallback to simple table extraction
464
+ processed_tables.append({
465
+ 'type': 'simple_table',
466
+ 'text': table_block['text'],
467
+ 'bbox': table_block['bbox']
468
+ })
469
+
470
+ return processed_tables
471
+
472
+ def _extract_text_from_region_numpy(self, region_np: np.ndarray) -> str:
473
+ """Extract text from numpy array region."""
474
+ try:
475
+ results = self.ocr_reader.readtext(region_np)
476
+ texts = []
477
+ for (_, text, confidence) in results:
478
+ if confidence > 0.5:
479
+ texts.append(text)
480
+ return ' '.join(texts)
481
+ except Exception as e:
482
+ logger.error(f"Failed to extract text from numpy region: {e}")
483
+ return ""
484
+
485
+ def _organize_table_data(self, table_data: list, table_out: dict) -> dict:
486
+ """Organize table data into proper structure using row/col indices from tf_responses."""
487
+ try:
488
+ tf_responses = table_out.get("tf_responses", []) if isinstance(table_out, dict) else []
489
+ num_rows = table_out.get("predict_details", {}).get("num_rows", 0)
490
+ num_cols = table_out.get("predict_details", {}).get("num_cols", 0)
491
+
492
+ # Build empty grid
493
+ grid = [["" for _ in range(num_cols)] for _ in range(num_rows)]
494
+
495
+ # Place cell texts in the correct grid positions
496
+ for idx, element in enumerate(tf_responses):
497
+ row = element.get("start_row_offset_idx", 0)
498
+ col = element.get("start_col_offset_idx", 0)
499
+ # Use the extracted text if available, else fallback to element text
500
+ text = table_data[idx] if idx < len(table_data) else element.get("text", "")
501
+ grid[row][col] = text
502
+
503
+ return {
504
+ 'type': 'structured_table',
505
+ 'grid': grid,
506
+ 'num_rows': num_rows,
507
+ 'num_cols': num_cols
508
+ }
509
+ except Exception as e:
510
+ logger.error(f"Failed to organize table data: {e}")
511
+ return {
512
+ 'type': 'simple_table',
513
+ 'data': table_data
514
+ }
515
+
516
+ def _convert_table_to_markdown(self, table: dict) -> str:
517
+ """Convert structured table to markdown format."""
518
+ if table['type'] != 'structured_table':
519
+ return f"**Table:** {table.get('text', '')}"
520
+ grid = table['grid']
521
+ if not grid or not grid[0]:
522
+ return ""
523
+
524
+ # Find the first non-empty row to use as header
525
+ header_row = None
526
+ for row in grid:
527
+ if any(cell.strip() for cell in row):
528
+ header_row = row
529
+ break
530
+
531
+ if not header_row:
532
+ return ""
533
+
534
+ # Use the header row as is (preserve all columns)
535
+ header_cells = [cell.strip() if cell else "" for cell in header_row]
536
+
537
+ markdown_lines = []
538
+ markdown_lines.append("| " + " | ".join(header_cells) + " |")
539
+ markdown_lines.append("|" + "|".join(["---"] * len(header_cells)) + "|")
540
+
541
+ # Add data rows (skip the header row)
542
+ header_index = grid.index(header_row)
543
+ for row in grid[header_index + 1:]:
544
+ cells = [cell.strip() if cell else "" for cell in row]
545
+ markdown_lines.append("| " + " | ".join(cells) + " |")
546
+
547
+ return '\n'.join(markdown_lines)
548
+
549
+ def _convert_to_structured_markdown_advanced(self, text_blocks: List, processed_tables: List[Dict], img_size: Tuple[int, int]) -> str:
550
+ """Convert text blocks and tables to structured markdown."""
551
+ markdown_parts = []
552
+
553
+ # Sort all elements by position
554
+ all_elements = []
555
+
556
+ # Add text blocks
557
+ for block in text_blocks:
558
+ all_elements.append({
559
+ 'type': 'text',
560
+ 'element': block,
561
+ 'y': block.y,
562
+ 'x': block.x
563
+ })
564
+
565
+ # Add tables
566
+ for table in processed_tables:
567
+ if 'bbox' in table:
568
+ all_elements.append({
569
+ 'type': 'table',
570
+ 'element': table,
571
+ 'y': table['bbox'][1],
572
+ 'x': table['bbox'][0]
573
+ })
574
+ else:
575
+ logger.warning(f"Table has no bbox, skipping: {table}")
576
+
577
+ # Sort by position
578
+ all_elements.sort(key=lambda x: (x['y'], x['x']))
579
+
580
+ # Convert to markdown
581
+ for element in all_elements:
582
+ if element['type'] == 'text':
583
+ block = element['element']
584
+ text = block.text.strip()
585
+ if not text:
586
+ continue
587
+
588
+ if block.element_type == 'heading':
589
+ # Determine heading level based on font size/position
590
+ level = self._determine_heading_level(block)
591
+ markdown_parts.append(f"{'#' * level} {text}")
592
+ markdown_parts.append("")
593
+ elif block.element_type == 'list_item':
594
+ markdown_parts.append(f"- {text}")
595
+ else:
596
+ markdown_parts.append(text)
597
+ markdown_parts.append("")
598
+
599
+ elif element['type'] == 'table':
600
+ table = element['element']
601
+ if table['type'] == 'structured_table':
602
+ # Convert structured table to markdown
603
+ table_md = self._convert_table_to_markdown(table)
604
+ markdown_parts.append(table_md)
605
+ markdown_parts.append("")
606
+ else:
607
+ # Simple table
608
+ markdown_parts.append(f"**Table:** {table.get('text', '')}")
609
+ markdown_parts.append("")
610
+
611
+ return '\n'.join(markdown_parts)
612
+
613
+ def _determine_heading_level(self, block) -> int:
614
+ """Determine heading level based on font size and position."""
615
+ # Simple heuristic: larger text or positioned at top = higher level
616
+ if block.y < 100: # Near top of page
617
+ return 1
618
+ elif block.height > 30: # Large text
619
+ return 2
620
+ else:
621
+ return 3
622
+
623
+ def _extract_text_from_region(self, img: Image.Image, bbox: List[float]) -> str:
624
+ """Extract text from a specific region of the image."""
625
+ try:
626
+ # Crop the region
627
+ x1, y1, x2, y2 = bbox
628
+ region = img.crop((x1, y1, x2, y2))
629
+
630
+ # Convert PIL image to numpy array for easyocr
631
+ region_np = np.array(region)
632
+
633
+ # Use OCR on the region
634
+ results = self.ocr_reader.readtext(region_np)
635
+ texts = []
636
+ for (_, text, confidence) in results:
637
+ if confidence > 0.5:
638
+ texts.append(text)
639
+
640
+ return ' '.join(texts)
641
+
642
+ except Exception as e:
643
+ logger.error(f"Failed to extract text from region: {e}")
644
+ return ""
docstrange/pipeline/ocr_service.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """OCR Service abstraction for neural document processing."""
2
+
3
+ import os
4
+ import logging
5
+ from abc import ABC, abstractmethod
6
+ from typing import List, Dict, Any, Optional
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class OCRService(ABC):
12
+ """Abstract base class for OCR services."""
13
+
14
+ @abstractmethod
15
+ def extract_text(self, image_path: str) -> str:
16
+ """Extract text from image.
17
+
18
+ Args:
19
+ image_path: Path to the image file
20
+
21
+ Returns:
22
+ Extracted text as string
23
+ """
24
+ pass
25
+
26
+ @abstractmethod
27
+ def extract_text_with_layout(self, image_path: str) -> str:
28
+ """Extract text with layout awareness from image.
29
+
30
+ Args:
31
+ image_path: Path to the image file
32
+
33
+ Returns:
34
+ Layout-aware extracted text as markdown
35
+ """
36
+ pass
37
+
38
+
39
+ class NanonetsOCRService(OCRService):
40
+ """Nanonets OCR implementation using NanonetsDocumentProcessor."""
41
+
42
+ def __init__(self):
43
+ """Initialize the service."""
44
+ from .nanonets_processor import NanonetsDocumentProcessor
45
+ self._processor = NanonetsDocumentProcessor()
46
+ logger.info("NanonetsOCRService initialized")
47
+
48
+ @property
49
+ def model(self):
50
+ """Get the Nanonets model."""
51
+ return self._processor.model
52
+
53
+ @property
54
+ def processor(self):
55
+ """Get the Nanonets processor."""
56
+ return self._processor.processor
57
+
58
+ @property
59
+ def tokenizer(self):
60
+ """Get the Nanonets tokenizer."""
61
+ return self._processor.tokenizer
62
+
63
+ def extract_text(self, image_path: str) -> str:
64
+ """Extract text using Nanonets OCR."""
65
+ try:
66
+ # Validate image file
67
+ if not os.path.exists(image_path):
68
+ logger.error(f"Image file does not exist: {image_path}")
69
+ return ""
70
+
71
+ # Check if file is readable
72
+ try:
73
+ from PIL import Image
74
+ with Image.open(image_path) as img:
75
+ logger.info(f"Image loaded successfully: {img.size} {img.mode}")
76
+ except Exception as e:
77
+ logger.error(f"Failed to load image: {e}")
78
+ return ""
79
+
80
+ try:
81
+ text = self._processor.extract_text(image_path)
82
+ logger.info(f"Extracted text length: {len(text)}")
83
+ return text.strip()
84
+ except Exception as e:
85
+ logger.error(f"Nanonets OCR extraction failed: {e}")
86
+ return ""
87
+
88
+ except Exception as e:
89
+ logger.error(f"Nanonets OCR extraction failed: {e}")
90
+ return ""
91
+
92
+ def extract_text_with_layout(self, image_path: str) -> str:
93
+ """Extract text with layout awareness using Nanonets OCR."""
94
+ try:
95
+ # Validate image file
96
+ if not os.path.exists(image_path):
97
+ logger.error(f"Image file does not exist: {image_path}")
98
+ return ""
99
+
100
+ # Check if file is readable
101
+ try:
102
+ from PIL import Image
103
+ with Image.open(image_path) as img:
104
+ logger.info(f"Image loaded successfully: {img.size} {img.mode}")
105
+ except Exception as e:
106
+ logger.error(f"Failed to load image: {e}")
107
+ return ""
108
+
109
+ try:
110
+ text = self._processor.extract_text_with_layout(image_path)
111
+ logger.info(f"Layout-aware extracted text length: {len(text)}")
112
+ return text.strip()
113
+ except Exception as e:
114
+ logger.error(f"Nanonets OCR layout-aware extraction failed: {e}")
115
+ return ""
116
+
117
+ except Exception as e:
118
+ logger.error(f"Nanonets OCR layout-aware extraction failed: {e}")
119
+ return ""
120
+
121
+
122
+ class NeuralOCRService(OCRService):
123
+ """Neural OCR implementation using docling's pre-trained models."""
124
+
125
+ def __init__(self):
126
+ """Initialize the service."""
127
+ from .neural_document_processor import NeuralDocumentProcessor
128
+ self._processor = NeuralDocumentProcessor()
129
+ logger.info("NeuralOCRService initialized")
130
+
131
+ def extract_text(self, image_path: str) -> str:
132
+ """Extract text using Neural OCR (docling models)."""
133
+ try:
134
+ # Validate image file
135
+ if not os.path.exists(image_path):
136
+ logger.error(f"Image file does not exist: {image_path}")
137
+ return ""
138
+
139
+ # Check if file is readable
140
+ try:
141
+ from PIL import Image
142
+ with Image.open(image_path) as img:
143
+ logger.info(f"Image loaded successfully: {img.size} {img.mode}")
144
+ except Exception as e:
145
+ logger.error(f"Failed to load image: {e}")
146
+ return ""
147
+
148
+ try:
149
+ text = self._processor.extract_text(image_path)
150
+ logger.info(f"Extracted text length: {len(text)}")
151
+ return text.strip()
152
+ except Exception as e:
153
+ logger.error(f"Neural OCR extraction failed: {e}")
154
+ return ""
155
+
156
+ except Exception as e:
157
+ logger.error(f"Neural OCR extraction failed: {e}")
158
+ return ""
159
+
160
+ def extract_text_with_layout(self, image_path: str) -> str:
161
+ """Extract text with layout awareness using Neural OCR."""
162
+ try:
163
+ # Validate image file
164
+ if not os.path.exists(image_path):
165
+ logger.error(f"Image file does not exist: {image_path}")
166
+ return ""
167
+
168
+ # Check if file is readable
169
+ try:
170
+ from PIL import Image
171
+ with Image.open(image_path) as img:
172
+ logger.info(f"Image loaded successfully: {img.size} {img.mode}")
173
+ except Exception as e:
174
+ logger.error(f"Failed to load image: {e}")
175
+ return ""
176
+
177
+ try:
178
+ text = self._processor.extract_text_with_layout(image_path)
179
+ logger.info(f"Layout-aware extracted text length: {len(text)}")
180
+ return text.strip()
181
+ except Exception as e:
182
+ logger.error(f"Neural OCR layout-aware extraction failed: {e}")
183
+ return ""
184
+
185
+ except Exception as e:
186
+ logger.error(f"Neural OCR layout-aware extraction failed: {e}")
187
+ return ""
188
+
189
+
190
+ class OCRServiceFactory:
191
+ """Factory for creating OCR services based on configuration."""
192
+
193
+ @staticmethod
194
+ def create_service(provider: str = None) -> OCRService:
195
+ """Create OCR service based on provider configuration.
196
+
197
+ Args:
198
+ provider: OCR provider name (defaults to config)
199
+
200
+ Returns:
201
+ OCRService instance
202
+ """
203
+ from docstrange.config import InternalConfig
204
+
205
+ if provider is None:
206
+ provider = getattr(InternalConfig, 'ocr_provider', 'nanonets')
207
+
208
+ if provider.lower() == 'nanonets':
209
+ return NanonetsOCRService()
210
+ elif provider.lower() == 'neural':
211
+ return NeuralOCRService()
212
+ else:
213
+ raise ValueError(f"Unsupported OCR provider: {provider}")
214
+
215
+ @staticmethod
216
+ def get_available_providers() -> List[str]:
217
+ """Get list of available OCR providers.
218
+
219
+ Returns:
220
+ List of available provider names
221
+ """
222
+ return ['nanonets', 'neural']
docstrange/processors/__init__.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Processors for different file types."""
2
+
3
+ from .pdf_processor import PDFProcessor
4
+ from .docx_processor import DOCXProcessor
5
+ from .txt_processor import TXTProcessor
6
+ from .excel_processor import ExcelProcessor
7
+ from .url_processor import URLProcessor
8
+ from .html_processor import HTMLProcessor
9
+ from .pptx_processor import PPTXProcessor
10
+ from .image_processor import ImageProcessor
11
+ from .cloud_processor import CloudProcessor, CloudConversionResult
12
+ from .gpu_processor import GPUProcessor, GPUConversionResult
13
+
14
+ __all__ = [
15
+ "PDFProcessor",
16
+ "DOCXProcessor",
17
+ "TXTProcessor",
18
+ "ExcelProcessor",
19
+ "URLProcessor",
20
+ "HTMLProcessor",
21
+ "PPTXProcessor",
22
+ "ImageProcessor",
23
+ "CloudProcessor",
24
+ "CloudConversionResult",
25
+ "GPUProcessor",
26
+ "GPUConversionResult"
27
+ ]
docstrange/processors/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (859 Bytes). View file
 
docstrange/processors/__pycache__/base.cpython-310.pyc ADDED
Binary file (3.01 kB). View file
 
docstrange/processors/__pycache__/cloud_processor.cpython-310.pyc ADDED
Binary file (11.4 kB). View file
 
docstrange/processors/__pycache__/docx_processor.cpython-310.pyc ADDED
Binary file (5.52 kB). View file
 
docstrange/processors/__pycache__/excel_processor.cpython-310.pyc ADDED
Binary file (5.49 kB). View file
 
docstrange/processors/__pycache__/gpu_processor.cpython-310.pyc ADDED
Binary file (14 kB). View file
 
docstrange/processors/__pycache__/html_processor.cpython-310.pyc ADDED
Binary file (2.36 kB). View file
 
docstrange/processors/__pycache__/image_processor.cpython-310.pyc ADDED
Binary file (3.84 kB). View file
 
docstrange/processors/__pycache__/pdf_processor.cpython-310.pyc ADDED
Binary file (4.54 kB). View file
 
docstrange/processors/__pycache__/pptx_processor.cpython-310.pyc ADDED
Binary file (4.22 kB). View file
 
docstrange/processors/__pycache__/txt_processor.cpython-310.pyc ADDED
Binary file (2.92 kB). View file
 
docstrange/processors/__pycache__/url_processor.cpython-310.pyc ADDED
Binary file (8.74 kB). View file
 
docstrange/processors/base.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Base processor class for document conversion."""
2
+
3
+ import os
4
+ import logging
5
+ from abc import ABC, abstractmethod
6
+ from typing import Any, Dict, Optional
7
+
8
+ from ..result import ConversionResult
9
+ from docstrange.config import InternalConfig
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class BaseProcessor(ABC):
15
+ """Base class for all document processors."""
16
+
17
+ def __init__(self, preserve_layout: bool = True, include_images: bool = False, ocr_enabled: bool = True, use_markdownify: bool = InternalConfig.use_markdownify):
18
+ """Initialize the processor.
19
+
20
+ Args:
21
+ preserve_layout: Whether to preserve document layout
22
+ include_images: Whether to include images in output
23
+ ocr_enabled: Whether to enable OCR for image processing
24
+ use_markdownify: Whether to use markdownify for HTML->Markdown conversion
25
+ """
26
+ self.preserve_layout = preserve_layout
27
+ self.include_images = include_images
28
+ self.ocr_enabled = ocr_enabled
29
+ self.use_markdownify = use_markdownify
30
+
31
+ @abstractmethod
32
+ def can_process(self, file_path: str) -> bool:
33
+ """Check if this processor can handle the given file.
34
+
35
+ Args:
36
+ file_path: Path to the file to check
37
+
38
+ Returns:
39
+ True if this processor can handle the file
40
+ """
41
+ pass
42
+
43
+ @abstractmethod
44
+ def process(self, file_path: str) -> ConversionResult:
45
+ """Process the file and return a conversion result.
46
+
47
+ Args:
48
+ file_path: Path to the file to process
49
+
50
+ Returns:
51
+ ConversionResult containing the processed content
52
+
53
+ Raises:
54
+ ConversionError: If processing fails
55
+ """
56
+ pass
57
+
58
+ def get_metadata(self, file_path: str) -> Dict[str, Any]:
59
+ """Get metadata about the file.
60
+
61
+ Args:
62
+ file_path: Path to the file
63
+
64
+ Returns:
65
+ Dictionary containing file metadata
66
+ """
67
+ try:
68
+ file_stat = os.stat(file_path)
69
+ # Ensure file_path is a string for splitext
70
+ file_path_str = str(file_path)
71
+ return {
72
+ "file_size": file_stat.st_size,
73
+ "file_extension": os.path.splitext(file_path_str)[1].lower(),
74
+ "file_name": os.path.basename(file_path_str),
75
+ "processor": self.__class__.__name__,
76
+ "preserve_layout": self.preserve_layout,
77
+ "include_images": self.include_images,
78
+ "ocr_enabled": self.ocr_enabled
79
+ }
80
+ except Exception as e:
81
+ logger.warning(f"Failed to get metadata for {file_path}: {e}")
82
+ return {
83
+ "processor": self.__class__.__name__,
84
+ "preserve_layout": self.preserve_layout,
85
+ "include_images": self.include_images,
86
+ "ocr_enabled": self.ocr_enabled
87
+ }
docstrange/processors/cloud_processor.py ADDED
@@ -0,0 +1,399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Cloud processor for Nanonets API integration with API key pool rotation and local fallback."""
2
+
3
+ import os
4
+ import requests
5
+ import json
6
+ import logging
7
+ import time
8
+ from typing import Dict, Any, Optional, List
9
+
10
+ from .base import BaseProcessor
11
+ from ..result import ConversionResult
12
+ from ..exceptions import ConversionError
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Default reset time for rate-limited keys (1 hour)
17
+ DEFAULT_RATE_LIMIT_RESET = 3600
18
+
19
+
20
+ class CloudConversionResult(ConversionResult):
21
+ """Enhanced ConversionResult for cloud mode with lazy API calls, key rotation, and local fallback."""
22
+
23
+ def __init__(self, file_path: str, cloud_processor: 'CloudProcessor', metadata: Optional[Dict[str, Any]] = None,
24
+ api_key_pool=None, local_fallback_processor=None):
25
+ # Initialize with empty content - we'll make API calls on demand
26
+ super().__init__("", metadata)
27
+ self.file_path = file_path
28
+ self.cloud_processor = cloud_processor
29
+ self.api_key_pool = api_key_pool
30
+ self.local_fallback_processor = local_fallback_processor # GPU processor or None
31
+ self._cached_outputs = {} # Cache API responses by output type
32
+ self._used_fallback = False # Track if we fell back to local processing
33
+
34
+ def _get_cloud_output(self, output_type: str, specified_fields: Optional[list] = None, json_schema: Optional[dict] = None) -> str:
35
+ """Get output from cloud API for specific type, with caching, key rotation, and local fallback."""
36
+ # Validate output type
37
+ valid_output_types = ["markdown", "flat-json", "html", "csv", "specified-fields", "specified-json"]
38
+ if output_type not in valid_output_types:
39
+ logger.warning(f"Invalid output type '{output_type}' for cloud API. Using 'markdown'.")
40
+ output_type = "markdown"
41
+
42
+ # Create cache key based on output type and parameters
43
+ cache_key = output_type
44
+ if specified_fields:
45
+ cache_key += f"_fields_{','.join(specified_fields)}"
46
+ if json_schema:
47
+ cache_key += f"_schema_{hash(str(json_schema))}"
48
+
49
+ if cache_key in self._cached_outputs:
50
+ return self._cached_outputs[cache_key]
51
+
52
+ # If we already fell back to local, skip cloud
53
+ if self._used_fallback:
54
+ return self._convert_locally(output_type)
55
+
56
+ # Try cloud API with key rotation
57
+ last_error = None
58
+ keys_tried = set()
59
+
60
+ while True:
61
+ # Get next available key from pool
62
+ current_key = None
63
+ if self.api_key_pool:
64
+ current_key = self.api_key_pool.get_next_key()
65
+
66
+ # Also try the processor's own key if set
67
+ if not current_key and self.cloud_processor.api_key:
68
+ current_key = self.cloud_processor.api_key
69
+
70
+ if not current_key:
71
+ logger.info("No API keys available, falling back to local processing")
72
+ return self._convert_locally(output_type)
73
+
74
+ # Don't try the same key twice in one cycle
75
+ if current_key in keys_tried:
76
+ logger.info("All API keys rate limited, falling back to local processing")
77
+ return self._convert_locally(output_type)
78
+
79
+ keys_tried.add(current_key)
80
+
81
+ try:
82
+ # Prepare headers
83
+ headers = {}
84
+ if current_key:
85
+ headers['Authorization'] = f'Bearer {current_key}'
86
+
87
+ # Prepare file for upload
88
+ with open(self.file_path, 'rb') as file:
89
+ files = {
90
+ 'file': (os.path.basename(self.file_path), file, self.cloud_processor._get_content_type(self.file_path))
91
+ }
92
+
93
+ data = {
94
+ 'output_type': output_type
95
+ }
96
+
97
+ # Add model_type if specified
98
+ if self.cloud_processor.model_type:
99
+ data['model_type'] = self.cloud_processor.model_type
100
+
101
+ # Add field extraction parameters
102
+ if output_type == "specified-fields" and specified_fields:
103
+ data['specified_fields'] = ','.join(specified_fields)
104
+ elif output_type == "specified-json" and json_schema:
105
+ data['json_schema'] = json.dumps(json_schema)
106
+
107
+ log_prefix = f"API key {current_key[:8]}..." if current_key else "no auth"
108
+ logger.info(f"Making cloud API call ({log_prefix}) for {output_type} on {self.file_path}")
109
+
110
+ # Make API request
111
+ response = requests.post(
112
+ self.cloud_processor.api_url,
113
+ headers=headers,
114
+ files=files,
115
+ data=data,
116
+ timeout=300
117
+ )
118
+
119
+ # Handle rate limiting (429) - mark key as limited and try next
120
+ if response.status_code == 429:
121
+ # Mark this key as rate limited in the pool
122
+ if self.api_key_pool:
123
+ self.api_key_pool.mark_key_rate_limited(current_key, DEFAULT_RATE_LIMIT_RESET)
124
+
125
+ # Also mark the processor's key if it matches
126
+ if self.cloud_processor.api_key == current_key:
127
+ logger.warning(f"Processor API key rate limited, will try pool keys")
128
+
129
+ logger.warning(f"API key {current_key[:8]}... rate limited, trying next key...")
130
+ last_error = f"Rate limited (429)"
131
+ continue
132
+
133
+ response.raise_for_status()
134
+ result_data = response.json()
135
+
136
+ # Extract content from response
137
+ content = self.cloud_processor._extract_content_from_response(result_data)
138
+
139
+ # Cache the result
140
+ self._cached_outputs[cache_key] = content
141
+ return content
142
+
143
+ except requests.exceptions.HTTPError as e:
144
+ if '429' in str(e):
145
+ if self.api_key_pool:
146
+ self.api_key_pool.mark_key_rate_limited(current_key, DEFAULT_RATE_LIMIT_RESET)
147
+ logger.warning(f"API key {current_key[:8]}... rate limited (HTTPError), trying next key...")
148
+ last_error = str(e)
149
+ continue
150
+ else:
151
+ logger.error(f"Cloud API HTTP error: {e}")
152
+ last_error = str(e)
153
+ break
154
+ except Exception as e:
155
+ logger.error(f"Cloud API call failed: {e}")
156
+ last_error = str(e)
157
+ break
158
+
159
+ # All keys exhausted, fall back to local processing
160
+ logger.warning(f"All API keys rate limited or failed. Falling back to local Docling processing.")
161
+ self._used_fallback = True
162
+ return self._convert_locally(output_type)
163
+
164
+ def _convert_locally(self, output_type: str) -> str:
165
+ """Fallback to local Docling/GPU conversion methods."""
166
+ self._used_fallback = True
167
+
168
+ # Try the local fallback processor (GPU processor with Docling models)
169
+ if self.local_fallback_processor:
170
+ try:
171
+ logger.info(f"Using local Docling processor for fallback on {self.file_path}")
172
+ local_result = self.local_fallback_processor.process(self.file_path)
173
+
174
+ if output_type == "html":
175
+ return local_result.extract_html()
176
+ elif output_type == "flat-json":
177
+ return json.dumps(local_result.extract_data(), indent=2)
178
+ elif output_type == "csv":
179
+ return local_result.extract_csv(include_all_tables=True)
180
+ else:
181
+ return local_result.extract_markdown()
182
+ except Exception as e:
183
+ logger.error(f"Local Docling fallback also failed: {e}")
184
+
185
+ # Last resort: use parent class methods
186
+ if output_type == "html":
187
+ return super().extract_html()
188
+ elif output_type == "flat-json":
189
+ return json.dumps(super().extract_data(), indent=2)
190
+ elif output_type == "csv":
191
+ return super().extract_csv(include_all_tables=True)
192
+ else:
193
+ return self.content
194
+
195
+ def extract_markdown(self) -> str:
196
+ """Export as markdown."""
197
+ return self._get_cloud_output("markdown")
198
+
199
+ def extract_html(self) -> str:
200
+ """Export as HTML."""
201
+ return self._get_cloud_output("html")
202
+
203
+ def extract_data(self, specified_fields: Optional[list] = None, json_schema: Optional[dict] = None) -> Dict[str, Any]:
204
+ """Export as structured JSON with optional field extraction.
205
+
206
+ Args:
207
+ specified_fields: Optional list of specific fields to extract
208
+ json_schema: Optional JSON schema defining fields and types to extract
209
+
210
+ Returns:
211
+ Structured JSON with extracted data
212
+ """
213
+ try:
214
+ if specified_fields:
215
+ # Request specified fields extraction
216
+ content = self._get_cloud_output("specified-fields", specified_fields=specified_fields)
217
+ extracted_data = json.loads(content)
218
+ return {
219
+ "extracted_fields": extracted_data,
220
+ "format": "specified_fields"
221
+ }
222
+
223
+ elif json_schema:
224
+ # Request JSON schema extraction
225
+ content = self._get_cloud_output("specified-json", json_schema=json_schema)
226
+ extracted_data = json.loads(content)
227
+ return {
228
+ "structured_data": extracted_data,
229
+ "format": "structured_json"
230
+ }
231
+
232
+ else:
233
+ # Standard JSON extraction
234
+ json_content = self._get_cloud_output("flat-json")
235
+ parsed_content = json.loads(json_content)
236
+ return {
237
+ "document": parsed_content,
238
+ "format": "cloud_flat_json"
239
+ }
240
+
241
+ except Exception as e:
242
+ logger.error(f"Failed to parse JSON content: {e}")
243
+ return {
244
+ "document": {"raw_content": content if 'content' in locals() else ""},
245
+ "format": "json_parse_error",
246
+ "error": str(e)
247
+ }
248
+
249
+
250
+
251
+ def extract_text(self) -> str:
252
+ """Export as plain text."""
253
+ # For text output, we can try markdown first and then extract to text
254
+ try:
255
+ return self._get_cloud_output("markdown")
256
+ except Exception as e:
257
+ logger.error(f"Failed to get text output: {e}")
258
+ return ""
259
+
260
+ def extract_csv(self, table_index: int = 0, include_all_tables: bool = False) -> str:
261
+ """Export tables as CSV format.
262
+
263
+ Args:
264
+ table_index: Which table to export (0-based index). Default is 0 (first table).
265
+ include_all_tables: If True, export all tables with separators. Default is False.
266
+
267
+ Returns:
268
+ CSV formatted string of the table(s)
269
+
270
+ Raises:
271
+ ValueError: If no tables are found or table_index is out of range
272
+ """
273
+ return self._get_cloud_output("csv")
274
+
275
+
276
+ class CloudProcessor(BaseProcessor):
277
+ """Processor for cloud-based document conversion using Nanonets API with API key pool rotation."""
278
+
279
+ def __init__(self, api_key: Optional[str] = None, output_type: str = None, model_type: Optional[str] = None,
280
+ specified_fields: Optional[list] = None, json_schema: Optional[dict] = None,
281
+ api_key_pool=None, local_fallback_processor=None, **kwargs):
282
+ """Initialize the cloud processor.
283
+
284
+ Args:
285
+ api_key: API key for cloud processing (optional - uses rate-limited free tier without key)
286
+ output_type: Output type for cloud processing (markdown, flat-json, html, csv, specified-fields, specified-json)
287
+ model_type: Model type for cloud processing (gemini, openapi, nanonets)
288
+ specified_fields: List of fields to extract (for specified-fields output type)
289
+ json_schema: JSON schema defining fields and types to extract (for specified-json output type)
290
+ api_key_pool: ApiKeyPool instance for key rotation
291
+ local_fallback_processor: Local processor (GPU/Docling) for fallback when all keys exhausted
292
+ """
293
+ super().__init__(**kwargs)
294
+ self.api_key = api_key
295
+ self.output_type = output_type
296
+ self.model_type = model_type
297
+ self.specified_fields = specified_fields
298
+ self.json_schema = json_schema
299
+ self.api_key_pool = api_key_pool
300
+ self.local_fallback_processor = local_fallback_processor
301
+ self.api_url = "https://extraction-api.nanonets.com/extract"
302
+
303
+ # Don't validate output_type during initialization - it will be validated during processing
304
+ # This prevents warnings during DocumentExtractor initialization
305
+
306
+ def can_process(self, file_path: str) -> bool:
307
+ """Check if the processor can handle the file."""
308
+ # Cloud processor supports most common document formats
309
+ # API key is optional - without it, uses rate-limited free tier
310
+ supported_extensions = {
311
+ '.pdf', '.docx', '.doc', '.xlsx', '.xls', '.pptx', '.ppt',
312
+ '.txt', '.html', '.htm', '.png', '.jpg', '.jpeg', '.gif',
313
+ '.bmp', '.tiff', '.tif'
314
+ }
315
+
316
+ _, ext = os.path.splitext(file_path.lower())
317
+ return ext in supported_extensions
318
+
319
+ def process(self, file_path: str) -> CloudConversionResult:
320
+ """Create a lazy CloudConversionResult that will make API calls on demand with key rotation.
321
+
322
+ Args:
323
+ file_path: Path to the file to process
324
+
325
+ Returns:
326
+ CloudConversionResult that makes API calls when output methods are called
327
+
328
+ Raises:
329
+ ConversionError: If file doesn't exist
330
+ """
331
+ if not os.path.exists(file_path):
332
+ raise ConversionError(f"File not found: {file_path}")
333
+
334
+ # Create metadata without making any API calls
335
+ metadata = {
336
+ 'source_file': file_path,
337
+ 'processing_mode': 'cloud',
338
+ 'api_provider': 'nanonets',
339
+ 'file_size': os.path.getsize(file_path),
340
+ 'model_type': self.model_type,
341
+ 'has_api_key': bool(self.api_key),
342
+ 'key_rotation': True,
343
+ 'local_fallback': self.local_fallback_processor is not None
344
+ }
345
+
346
+ if self.api_key:
347
+ logger.info(f"Created cloud extractor for {file_path} with API key pool rotation")
348
+ else:
349
+ logger.info(f"Created cloud extractor for {file_path} without API key - will use pool + local fallback")
350
+
351
+ # Return lazy result with key pool and local fallback
352
+ return CloudConversionResult(
353
+ file_path=file_path,
354
+ cloud_processor=self,
355
+ metadata=metadata,
356
+ api_key_pool=self.api_key_pool,
357
+ local_fallback_processor=self.local_fallback_processor
358
+ )
359
+
360
+ def _extract_content_from_response(self, response_data: Dict[str, Any]) -> str:
361
+ """Extract content from API response."""
362
+ try:
363
+ # API always returns content in the 'content' field
364
+ if 'content' in response_data:
365
+ return response_data['content']
366
+
367
+ # Fallback: return whole response as JSON if no content field
368
+ logger.warning("No 'content' field found in API response, returning full response")
369
+ return json.dumps(response_data, indent=2)
370
+
371
+ except Exception as e:
372
+ logger.error(f"Failed to extract content from API response: {e}")
373
+ return json.dumps(response_data, indent=2)
374
+
375
+ def _get_content_type(self, file_path: str) -> str:
376
+ """Get content type for file upload."""
377
+ _, ext = os.path.splitext(file_path.lower())
378
+
379
+ content_types = {
380
+ '.pdf': 'application/pdf',
381
+ '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
382
+ '.doc': 'application/msword',
383
+ '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
384
+ '.xls': 'application/vnd.ms-excel',
385
+ '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
386
+ '.ppt': 'application/vnd.ms-powerpoint',
387
+ '.txt': 'text/plain',
388
+ '.html': 'text/html',
389
+ '.htm': 'text/html',
390
+ '.png': 'image/png',
391
+ '.jpg': 'image/jpeg',
392
+ '.jpeg': 'image/jpeg',
393
+ '.gif': 'image/gif',
394
+ '.bmp': 'image/bmp',
395
+ '.tiff': 'image/tiff',
396
+ '.tif': 'image/tiff'
397
+ }
398
+
399
+ return content_types.get(ext, 'application/octet-stream')
docstrange/processors/docx_processor.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """DOCX file processor."""
2
+
3
+ import os
4
+ from typing import Dict, Any
5
+
6
+ from .base import BaseProcessor
7
+ from ..result import ConversionResult
8
+ from ..exceptions import ConversionError, FileNotFoundError
9
+
10
+
11
+ class DOCXProcessor(BaseProcessor):
12
+ """Processor for Microsoft Word DOCX and DOC files."""
13
+
14
+ def can_process(self, file_path: str) -> bool:
15
+ """Check if this processor can handle the given file.
16
+
17
+ Args:
18
+ file_path: Path to the file to check
19
+
20
+ Returns:
21
+ True if this processor can handle the file
22
+ """
23
+ if not os.path.exists(file_path):
24
+ return False
25
+
26
+ # Check file extension - ensure file_path is a string
27
+ file_path_str = str(file_path)
28
+ _, ext = os.path.splitext(file_path_str.lower())
29
+ return ext in ['.docx', '.doc']
30
+
31
+ def process(self, file_path: str) -> ConversionResult:
32
+ """Process the DOCX file and return a conversion result.
33
+
34
+ Args:
35
+ file_path: Path to the DOCX file to process
36
+
37
+ Returns:
38
+ ConversionResult containing the processed content
39
+
40
+ Raises:
41
+ FileNotFoundError: If the file doesn't exist
42
+ ConversionError: If processing fails
43
+ """
44
+ if not os.path.exists(file_path):
45
+ raise FileNotFoundError(f"File not found: {file_path}")
46
+
47
+ # Initialize metadata
48
+ metadata = {
49
+ "file_path": file_path,
50
+ "file_size": os.path.getsize(file_path),
51
+ "processor": "DOCXProcessor"
52
+ }
53
+
54
+ # Check file extension - ensure file_path is a string
55
+ file_path_str = str(file_path)
56
+ _, ext = os.path.splitext(file_path_str.lower())
57
+
58
+ if ext == '.doc':
59
+ return self._process_doc_file(file_path, metadata)
60
+ else:
61
+ return self._process_docx_file(file_path, metadata)
62
+
63
+ def _process_doc_file(self, file_path: str, metadata: Dict[str, Any]) -> ConversionResult:
64
+ """Process .doc files using pypandoc."""
65
+ try:
66
+ import pypandoc
67
+
68
+ # Convert .doc to markdown using pandoc
69
+ content = pypandoc.convert_file(file_path, 'markdown')
70
+
71
+ metadata.update({
72
+ "file_type": "doc",
73
+ "extractor": "pypandoc"
74
+ })
75
+
76
+ # Clean up the content
77
+ content = self._clean_content(content)
78
+
79
+ return ConversionResult(content, metadata)
80
+
81
+ except ImportError:
82
+ raise ConversionError("pypandoc is required for .doc file processing. Install it with: pip install pypandoc")
83
+ except Exception as e:
84
+ raise ConversionError(f"Failed to process .doc file {file_path}: {str(e)}")
85
+
86
+ def _process_docx_file(self, file_path: str, metadata: Dict[str, Any]) -> ConversionResult:
87
+ """Process .docx files using python-docx with improved table extraction."""
88
+ try:
89
+ from docx import Document
90
+
91
+ content_parts = []
92
+ doc = Document(file_path)
93
+
94
+ metadata.update({
95
+ "paragraph_count": len(doc.paragraphs),
96
+ "section_count": len(doc.sections),
97
+ "file_type": "docx",
98
+ "extractor": "python-docx"
99
+ })
100
+
101
+ # Extract text from paragraphs
102
+ for paragraph in doc.paragraphs:
103
+ if paragraph.text.strip():
104
+ # Check if this is a heading
105
+ if paragraph.style.name.startswith('Heading'):
106
+ level = paragraph.style.name.replace('Heading ', '')
107
+ try:
108
+ level_num = int(level)
109
+ content_parts.append(f"\n{'#' * min(level_num, 6)} {paragraph.text}\n")
110
+ except ValueError:
111
+ content_parts.append(f"\n## {paragraph.text}\n")
112
+ else:
113
+ content_parts.append(paragraph.text)
114
+
115
+ # Extract text from tables (improved)
116
+ for table_idx, table in enumerate(doc.tables):
117
+ # Check if preserve_layout is available (from base class or config)
118
+ preserve_layout = getattr(self, 'preserve_layout', False)
119
+ if preserve_layout:
120
+ content_parts.append(f"\n### Table {table_idx+1}\n")
121
+
122
+ # Gather all rows
123
+ rows = table.rows
124
+ if not rows:
125
+ continue
126
+
127
+ # Detect merged cells (optional warning)
128
+ merged_warning = False
129
+ for row in rows:
130
+ for cell in row.cells:
131
+ if len(cell._tc.xpath('.//w:vMerge')) > 0 or len(cell._tc.xpath('.//w:gridSpan')) > 0:
132
+ merged_warning = True
133
+ break
134
+ if merged_warning:
135
+ break
136
+ if merged_warning:
137
+ content_parts.append("*Warning: Table contains merged cells which may not render correctly in markdown.*\n")
138
+
139
+ # Row limit for large tables
140
+ row_limit = 20
141
+ if len(rows) > row_limit:
142
+ content_parts.append(f"*Table truncated to first {row_limit} rows out of {len(rows)} total.*\n")
143
+
144
+ # Build table data
145
+ table_data = []
146
+ for i, row in enumerate(rows):
147
+ if i >= row_limit:
148
+ break
149
+ row_data = [cell.text.strip().replace('\n', ' ') for cell in row.cells]
150
+ table_data.append(row_data)
151
+
152
+ # Ensure all rows have the same number of columns
153
+ max_cols = max(len(r) for r in table_data)
154
+ for r in table_data:
155
+ while len(r) < max_cols:
156
+ r.append("")
157
+
158
+ # Markdown table: first row as header
159
+ if table_data:
160
+ header = table_data[0]
161
+ separator = ["---"] * len(header)
162
+ content_parts.append("| " + " | ".join(header) + " |")
163
+ content_parts.append("| " + " | ".join(separator) + " |")
164
+ for row in table_data[1:]:
165
+ content_parts.append("| " + " | ".join(row) + " |")
166
+ content_parts.append("")
167
+
168
+ content = '\n'.join(content_parts)
169
+ content = self._clean_content(content)
170
+ return ConversionResult(content, metadata)
171
+ except ImportError:
172
+ raise ConversionError("python-docx is required for .docx file processing. Install it with: pip install python-docx")
173
+ except Exception as e:
174
+ raise ConversionError(f"Failed to process .docx file {file_path}: {str(e)}")
175
+
176
+ def _clean_content(self, content: str) -> str:
177
+ """Clean up the extracted Word content.
178
+
179
+ Args:
180
+ content: Raw Word text content
181
+
182
+ Returns:
183
+ Cleaned text content
184
+ """
185
+ # Remove excessive whitespace and normalize
186
+ lines = content.split('\n')
187
+ cleaned_lines = []
188
+
189
+ for line in lines:
190
+ # Remove excessive whitespace
191
+ line = ' '.join(line.split())
192
+ if line.strip():
193
+ cleaned_lines.append(line)
194
+
195
+ # Join lines and add proper spacing
196
+ content = '\n'.join(cleaned_lines)
197
+
198
+ # Add spacing around headers
199
+ content = content.replace('## ', '\n## ')
200
+ content = content.replace('### ', '\n### ')
201
+
202
+ return content.strip()
docstrange/processors/excel_processor.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Excel file processor."""
2
+
3
+ import os
4
+ import logging
5
+ from typing import Dict, Any
6
+
7
+ from .base import BaseProcessor
8
+ from ..result import ConversionResult
9
+ from ..exceptions import ConversionError, FileNotFoundError
10
+
11
+ # Configure logging
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class ExcelProcessor(BaseProcessor):
16
+ """Processor for Excel files (XLSX, XLS) and CSV files."""
17
+
18
+ def can_process(self, file_path: str) -> bool:
19
+ """Check if this processor can handle the given file.
20
+
21
+ Args:
22
+ file_path: Path to the file to check
23
+
24
+ Returns:
25
+ True if this processor can handle the file
26
+ """
27
+ if not os.path.exists(file_path):
28
+ return False
29
+
30
+ # Check file extension - ensure file_path is a string
31
+ file_path_str = str(file_path)
32
+ _, ext = os.path.splitext(file_path_str.lower())
33
+ return ext in ['.xlsx', '.xls', '.csv']
34
+
35
+ def process(self, file_path: str) -> ConversionResult:
36
+ """Process the Excel file and return a conversion result.
37
+
38
+ Args:
39
+ file_path: Path to the Excel file to process
40
+
41
+ Returns:
42
+ ConversionResult containing the processed content
43
+
44
+ Raises:
45
+ FileNotFoundError: If the file doesn't exist
46
+ ConversionError: If processing fails
47
+ """
48
+ if not os.path.exists(file_path):
49
+ raise FileNotFoundError(f"File not found: {file_path}")
50
+
51
+ # Check file extension - ensure file_path is a string
52
+ file_path_str = str(file_path)
53
+ _, ext = os.path.splitext(file_path_str.lower())
54
+
55
+ if ext == '.csv':
56
+ return self._process_csv(file_path)
57
+ else:
58
+ return self._process_excel(file_path)
59
+
60
+ def _process_csv(self, file_path: str) -> ConversionResult:
61
+ """Process a CSV file and return a conversion result.
62
+
63
+ Args:
64
+ file_path: Path to the CSV file to process
65
+
66
+ Returns:
67
+ ConversionResult containing the processed content
68
+ """
69
+ try:
70
+ import pandas as pd
71
+
72
+ df = pd.read_csv(file_path)
73
+ content_parts = []
74
+
75
+ content_parts.append(f"# CSV Data: {os.path.basename(file_path)}")
76
+ content_parts.append("")
77
+
78
+ # Convert DataFrame to markdown table
79
+ table_md = self._dataframe_to_markdown(df, pd)
80
+ content_parts.append(table_md)
81
+
82
+ metadata = {
83
+ "row_count": len(df),
84
+ "column_count": len(df.columns),
85
+ "columns": df.columns.tolist(),
86
+ "extractor": "pandas"
87
+ }
88
+
89
+ content = '\n'.join(content_parts)
90
+
91
+ return ConversionResult(content, metadata)
92
+
93
+ except ImportError:
94
+ raise ConversionError("pandas is required for CSV processing. Install it with: pip install pandas")
95
+ except Exception as e:
96
+ raise ConversionError(f"Failed to process CSV file {file_path}: {str(e)}")
97
+
98
+ def _process_excel(self, file_path: str) -> ConversionResult:
99
+ """Process an Excel file and return a conversion result.
100
+
101
+ Args:
102
+ file_path: Path to the Excel file to process
103
+
104
+ Returns:
105
+ ConversionResult containing the processed content
106
+ """
107
+ try:
108
+ import pandas as pd
109
+
110
+ excel_file = pd.ExcelFile(file_path)
111
+ sheet_names = excel_file.sheet_names
112
+
113
+ metadata = {
114
+ "sheet_count": len(sheet_names),
115
+ "sheet_names": sheet_names,
116
+ "extractor": "pandas"
117
+ }
118
+
119
+ content_parts = []
120
+
121
+ for sheet_name in sheet_names:
122
+ df = pd.read_excel(file_path, sheet_name=sheet_name)
123
+ if not df.empty:
124
+ content_parts.append(f"\n## Sheet: {sheet_name}")
125
+ content_parts.append("")
126
+
127
+ # Convert DataFrame to markdown table
128
+ table_md = self._dataframe_to_markdown(df, pd)
129
+ content_parts.append(table_md)
130
+ content_parts.append("")
131
+
132
+ # Add metadata for this sheet
133
+ metadata.update({
134
+ f"sheet_{sheet_name}_rows": len(df),
135
+ f"sheet_{sheet_name}_columns": len(df.columns),
136
+ f"sheet_{sheet_name}_columns_list": df.columns.tolist()
137
+ })
138
+
139
+ content = '\n'.join(content_parts)
140
+
141
+ return ConversionResult(content, metadata)
142
+
143
+ except ImportError:
144
+ raise ConversionError("pandas and openpyxl are required for Excel processing. Install them with: pip install pandas openpyxl")
145
+ except Exception as e:
146
+ if isinstance(e, (FileNotFoundError, ConversionError)):
147
+ raise
148
+ raise ConversionError(f"Failed to process Excel file {file_path}: {str(e)}")
149
+
150
+ def _dataframe_to_markdown(self, df, pd) -> str:
151
+ """Convert pandas DataFrame to markdown table.
152
+
153
+ Args:
154
+ df: pandas DataFrame
155
+ pd: pandas module reference
156
+
157
+ Returns:
158
+ Markdown table string
159
+ """
160
+ if df.empty:
161
+ return "*No data available*"
162
+
163
+ # Convert DataFrame to markdown table
164
+ markdown_parts = []
165
+
166
+ # Header
167
+ markdown_parts.append("| " + " | ".join(str(col) for col in df.columns) + " |")
168
+ markdown_parts.append("| " + " | ".join(["---"] * len(df.columns)) + " |")
169
+
170
+ # Data rows
171
+ for _, row in df.iterrows():
172
+ row_data = []
173
+ for cell in row:
174
+ if pd.isna(cell):
175
+ row_data.append("")
176
+ else:
177
+ row_data.append(str(cell))
178
+ markdown_parts.append("| " + " | ".join(row_data) + " |")
179
+
180
+ return "\n".join(markdown_parts)
181
+
182
+ def _clean_content(self, content: str) -> str:
183
+ """Clean up the extracted Excel content.
184
+
185
+ Args:
186
+ content: Raw Excel text content
187
+
188
+ Returns:
189
+ Cleaned text content
190
+ """
191
+ # Remove excessive whitespace and normalize
192
+ lines = content.split('\n')
193
+ cleaned_lines = []
194
+
195
+ for line in lines:
196
+ # Remove excessive whitespace
197
+ line = ' '.join(line.split())
198
+ if line.strip():
199
+ cleaned_lines.append(line)
200
+
201
+ # Join lines and add proper spacing
202
+ content = '\n'.join(cleaned_lines)
203
+
204
+ # Add spacing around headers
205
+ content = content.replace('# ', '\n# ')
206
+ content = content.replace('## ', '\n## ')
207
+
208
+ return content.strip()
docstrange/processors/gpu_processor.py ADDED
@@ -0,0 +1,501 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """GPU processor with OCR capabilities for images and PDFs."""
2
+
3
+ import os
4
+ import json
5
+ import logging
6
+ import tempfile
7
+ import re
8
+ from typing import Dict, Any, List, Optional
9
+ from pathlib import Path
10
+
11
+ from .base import BaseProcessor
12
+ from ..result import ConversionResult
13
+ from ..exceptions import ConversionError, FileNotFoundError
14
+ from ..pipeline.ocr_service import OCRServiceFactory
15
+
16
+ # Configure logging
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class GPUConversionResult(ConversionResult):
21
+ """Enhanced ConversionResult for GPU processing with Nanonets OCR capabilities."""
22
+
23
+ def __init__(self, content: str, metadata: Optional[Dict[str, Any]] = None,
24
+ gpu_processor: Optional['GPUProcessor'] = None, file_path: Optional[str] = None,
25
+ ocr_provider: str = "nanonets"):
26
+ super().__init__(content, metadata)
27
+ self.gpu_processor = gpu_processor
28
+ self.file_path = file_path
29
+ self.ocr_provider = ocr_provider
30
+
31
+ # Add GPU-specific metadata
32
+ if metadata is None:
33
+ self.metadata = {}
34
+
35
+ # Ensure GPU-specific metadata is present
36
+ if 'processing_mode' not in self.metadata:
37
+ self.metadata['processing_mode'] = 'gpu'
38
+ if 'ocr_provider' not in self.metadata:
39
+ self.metadata['ocr_provider'] = ocr_provider
40
+ if 'gpu_processing' not in self.metadata:
41
+ self.metadata['gpu_processing'] = True
42
+
43
+ def get_ocr_info(self) -> Dict[str, Any]:
44
+ """Get information about the OCR processing used.
45
+
46
+ Returns:
47
+ Dictionary with OCR processing information
48
+ """
49
+ return {
50
+ 'ocr_provider': self.ocr_provider,
51
+ 'processing_mode': 'gpu',
52
+ 'file_path': self.file_path,
53
+ 'gpu_processor_available': self.gpu_processor is not None
54
+ }
55
+
56
+ def extract_markdown(self) -> str:
57
+ """Export as markdown without GPU processing metadata."""
58
+ return self.content
59
+
60
+ def extract_html(self) -> str:
61
+ """Export as HTML with GPU processing styling."""
62
+ # Get the base HTML from parent class
63
+ html_content = super().extract_html()
64
+
65
+ # Add GPU processing indicator
66
+ gpu_indicator = f"""
67
+ <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 8px; margin-bottom: 2rem; text-align: center;">
68
+ <strong>🚀 GPU Processed</strong> - Enhanced with {self.ocr_provider} OCR
69
+ </div>
70
+ """
71
+
72
+ # Insert the indicator after the opening body tag
73
+ body_start = html_content.find('<body')
74
+ if body_start != -1:
75
+ body_end = html_content.find('>', body_start) + 1
76
+ return html_content[:body_end] + gpu_indicator + html_content[body_end:]
77
+
78
+ return html_content
79
+
80
+ def extract_data(self) -> Dict[str, Any]:
81
+ """Export as structured JSON using Nanonets model with specific prompt."""
82
+ logger.debug(f"GPUConversionResult.extract_data() called for {self.file_path}")
83
+
84
+ try:
85
+ # If we have a GPU processor and file path, use the model to extract JSON
86
+ if self.gpu_processor and self.file_path and os.path.exists(self.file_path):
87
+ logger.info("Using Nanonets model for JSON extraction")
88
+ return self._extract_json_with_model()
89
+ else:
90
+ logger.info("Using fallback JSON conversion")
91
+ # Fallback to base JSON conversion
92
+ return self._convert_to_base_json()
93
+ except Exception as e:
94
+ logger.warning(f"Failed to extract JSON with model: {e}. Using fallback conversion.")
95
+ return self._convert_to_base_json()
96
+
97
+ def _extract_json_with_model(self) -> Dict[str, Any]:
98
+ """Extract structured JSON using Nanonets model with specific prompt."""
99
+ try:
100
+ from PIL import Image
101
+ from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
102
+
103
+ # Get the model from the GPU processor's OCR service
104
+ ocr_service = self.gpu_processor._get_ocr_service()
105
+
106
+ # Access the model components from the OCR service
107
+ if hasattr(ocr_service, 'processor') and hasattr(ocr_service, 'model') and hasattr(ocr_service, 'tokenizer'):
108
+ model = ocr_service.model
109
+ processor = ocr_service.processor
110
+ tokenizer = ocr_service.tokenizer
111
+ else:
112
+ # Fallback: load model directly
113
+ model_path = "nanonets/Nanonets-OCR-s"
114
+ model = AutoModelForImageTextToText.from_pretrained(
115
+ model_path,
116
+ torch_dtype="auto",
117
+ device_map="auto"
118
+ )
119
+ model.eval()
120
+ processor = AutoProcessor.from_pretrained(model_path)
121
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
122
+
123
+ # Define the JSON extraction prompt
124
+ prompt = """Extract all information from the above document and return it as a valid JSON object.
125
+
126
+ Instructions:
127
+ - The output should be a single JSON object.
128
+ - Keys should be meaningful field names.
129
+ - If multiple similar blocks (like invoice items or line items), return a list of JSON objects under a key.
130
+ - Use strings for all values.
131
+ - Wrap page numbers using: "page_number": "1"
132
+ - Wrap watermarks using: "watermark": "CONFIDENTIAL"
133
+ - Use ☐ and ☑ for checkboxes.
134
+
135
+ Example:
136
+ {
137
+ "Name": "John Doe",
138
+ "Invoice Number": "INV-4567",
139
+ "Amount Due": "$123.45",
140
+ "Items": [
141
+ {"Description": "Widget A", "Price": "$20"},
142
+ {"Description": "Widget B", "Price": "$30"}
143
+ ],
144
+ "page_number": "1",
145
+ "watermark": "CONFIDENTIAL"
146
+ }"""
147
+
148
+ # Load the image
149
+ image = Image.open(self.file_path)
150
+
151
+ # Prepare messages for the model
152
+ messages = [
153
+ {"role": "system", "content": "You are a helpful assistant."},
154
+ {"role": "user", "content": [
155
+ {"type": "image", "image": f"file://{self.file_path}"},
156
+ {"type": "text", "text": prompt},
157
+ ]},
158
+ ]
159
+
160
+ # Apply chat template and process
161
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
162
+ inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
163
+ inputs = inputs.to(model.device)
164
+
165
+ # Generate JSON response
166
+ output_ids = model.generate(**inputs, max_new_tokens=15000, do_sample=False)
167
+ generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
168
+
169
+ json_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
170
+ logger.debug(f"Generated JSON text: {json_text[:200]}...")
171
+
172
+ # Try to parse the JSON response with improved parsing
173
+ def try_parse_json(text):
174
+ try:
175
+ return json.loads(text)
176
+ except json.JSONDecodeError:
177
+ # Try cleaning and reparsing
178
+ try:
179
+ text = re.sub(r"(\w+):", r'"\1":', text) # wrap keys
180
+ text = text.replace("'", '"') # replace single quotes
181
+ return json.loads(text)
182
+ except (json.JSONDecodeError, Exception):
183
+ return {"raw_text": text}
184
+
185
+ # Parse the JSON
186
+ extracted_data = try_parse_json(json_text)
187
+
188
+ # Create the result structure
189
+ result = {
190
+ "document": extracted_data,
191
+ "format": "gpu_structured_json",
192
+ "gpu_processing_info": {
193
+ 'ocr_provider': self.ocr_provider,
194
+ 'processing_mode': 'gpu',
195
+ 'file_path': self.file_path,
196
+ 'gpu_processor_available': self.gpu_processor is not None,
197
+ 'json_extraction_method': 'nanonets_model'
198
+ }
199
+ }
200
+
201
+ return result
202
+
203
+ except Exception as e:
204
+ logger.error(f"Failed to extract JSON with model: {e}")
205
+ raise
206
+
207
+ def _convert_to_base_json(self) -> Dict[str, Any]:
208
+ """Fallback to base JSON conversion method."""
209
+ # Get the base JSON from parent class
210
+ base_json = super().extract_data()
211
+
212
+ # Add GPU-specific metadata
213
+ base_json['gpu_processing_info'] = {
214
+ 'ocr_provider': self.ocr_provider,
215
+ 'processing_mode': 'gpu',
216
+ 'file_path': self.file_path,
217
+ 'gpu_processor_available': self.gpu_processor is not None,
218
+ 'json_extraction_method': 'fallback_conversion'
219
+ }
220
+
221
+ # Update the format to indicate GPU processing
222
+ base_json['format'] = 'gpu_structured_json'
223
+
224
+ return base_json
225
+
226
+ def extract_text(self) -> str:
227
+ """Export as plain text without GPU processing header."""
228
+ return self.content
229
+
230
+ def get_processing_stats(self) -> Dict[str, Any]:
231
+ """Get processing statistics and information.
232
+
233
+ Returns:
234
+ Dictionary with processing statistics
235
+ """
236
+ stats = {
237
+ 'processing_mode': 'gpu',
238
+ 'ocr_provider': self.ocr_provider,
239
+ 'file_path': self.file_path,
240
+ 'content_length': len(self.content),
241
+ 'word_count': len(self.content.split()),
242
+ 'line_count': len(self.content.split('\n')),
243
+ 'gpu_processor_available': self.gpu_processor is not None
244
+ }
245
+
246
+ # Add metadata if available
247
+ if self.metadata:
248
+ stats['metadata'] = self.metadata
249
+
250
+ return stats
251
+
252
+
253
+ class GPUProcessor(BaseProcessor):
254
+ """Processor for image files and PDFs with Nanonets OCR capabilities."""
255
+
256
+ def __init__(self, preserve_layout: bool = True, include_images: bool = False, ocr_enabled: bool = True, use_markdownify: bool = None, ocr_service=None):
257
+ super().__init__(preserve_layout, include_images, ocr_enabled, use_markdownify)
258
+ self._ocr_service = ocr_service
259
+
260
+ def can_process(self, file_path: str) -> bool:
261
+ """Check if this processor can handle the given file.
262
+
263
+ Args:
264
+ file_path: Path to the file to check
265
+
266
+ Returns:
267
+ True if this processor can handle the file
268
+ """
269
+ if not os.path.exists(file_path):
270
+ return False
271
+
272
+ # Check file extension - ensure file_path is a string
273
+ file_path_str = str(file_path)
274
+ _, ext = os.path.splitext(file_path_str.lower())
275
+ return ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp', '.gif', '.pdf']
276
+
277
+ def _get_ocr_service(self):
278
+ """Get OCR service instance."""
279
+ if self._ocr_service is not None:
280
+ return self._ocr_service
281
+ # Use Nanonets OCR service by default
282
+ self._ocr_service = OCRServiceFactory.create_service('nanonets')
283
+ return self._ocr_service
284
+
285
+ def process(self, file_path: str) -> GPUConversionResult:
286
+ """Process image file or PDF with OCR capabilities.
287
+
288
+ Args:
289
+ file_path: Path to the image file or PDF
290
+
291
+ Returns:
292
+ GPUConversionResult with extracted content
293
+ """
294
+ try:
295
+ if not os.path.exists(file_path):
296
+ raise FileNotFoundError(f"File not found: {file_path}")
297
+
298
+ # Check file type
299
+ file_path_str = str(file_path)
300
+ _, ext = os.path.splitext(file_path_str.lower())
301
+
302
+ if ext == '.pdf':
303
+ logger.info(f"Processing PDF file: {file_path}")
304
+ return self._process_pdf(file_path)
305
+ else:
306
+ logger.info(f"Processing image file: {file_path}")
307
+ return self._process_image(file_path)
308
+
309
+ except Exception as e:
310
+ logger.error(f"Failed to process file {file_path}: {e}")
311
+ raise ConversionError(f"GPU processing failed: {e}")
312
+
313
+ def _process_image(self, file_path: str) -> GPUConversionResult:
314
+ """Process image file with OCR capabilities.
315
+
316
+ Args:
317
+ file_path: Path to the image file
318
+
319
+ Returns:
320
+ GPUConversionResult with extracted content
321
+ """
322
+ # Get OCR service
323
+ ocr_service = self._get_ocr_service()
324
+
325
+ # Extract text with layout awareness if enabled
326
+ if self.ocr_enabled and self.preserve_layout:
327
+ logger.info("Extracting text with layout awareness using Nanonets OCR")
328
+ extracted_text = ocr_service.extract_text_with_layout(file_path)
329
+ elif self.ocr_enabled:
330
+ logger.info("Extracting text without layout awareness using Nanonets OCR")
331
+ extracted_text = ocr_service.extract_text(file_path)
332
+ else:
333
+ logger.warning("OCR is disabled, returning empty content")
334
+ extracted_text = ""
335
+
336
+ # Create GPU result
337
+ result = GPUConversionResult(
338
+ content=extracted_text,
339
+ metadata={
340
+ 'file_path': file_path,
341
+ 'file_type': 'image',
342
+ 'ocr_enabled': self.ocr_enabled,
343
+ 'preserve_layout': self.preserve_layout,
344
+ 'ocr_provider': 'nanonets'
345
+ },
346
+ gpu_processor=self,
347
+ file_path=file_path,
348
+ ocr_provider='nanonets'
349
+ )
350
+
351
+ logger.info(f"Image processing completed. Extracted {len(extracted_text)} characters")
352
+ return result
353
+
354
+ def _process_pdf(self, file_path: str) -> GPUConversionResult:
355
+ """Process PDF file by converting to images and using OCR.
356
+
357
+ Args:
358
+ file_path: Path to the PDF file
359
+
360
+ Returns:
361
+ GPUConversionResult with extracted content
362
+ """
363
+ try:
364
+ # Convert PDF to images
365
+ image_paths = self._convert_pdf_to_images(file_path)
366
+
367
+ if not image_paths:
368
+ logger.warning("No pages could be extracted from PDF")
369
+ return GPUConversionResult(
370
+ content="",
371
+ metadata={
372
+ 'file_path': file_path,
373
+ 'file_type': 'pdf',
374
+ 'ocr_enabled': self.ocr_enabled,
375
+ 'preserve_layout': self.preserve_layout,
376
+ 'ocr_provider': 'nanonets',
377
+ 'pages_processed': 0
378
+ },
379
+ gpu_processor=self,
380
+ file_path=file_path,
381
+ ocr_provider='nanonets'
382
+ )
383
+
384
+ # Process each page with OCR
385
+ all_texts = []
386
+ ocr_service = self._get_ocr_service()
387
+
388
+ for i, image_path in enumerate(image_paths):
389
+ logger.info(f"Processing PDF page {i+1}/{len(image_paths)}")
390
+
391
+ try:
392
+ if self.ocr_enabled and self.preserve_layout:
393
+ page_text = ocr_service.extract_text_with_layout(image_path)
394
+ elif self.ocr_enabled:
395
+ page_text = ocr_service.extract_text(image_path)
396
+ else:
397
+ page_text = ""
398
+
399
+ # Add page header and content if there's text
400
+ if page_text.strip():
401
+ # Add page header (markdown style)
402
+ all_texts.append(f"\n## Page {i+1}\n\n")
403
+ all_texts.append(page_text)
404
+
405
+ # Add horizontal rule after content (except for last page)
406
+ if i < len(image_paths) - 1:
407
+ all_texts.append("\n\n---\n\n")
408
+
409
+ except Exception as e:
410
+ logger.error(f"Failed to process page {i+1}: {e}")
411
+ # Add error page with markdown formatting
412
+ all_texts.append(f"\n## Page {i+1}\n\n*Error processing this page: {e}*\n\n")
413
+ if i < len(image_paths) - 1:
414
+ all_texts.append("---\n\n")
415
+
416
+ finally:
417
+ # Clean up temporary image file
418
+ try:
419
+ os.unlink(image_path)
420
+ except OSError:
421
+ pass
422
+
423
+ # Combine all page texts
424
+ combined_text = ''.join(all_texts)
425
+
426
+ # Create result
427
+ result = GPUConversionResult(
428
+ content=combined_text,
429
+ metadata={
430
+ 'file_path': file_path,
431
+ 'file_type': 'pdf',
432
+ 'ocr_enabled': self.ocr_enabled,
433
+ 'preserve_layout': self.preserve_layout,
434
+ 'ocr_provider': 'nanonets',
435
+ 'pages_processed': len(image_paths)
436
+ },
437
+ gpu_processor=self,
438
+ file_path=file_path,
439
+ ocr_provider='nanonets'
440
+ )
441
+
442
+ logger.info(f"PDF processing completed. Processed {len(image_paths)} pages, extracted {len(combined_text)} characters")
443
+ return result
444
+
445
+ except Exception as e:
446
+ logger.error(f"Failed to process PDF {file_path}: {e}")
447
+ raise ConversionError(f"PDF processing failed: {e}")
448
+
449
+ def _convert_pdf_to_images(self, pdf_path: str) -> List[str]:
450
+ """Convert PDF pages to images.
451
+
452
+ Args:
453
+ pdf_path: Path to the PDF file
454
+
455
+ Returns:
456
+ List of paths to temporary image files
457
+ """
458
+ try:
459
+ from pdf2image import convert_from_path
460
+ from ..config import InternalConfig
461
+
462
+ # Get DPI from config
463
+ dpi = getattr(InternalConfig, 'pdf_image_dpi', 300)
464
+
465
+ # Convert PDF pages to images using pdf2image
466
+ images = convert_from_path(pdf_path, dpi=dpi)
467
+ image_paths = []
468
+
469
+ # Save each image to a temporary file
470
+ for page_num, image in enumerate(images):
471
+ persistent_image_path = tempfile.mktemp(suffix='.png')
472
+ image.save(persistent_image_path, 'PNG')
473
+ image_paths.append(persistent_image_path)
474
+
475
+ logger.info(f"Converted PDF to {len(image_paths)} images")
476
+ return image_paths
477
+
478
+ except ImportError:
479
+ logger.error("pdf2image not available. Please install it: pip install pdf2image")
480
+ raise ConversionError("pdf2image is required for PDF processing")
481
+ except Exception as e:
482
+ logger.error(f"Failed to extract PDF to images: {e}")
483
+ raise ConversionError(f"PDF to image conversion failed: {e}")
484
+
485
+ @staticmethod
486
+ def predownload_ocr_models():
487
+ """Pre-download OCR models by running a dummy prediction."""
488
+ try:
489
+ from docstrange.pipeline.ocr_service import OCRServiceFactory
490
+ ocr_service = OCRServiceFactory.create_service('nanonets')
491
+ # Create a blank image for testing
492
+ from PIL import Image
493
+ import tempfile
494
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
495
+ img = Image.new('RGB', (100, 100), color='white')
496
+ img.save(tmp.name)
497
+ ocr_service.extract_text_with_layout(tmp.name)
498
+ os.unlink(tmp.name)
499
+ logger.info("Nanonets OCR models pre-downloaded and cached.")
500
+ except Exception as e:
501
+ logger.error(f"Failed to pre-download Nanonets OCR models: {e}")
docstrange/processors/html_processor.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HTML file processor."""
2
+
3
+ import os
4
+ import logging
5
+ from typing import Dict, Any
6
+
7
+ from .base import BaseProcessor
8
+ from ..result import ConversionResult
9
+ from ..exceptions import ConversionError, FileNotFoundError
10
+
11
+ # Configure logging
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class HTMLProcessor(BaseProcessor):
16
+ """Processor for HTML files using markdownify for conversion."""
17
+
18
+ def can_process(self, file_path: str) -> bool:
19
+ """Check if this processor can handle the given file.
20
+
21
+ Args:
22
+ file_path: Path to the file to check
23
+
24
+ Returns:
25
+ True if this processor can handle the file
26
+ """
27
+ if not os.path.exists(file_path):
28
+ return False
29
+
30
+ # Check file extension - ensure file_path is a string
31
+ file_path_str = str(file_path)
32
+ _, ext = os.path.splitext(file_path_str.lower())
33
+ return ext in ['.html', '.htm']
34
+
35
+ def process(self, file_path: str) -> ConversionResult:
36
+ """Process the HTML file and return a conversion result.
37
+
38
+ Args:
39
+ file_path: Path to the HTML file to process
40
+
41
+ Returns:
42
+ ConversionResult containing the processed content
43
+
44
+ Raises:
45
+ FileNotFoundError: If the file doesn't exist
46
+ ConversionError: If processing fails
47
+ """
48
+ if not os.path.exists(file_path):
49
+ raise FileNotFoundError(f"File not found: {file_path}")
50
+
51
+ try:
52
+ try:
53
+ from markdownify import markdownify as md
54
+ except ImportError:
55
+ raise ConversionError("markdownify is required for HTML processing. Install it with: pip install markdownify")
56
+
57
+ metadata = self.get_metadata(file_path)
58
+ with open(file_path, 'r', encoding='utf-8') as f:
59
+ html_content = f.read()
60
+ content = md(html_content, heading_style="ATX")
61
+ return ConversionResult(content, metadata)
62
+ except Exception as e:
63
+ if isinstance(e, (FileNotFoundError, ConversionError)):
64
+ raise
65
+ raise ConversionError(f"Failed to process HTML file {file_path}: {str(e)}")
docstrange/processors/image_processor.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Image file processor with OCR capabilities."""
2
+
3
+ import os
4
+ import logging
5
+ from typing import Dict, Any
6
+
7
+ from .base import BaseProcessor
8
+ from ..result import ConversionResult
9
+ from ..exceptions import ConversionError, FileNotFoundError
10
+ from ..pipeline.ocr_service import OCRServiceFactory
11
+
12
+ # Configure logging
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class ImageProcessor(BaseProcessor):
17
+ """Processor for image files (JPG, PNG, etc.) with OCR capabilities."""
18
+
19
+ def __init__(self, preserve_layout: bool = True, include_images: bool = False, ocr_enabled: bool = True, use_markdownify: bool = None, ocr_service=None):
20
+ super().__init__(preserve_layout, include_images, ocr_enabled, use_markdownify)
21
+ self._ocr_service = ocr_service
22
+
23
+ def can_process(self, file_path: str) -> bool:
24
+ """Check if this processor can handle the given file.
25
+
26
+ Args:
27
+ file_path: Path to the file to check
28
+
29
+ Returns:
30
+ True if this processor can handle the file
31
+ """
32
+ if not os.path.exists(file_path):
33
+ return False
34
+
35
+ # Check file extension - ensure file_path is a string
36
+ file_path_str = str(file_path)
37
+ _, ext = os.path.splitext(file_path_str.lower())
38
+ return ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp', '.gif']
39
+
40
+ def _get_ocr_service(self):
41
+ """Get OCR service instance."""
42
+ if self._ocr_service is not None:
43
+ return self._ocr_service
44
+ self._ocr_service = OCRServiceFactory.create_service()
45
+ return self._ocr_service
46
+
47
+ def process(self, file_path: str) -> ConversionResult:
48
+ """Process image file with OCR capabilities.
49
+
50
+ Args:
51
+ file_path: Path to the image file
52
+
53
+ Returns:
54
+ ConversionResult with extracted content
55
+ """
56
+ try:
57
+ if not os.path.exists(file_path):
58
+ raise FileNotFoundError(f"Image file not found: {file_path}")
59
+
60
+ logger.info(f"Processing image file: {file_path}")
61
+
62
+ # Get OCR service
63
+ ocr_service = self._get_ocr_service()
64
+
65
+ # Extract text with layout awareness if enabled
66
+ if self.ocr_enabled and self.preserve_layout:
67
+ logger.info("Extracting text with layout awareness")
68
+ extracted_text = ocr_service.extract_text_with_layout(file_path)
69
+ elif self.ocr_enabled:
70
+ logger.info("Extracting text without layout awareness")
71
+ extracted_text = ocr_service.extract_text(file_path)
72
+ else:
73
+ logger.warning("OCR is disabled, returning empty content")
74
+ extracted_text = ""
75
+
76
+ # Create result
77
+ result = ConversionResult(
78
+ content=extracted_text,
79
+ metadata={
80
+ 'file_path': file_path,
81
+ 'file_type': 'image',
82
+ 'ocr_enabled': self.ocr_enabled,
83
+ 'preserve_layout': self.preserve_layout
84
+ }
85
+ )
86
+
87
+ logger.info(f"Image processing completed. Extracted {len(extracted_text)} characters")
88
+ return result
89
+
90
+ except Exception as e:
91
+ logger.error(f"Failed to process image file {file_path}: {e}")
92
+ raise ConversionError(f"Image processing failed: {e}")
93
+
94
+ @staticmethod
95
+ def predownload_ocr_models():
96
+ """Pre-download OCR models by running a dummy prediction."""
97
+ try:
98
+ from docstrange.services.ocr_service import OCRServiceFactory
99
+ ocr_service = OCRServiceFactory.create_service()
100
+ # Create a blank image for testing
101
+ from PIL import Image
102
+ import tempfile
103
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
104
+ img = Image.new('RGB', (100, 100), color='white')
105
+ img.save(tmp.name)
106
+ ocr_service.extract_text_with_layout(tmp.name)
107
+ os.unlink(tmp.name)
108
+ logger.info("OCR models pre-downloaded and cached.")
109
+ except Exception as e:
110
+ logger.error(f"Failed to pre-download OCR models: {e}")
docstrange/processors/pdf_processor.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """PDF file processor with OCR support for scanned PDFs."""
2
+
3
+ import os
4
+ import logging
5
+ import tempfile
6
+ from typing import Dict, Any, List, Tuple
7
+
8
+ from .base import BaseProcessor
9
+ from .image_processor import ImageProcessor
10
+ from ..result import ConversionResult
11
+ from ..exceptions import ConversionError, FileNotFoundError
12
+ from ..config import InternalConfig
13
+ from ..pipeline.ocr_service import OCRServiceFactory, NeuralOCRService
14
+
15
+ # Configure logging
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class PDFProcessor(BaseProcessor):
20
+ """Processor for PDF files using PDF-to-image conversion with OCR."""
21
+
22
+ def __init__(self, preserve_layout: bool = True, include_images: bool = False, ocr_enabled: bool = True, use_markdownify: bool = None):
23
+ super().__init__(preserve_layout, include_images, ocr_enabled, use_markdownify)
24
+ # Create a shared OCR service instance for all pages
25
+ shared_ocr_service = NeuralOCRService()
26
+ self._image_processor = ImageProcessor(
27
+ preserve_layout=preserve_layout,
28
+ include_images=include_images,
29
+ ocr_enabled=ocr_enabled,
30
+ use_markdownify=use_markdownify,
31
+ ocr_service=shared_ocr_service
32
+ )
33
+
34
+ def can_process(self, file_path: str) -> bool:
35
+ """Check if this processor can handle the given file.
36
+
37
+ Args:
38
+ file_path: Path to the file to check
39
+
40
+ Returns:
41
+ True if this processor can handle the file
42
+ """
43
+ if not os.path.exists(file_path):
44
+ return False
45
+
46
+ # Check file extension - ensure file_path is a string
47
+ file_path_str = str(file_path)
48
+ _, ext = os.path.splitext(file_path_str.lower())
49
+ return ext == '.pdf'
50
+
51
+ def process(self, file_path: str) -> ConversionResult:
52
+ """Process PDF file with OCR capabilities.
53
+
54
+ Args:
55
+ file_path: Path to the PDF file
56
+
57
+ Returns:
58
+ ConversionResult with extracted content
59
+ """
60
+ try:
61
+ from ..config import InternalConfig
62
+ pdf_to_image_enabled = InternalConfig.pdf_to_image_enabled
63
+ except (ImportError, AttributeError):
64
+ # Fallback if config is not available
65
+ pdf_to_image_enabled = True
66
+ logger.warning("InternalConfig not available, defaulting to pdf_to_image_enabled = True")
67
+
68
+ try:
69
+ if not os.path.exists(file_path):
70
+ raise FileNotFoundError(f"PDF file not found: {file_path}")
71
+
72
+ logger.info(f"Processing PDF file: {file_path}")
73
+ logger.info(f"pdf_to_image_enabled = {pdf_to_image_enabled}")
74
+
75
+ # Always use OCR-based processing (pdf2image + OCR)
76
+ logger.info("Using OCR-based PDF processing with pdf2image")
77
+ return self._process_with_ocr(file_path)
78
+
79
+ except Exception as e:
80
+ logger.error(f"Failed to process PDF file {file_path}: {e}")
81
+ raise ConversionError(f"PDF processing failed: {e}")
82
+
83
+ def _process_with_ocr(self, file_path: str) -> ConversionResult:
84
+ """Process PDF using OCR after converting pages to images."""
85
+ try:
86
+ from pdf2image import convert_from_path
87
+ from ..config import InternalConfig
88
+
89
+ # Get DPI from config
90
+ dpi = getattr(InternalConfig, 'pdf_image_dpi', 300)
91
+
92
+ # Convert PDF pages to images using pdf2image
93
+ images = convert_from_path(file_path, dpi=dpi)
94
+ page_count = len(images)
95
+ all_content = []
96
+
97
+ for page_num, image in enumerate(images):
98
+ # Save to temporary file for OCR processing
99
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
100
+ image.save(tmp.name, 'PNG')
101
+ temp_image_path = tmp.name
102
+
103
+ try:
104
+ # Process the page image
105
+ page_result = self._image_processor.process(temp_image_path)
106
+ page_content = page_result.content
107
+
108
+ if page_content.strip():
109
+ all_content.append(f"## Page {page_num + 1}\n\n{page_content}")
110
+
111
+ finally:
112
+ # Clean up temporary file
113
+ os.unlink(temp_image_path)
114
+
115
+ content = "\n\n".join(all_content) if all_content else "No content extracted from PDF"
116
+
117
+ return ConversionResult(
118
+ content=content,
119
+ metadata={
120
+ 'file_path': file_path,
121
+ 'file_type': 'pdf',
122
+ 'pages': page_count,
123
+ 'extraction_method': 'ocr'
124
+ }
125
+ )
126
+
127
+ except ImportError:
128
+ logger.error("pdf2image not available. Please install it: pip install pdf2image")
129
+ raise ConversionError("pdf2image is required for PDF processing")
130
+ except Exception as e:
131
+ logger.error(f"OCR-based PDF processing failed: {e}")
132
+ raise ConversionError(f"OCR-based PDF processing failed: {e}")
133
+
134
+ @staticmethod
135
+ def predownload_ocr_models():
136
+ """Pre-download OCR models by running a dummy prediction."""
137
+ try:
138
+ # Use ImageProcessor's predownload method
139
+ ImageProcessor.predownload_ocr_models()
140
+ except Exception as e:
141
+ logger.error(f"Failed to pre-download OCR models: {e}")
docstrange/processors/pptx_processor.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """PowerPoint file processor."""
2
+
3
+ import os
4
+ import logging
5
+ from typing import Dict, Any
6
+
7
+ from .base import BaseProcessor
8
+ from ..result import ConversionResult
9
+ from ..exceptions import ConversionError, FileNotFoundError
10
+
11
+ # Configure logging
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class PPTXProcessor(BaseProcessor):
16
+ """Processor for PowerPoint files (PPT, PPTX)."""
17
+
18
+ def can_process(self, file_path: str) -> bool:
19
+ """Check if this processor can handle the given file.
20
+
21
+ Args:
22
+ file_path: Path to the file to check
23
+
24
+ Returns:
25
+ True if this processor can handle the file
26
+ """
27
+ if not os.path.exists(file_path):
28
+ return False
29
+
30
+ # Check file extension - ensure file_path is a string
31
+ file_path_str = str(file_path)
32
+ _, ext = os.path.splitext(file_path_str.lower())
33
+ return ext in ['.ppt', '.pptx']
34
+
35
+ def process(self, file_path: str) -> ConversionResult:
36
+ """Process the PowerPoint file and return a conversion result.
37
+
38
+ Args:
39
+ file_path: Path to the PowerPoint file to process
40
+
41
+ Returns:
42
+ ConversionResult containing the processed content
43
+
44
+ Raises:
45
+ FileNotFoundError: If the file doesn't exist
46
+ ConversionError: If processing fails
47
+ """
48
+ if not os.path.exists(file_path):
49
+ raise FileNotFoundError(f"File not found: {file_path}")
50
+
51
+ # Initialize metadata
52
+ metadata = {
53
+ "file_path": file_path,
54
+ "file_size": os.path.getsize(file_path),
55
+ "processor": "PPTXProcessor"
56
+ }
57
+
58
+ # Check file extension to determine processing method
59
+ file_path_str = str(file_path)
60
+ _, ext = os.path.splitext(file_path_str.lower())
61
+
62
+ if ext == '.ppt':
63
+ return self._process_ppt_file(file_path, metadata)
64
+ else:
65
+ return self._process_pptx_file(file_path, metadata)
66
+
67
+ def _process_ppt_file(self, file_path: str, metadata: Dict[str, Any]) -> ConversionResult:
68
+ """Process .ppt files using pypandoc."""
69
+ try:
70
+ import pypandoc
71
+
72
+ # Convert .ppt to markdown using pandoc
73
+ content = pypandoc.convert_file(file_path, 'markdown')
74
+
75
+ metadata.update({
76
+ "file_type": "ppt",
77
+ "extractor": "pypandoc"
78
+ })
79
+
80
+ # Clean up the content
81
+ content = self._clean_content(content)
82
+
83
+ return ConversionResult(content, metadata)
84
+
85
+ except ImportError:
86
+ raise ConversionError("pypandoc is required for .ppt file processing. Install it with: pip install pypandoc")
87
+ except Exception as e:
88
+ raise ConversionError(f"Failed to process .ppt file {file_path}: {str(e)}")
89
+
90
+ def _process_pptx_file(self, file_path: str, metadata: Dict[str, Any]) -> ConversionResult:
91
+ """Process .pptx files using python-pptx."""
92
+ try:
93
+ from pptx import Presentation
94
+
95
+ content_parts = []
96
+ prs = Presentation(file_path)
97
+
98
+ metadata.update({
99
+ "slide_count": len(prs.slides),
100
+ "file_type": "pptx",
101
+ "extractor": "python-pptx"
102
+ })
103
+
104
+ # Check if preserve_layout is available (from base class or config)
105
+ preserve_layout = getattr(self, 'preserve_layout', False)
106
+
107
+ for slide_num, slide in enumerate(prs.slides, 1):
108
+ if preserve_layout:
109
+ content_parts.append(f"\n## Slide {slide_num}\n")
110
+
111
+ slide_content = []
112
+
113
+ for shape in slide.shapes:
114
+ if hasattr(shape, "text") and shape.text.strip():
115
+ slide_content.append(shape.text.strip())
116
+
117
+ if slide_content:
118
+ content_parts.extend(slide_content)
119
+ content_parts.append("") # Add spacing between slides
120
+
121
+ content = "\n\n".join(content_parts)
122
+
123
+ # Clean up the content
124
+ content = self._clean_content(content)
125
+
126
+ return ConversionResult(content, metadata)
127
+
128
+ except ImportError:
129
+ raise ConversionError("python-pptx is required for .pptx file processing. Install it with: pip install python-pptx")
130
+ except Exception as e:
131
+ if isinstance(e, (FileNotFoundError, ConversionError)):
132
+ raise
133
+ raise ConversionError(f"Failed to process .pptx file {file_path}: {str(e)}")
134
+
135
+ def _clean_content(self, content: str) -> str:
136
+ """Clean up the extracted PowerPoint content.
137
+
138
+ Args:
139
+ content: Raw PowerPoint text content
140
+
141
+ Returns:
142
+ Cleaned text content
143
+ """
144
+ # Remove excessive whitespace and normalize
145
+ lines = content.split('\n')
146
+ cleaned_lines = []
147
+
148
+ for line in lines:
149
+ # Remove excessive whitespace
150
+ line = ' '.join(line.split())
151
+ if line.strip():
152
+ cleaned_lines.append(line)
153
+
154
+ # Join lines and add proper spacing
155
+ content = '\n'.join(cleaned_lines)
156
+
157
+ # Add spacing around headers
158
+ content = content.replace('## Slide', '\n## Slide')
159
+
160
+ return content.strip()
docstrange/processors/txt_processor.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Text file processor."""
2
+
3
+ import os
4
+ from typing import Dict, Any
5
+
6
+ from .base import BaseProcessor
7
+ from ..result import ConversionResult
8
+ from ..exceptions import ConversionError, FileNotFoundError
9
+
10
+
11
+ class TXTProcessor(BaseProcessor):
12
+ """Processor for plain text files."""
13
+
14
+ def can_process(self, file_path: str) -> bool:
15
+ """Check if this processor can handle the given file.
16
+
17
+ Args:
18
+ file_path: Path to the file to check
19
+
20
+ Returns:
21
+ True if this processor can handle the file
22
+ """
23
+ if not os.path.exists(file_path):
24
+ return False
25
+
26
+ # Check file extension - ensure file_path is a string
27
+ file_path_str = str(file_path)
28
+ _, ext = os.path.splitext(file_path_str.lower())
29
+ return ext in ['.txt', '.text']
30
+
31
+ def process(self, file_path: str) -> ConversionResult:
32
+ """Process the text file and return a conversion result.
33
+
34
+ Args:
35
+ file_path: Path to the text file to process
36
+
37
+ Returns:
38
+ ConversionResult containing the processed content
39
+
40
+ Raises:
41
+ FileNotFoundError: If the file doesn't exist
42
+ ConversionError: If processing fails
43
+ """
44
+ if not os.path.exists(file_path):
45
+ raise FileNotFoundError(f"File not found: {file_path}")
46
+
47
+ try:
48
+ # Try different encodings
49
+ encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
50
+ content = None
51
+
52
+ for encoding in encodings:
53
+ try:
54
+ with open(file_path, 'r', encoding=encoding) as f:
55
+ content = f.read()
56
+ break
57
+ except UnicodeDecodeError:
58
+ continue
59
+
60
+ if content is None:
61
+ raise ConversionError(f"Could not decode file {file_path} with any supported encoding")
62
+
63
+ # Clean up the content
64
+ content = self._clean_content(content)
65
+
66
+ metadata = self.get_metadata(file_path)
67
+ metadata.update({
68
+ "encoding": encoding,
69
+ "line_count": len(content.split('\n')),
70
+ "word_count": len(content.split())
71
+ })
72
+
73
+ return ConversionResult(content, metadata)
74
+
75
+ except Exception as e:
76
+ if isinstance(e, (FileNotFoundError, ConversionError)):
77
+ raise
78
+ raise ConversionError(f"Failed to process text file {file_path}: {str(e)}")
79
+
80
+ def _clean_content(self, content: str) -> str:
81
+ """Clean up the text content.
82
+
83
+ Args:
84
+ content: Raw text content
85
+
86
+ Returns:
87
+ Cleaned text content
88
+ """
89
+ # Remove excessive whitespace
90
+ lines = content.split('\n')
91
+ cleaned_lines = []
92
+
93
+ for line in lines:
94
+ # Remove trailing whitespace
95
+ line = line.rstrip()
96
+ cleaned_lines.append(line)
97
+
98
+ # Remove empty lines at the beginning and end
99
+ while cleaned_lines and not cleaned_lines[0].strip():
100
+ cleaned_lines.pop(0)
101
+
102
+ while cleaned_lines and not cleaned_lines[-1].strip():
103
+ cleaned_lines.pop()
104
+
105
+ return '\n'.join(cleaned_lines)
docstrange/processors/url_processor.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """URL processor for handling web pages and file downloads."""
2
+
3
+ import os
4
+ import re
5
+ import tempfile
6
+ from typing import Dict, Any, Optional
7
+ from urllib.parse import urlparse
8
+
9
+ from .base import BaseProcessor
10
+ from ..result import ConversionResult
11
+ from ..exceptions import ConversionError, NetworkError
12
+
13
+
14
+ class URLProcessor(BaseProcessor):
15
+ """Processor for URLs and web pages."""
16
+
17
+ def can_process(self, file_path: str) -> bool:
18
+ """Check if this processor can handle the given file.
19
+
20
+ Args:
21
+ file_path: Path to the file to check (or URL)
22
+
23
+ Returns:
24
+ True if this processor can handle the file
25
+ """
26
+ # Check if it looks like a URL
27
+ return self._is_url(file_path)
28
+
29
+ def process(self, file_path: str) -> ConversionResult:
30
+ """Process the URL and return a conversion result.
31
+
32
+ Args:
33
+ file_path: URL to process
34
+
35
+ Returns:
36
+ ConversionResult containing the processed content
37
+
38
+ Raises:
39
+ NetworkError: If network operations fail
40
+ ConversionError: If processing fails
41
+ """
42
+ try:
43
+ import requests
44
+
45
+ # First, check if this URL points to a file
46
+ file_info = self._detect_file_from_url(file_path)
47
+
48
+ if file_info:
49
+ # This is a file URL, download and process it
50
+ return self._process_file_url(file_path, file_info)
51
+ else:
52
+ # This is a web page, process it as HTML
53
+ return self._process_web_page(file_path)
54
+
55
+ except ImportError:
56
+ raise ConversionError("requests and beautifulsoup4 are required for URL processing. Install them with: pip install requests beautifulsoup4")
57
+ except requests.RequestException as e:
58
+ raise NetworkError(f"Failed to fetch URL {file_path}: {str(e)}")
59
+ except Exception as e:
60
+ if isinstance(e, (NetworkError, ConversionError)):
61
+ raise
62
+ raise ConversionError(f"Failed to process URL {file_path}: {str(e)}")
63
+
64
+ def _detect_file_from_url(self, url: str) -> Optional[Dict[str, Any]]:
65
+ """Detect if a URL points to a file and return file information.
66
+
67
+ Args:
68
+ url: URL to check
69
+
70
+ Returns:
71
+ File info dict if it's a file URL, None otherwise
72
+ """
73
+ try:
74
+ import requests
75
+
76
+ # Check URL path for file extensions
77
+ parsed_url = urlparse(url)
78
+ path = parsed_url.path.lower()
79
+
80
+ # Common file extensions
81
+ file_extensions = {
82
+ '.pdf': 'pdf',
83
+ '.doc': 'doc',
84
+ '.docx': 'docx',
85
+ '.txt': 'txt',
86
+ '.md': 'markdown',
87
+ '.html': 'html',
88
+ '.htm': 'html',
89
+ '.xlsx': 'xlsx',
90
+ '.xls': 'xls',
91
+ '.csv': 'csv',
92
+ '.ppt': 'ppt',
93
+ '.pptx': 'pptx',
94
+ '.jpg': 'image',
95
+ '.jpeg': 'image',
96
+ '.png': 'image',
97
+ '.gif': 'image',
98
+ '.bmp': 'image',
99
+ '.tiff': 'image',
100
+ '.tif': 'image',
101
+ '.webp': 'image'
102
+ }
103
+
104
+ # Check for file extension in URL path
105
+ for ext, file_type in file_extensions.items():
106
+ if path.endswith(ext):
107
+ return {
108
+ 'file_type': file_type,
109
+ 'extension': ext,
110
+ 'filename': os.path.basename(path) or f"downloaded_file{ext}"
111
+ }
112
+
113
+ # If no extension in URL, check content-type header
114
+ try:
115
+ headers = {
116
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
117
+ }
118
+
119
+ # Make a HEAD request to check content-type
120
+ response = requests.head(url, headers=headers, timeout=10, allow_redirects=True)
121
+
122
+ if response.status_code == 200:
123
+ content_type = response.headers.get('content-type', '').lower()
124
+
125
+ # Check for file content types
126
+ if 'application/pdf' in content_type:
127
+ return {'file_type': 'pdf', 'extension': '.pdf', 'filename': 'downloaded_file.pdf'}
128
+ elif 'application/msword' in content_type or 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' in content_type:
129
+ ext = '.docx' if 'openxmlformats' in content_type else '.doc'
130
+ return {'file_type': 'doc' if ext == '.doc' else 'docx', 'extension': ext, 'filename': f'downloaded_file{ext}'}
131
+ elif 'application/vnd.ms-excel' in content_type or 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' in content_type:
132
+ ext = '.xlsx' if 'openxmlformats' in content_type else '.xls'
133
+ return {'file_type': 'xlsx' if ext == '.xlsx' else 'xls', 'extension': ext, 'filename': f'downloaded_file{ext}'}
134
+ elif 'application/vnd.ms-powerpoint' in content_type or 'application/vnd.openxmlformats-officedocument.presentationml.presentation' in content_type:
135
+ ext = '.pptx' if 'openxmlformats' in content_type else '.ppt'
136
+ return {'file_type': 'pptx' if ext == '.pptx' else 'ppt', 'extension': ext, 'filename': f'downloaded_file{ext}'}
137
+ elif 'text/plain' in content_type:
138
+ return {'file_type': 'txt', 'extension': '.txt', 'filename': 'downloaded_file.txt'}
139
+ elif 'text/markdown' in content_type:
140
+ return {'file_type': 'markdown', 'extension': '.md', 'filename': 'downloaded_file.md'}
141
+ elif 'text/html' in content_type:
142
+ # HTML could be a web page or a file, check if it's likely a file
143
+ if 'attachment' in response.headers.get('content-disposition', '').lower():
144
+ return {'file_type': 'html', 'extension': '.html', 'filename': 'downloaded_file.html'}
145
+ # If it's HTML but not an attachment, treat as web page
146
+ return None
147
+ elif any(img_type in content_type for img_type in ['image/jpeg', 'image/png', 'image/gif', 'image/bmp', 'image/tiff', 'image/webp']):
148
+ # Determine extension from content type
149
+ ext_map = {
150
+ 'image/jpeg': '.jpg',
151
+ 'image/png': '.png',
152
+ 'image/gif': '.gif',
153
+ 'image/bmp': '.bmp',
154
+ 'image/tiff': '.tiff',
155
+ 'image/webp': '.webp'
156
+ }
157
+ ext = ext_map.get(content_type, '.jpg')
158
+ return {'file_type': 'image', 'extension': ext, 'filename': f'downloaded_file{ext}'}
159
+
160
+ except requests.RequestException:
161
+ # If HEAD request fails, assume it's a web page
162
+ pass
163
+
164
+ except Exception:
165
+ pass
166
+
167
+ return None
168
+
169
+ def _process_file_url(self, url: str, file_info: Dict[str, Any]) -> ConversionResult:
170
+ """Download and process a file from URL.
171
+
172
+ Args:
173
+ url: URL to download from
174
+ file_info: Information about the file
175
+
176
+ Returns:
177
+ ConversionResult containing the processed content
178
+ """
179
+ try:
180
+ import requests
181
+ from ..extractor import DocumentExtractor
182
+
183
+ # Download the file
184
+ headers = {
185
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
186
+ }
187
+
188
+ response = requests.get(url, headers=headers, timeout=60, stream=True)
189
+ response.raise_for_status()
190
+
191
+ # Create a temporary file
192
+ with tempfile.NamedTemporaryFile(delete=False, suffix=file_info['extension']) as temp_file:
193
+ # Write the downloaded content and track size
194
+ content_length = 0
195
+ for chunk in response.iter_content(chunk_size=8192):
196
+ if chunk: # Filter out keep-alive chunks
197
+ temp_file.write(chunk)
198
+ content_length += len(chunk)
199
+
200
+ temp_file_path = temp_file.name
201
+
202
+ try:
203
+ # Process the downloaded file using the appropriate processor
204
+ extractor = DocumentExtractor()
205
+ result = extractor.extract(temp_file_path)
206
+
207
+ # Add URL metadata to the result
208
+ result.metadata.update({
209
+ "source_url": url,
210
+ "downloaded_filename": file_info['filename'],
211
+ "content_type": response.headers.get('content-type', ''),
212
+ "content_length": content_length
213
+ })
214
+
215
+ return result
216
+
217
+ finally:
218
+ # Clean up the temporary file
219
+ try:
220
+ os.unlink(temp_file_path)
221
+ except OSError:
222
+ pass
223
+
224
+ except Exception as e:
225
+ raise ConversionError(f"Failed to download and process file from URL {url}: {str(e)}")
226
+
227
+ def _process_web_page(self, url: str) -> ConversionResult:
228
+ """Process a web page URL.
229
+
230
+ Args:
231
+ url: URL to process
232
+
233
+ Returns:
234
+ ConversionResult containing the processed content
235
+ """
236
+ try:
237
+ from bs4 import BeautifulSoup
238
+ import requests
239
+
240
+ # Fetch the web page
241
+ headers = {
242
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
243
+ }
244
+
245
+ response = requests.get(url, headers=headers, timeout=30)
246
+ response.raise_for_status()
247
+
248
+ # Parse the HTML
249
+ soup = BeautifulSoup(response.content, 'html.parser')
250
+
251
+ # Remove script and style elements
252
+ for script in soup(["script", "style"]):
253
+ script.decompose()
254
+
255
+ # Extract text content
256
+ content_parts = []
257
+
258
+ # Get title
259
+ title = soup.find('title')
260
+ if title:
261
+ content_parts.append(f"# {title.get_text().strip()}\n")
262
+
263
+ # Get main content
264
+ main_content = self._extract_main_content(soup)
265
+ if main_content:
266
+ content_parts.append(main_content)
267
+ else:
268
+ # Fallback to body text
269
+ body = soup.find('body')
270
+ if body:
271
+ content_parts.append(body.get_text())
272
+
273
+ content = '\n'.join(content_parts)
274
+
275
+ # Clean up the content
276
+ content = self._clean_content(content)
277
+
278
+ metadata = {
279
+ "url": url,
280
+ "status_code": response.status_code,
281
+ "content_type": response.headers.get('content-type', ''),
282
+ "content_length": len(response.content),
283
+ "processor": self.__class__.__name__
284
+ }
285
+
286
+ return ConversionResult(content, metadata)
287
+
288
+ except Exception as e:
289
+ raise ConversionError(f"Failed to process web page {url}: {str(e)}")
290
+
291
+ def _is_url(self, text: str) -> bool:
292
+ """Check if the text looks like a URL.
293
+
294
+ Args:
295
+ text: Text to check
296
+
297
+ Returns:
298
+ True if text looks like a URL
299
+ """
300
+ try:
301
+ result = urlparse(text)
302
+ return all([result.scheme, result.netloc])
303
+ except Exception:
304
+ return False
305
+
306
+ def _extract_main_content(self, soup) -> str:
307
+ """Extract main content from the HTML.
308
+
309
+ Args:
310
+ soup: BeautifulSoup object
311
+
312
+ Returns:
313
+ Extracted main content
314
+ """
315
+ # Try to find main content areas
316
+ main_selectors = [
317
+ 'main',
318
+ '[role="main"]',
319
+ '.main-content',
320
+ '.content',
321
+ '#content',
322
+ 'article',
323
+ '.post-content',
324
+ '.entry-content'
325
+ ]
326
+
327
+ for selector in main_selectors:
328
+ element = soup.select_one(selector)
329
+ if element:
330
+ return element.get_text()
331
+
332
+ # If no main content found, return empty string
333
+ return ""
334
+
335
+ def _clean_content(self, content: str) -> str:
336
+ """Clean up the extracted web content.
337
+
338
+ Args:
339
+ content: Raw web text content
340
+
341
+ Returns:
342
+ Cleaned text content
343
+ """
344
+ # Remove excessive whitespace and normalize
345
+ lines = content.split('\n')
346
+ cleaned_lines = []
347
+
348
+ for line in lines:
349
+ # Remove excessive whitespace
350
+ line = ' '.join(line.split())
351
+ if line.strip():
352
+ cleaned_lines.append(line)
353
+
354
+ # Join lines and add proper spacing
355
+ content = '\n'.join(cleaned_lines)
356
+
357
+ # Add spacing around headers
358
+ content = content.replace('# ', '\n# ')
359
+ content = content.replace('## ', '\n## ')
360
+
361
+ return content.strip()
docstrange/result.py ADDED
@@ -0,0 +1,1143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Conversion result class for handling different output formats."""
2
+
3
+ import csv
4
+ import io
5
+ import json
6
+ import logging
7
+ import re
8
+ from typing import Any, Dict, List, Optional, Union
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class MarkdownToJSONParser:
14
+ """Comprehensive markdown to structured JSON parser."""
15
+
16
+ def __init__(self):
17
+ """Initialize the parser."""
18
+ # Compile regex patterns for better performance
19
+ self.header_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
20
+ self.list_item_pattern = re.compile(r'^(\s*)[*\-+]\s+(.+)$', re.MULTILINE)
21
+ self.ordered_list_pattern = re.compile(r'^(\s*)\d+\.\s+(.+)$', re.MULTILINE)
22
+ self.code_block_pattern = re.compile(r'```(\w+)?\n(.*?)```', re.DOTALL)
23
+ self.inline_code_pattern = re.compile(r'`([^`]+)`')
24
+ self.link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
25
+ self.image_pattern = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
26
+ self.table_pattern = re.compile(r'\|(.+)\|\s*\n\|[-\s|:]+\|\s*\n((?:\|.+\|\s*\n?)*)', re.MULTILINE)
27
+ self.blockquote_pattern = re.compile(r'^>\s+(.+)$', re.MULTILINE)
28
+ self.bold_pattern = re.compile(r'\*\*(.+?)\*\*')
29
+ self.italic_pattern = re.compile(r'\*(.+?)\*')
30
+
31
+ def parse(self, markdown_text: str) -> Dict[str, Any]:
32
+ """Parse markdown text into structured JSON.
33
+
34
+ Args:
35
+ markdown_text: The markdown content to parse
36
+
37
+ Returns:
38
+ Structured JSON representation
39
+ """
40
+ if not markdown_text or not markdown_text.strip():
41
+ return {
42
+ "document": {
43
+ "sections": [],
44
+ "metadata": {"total_sections": 0}
45
+ }
46
+ }
47
+
48
+ lines = markdown_text.split('\n')
49
+ sections = []
50
+ current_section = None
51
+ current_content = []
52
+
53
+ for line in lines:
54
+ line = line.rstrip()
55
+
56
+ # Check if this is a header
57
+ header_match = self.header_pattern.match(line)
58
+ if header_match:
59
+ # Save previous section if exists
60
+ if current_section is not None:
61
+ current_section['content'] = self._parse_content('\n'.join(current_content))
62
+ sections.append(current_section)
63
+
64
+ # Start new section
65
+ header_level = len(header_match.group(1))
66
+ header_text = header_match.group(2).strip()
67
+
68
+ current_section = {
69
+ "title": header_text,
70
+ "level": header_level,
71
+ "type": "section",
72
+ "content": {}
73
+ }
74
+ current_content = []
75
+ else:
76
+ # Add to current content
77
+ if line.strip() or current_content: # Keep empty lines only if we have content
78
+ current_content.append(line)
79
+
80
+ # Don't forget the last section
81
+ if current_section is not None:
82
+ current_section['content'] = self._parse_content('\n'.join(current_content))
83
+ sections.append(current_section)
84
+ elif current_content:
85
+ # Handle content without any headers
86
+ sections.append({
87
+ "title": "Content",
88
+ "level": 1,
89
+ "type": "section",
90
+ "content": self._parse_content('\n'.join(current_content))
91
+ })
92
+
93
+ # Create hierarchical structure
94
+ structured_sections = self._create_hierarchy(sections)
95
+
96
+ return {
97
+ "document": {
98
+ "sections": structured_sections,
99
+ "metadata": {
100
+ "total_sections": len(sections),
101
+ "max_heading_level": max([s.get('level', 1) for s in sections]) if sections else 0,
102
+ "has_tables": any('tables' in s.get('content', {}) for s in sections),
103
+ "has_code_blocks": any('code_blocks' in s.get('content', {}) for s in sections),
104
+ "has_lists": any('lists' in s.get('content', {}) for s in sections),
105
+ "has_images": any('images' in s.get('content', {}) for s in sections)
106
+ }
107
+ }
108
+ }
109
+
110
+ def _parse_content(self, content: str) -> Dict[str, Any]:
111
+ """Parse content within a section into structured components."""
112
+ if not content.strip():
113
+ return {}
114
+
115
+ result = {}
116
+
117
+ # Extract and parse different content types
118
+ paragraphs = self._extract_paragraphs(content)
119
+ if paragraphs:
120
+ result['paragraphs'] = paragraphs
121
+
122
+ lists = self._extract_lists(content)
123
+ if lists:
124
+ result['lists'] = lists
125
+
126
+ code_blocks = self._extract_code_blocks(content)
127
+ if code_blocks:
128
+ result['code_blocks'] = code_blocks
129
+
130
+ tables = self._extract_tables(content)
131
+ if tables:
132
+ result['tables'] = tables
133
+
134
+ images = self._extract_images(content)
135
+ if images:
136
+ result['images'] = images
137
+
138
+ links = self._extract_links(content)
139
+ if links:
140
+ result['links'] = links
141
+
142
+ blockquotes = self._extract_blockquotes(content)
143
+ if blockquotes:
144
+ result['blockquotes'] = blockquotes
145
+
146
+ return result
147
+
148
+ def _extract_paragraphs(self, content: str) -> List[str]:
149
+ """Extract paragraphs from content."""
150
+ # Remove code blocks, tables, lists, etc. to get clean paragraphs
151
+ clean_content = content
152
+
153
+ # Remove code blocks
154
+ clean_content = self.code_block_pattern.sub('', clean_content)
155
+
156
+ # Remove tables (simplified)
157
+ clean_content = re.sub(r'\|.*\|', '', clean_content)
158
+
159
+ # Remove list items
160
+ clean_content = self.list_item_pattern.sub('', clean_content)
161
+ clean_content = self.ordered_list_pattern.sub('', clean_content)
162
+
163
+ # Remove blockquotes
164
+ clean_content = self.blockquote_pattern.sub('', clean_content)
165
+
166
+ # Split into paragraphs and clean
167
+ paragraphs = []
168
+ for para in clean_content.split('\n\n'):
169
+ para = para.strip()
170
+ if para and not para.startswith('#'):
171
+ # Clean up markdown formatting for paragraphs
172
+ para = self._clean_inline_formatting(para)
173
+ paragraphs.append(para)
174
+
175
+ return paragraphs
176
+
177
+ def _extract_lists(self, content: str) -> List[Dict[str, Any]]:
178
+ """Extract lists from content."""
179
+ lists = []
180
+ lines = content.split('\n')
181
+ current_list = None
182
+
183
+ for line in lines:
184
+ line = line.rstrip()
185
+
186
+ # Check for unordered list
187
+ unordered_match = self.list_item_pattern.match(line)
188
+ if unordered_match:
189
+ indent_level = len(unordered_match.group(1)) // 2
190
+ item_text = self._clean_inline_formatting(unordered_match.group(2))
191
+
192
+ if current_list is None or current_list['type'] != 'unordered':
193
+ if current_list:
194
+ lists.append(current_list)
195
+ current_list = {'type': 'unordered', 'items': []}
196
+
197
+ current_list['items'].append({
198
+ 'text': item_text,
199
+ 'level': indent_level
200
+ })
201
+ continue
202
+
203
+ # Check for ordered list
204
+ ordered_match = self.ordered_list_pattern.match(line)
205
+ if ordered_match:
206
+ indent_level = len(ordered_match.group(1)) // 2
207
+ item_text = self._clean_inline_formatting(ordered_match.group(2))
208
+
209
+ if current_list is None or current_list['type'] != 'ordered':
210
+ if current_list:
211
+ lists.append(current_list)
212
+ current_list = {'type': 'ordered', 'items': []}
213
+
214
+ current_list['items'].append({
215
+ 'text': item_text,
216
+ 'level': indent_level
217
+ })
218
+ continue
219
+
220
+ # If we hit a non-list line and have a current list, save it
221
+ if current_list and line.strip():
222
+ lists.append(current_list)
223
+ current_list = None
224
+
225
+ # Don't forget the last list
226
+ if current_list:
227
+ lists.append(current_list)
228
+
229
+ return lists
230
+
231
+ def _extract_code_blocks(self, content: str) -> List[Dict[str, str]]:
232
+ """Extract code blocks from content."""
233
+ code_blocks = []
234
+
235
+ for match in self.code_block_pattern.finditer(content):
236
+ language = match.group(1) or 'text'
237
+ code = match.group(2).strip()
238
+
239
+ code_blocks.append({
240
+ 'language': language,
241
+ 'code': code
242
+ })
243
+
244
+ return code_blocks
245
+
246
+ def _extract_tables(self, content: str) -> List[Dict[str, Any]]:
247
+ """Extract tables from content."""
248
+ tables = []
249
+
250
+ for match in self.table_pattern.finditer(content):
251
+ header_row = match.group(1).strip()
252
+ body_rows = match.group(2).strip()
253
+
254
+ # Parse header
255
+ headers = [cell.strip() for cell in header_row.split('|') if cell.strip()]
256
+
257
+ # Parse body rows
258
+ rows = []
259
+ for row_line in body_rows.split('\n'):
260
+ if row_line.strip() and '|' in row_line:
261
+ cells = [cell.strip() for cell in row_line.split('|') if cell.strip()]
262
+ if cells:
263
+ rows.append(cells)
264
+
265
+ if headers and rows:
266
+ tables.append({
267
+ 'headers': headers,
268
+ 'rows': rows,
269
+ 'columns': len(headers)
270
+ })
271
+
272
+ return tables
273
+
274
+ def _extract_images(self, content: str) -> List[Dict[str, str]]:
275
+ """Extract images from content."""
276
+ images = []
277
+
278
+ for match in self.image_pattern.finditer(content):
279
+ alt_text = match.group(1)
280
+ url = match.group(2)
281
+
282
+ images.append({
283
+ 'alt_text': alt_text,
284
+ 'url': url
285
+ })
286
+
287
+ return images
288
+
289
+ def _extract_links(self, content: str) -> List[Dict[str, str]]:
290
+ """Extract links from content."""
291
+ links = []
292
+
293
+ for match in self.link_pattern.finditer(content):
294
+ text = match.group(1)
295
+ url = match.group(2)
296
+
297
+ links.append({
298
+ 'text': text,
299
+ 'url': url
300
+ })
301
+
302
+ return links
303
+
304
+ def _extract_blockquotes(self, content: str) -> List[str]:
305
+ """Extract blockquotes from content."""
306
+ blockquotes = []
307
+
308
+ for match in self.blockquote_pattern.finditer(content):
309
+ quote_text = match.group(1).strip()
310
+ blockquotes.append(quote_text)
311
+
312
+ return blockquotes
313
+
314
+ def _clean_inline_formatting(self, text: str) -> str:
315
+ """Clean inline markdown formatting from text."""
316
+ # Remove bold
317
+ text = self.bold_pattern.sub(r'\1', text)
318
+ # Remove italic
319
+ text = self.italic_pattern.sub(r'\1', text)
320
+ # Remove inline code
321
+ text = self.inline_code_pattern.sub(r'\1', text)
322
+
323
+ return text.strip()
324
+
325
+ def _create_hierarchy(self, sections: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
326
+ """Create hierarchical structure from flat sections list."""
327
+ if not sections:
328
+ return []
329
+
330
+ result = []
331
+ stack = []
332
+
333
+ for section in sections:
334
+ level = section['level']
335
+
336
+ # Pop from stack until we find a parent at appropriate level
337
+ while stack and stack[-1]['level'] >= level:
338
+ stack.pop()
339
+
340
+ # If we have a parent, add this section as a subsection
341
+ if stack:
342
+ parent = stack[-1]
343
+ if 'subsections' not in parent:
344
+ parent['subsections'] = []
345
+ parent['subsections'].append(section)
346
+ else:
347
+ # This is a top-level section
348
+ result.append(section)
349
+
350
+ # Add this section to the stack
351
+ stack.append(section)
352
+
353
+ return result
354
+
355
+
356
+ class MarkdownToHTMLConverter:
357
+ """Comprehensive markdown to HTML extractor."""
358
+
359
+ def __init__(self):
360
+ """Initialize the extractor."""
361
+ # Compile regex patterns for better performance
362
+ self.header_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
363
+ self.bold_pattern = re.compile(r'\*\*(.+?)\*\*')
364
+ self.italic_pattern = re.compile(r'\*(.+?)\*')
365
+ self.bold_italic_pattern = re.compile(r'\*\*\*(.+?)\*\*\*')
366
+ self.strikethrough_pattern = re.compile(r'~~(.+?)~~')
367
+ self.inline_code_pattern = re.compile(r'`([^`]+)`')
368
+ self.link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
369
+ self.image_pattern = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
370
+ self.horizontal_rule_pattern = re.compile(r'^---+$', re.MULTILINE)
371
+ self.blockquote_pattern = re.compile(r'^>\s+(.+)$', re.MULTILINE)
372
+
373
+ def extract(self, markdown_text: str) -> str:
374
+ """Convert markdown text to HTML.
375
+
376
+ Args:
377
+ markdown_text: The markdown content to extract
378
+
379
+ Returns:
380
+ HTML string
381
+ """
382
+ html = markdown_text
383
+
384
+ # Process code blocks first (before other inline processing)
385
+ html = self._process_code_blocks(html)
386
+
387
+ # Process tables
388
+ html = self._process_tables(html)
389
+
390
+ # Process horizontal rules
391
+ html = self._process_horizontal_rules(html)
392
+
393
+ # Process blockquotes
394
+ html = self._process_blockquotes(html)
395
+
396
+ # Process headers
397
+ html = self._process_headers(html)
398
+
399
+ # Process lists
400
+ html = self._process_lists(html)
401
+
402
+ # Process inline elements
403
+ html = self._process_inline_elements(html)
404
+
405
+ # Process paragraphs
406
+ html = self._process_paragraphs(html)
407
+
408
+ return html
409
+
410
+ def _process_code_blocks(self, text: str) -> str:
411
+ """Process fenced code blocks."""
412
+ # Handle ```code blocks```
413
+ def replace_code_block(match):
414
+ language = match.group(1) or ''
415
+ code = match.group(2)
416
+ lang_class = f' class="language-{language}"' if language else ''
417
+ return f'<pre><code{lang_class}>{self._escape_html(code)}</code></pre>'
418
+
419
+ text = re.sub(r'```(\w+)?\n(.*?)\n```', replace_code_block, text, flags=re.DOTALL)
420
+
421
+ # Handle indented code blocks (4 spaces or tab)
422
+ lines = text.split('\n')
423
+ in_code_block = False
424
+ code_lines = []
425
+ result_lines = []
426
+
427
+ for line in lines:
428
+ if line.startswith(' ') or line.startswith('\t'):
429
+ if not in_code_block:
430
+ in_code_block = True
431
+ code_lines = [line.lstrip()]
432
+ else:
433
+ code_lines.append(line.lstrip())
434
+ else:
435
+ if in_code_block:
436
+ # End code block
437
+ code_content = '\n'.join(code_lines)
438
+ result_lines.append(f'<pre><code>{self._escape_html(code_content)}</code></pre>')
439
+ code_lines = []
440
+ in_code_block = False
441
+ result_lines.append(line)
442
+
443
+ if in_code_block:
444
+ code_content = '\n'.join(code_lines)
445
+ result_lines.append(f'<pre><code>{self._escape_html(code_content)}</code></pre>')
446
+
447
+ return '\n'.join(result_lines)
448
+
449
+ def _process_tables(self, text: str) -> str:
450
+ """Process markdown tables."""
451
+ lines = text.split('\n')
452
+ result_lines = []
453
+ i = 0
454
+
455
+ while i < len(lines):
456
+ line = lines[i]
457
+
458
+ # Check if this line looks like a table header
459
+ if '|' in line and i + 1 < len(lines) and '|' in lines[i + 1]:
460
+ # Check if next line is separator
461
+ next_line = lines[i + 1]
462
+ if re.match(r'^\s*\|[\s\-:|]+\|\s*$', next_line):
463
+ # This is a table
464
+ table_lines = [line]
465
+ j = i + 1
466
+
467
+ # Collect all table rows
468
+ while j < len(lines) and '|' in lines[j]:
469
+ table_lines.append(lines[j])
470
+ j += 1
471
+
472
+ # Convert table to HTML
473
+ html_table = self._convert_table_to_html(table_lines)
474
+ result_lines.append(html_table)
475
+ i = j
476
+ continue
477
+
478
+ result_lines.append(line)
479
+ i += 1
480
+
481
+ return '\n'.join(result_lines)
482
+
483
+ def _convert_table_to_html(self, table_lines: List[str]) -> str:
484
+ """Convert table lines to HTML table."""
485
+ if len(table_lines) < 2:
486
+ return table_lines[0] if table_lines else ''
487
+
488
+ html_parts = ['<table>']
489
+
490
+ # Process header
491
+ header_cells = [cell.strip() for cell in table_lines[0].split('|')[1:-1]]
492
+ html_parts.append('<thead><tr>')
493
+ for cell in header_cells:
494
+ html_parts.append(f'<th>{self._escape_html(cell)}</th>')
495
+ html_parts.append('</tr></thead>')
496
+
497
+ # Process body (skip separator line)
498
+ html_parts.append('<tbody>')
499
+ for line in table_lines[2:]:
500
+ cells = [cell.strip() for cell in line.split('|')[1:-1]]
501
+ html_parts.append('<tr>')
502
+ for cell in cells:
503
+ html_parts.append(f'<td>{self._escape_html(cell)}</td>')
504
+ html_parts.append('</tr>')
505
+ html_parts.append('</tbody>')
506
+
507
+ html_parts.append('</table>')
508
+ return '\n'.join(html_parts)
509
+
510
+ def _process_horizontal_rules(self, text: str) -> str:
511
+ """Process horizontal rules."""
512
+ return self.horizontal_rule_pattern.sub('<hr>', text)
513
+
514
+ def _process_blockquotes(self, text: str) -> str:
515
+ """Process blockquotes."""
516
+ lines = text.split('\n')
517
+ result_lines = []
518
+ i = 0
519
+
520
+ while i < len(lines):
521
+ line = lines[i]
522
+
523
+ if line.startswith('> '):
524
+ # Start blockquote
525
+ quote_lines = [line[2:]] # Remove '> '
526
+ j = i + 1
527
+
528
+ # Collect all quote lines
529
+ while j < len(lines) and (lines[j].startswith('> ') or lines[j].strip() == ''):
530
+ if lines[j].startswith('> '):
531
+ quote_lines.append(lines[j][2:])
532
+ else:
533
+ quote_lines.append('')
534
+ j += 1
535
+
536
+ # Convert to HTML
537
+ quote_content = '\n'.join(quote_lines)
538
+ quote_html = self._process_inline_elements(quote_content)
539
+ result_lines.append(f'<blockquote>{quote_html}</blockquote>')
540
+ i = j
541
+ continue
542
+
543
+ result_lines.append(line)
544
+ i += 1
545
+
546
+ return '\n'.join(result_lines)
547
+
548
+ def _process_headers(self, text: str) -> str:
549
+ """Process markdown headers."""
550
+ def replace_header(match):
551
+ level = len(match.group(1))
552
+ content = match.group(2)
553
+ return f'<h{level}>{self._escape_html(content)}</h{level}>'
554
+
555
+ return self.header_pattern.sub(replace_header, text)
556
+
557
+ def _process_lists(self, text: str) -> str:
558
+ """Process ordered and unordered lists."""
559
+ lines = text.split('\n')
560
+ result_lines = []
561
+ i = 0
562
+
563
+ while i < len(lines):
564
+ line = lines[i]
565
+
566
+ # Check for unordered list
567
+ if re.match(r'^[\s]*[-*+]\s+', line):
568
+ list_lines = self._collect_list_items(lines, i, r'^[\s]*[-*+]\s+')
569
+ html_list = self._convert_list_to_html(list_lines, 'ul')
570
+ result_lines.append(html_list)
571
+ i += len(list_lines)
572
+ continue
573
+
574
+ # Check for ordered list
575
+ elif re.match(r'^[\s]*\d+\.\s+', line):
576
+ list_lines = self._collect_list_items(lines, i, r'^[\s]*\d+\.\s+')
577
+ html_list = self._convert_list_to_html(list_lines, 'ol')
578
+ result_lines.append(html_list)
579
+ i += len(list_lines)
580
+ continue
581
+
582
+ result_lines.append(line)
583
+ i += 1
584
+
585
+ return '\n'.join(result_lines)
586
+
587
+ def _collect_list_items(self, lines: List[str], start_idx: int, pattern: str) -> List[str]:
588
+ """Collect consecutive list items."""
589
+ items = []
590
+ i = start_idx
591
+
592
+ while i < len(lines):
593
+ line = lines[i]
594
+ if re.match(pattern, line):
595
+ items.append(line)
596
+ i += 1
597
+ elif line.strip() == '':
598
+ # Empty line might be part of list item
599
+ items.append(line)
600
+ i += 1
601
+ else:
602
+ break
603
+
604
+ return items
605
+
606
+ def _convert_list_to_html(self, list_lines: List[str], list_type: str) -> str:
607
+ """Convert list lines to HTML list."""
608
+ html_parts = [f'<{list_type}>']
609
+
610
+ for line in list_lines:
611
+ if line.strip() == '':
612
+ continue
613
+
614
+ # Extract list item content
615
+ if list_type == 'ul':
616
+ content = re.sub(r'^[\s]*[-*+]\s+', '', line)
617
+ else:
618
+ content = re.sub(r'^[\s]*\d+\.\s+', '', line)
619
+
620
+ # Process inline elements in list item
621
+ content = self._process_inline_elements(content)
622
+ html_parts.append(f'<li>{content}</li>')
623
+
624
+ html_parts.append(f'</{list_type}>')
625
+ return '\n'.join(html_parts)
626
+
627
+ def _process_inline_elements(self, text: str) -> str:
628
+ """Process inline markdown elements."""
629
+ # Process bold and italic (order matters)
630
+ text = self.bold_italic_pattern.sub(r'<strong><em>\1</em></strong>', text)
631
+ text = self.bold_pattern.sub(r'<strong>\1</strong>', text)
632
+ text = self.italic_pattern.sub(r'<em>\1</em>', text)
633
+
634
+ # Process strikethrough
635
+ text = self.strikethrough_pattern.sub(r'<del>\1</del>', text)
636
+
637
+ # Process inline code
638
+ text = self.inline_code_pattern.sub(r'<code>\1</code>', text)
639
+
640
+ # Process links
641
+ text = self.link_pattern.sub(r'<a href="\2">\1</a>', text)
642
+
643
+ # Process images
644
+ text = self.image_pattern.sub(r'<img src="\2" alt="\1">', text)
645
+
646
+ return text
647
+
648
+ def _process_paragraphs(self, text: str) -> str:
649
+ """Process paragraphs by wrapping non-empty lines in <p> tags."""
650
+ lines = text.split('\n')
651
+ result_lines = []
652
+ current_paragraph = []
653
+
654
+ for line in lines:
655
+ if line.strip() == '':
656
+ if current_paragraph:
657
+ # End current paragraph
658
+ paragraph_content = ' '.join(current_paragraph)
659
+ result_lines.append(f'<p>{paragraph_content}</p>')
660
+ current_paragraph = []
661
+ else:
662
+ # Check if line is already an HTML block element
663
+ if re.match(r'^<(h[1-6]|p|div|blockquote|pre|table|ul|ol|li|hr)', line.strip()):
664
+ # Flush current paragraph if any
665
+ if current_paragraph:
666
+ paragraph_content = ' '.join(current_paragraph)
667
+ result_lines.append(f'<p>{paragraph_content}</p>')
668
+ current_paragraph = []
669
+ result_lines.append(line)
670
+ else:
671
+ current_paragraph.append(line)
672
+
673
+ # Handle any remaining paragraph
674
+ if current_paragraph:
675
+ paragraph_content = ' '.join(current_paragraph)
676
+ result_lines.append(f'<p>{paragraph_content}</p>')
677
+
678
+ return '\n'.join(result_lines)
679
+
680
+ def _escape_html(self, text: str) -> str:
681
+ """Escape HTML special characters."""
682
+ return (text.replace('&', '&amp;')
683
+ .replace('<', '&lt;')
684
+ .replace('>', '&gt;')
685
+ .replace('"', '&quot;')
686
+ .replace("'", '&#39;'))
687
+
688
+
689
+ class ConversionResult:
690
+ """Result object with methods to export to different formats."""
691
+
692
+ def __init__(self, content: str, metadata: Optional[Dict[str, Any]] = None):
693
+ """Initialize the conversion result.
694
+
695
+ Args:
696
+ content: The converted content as string
697
+ metadata: Optional metadata about the conversion
698
+ """
699
+ self.content = content
700
+ self.metadata = metadata or {}
701
+ self._html_converter = MarkdownToHTMLConverter()
702
+ self._json_parser = MarkdownToJSONParser()
703
+
704
+ def extract_markdown(self) -> str:
705
+ """Export as markdown.
706
+
707
+ Returns:
708
+ The content formatted as markdown
709
+ """
710
+ return self.content
711
+
712
+ def extract_html(self) -> str:
713
+ """Export as HTML.
714
+
715
+ Returns:
716
+ The content formatted as HTML
717
+ """
718
+ # Convert markdown content to HTML using the comprehensive extractor
719
+ html_content = self._html_converter.extract(self.content)
720
+
721
+ # Wrap in HTML structure with Nanonets design system
722
+ return f"""<!DOCTYPE html>
723
+ <html lang="en">
724
+ <head>
725
+ <meta charset="UTF-8">
726
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
727
+ <title>Converted Document</title>
728
+ <link rel="preconnect" href="https://fonts.googleapis.com">
729
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
730
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
731
+ <style>
732
+ * {{
733
+ box-sizing: border-box;
734
+ }}
735
+
736
+ body {{
737
+ font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
738
+ line-height: 1.6;
739
+ color: #1F2129;
740
+ background-color: #FFFFFF;
741
+ margin: 0;
742
+ padding: 2rem;
743
+ max-width: 1200px;
744
+ margin: 0 auto;
745
+ }}
746
+
747
+ .content {{
748
+ background: #FFFFFF;
749
+ padding: 2rem;
750
+ border-radius: 8px;
751
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
752
+ }}
753
+
754
+ /* Typography */
755
+ h1, h2, h3, h4, h5, h6 {{
756
+ font-family: 'Inter', sans-serif;
757
+ color: #1D2554;
758
+ margin-top: 2rem;
759
+ margin-bottom: 1rem;
760
+ font-weight: 600;
761
+ line-height: 1.3;
762
+ }}
763
+
764
+ h1 {{ font-size: 48px; letter-spacing: -0.02em; margin-top: 0; }}
765
+ h2 {{ font-size: 36px; letter-spacing: -0.01em; }}
766
+ h3 {{ font-size: 24px; }}
767
+ h4 {{ font-size: 20px; }}
768
+ h5 {{ font-size: 16px; }}
769
+ h6 {{ font-size: 14px; }}
770
+
771
+ p {{
772
+ font-size: 16px;
773
+ line-height: 1.6;
774
+ margin-bottom: 1rem;
775
+ color: #1F2129;
776
+ }}
777
+
778
+ /* Lists */
779
+ ul, ol {{
780
+ margin: 1rem 0;
781
+ padding-left: 2rem;
782
+ }}
783
+
784
+ li {{
785
+ margin-bottom: 0.5rem;
786
+ line-height: 1.6;
787
+ }}
788
+
789
+ /* Code */
790
+ code {{
791
+ background-color: #F8FAFF;
792
+ color: #3A4DB2;
793
+ padding: 0.2rem 0.4rem;
794
+ border-radius: 4px;
795
+ font-family: 'SF Mono', Monaco, 'Cascadia Code', 'Roboto Mono', Consolas, 'Courier New', monospace;
796
+ font-size: 0.9em;
797
+ border: 1px solid #EAEDFF;
798
+ }}
799
+
800
+ pre {{
801
+ background-color: #F8FAFF;
802
+ border: 1px solid #EAEDFF;
803
+ border-radius: 8px;
804
+ padding: 1.5rem;
805
+ overflow-x: auto;
806
+ margin: 1.5rem 0;
807
+ }}
808
+
809
+ pre code {{
810
+ background: none;
811
+ border: none;
812
+ padding: 0;
813
+ color: #1F2129;
814
+ }}
815
+
816
+ /* Tables */
817
+ table {{
818
+ border-collapse: collapse;
819
+ width: 100%;
820
+ margin: 1.5rem 0;
821
+ border-radius: 8px;
822
+ overflow: hidden;
823
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
824
+ }}
825
+
826
+ th, td {{
827
+ border: 1px solid #EAEDFF;
828
+ padding: 0.75rem;
829
+ text-align: left;
830
+ vertical-align: top;
831
+ }}
832
+
833
+ th {{
834
+ background-color: #F2F4FF;
835
+ color: #1D2554;
836
+ font-weight: 600;
837
+ font-size: 14px;
838
+ }}
839
+
840
+ td {{
841
+ background-color: #FFFFFF;
842
+ font-size: 14px;
843
+ }}
844
+
845
+ tr:nth-child(even) td {{
846
+ background-color: #F8FAFF;
847
+ }}
848
+
849
+ /* Links */
850
+ a {{
851
+ color: #546FFF;
852
+ text-decoration: none;
853
+ border-bottom: 1px solid transparent;
854
+ transition: border-bottom-color 0.2s ease;
855
+ }}
856
+
857
+ a:hover {{
858
+ border-bottom-color: #546FFF;
859
+ }}
860
+
861
+ /* Images */
862
+ img {{
863
+ max-width: 100%;
864
+ height: auto;
865
+ border-radius: 8px;
866
+ margin: 1rem 0;
867
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
868
+ }}
869
+
870
+ /* Blockquotes */
871
+ blockquote {{
872
+ border-left: 4px solid #546FFF;
873
+ margin: 1.5rem 0;
874
+ padding: 1rem 1.5rem;
875
+ background-color: #F8FAFF;
876
+ border-radius: 0 8px 8px 0;
877
+ font-style: italic;
878
+ }}
879
+
880
+ blockquote p {{
881
+ margin: 0;
882
+ color: #3A4DB2;
883
+ }}
884
+
885
+ /* Horizontal rules */
886
+ hr {{
887
+ border: none;
888
+ height: 1px;
889
+ background-color: #EAEDFF;
890
+ margin: 2rem 0;
891
+ }}
892
+
893
+ /* Emphasis */
894
+ strong {{
895
+ font-weight: 600;
896
+ color: #1D2554;
897
+ }}
898
+
899
+ em {{
900
+ font-style: italic;
901
+ color: #3A4DB2;
902
+ }}
903
+
904
+ del {{
905
+ text-decoration: line-through;
906
+ color: #676767;
907
+ }}
908
+
909
+ /* Responsive design */
910
+ @media (max-width: 768px) {{
911
+ body {{
912
+ padding: 1rem;
913
+ }}
914
+
915
+ .content {{
916
+ padding: 1rem;
917
+ }}
918
+
919
+ h1 {{ font-size: 32px; }}
920
+ h2 {{ font-size: 28px; }}
921
+ h3 {{ font-size: 20px; }}
922
+
923
+ table {{
924
+ font-size: 12px;
925
+ }}
926
+
927
+ th, td {{
928
+ padding: 0.5rem;
929
+ }}
930
+ }}
931
+ </style>
932
+ </head>
933
+ <body>
934
+ <div class="content">
935
+ {html_content}
936
+ </div>
937
+ </body>
938
+ </html>"""
939
+
940
+ def extract_data(self, specified_fields: Optional[list] = None, json_schema: Optional[dict] = None,
941
+ ollama_url: str = "http://localhost:11434", ollama_model: str = "llama3.2") -> Dict[str, Any]:
942
+ """Convert content to JSON format.
943
+
944
+ Args:
945
+ specified_fields: List of specific fields to extract (uses Ollama)
946
+ json_schema: JSON schema to conform to (uses Ollama)
947
+ ollama_url: Ollama server URL for local processing
948
+ ollama_model: Model name for local processing
949
+
950
+ Returns:
951
+ Dictionary containing the JSON representation
952
+ """
953
+ try:
954
+ # If specific fields or schema are requested, use Ollama extraction
955
+ if specified_fields or json_schema:
956
+ try:
957
+ from docstrange.services import OllamaFieldExtractor
958
+ extractor = OllamaFieldExtractor(base_url=ollama_url, model=ollama_model)
959
+
960
+ if extractor.is_available():
961
+ if specified_fields:
962
+ extracted_data = extractor.extract_fields(self.content, specified_fields)
963
+ return {
964
+ "extracted_fields": extracted_data,
965
+ "requested_fields": specified_fields,
966
+ **self.metadata,
967
+ "format": "local_specified_fields",
968
+ "extractor": "ollama"
969
+ }
970
+ elif json_schema:
971
+ extracted_data = extractor.extract_with_schema(self.content, json_schema)
972
+ return {
973
+ "extracted_data": extracted_data,
974
+ "schema": json_schema,
975
+ **self.metadata,
976
+ "format": "local_json_schema",
977
+ "extractor": "ollama"
978
+ }
979
+ else:
980
+ logger.warning("Ollama not available for field extraction, falling back to standard parsing")
981
+ except Exception as e:
982
+ logger.warning(f"Ollama extraction failed: {e}, falling back to standard parsing")
983
+
984
+ # For general JSON conversion, try Ollama first for better document understanding
985
+ try:
986
+ from docstrange.services import OllamaFieldExtractor
987
+ extractor = OllamaFieldExtractor(base_url=ollama_url, model=ollama_model)
988
+
989
+ if extractor.is_available():
990
+ # Ask Ollama to extract the entire document to structured JSON
991
+ document_json = extractor.extract_document_json(self.content)
992
+ return {
993
+ **document_json,
994
+ **self.metadata,
995
+ "format": "ollama_structured_json",
996
+ "extractor": "ollama"
997
+ }
998
+ else:
999
+ logger.info("Ollama not available, using fallback JSON parser")
1000
+ except Exception as e:
1001
+ logger.warning(f"Ollama document conversion failed: {e}, using fallback parser")
1002
+
1003
+ # Fallback to original parsing logic
1004
+ parsed_content = self._json_parser.parse(self.content)
1005
+ return {
1006
+ **parsed_content,
1007
+ **self.metadata,
1008
+ "format": "structured_json"
1009
+ }
1010
+
1011
+ except Exception as e:
1012
+ logger.error(f"JSON conversion failed: {e}")
1013
+ return {
1014
+ "error": f"Failed to extract to JSON: {str(e)}",
1015
+ "raw_content": self.content,
1016
+ **self.metadata,
1017
+ "format": "error"
1018
+ }
1019
+
1020
+ def extract_text(self) -> str:
1021
+ """Export as plain text.
1022
+
1023
+ Returns:
1024
+ The content as plain text
1025
+ """
1026
+ return self.content
1027
+
1028
+ def extract_csv(self, table_index: int = 0, include_all_tables: bool = False) -> str:
1029
+ """Export tables as CSV format.
1030
+
1031
+ Args:
1032
+ table_index: Which table to export (0-based index). Default is 0 (first table).
1033
+ include_all_tables: If True, export all tables with separators. Default is False.
1034
+
1035
+ Returns:
1036
+ CSV formatted string of the table(s)
1037
+
1038
+ Raises:
1039
+ ValueError: If no tables are found or table_index is out of range
1040
+ """
1041
+ # Parse the content to extract tables
1042
+ json_data = self.extract_data()
1043
+
1044
+ # Extract all tables from all sections
1045
+ tables = []
1046
+
1047
+ def extract_tables_from_sections(sections):
1048
+ for section in sections:
1049
+ content = section.get('content', {})
1050
+ if 'tables' in content:
1051
+ tables.extend(content['tables'])
1052
+ # Recursively check subsections
1053
+ if 'subsections' in section:
1054
+ extract_tables_from_sections(section['subsections'])
1055
+
1056
+ if 'document' in json_data and 'sections' in json_data['document']:
1057
+ extract_tables_from_sections(json_data['document']['sections'])
1058
+
1059
+ if not tables:
1060
+ # If no structured tables found, try to parse markdown tables directly
1061
+ tables = self._extract_markdown_tables_directly(self.content)
1062
+
1063
+ if not tables:
1064
+ raise ValueError("No tables found in the document content")
1065
+
1066
+ if include_all_tables:
1067
+ # Export all tables with separators
1068
+ csv_output = io.StringIO()
1069
+ writer = csv.writer(csv_output)
1070
+
1071
+ for i, table in enumerate(tables):
1072
+ if i > 0:
1073
+ # Add separator between tables
1074
+ writer.writerow([])
1075
+ writer.writerow([f"=== Table {i + 1} ==="])
1076
+ writer.writerow([])
1077
+
1078
+ # Write table headers if available
1079
+ if 'headers' in table and table['headers']:
1080
+ writer.writerow(table['headers'])
1081
+
1082
+ # Write table rows
1083
+ if 'rows' in table:
1084
+ for row in table['rows']:
1085
+ writer.writerow(row)
1086
+
1087
+ return csv_output.getvalue()
1088
+ else:
1089
+ # Export specific table
1090
+ if table_index >= len(tables):
1091
+ raise ValueError(f"Table index {table_index} out of range. Found {len(tables)} table(s)")
1092
+
1093
+ table = tables[table_index]
1094
+ csv_output = io.StringIO()
1095
+ writer = csv.writer(csv_output)
1096
+
1097
+ # Write table headers if available
1098
+ if 'headers' in table and table['headers']:
1099
+ writer.writerow(table['headers'])
1100
+
1101
+ # Write table rows
1102
+ if 'rows' in table:
1103
+ for row in table['rows']:
1104
+ writer.writerow(row)
1105
+
1106
+ return csv_output.getvalue()
1107
+
1108
+ def _extract_markdown_tables_directly(self, content: str) -> List[Dict[str, Any]]:
1109
+ """Extract tables directly from markdown content as fallback."""
1110
+ tables = []
1111
+ table_pattern = re.compile(r'\|(.+)\|\s*\n\|[-\s|:]+\|\s*\n((?:\|.+\|\s*\n?)*)', re.MULTILINE)
1112
+
1113
+ for match in table_pattern.finditer(content):
1114
+ header_row = match.group(1).strip()
1115
+ body_rows = match.group(2).strip()
1116
+
1117
+ # Parse header
1118
+ headers = [cell.strip() for cell in header_row.split('|') if cell.strip()]
1119
+
1120
+ # Parse body rows
1121
+ rows = []
1122
+ for row_line in body_rows.split('\n'):
1123
+ if row_line.strip() and '|' in row_line:
1124
+ cells = [cell.strip() for cell in row_line.split('|') if cell.strip()]
1125
+ if cells:
1126
+ rows.append(cells)
1127
+
1128
+ if headers and rows:
1129
+ tables.append({
1130
+ 'headers': headers,
1131
+ 'rows': rows,
1132
+ 'columns': len(headers)
1133
+ })
1134
+
1135
+ return tables
1136
+
1137
+ def __str__(self) -> str:
1138
+ """String representation of the result."""
1139
+ return self.content
1140
+
1141
+ def __repr__(self) -> str:
1142
+ """Representation of the result object."""
1143
+ return f"ConversionResult(content='{self.content[:50]}...', metadata={self.metadata})"
docstrange/services/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Services for authentication, API key pooling, and local LLM processing."""
2
+
3
+ from .ollama_service import OllamaFieldExtractor
4
+ from .api_key_pool import (
5
+ ApiKeyPool,
6
+ get_pool,
7
+ add_api_key,
8
+ remove_api_key,
9
+ list_api_keys,
10
+ get_available_key,
11
+ )
12
+
13
+ __all__ = [
14
+ "OllamaFieldExtractor",
15
+ "ApiKeyPool",
16
+ "get_pool",
17
+ "add_api_key",
18
+ "remove_api_key",
19
+ "list_api_keys",
20
+ "get_available_key",
21
+ ]
docstrange/services/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (511 Bytes). View file
 
docstrange/services/__pycache__/api_key_pool.cpython-310.pyc ADDED
Binary file (8.41 kB). View file
 
docstrange/services/__pycache__/ollama_service.cpython-310.pyc ADDED
Binary file (8.45 kB). View file
 
docstrange/services/api_key_pool.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ API Key Pool Manager for DocStrange.
3
+
4
+ Manages a pool of Nanonets API keys with automatic rotation on rate limit (429).
5
+ """
6
+
7
+ import os
8
+ import json
9
+ import time
10
+ import threading
11
+ from pathlib import Path
12
+ from typing import Optional, List, Dict, Any
13
+ import logging
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class KeyStatus:
19
+ ACTIVE = "active"
20
+ RATE_LIMITED = "rate_limited"
21
+ EXPIRED = "expired"
22
+
23
+
24
+ class ApiKeyEntry:
25
+ """Represents a single API key in the pool with its state."""
26
+
27
+ def __init__(self, key: str, source: str = "manual"):
28
+ self.key = key
29
+ self.source = source # "manual", "env", "config", "credentials"
30
+ self.status = KeyStatus.ACTIVE
31
+ self.rate_limited_at = None
32
+ self.reset_at = None # When the rate limit resets (epoch time)
33
+ self.requests_made = 0
34
+ self.last_used = None
35
+
36
+ def mark_rate_limited(self, reset_after_seconds: int = 3600):
37
+ """Mark this key as rate-limited."""
38
+ self.status = KeyStatus.RATE_LIMITED
39
+ self.rate_limited_at = time.time()
40
+ self.reset_at = time.time() + reset_after_seconds
41
+ logger.warning(f"API key {self.key[:8]}... rate limited, resets at {self.reset_at}")
42
+
43
+ def is_available(self) -> bool:
44
+ """Check if this key is available for use."""
45
+ if self.status == KeyStatus.ACTIVE:
46
+ return True
47
+ if self.status == KeyStatus.RATE_LIMITED and self.reset_at:
48
+ if time.time() >= self.reset_at:
49
+ self.status = KeyStatus.ACTIVE
50
+ self.rate_limited_at = None
51
+ self.reset_at = None
52
+ return True
53
+ return False
54
+
55
+ def record_use(self):
56
+ """Record that this key was used."""
57
+ self.requests_made += 1
58
+ self.last_used = time.time()
59
+
60
+
61
+ class ApiKeyPool:
62
+ """
63
+ Manages a pool of API keys with automatic rotation.
64
+
65
+ When a key hits rate limit (429), it's marked as unavailable and the next
66
+ key in the pool is tried. When all keys are exhausted, signals fallback.
67
+ """
68
+
69
+ _instance = None
70
+ _lock = threading.Lock()
71
+
72
+ def __init__(self):
73
+ self._keys: List[ApiKeyEntry] = []
74
+ self._current_index = 0
75
+ self._lock_pool = threading.Lock()
76
+ self._config_path = Path.home() / ".docstrange" / "api_keys.json"
77
+ self._load_config()
78
+
79
+ @classmethod
80
+ def get_instance(cls) -> "ApiKeyPool":
81
+ """Get singleton instance."""
82
+ if cls._instance is None:
83
+ with cls._lock:
84
+ if cls._instance is None:
85
+ cls._instance = cls()
86
+ return cls._instance
87
+
88
+ def _load_config(self):
89
+ """Load API keys from config file."""
90
+ try:
91
+ if self._config_path.exists():
92
+ with open(self._config_path, 'r') as f:
93
+ config = json.load(f)
94
+
95
+ keys = config.get("api_keys", [])
96
+ for key_entry in keys:
97
+ if isinstance(key_entry, str):
98
+ self.add_key(key_entry, source="config")
99
+ elif isinstance(key_entry, dict) and "key" in key_entry:
100
+ self.add_key(key_entry["key"], source=key_entry.get("source", "config"))
101
+
102
+ logger.info(f"Loaded {len(self._keys)} API keys from config")
103
+ except Exception as e:
104
+ logger.warning(f"Failed to load API key config: {e}")
105
+
106
+ # Also check environment variable for a comma-separated list of keys
107
+ env_keys = os.environ.get('NANONETS_API_KEYS', '')
108
+ if env_keys:
109
+ for key in env_keys.split(','):
110
+ key = key.strip()
111
+ if key:
112
+ self.add_key(key, source="env")
113
+
114
+ def save_config(self):
115
+ """Save API keys to config file."""
116
+ try:
117
+ config_dir = self._config_path.parent
118
+ config_dir.mkdir(exist_ok=True)
119
+
120
+ keys_data = []
121
+ for entry in self._keys:
122
+ keys_data.append({
123
+ "key": entry.key,
124
+ "source": entry.source
125
+ })
126
+
127
+ with open(self._config_path, 'w') as f:
128
+ json.dump({"api_keys": keys_data}, f, indent=2)
129
+
130
+ os.chmod(self._config_path, 0o600)
131
+ logger.info(f"Saved {len(keys_data)} API keys to config")
132
+ except Exception as e:
133
+ logger.error(f"Failed to save API key config: {e}")
134
+
135
+ def add_key(self, key: str, source: str = "manual") -> bool:
136
+ """Add an API key to the pool."""
137
+ with self._lock_pool:
138
+ # Check for duplicates
139
+ for entry in self._keys:
140
+ if entry.key == key:
141
+ return False
142
+
143
+ self._keys.append(ApiKeyEntry(key, source))
144
+ logger.info(f"Added API key from {source} to pool (total: {len(self._keys)})")
145
+ return True
146
+
147
+ def remove_key(self, key: str) -> bool:
148
+ """Remove an API key from the pool."""
149
+ with self._lock_pool:
150
+ for i, entry in enumerate(self._keys):
151
+ if entry.key == key:
152
+ self._keys.pop(i)
153
+ return True
154
+ return False
155
+
156
+ def get_next_key(self) -> Optional[str]:
157
+ """
158
+ Get the next available API key.
159
+
160
+ Returns None if all keys are rate-limited.
161
+ """
162
+ with self._lock_pool:
163
+ if not self._keys:
164
+ return None
165
+
166
+ # Try to find an available key starting from current index
167
+ total_keys = len(self._keys)
168
+ for i in range(total_keys):
169
+ idx = (self._current_index + i) % total_keys
170
+ if self._keys[idx].is_available():
171
+ self._current_index = idx
172
+ self._keys[idx].record_use()
173
+ return self._keys[idx].key
174
+
175
+ return None
176
+
177
+ def mark_key_rate_limited(self, key: str, reset_after_seconds: int = 3600):
178
+ """Mark a specific key as rate-limited."""
179
+ with self._lock_pool:
180
+ for entry in self._keys:
181
+ if entry.key == key:
182
+ entry.mark_rate_limited(reset_after_seconds)
183
+ break
184
+
185
+ def has_available_keys(self) -> bool:
186
+ """Check if any API keys are available."""
187
+ with self._lock_pool:
188
+ return any(k.is_available() for k in self._keys)
189
+
190
+ def get_pool_stats(self) -> Dict[str, Any]:
191
+ """Get statistics about the key pool."""
192
+ with self._lock_pool:
193
+ stats = {
194
+ "total_keys": len(self._keys),
195
+ "available": 0,
196
+ "rate_limited": 0,
197
+ "total_requests": 0
198
+ }
199
+ for key in self._keys:
200
+ if key.is_available():
201
+ stats["available"] += 1
202
+ else:
203
+ stats["rate_limited"] += 1
204
+ stats["total_requests"] += key.requests_made
205
+ return stats
206
+
207
+ def get_all_keys(self) -> List[str]:
208
+ """Get all API keys (masked for display)."""
209
+ with self._lock_pool:
210
+ return [f"{k.key[:8]}...{k.key[-4:]}" if len(k.key) > 12 else "***" for k in self._keys]
211
+
212
+
213
+ # Convenience functions
214
+ def get_pool() -> ApiKeyPool:
215
+ """Get the API key pool singleton."""
216
+ return ApiKeyPool.get_instance()
217
+
218
+
219
+ def add_api_key(key: str):
220
+ """Add an API key to the pool."""
221
+ pool = get_pool()
222
+ pool.add_key(key)
223
+ pool.save_config()
224
+
225
+
226
+ def remove_api_key(key: str):
227
+ """Remove an API key from the pool."""
228
+ pool = get_pool()
229
+ pool.remove_key(key)
230
+ pool.save_config()
231
+
232
+
233
+ def list_api_keys() -> List[str]:
234
+ """List all API keys (masked)."""
235
+ pool = get_pool()
236
+ return pool.get_all_keys()
237
+
238
+
239
+ def get_available_key() -> Optional[str]:
240
+ """Get the next available API key."""
241
+ return get_pool().get_next_key()