File size: 10,643 Bytes
5f3e9f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
"""Vision AI client for extracting text from images.

Supports:
- Handwritten & low-resolution images (auto-enhanced)
- PDF pages (via PyMuPDF)
- Printed text, equations, diagrams
"""
import os
import io
import base64
from openai import OpenAI
from PIL import Image, ImageEnhance, ImageFilter
import sys

# Add config to path
config_path = os.path.join(os.path.dirname(__file__), '..', '..', 'config')
if config_path not in sys.path:
    sys.path.insert(0, config_path)
from config import API_KEY, API_URL

# Vision model β€” document OCR specialist
MODEL_VISION = "nvidia/llama-3.1-nemotron-nano-vl-8b-v1"

# Fallback model if primary unavailable
MODEL_VISION_FALLBACK = "meta/llama-3.2-90b-vision-instruct"


# ─── Image Preprocessing ────────────────────────────────────────────

def preprocess_image(image_path, target_min_width=1500):
    """
    Preprocess image for better OCR accuracy on handwritten / low-res content.
    
    Steps:
    1. Upscale if too small (< target_min_width)
    2. Enhance contrast for faded/handwritten text
    3. Sharpen to crisp up edges
    4. Convert to high-quality PNG bytes for API
    
    Returns: (base64_encoded_data, mime_type)
    """
    try:
        img = Image.open(image_path)
        original_size = img.size
        
        # Convert to RGB if needed (some PNGs have alpha)
        if img.mode in ('RGBA', 'P', 'LA'):
            background = Image.new('RGB', img.size, (255, 255, 255))
            if img.mode == 'RGBA':
                background.paste(img, mask=img.split()[3])
            else:
                background.paste(img)
            img = background
        elif img.mode != 'RGB':
            img = img.convert('RGB')
        
        # 1. Upscale small images
        width, height = img.size
        if width < target_min_width:
            scale = target_min_width / width
            new_width = int(width * scale)
            new_height = int(height * scale)
            img = img.resize((new_width, new_height), Image.LANCZOS)
            print(f"  πŸ“ Upscaled: {original_size} β†’ {img.size}")
        
        # 2. Enhance contrast (helps with faded/handwritten text)
        enhancer = ImageEnhance.Contrast(img)
        img = enhancer.enhance(1.4)  # 1.4x contrast boost
        
        # 3. Enhance sharpness (crisp text edges)
        enhancer = ImageEnhance.Sharpness(img)
        img = enhancer.enhance(1.8)  # 1.8x sharpness boost
        
        # 4. Slight brightness boost for dark images
        enhancer = ImageEnhance.Brightness(img)
        img = enhancer.enhance(1.1)  # Gentle brightness lift
        
        # 5. Convert to high-quality PNG bytes
        buffer = io.BytesIO()
        img.save(buffer, format='PNG', quality=95)
        buffer.seek(0)
        
        encoded = base64.b64encode(buffer.read()).decode('utf-8')
        print(f"  ✨ Preprocessed: contrast=1.4x, sharpness=1.8x, brightness=1.1x")
        
        return encoded, 'image/png'
        
    except Exception as e:
        print(f"  ⚠️ Preprocessing failed, using original: {e}")
        # Fallback: just read the raw file
        with open(image_path, 'rb') as f:
            raw_data = base64.b64encode(f.read()).decode('utf-8')
        ext = os.path.splitext(image_path)[1].lower()
        mime_type = {
            '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg',
            '.png': 'image/png', '.gif': 'image/gif',
            '.webp': 'image/webp'
        }.get(ext, 'image/jpeg')
        return raw_data, mime_type


# ─── OCR Prompt ──────────────────────────────────────────────────────

OCR_SYSTEM_PROMPT = """You are a high-precision OCR transcription system. Your ONLY task is to read and transcribe ALL visible text from the provided image EXACTLY as it appears β€” word for word, character for character.

ABSOLUTE RULES (violation = failure):
1. TRANSCRIBE VERBATIM β€” copy every word exactly as printed or written
2. NEVER answer, solve, explain, or interpret questions β€” just copy them
3. NEVER summarize, paraphrase, or skip any content
4. Preserve ALL multiple-choice options: (A), (B), (C), (D) β€” copy them all
5. Preserve ALL mathematical equations, formulas, subscripts, superscripts
6. Preserve ALL question numbers, marks allocations like [2], [3], (3x8=24)
7. Preserve ALL section headers: Group A, Group B, Group C
8. Preserve ALL instructions like "Attempt all questions", "Full Marks: 75"
9. For handwritten text, transcribe your best reading β€” mark unclear words with [?]
10. Include EVERY line from top to bottom β€” miss nothing
11. Use markdown: # for titles, ## for sections, ### for subsections
12. Output ONLY the raw transcribed text β€” zero commentary

Think of yourself as a SCANNER that converts images to text. You do not think, interpret, or respond β€” you only copy."""


def _build_user_prompt(user_instructions="", is_handwritten=False):
    """Build the user prompt."""
    handwritten_note = ""
    if is_handwritten:
        handwritten_note = "\nNOTE: This may contain HANDWRITTEN text. Read carefully and transcribe your best interpretation. Mark uncertain words with [?]."
    
    return f"""TRANSCRIBE every single word from this image exactly as written/printed.

CRITICAL: Do NOT answer or solve any questions. Do NOT skip any options (A/B/C/D). Copy ALL text verbatim β€” every question, every option, every mark, every instruction.{handwritten_note}

{f"Document context: {user_instructions}" if user_instructions else ""}

Begin transcription from the very top of the page:"""


# ─── Extraction Functions ────────────────────────────────────────────

def extract_text_from_image(image_path, user_instructions=""):
    """
    Extract text from a single image using vision AI with preprocessing.
    
    Args:
        image_path: Path to image file
        user_instructions: Optional context about the image
    
    Returns:
        dict with raw_text and metadata, or None on failure
    """
    print("=" * 60)
    print(f"πŸ‘οΈ Extracting text from: {os.path.basename(image_path)}")
    
    # Preprocess image for better OCR
    print("  πŸ”§ Preprocessing image...")
    image_data, mime_type = preprocess_image(image_path)
    
    user_prompt = _build_user_prompt(user_instructions)
    
    # Try primary model, then fallback
    for model in [MODEL_VISION, MODEL_VISION_FALLBACK]:
        try:
            client = OpenAI(base_url=API_URL, api_key=API_KEY)
            print(f"  πŸ” Using model: {model}")
            
            completion = client.chat.completions.create(
                model=model,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": OCR_SYSTEM_PROMPT + "\n\n" + user_prompt
                            },
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:{mime_type};base64,{image_data}"
                                }
                            }
                        ]
                    }
                ],
                temperature=0.1,
                max_tokens=16384
            )
            
            raw_text = completion.choices[0].message.content.strip()
            
            if raw_text and len(raw_text) > 20:
                print(f"  βœ… Extracted {len(raw_text)} characters")
                print(f"  πŸ“ Preview: {raw_text[:80]}...")
                print("=" * 60)
                
                return {
                    'raw_text': raw_text,
                    'metadata': {
                        'model': model,
                        'image_file': os.path.basename(image_path),
                        'character_count': len(raw_text),
                        'word_count': len(raw_text.split()),
                        'preprocessed': True
                    }
                }
            else:
                print(f"  ⚠️ Model returned too little text ({len(raw_text)} chars), trying next model...")
                continue
                
        except Exception as e:
            print(f"  ⚠️ Model {model} failed: {e}")
            if model == MODEL_VISION_FALLBACK:
                import traceback
                traceback.print_exc()
            continue
    
    print("  ❌ All models failed for this image")
    return None


def extract_text_from_multiple_images(image_paths, user_instructions=""):
    """
    Extract text from multiple images β€” processes each image individually
    for maximum accuracy, then combines results.
    
    Args:
        image_paths: List of paths to image files
        user_instructions: Optional context about the document
    """
    print("=" * 60)
    print(f"πŸ‘οΈ Extracting text from {len(image_paths)} images...")
    
    # Always process page by page for best results
    all_text_parts = []
    total = len(image_paths)
    
    for i, path in enumerate(image_paths):
        print(f"\nπŸ“„ Processing image {i+1}/{total}...")
        result = extract_text_from_image(path, user_instructions)
        
        if result and result.get('raw_text'):
            if total > 1:
                all_text_parts.append(f"--- Page {i+1} ---\n{result['raw_text']}")
            else:
                all_text_parts.append(result['raw_text'])
            print(f"  βœ… Page {i+1}: {len(result['raw_text'])} chars")
        else:
            print(f"  ⚠️ Page {i+1}: extraction returned empty")
    
    if all_text_parts:
        combined_text = "\n\n".join(all_text_parts)
        print(f"\nβœ… Total extracted: {len(combined_text)} characters from {len(all_text_parts)}/{total} pages")
        print("=" * 60)
        
        return {
            'raw_text': combined_text,
            'metadata': {
                'model': MODEL_VISION,
                'image_count': total,
                'pages_extracted': len(all_text_parts),
                'character_count': len(combined_text),
                'word_count': len(combined_text.split()),
                'preprocessed': True
            }
        }
    
    print("❌ Failed to extract text from any image")
    print("=" * 60)
    return None