Spaces:
Running
Running
| """Vision AI client for extracting text from images. | |
| Supports: | |
| - Handwritten & low-resolution images (auto-enhanced) | |
| - PDF pages (via PyMuPDF) | |
| - Printed text, equations, diagrams | |
| """ | |
| import os | |
| import io | |
| import base64 | |
| from openai import OpenAI | |
| from PIL import Image, ImageEnhance, ImageFilter | |
| import sys | |
| # Add config to path | |
| config_path = os.path.join(os.path.dirname(__file__), '..', '..', 'config') | |
| if config_path not in sys.path: | |
| sys.path.insert(0, config_path) | |
| from config import API_KEY, API_URL | |
| # Vision model β document OCR specialist | |
| MODEL_VISION = "nvidia/llama-3.1-nemotron-nano-vl-8b-v1" | |
| # Fallback model if primary unavailable | |
| MODEL_VISION_FALLBACK = "meta/llama-3.2-90b-vision-instruct" | |
| # βββ Image Preprocessing ββββββββββββββββββββββββββββββββββββββββββββ | |
| def preprocess_image(image_path, target_min_width=1500): | |
| """ | |
| Preprocess image for better OCR accuracy on handwritten / low-res content. | |
| Steps: | |
| 1. Upscale if too small (< target_min_width) | |
| 2. Enhance contrast for faded/handwritten text | |
| 3. Sharpen to crisp up edges | |
| 4. Convert to high-quality PNG bytes for API | |
| Returns: (base64_encoded_data, mime_type) | |
| """ | |
| try: | |
| img = Image.open(image_path) | |
| original_size = img.size | |
| # Convert to RGB if needed (some PNGs have alpha) | |
| if img.mode in ('RGBA', 'P', 'LA'): | |
| background = Image.new('RGB', img.size, (255, 255, 255)) | |
| if img.mode == 'RGBA': | |
| background.paste(img, mask=img.split()[3]) | |
| else: | |
| background.paste(img) | |
| img = background | |
| elif img.mode != 'RGB': | |
| img = img.convert('RGB') | |
| # 1. Upscale small images | |
| width, height = img.size | |
| if width < target_min_width: | |
| scale = target_min_width / width | |
| new_width = int(width * scale) | |
| new_height = int(height * scale) | |
| img = img.resize((new_width, new_height), Image.LANCZOS) | |
| print(f" π Upscaled: {original_size} β {img.size}") | |
| # 2. Enhance contrast (helps with faded/handwritten text) | |
| enhancer = ImageEnhance.Contrast(img) | |
| img = enhancer.enhance(1.4) # 1.4x contrast boost | |
| # 3. Enhance sharpness (crisp text edges) | |
| enhancer = ImageEnhance.Sharpness(img) | |
| img = enhancer.enhance(1.8) # 1.8x sharpness boost | |
| # 4. Slight brightness boost for dark images | |
| enhancer = ImageEnhance.Brightness(img) | |
| img = enhancer.enhance(1.1) # Gentle brightness lift | |
| # 5. Convert to high-quality PNG bytes | |
| buffer = io.BytesIO() | |
| img.save(buffer, format='PNG', quality=95) | |
| buffer.seek(0) | |
| encoded = base64.b64encode(buffer.read()).decode('utf-8') | |
| print(f" β¨ Preprocessed: contrast=1.4x, sharpness=1.8x, brightness=1.1x") | |
| return encoded, 'image/png' | |
| except Exception as e: | |
| print(f" β οΈ Preprocessing failed, using original: {e}") | |
| # Fallback: just read the raw file | |
| with open(image_path, 'rb') as f: | |
| raw_data = base64.b64encode(f.read()).decode('utf-8') | |
| ext = os.path.splitext(image_path)[1].lower() | |
| mime_type = { | |
| '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', | |
| '.png': 'image/png', '.gif': 'image/gif', | |
| '.webp': 'image/webp' | |
| }.get(ext, 'image/jpeg') | |
| return raw_data, mime_type | |
| # βββ OCR Prompt ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| OCR_SYSTEM_PROMPT = """You are a high-precision OCR transcription system. Your ONLY task is to read and transcribe ALL visible text from the provided image EXACTLY as it appears β word for word, character for character. | |
| ABSOLUTE RULES (violation = failure): | |
| 1. TRANSCRIBE VERBATIM β copy every word exactly as printed or written | |
| 2. NEVER answer, solve, explain, or interpret questions β just copy them | |
| 3. NEVER summarize, paraphrase, or skip any content | |
| 4. Preserve ALL multiple-choice options: (A), (B), (C), (D) β copy them all | |
| 5. Preserve ALL mathematical equations, formulas, subscripts, superscripts | |
| 6. Preserve ALL question numbers, marks allocations like [2], [3], (3x8=24) | |
| 7. Preserve ALL section headers: Group A, Group B, Group C | |
| 8. Preserve ALL instructions like "Attempt all questions", "Full Marks: 75" | |
| 9. For handwritten text, transcribe your best reading β mark unclear words with [?] | |
| 10. Include EVERY line from top to bottom β miss nothing | |
| 11. Use markdown: # for titles, ## for sections, ### for subsections | |
| 12. Output ONLY the raw transcribed text β zero commentary | |
| Think of yourself as a SCANNER that converts images to text. You do not think, interpret, or respond β you only copy.""" | |
| def _build_user_prompt(user_instructions="", is_handwritten=False): | |
| """Build the user prompt.""" | |
| handwritten_note = "" | |
| if is_handwritten: | |
| handwritten_note = "\nNOTE: This may contain HANDWRITTEN text. Read carefully and transcribe your best interpretation. Mark uncertain words with [?]." | |
| return f"""TRANSCRIBE every single word from this image exactly as written/printed. | |
| CRITICAL: Do NOT answer or solve any questions. Do NOT skip any options (A/B/C/D). Copy ALL text verbatim β every question, every option, every mark, every instruction.{handwritten_note} | |
| {f"Document context: {user_instructions}" if user_instructions else ""} | |
| Begin transcription from the very top of the page:""" | |
| # βββ Extraction Functions ββββββββββββββββββββββββββββββββββββββββββββ | |
| def extract_text_from_image(image_path, user_instructions=""): | |
| """ | |
| Extract text from a single image using vision AI with preprocessing. | |
| Args: | |
| image_path: Path to image file | |
| user_instructions: Optional context about the image | |
| Returns: | |
| dict with raw_text and metadata, or None on failure | |
| """ | |
| print("=" * 60) | |
| print(f"ποΈ Extracting text from: {os.path.basename(image_path)}") | |
| # Preprocess image for better OCR | |
| print(" π§ Preprocessing image...") | |
| image_data, mime_type = preprocess_image(image_path) | |
| user_prompt = _build_user_prompt(user_instructions) | |
| # Try primary model, then fallback | |
| for model in [MODEL_VISION, MODEL_VISION_FALLBACK]: | |
| try: | |
| client = OpenAI(base_url=API_URL, api_key=API_KEY) | |
| print(f" π Using model: {model}") | |
| completion = client.chat.completions.create( | |
| model=model, | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": OCR_SYSTEM_PROMPT + "\n\n" + user_prompt | |
| }, | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:{mime_type};base64,{image_data}" | |
| } | |
| } | |
| ] | |
| } | |
| ], | |
| temperature=0.1, | |
| max_tokens=16384 | |
| ) | |
| raw_text = completion.choices[0].message.content.strip() | |
| if raw_text and len(raw_text) > 20: | |
| print(f" β Extracted {len(raw_text)} characters") | |
| print(f" π Preview: {raw_text[:80]}...") | |
| print("=" * 60) | |
| return { | |
| 'raw_text': raw_text, | |
| 'metadata': { | |
| 'model': model, | |
| 'image_file': os.path.basename(image_path), | |
| 'character_count': len(raw_text), | |
| 'word_count': len(raw_text.split()), | |
| 'preprocessed': True | |
| } | |
| } | |
| else: | |
| print(f" β οΈ Model returned too little text ({len(raw_text)} chars), trying next model...") | |
| continue | |
| except Exception as e: | |
| print(f" β οΈ Model {model} failed: {e}") | |
| if model == MODEL_VISION_FALLBACK: | |
| import traceback | |
| traceback.print_exc() | |
| continue | |
| print(" β All models failed for this image") | |
| return None | |
| def extract_text_from_multiple_images(image_paths, user_instructions=""): | |
| """ | |
| Extract text from multiple images β processes each image individually | |
| for maximum accuracy, then combines results. | |
| Args: | |
| image_paths: List of paths to image files | |
| user_instructions: Optional context about the document | |
| """ | |
| print("=" * 60) | |
| print(f"ποΈ Extracting text from {len(image_paths)} images...") | |
| # Always process page by page for best results | |
| all_text_parts = [] | |
| total = len(image_paths) | |
| for i, path in enumerate(image_paths): | |
| print(f"\nπ Processing image {i+1}/{total}...") | |
| result = extract_text_from_image(path, user_instructions) | |
| if result and result.get('raw_text'): | |
| if total > 1: | |
| all_text_parts.append(f"--- Page {i+1} ---\n{result['raw_text']}") | |
| else: | |
| all_text_parts.append(result['raw_text']) | |
| print(f" β Page {i+1}: {len(result['raw_text'])} chars") | |
| else: | |
| print(f" β οΈ Page {i+1}: extraction returned empty") | |
| if all_text_parts: | |
| combined_text = "\n\n".join(all_text_parts) | |
| print(f"\nβ Total extracted: {len(combined_text)} characters from {len(all_text_parts)}/{total} pages") | |
| print("=" * 60) | |
| return { | |
| 'raw_text': combined_text, | |
| 'metadata': { | |
| 'model': MODEL_VISION, | |
| 'image_count': total, | |
| 'pages_extracted': len(all_text_parts), | |
| 'character_count': len(combined_text), | |
| 'word_count': len(combined_text.split()), | |
| 'preprocessed': True | |
| } | |
| } | |
| print("β Failed to extract text from any image") | |
| print("=" * 60) | |
| return None | |