YT-AI-Automation / backend /src /core /vision_client.py
github-actions
Sync Docker Space
5f3e9f5
"""Vision AI client for extracting text from images.
Supports:
- Handwritten & low-resolution images (auto-enhanced)
- PDF pages (via PyMuPDF)
- Printed text, equations, diagrams
"""
import os
import io
import base64
from openai import OpenAI
from PIL import Image, ImageEnhance, ImageFilter
import sys
# Add config to path
config_path = os.path.join(os.path.dirname(__file__), '..', '..', 'config')
if config_path not in sys.path:
sys.path.insert(0, config_path)
from config import API_KEY, API_URL
# Vision model β€” document OCR specialist
MODEL_VISION = "nvidia/llama-3.1-nemotron-nano-vl-8b-v1"
# Fallback model if primary unavailable
MODEL_VISION_FALLBACK = "meta/llama-3.2-90b-vision-instruct"
# ─── Image Preprocessing ────────────────────────────────────────────
def preprocess_image(image_path, target_min_width=1500):
"""
Preprocess image for better OCR accuracy on handwritten / low-res content.
Steps:
1. Upscale if too small (< target_min_width)
2. Enhance contrast for faded/handwritten text
3. Sharpen to crisp up edges
4. Convert to high-quality PNG bytes for API
Returns: (base64_encoded_data, mime_type)
"""
try:
img = Image.open(image_path)
original_size = img.size
# Convert to RGB if needed (some PNGs have alpha)
if img.mode in ('RGBA', 'P', 'LA'):
background = Image.new('RGB', img.size, (255, 255, 255))
if img.mode == 'RGBA':
background.paste(img, mask=img.split()[3])
else:
background.paste(img)
img = background
elif img.mode != 'RGB':
img = img.convert('RGB')
# 1. Upscale small images
width, height = img.size
if width < target_min_width:
scale = target_min_width / width
new_width = int(width * scale)
new_height = int(height * scale)
img = img.resize((new_width, new_height), Image.LANCZOS)
print(f" πŸ“ Upscaled: {original_size} β†’ {img.size}")
# 2. Enhance contrast (helps with faded/handwritten text)
enhancer = ImageEnhance.Contrast(img)
img = enhancer.enhance(1.4) # 1.4x contrast boost
# 3. Enhance sharpness (crisp text edges)
enhancer = ImageEnhance.Sharpness(img)
img = enhancer.enhance(1.8) # 1.8x sharpness boost
# 4. Slight brightness boost for dark images
enhancer = ImageEnhance.Brightness(img)
img = enhancer.enhance(1.1) # Gentle brightness lift
# 5. Convert to high-quality PNG bytes
buffer = io.BytesIO()
img.save(buffer, format='PNG', quality=95)
buffer.seek(0)
encoded = base64.b64encode(buffer.read()).decode('utf-8')
print(f" ✨ Preprocessed: contrast=1.4x, sharpness=1.8x, brightness=1.1x")
return encoded, 'image/png'
except Exception as e:
print(f" ⚠️ Preprocessing failed, using original: {e}")
# Fallback: just read the raw file
with open(image_path, 'rb') as f:
raw_data = base64.b64encode(f.read()).decode('utf-8')
ext = os.path.splitext(image_path)[1].lower()
mime_type = {
'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg',
'.png': 'image/png', '.gif': 'image/gif',
'.webp': 'image/webp'
}.get(ext, 'image/jpeg')
return raw_data, mime_type
# ─── OCR Prompt ──────────────────────────────────────────────────────
OCR_SYSTEM_PROMPT = """You are a high-precision OCR transcription system. Your ONLY task is to read and transcribe ALL visible text from the provided image EXACTLY as it appears β€” word for word, character for character.
ABSOLUTE RULES (violation = failure):
1. TRANSCRIBE VERBATIM β€” copy every word exactly as printed or written
2. NEVER answer, solve, explain, or interpret questions β€” just copy them
3. NEVER summarize, paraphrase, or skip any content
4. Preserve ALL multiple-choice options: (A), (B), (C), (D) β€” copy them all
5. Preserve ALL mathematical equations, formulas, subscripts, superscripts
6. Preserve ALL question numbers, marks allocations like [2], [3], (3x8=24)
7. Preserve ALL section headers: Group A, Group B, Group C
8. Preserve ALL instructions like "Attempt all questions", "Full Marks: 75"
9. For handwritten text, transcribe your best reading β€” mark unclear words with [?]
10. Include EVERY line from top to bottom β€” miss nothing
11. Use markdown: # for titles, ## for sections, ### for subsections
12. Output ONLY the raw transcribed text β€” zero commentary
Think of yourself as a SCANNER that converts images to text. You do not think, interpret, or respond β€” you only copy."""
def _build_user_prompt(user_instructions="", is_handwritten=False):
"""Build the user prompt."""
handwritten_note = ""
if is_handwritten:
handwritten_note = "\nNOTE: This may contain HANDWRITTEN text. Read carefully and transcribe your best interpretation. Mark uncertain words with [?]."
return f"""TRANSCRIBE every single word from this image exactly as written/printed.
CRITICAL: Do NOT answer or solve any questions. Do NOT skip any options (A/B/C/D). Copy ALL text verbatim β€” every question, every option, every mark, every instruction.{handwritten_note}
{f"Document context: {user_instructions}" if user_instructions else ""}
Begin transcription from the very top of the page:"""
# ─── Extraction Functions ────────────────────────────────────────────
def extract_text_from_image(image_path, user_instructions=""):
"""
Extract text from a single image using vision AI with preprocessing.
Args:
image_path: Path to image file
user_instructions: Optional context about the image
Returns:
dict with raw_text and metadata, or None on failure
"""
print("=" * 60)
print(f"πŸ‘οΈ Extracting text from: {os.path.basename(image_path)}")
# Preprocess image for better OCR
print(" πŸ”§ Preprocessing image...")
image_data, mime_type = preprocess_image(image_path)
user_prompt = _build_user_prompt(user_instructions)
# Try primary model, then fallback
for model in [MODEL_VISION, MODEL_VISION_FALLBACK]:
try:
client = OpenAI(base_url=API_URL, api_key=API_KEY)
print(f" πŸ” Using model: {model}")
completion = client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": OCR_SYSTEM_PROMPT + "\n\n" + user_prompt
},
{
"type": "image_url",
"image_url": {
"url": f"data:{mime_type};base64,{image_data}"
}
}
]
}
],
temperature=0.1,
max_tokens=16384
)
raw_text = completion.choices[0].message.content.strip()
if raw_text and len(raw_text) > 20:
print(f" βœ… Extracted {len(raw_text)} characters")
print(f" πŸ“ Preview: {raw_text[:80]}...")
print("=" * 60)
return {
'raw_text': raw_text,
'metadata': {
'model': model,
'image_file': os.path.basename(image_path),
'character_count': len(raw_text),
'word_count': len(raw_text.split()),
'preprocessed': True
}
}
else:
print(f" ⚠️ Model returned too little text ({len(raw_text)} chars), trying next model...")
continue
except Exception as e:
print(f" ⚠️ Model {model} failed: {e}")
if model == MODEL_VISION_FALLBACK:
import traceback
traceback.print_exc()
continue
print(" ❌ All models failed for this image")
return None
def extract_text_from_multiple_images(image_paths, user_instructions=""):
"""
Extract text from multiple images β€” processes each image individually
for maximum accuracy, then combines results.
Args:
image_paths: List of paths to image files
user_instructions: Optional context about the document
"""
print("=" * 60)
print(f"πŸ‘οΈ Extracting text from {len(image_paths)} images...")
# Always process page by page for best results
all_text_parts = []
total = len(image_paths)
for i, path in enumerate(image_paths):
print(f"\nπŸ“„ Processing image {i+1}/{total}...")
result = extract_text_from_image(path, user_instructions)
if result and result.get('raw_text'):
if total > 1:
all_text_parts.append(f"--- Page {i+1} ---\n{result['raw_text']}")
else:
all_text_parts.append(result['raw_text'])
print(f" βœ… Page {i+1}: {len(result['raw_text'])} chars")
else:
print(f" ⚠️ Page {i+1}: extraction returned empty")
if all_text_parts:
combined_text = "\n\n".join(all_text_parts)
print(f"\nβœ… Total extracted: {len(combined_text)} characters from {len(all_text_parts)}/{total} pages")
print("=" * 60)
return {
'raw_text': combined_text,
'metadata': {
'model': MODEL_VISION,
'image_count': total,
'pages_extracted': len(all_text_parts),
'character_count': len(combined_text),
'word_count': len(combined_text.split()),
'preprocessed': True
}
}
print("❌ Failed to extract text from any image")
print("=" * 60)
return None