"""OCR-based readability features. Fallback to heuristic when OCR unavailable.""" import numpy as np from PIL import Image from typing import Dict def _compute_text_heuristic(img: Image.Image) -> Dict[str, float]: """ Fallback heuristic: detect high-contrast regions as proxy for text presence. Returns reasonable defaults when OCR is unavailable. """ gray = np.array(img.convert("L")).astype(np.float32) # Edge density is proxy for text edges = np.abs(np.diff(gray, axis=1, append=gray[:, -1:])) + np.abs(np.diff(gray, axis=0, append=gray[-1:, :])) text_proxy = float(edges.mean()) / (gray.max() + 1e-8) return { "text_coverage": 0.0, "avg_ocr_confidence": 0.0, "word_count": 0, "text_density": 0.0, "avg_text_height_ratio": 0.0, "has_text": False, "text_proxy": text_proxy, } def compute_ocr_features(img: Image.Image, ocr_results: list = None) -> Dict[str, float]: """ Compute OCR-based readability features. Args: img: PIL Image ocr_results: List of (bbox, text, confidence) tuples from EasyOCR. If None, uses heuristic fallback. Returns dict with keys: text_coverage, avg_ocr_confidence, word_count, text_density, avg_text_height_ratio, has_text, text_proxy """ if ocr_results is None: return _compute_text_heuristic(img) total_area = img.width * img.height if not ocr_results: return { "text_coverage": 0.0, "avg_ocr_confidence": 0.0, "word_count": 0, "text_density": 0.0, "avg_text_height_ratio": 0.0, "has_text": False, "text_proxy": 0.0, } text_area = 0.0 total_conf = 0.0 total_height = 0.0 for result in ocr_results: # result[0] is bbox: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] # result[1] is text string # result[2] is confidence bbox = result[0] conf = result[2] # Approximate area from bbox xs = [p[0] for p in bbox] ys = [p[1] for p in bbox] area = (max(xs) - min(xs)) * (max(ys) - min(ys)) text_area += area total_conf += conf total_height += max(ys) - min(ys) num_detections = len(ocr_results) avg_height = total_height / num_detections return { "text_coverage": float(text_area / (total_area + 1e-8)), "avg_ocr_confidence": float(total_conf / num_detections), "word_count": num_detections, "text_density": float(num_detections / (total_area / 10000 + 1e-8)), "avg_text_height_ratio": float(avg_height / (img.height + 1e-8)), "has_text": True, "text_proxy": float(total_conf / num_detections), }