import os import sys import cv2 import numpy as np from paddleocr import PPStructure, PaddleOCR from paddleocr.ppstructure.recovery.recovery_to_doc import convert_info_docx from docx import Document from docx.shared import Pt, Inches, Emu class LocalOCREngine: def __init__(self, use_gpu=False, lang='ch'): self.use_gpu = use_gpu self.lang = lang # Initialize PaddleOCR (Fallback to non-structure engine to fix crash) print(f"Initializing PaddleOCR (v4) with lang={lang}...") try: self.table_engine = PaddleOCR( show_log=True, use_gpu=use_gpu, lang=lang, ocr_version='PP-OCRv4', use_angle_cls=False ) print(f"✓ Initialized PaddleOCR (v4) successfully.") except Exception as e: print(f"PaddleOCR init failed: {e}") raise e def custom_convert_to_docx(self, processed_output, save_path, img_width_px, img_height_px): """ Generates a Word document where each text block is positioned to match the original image layout. """ doc = Document() # A4 Page dimensions in EMU (roughly 8.27in x 11.69in) # 1 inch = 914400 EMU page_width_emu = int(8.27 * 914400) page_height_emu = int(11.69 * 914400) # Scale factor from pixels to EMU scale_x = page_width_emu / img_width_px scale_y = page_height_emu / img_height_px # Collect all lines with their global coordinates for sorting all_lines = [] for item in processed_output: for line in item.get('lines', []): if line.get('bbox'): all_lines.append({ 'text': line['text'], 'bbox': line['bbox'] }) # Sort all lines by Y then X to maintain natural reading order flow with spacing all_lines.sort(key=lambda x: (x['bbox'][1], x['bbox'][0])) last_y = 0 for line in all_lines: bbox = line['bbox'] # x1, y1, x2, y2 x_emu = int(bbox[0] * scale_x) y_emu = int(bbox[1] * scale_y) # Calculate delta Y for "space_before" delta_y = max(0, y_emu - last_y) p = doc.add_paragraph() p.paragraph_format.left_indent = Emu(x_emu) # We use space_before to simulate the vertical position # Note: space_before is slightly capped in Word UI but works well for layout # Cap space_before to avoid huge gaps failing docx p.paragraph_format.space_before = Emu(min(delta_y, 1000000)) run = p.add_run(line['text']) run.font.size = Pt(9) run.font.name = 'Arial' # Update last_y based on the bottom of the current line last_y = int(bbox[3] * scale_y) doc.save(save_path) return save_path def process_image(self, img_path_or_array, save_folder="./output", img_name="result"): if not os.path.exists(save_folder): os.makedirs(save_folder) if isinstance(img_path_or_array, str): img = cv2.imread(img_path_or_array) img_name = os.path.basename(img_path_or_array).split('.')[0] else: img = img_path_or_array if img is None: raise ValueError("Image could not be loaded.") h, w = img.shape[:2] # Run the engine result = self.table_engine(img) # Compatibility Fix: Convert PaddleOCR format to PPStructure format if needed print(f"DEBUG: result type = {type(result)}") # PaddleOCR returns different formats: # - When called directly: returns a list [[box, (text, conf)], ...] # - When using certain modes: returns tuple (result_list, ...) if isinstance(result, tuple) and len(result) > 0: print(f"DEBUG: PaddleOCR returned tuple with {len(result)} elements") print(f"DEBUG: result[0] type = {type(result[0])}, length = {len(result[0]) if hasattr(result[0], '__len__') else 'N/A'}") # If first element is a list, it's likely the OCR results if isinstance(result[0], list) and len(result[0]) > 0: # Check if it's already in the correct format [box, (text, conf)] first_item = result[0][0] print(f"DEBUG: first_item type = {type(first_item)}") if isinstance(first_item, (list, tuple)) and len(first_item) >= 2: # Format: [[box, (text, conf)], ...] lines = result[0] print(f"DEBUG: Found {len(lines)} lines with box+text format") formatted_res = [] for i, line in enumerate(lines): if len(line) >= 2: box = line[0] text_info = line[1] text = text_info[0] if isinstance(text_info, (list, tuple)) else str(text_info) pts = np.array(box) x1, y1 = np.min(pts, axis=0) x2, y2 = np.max(pts, axis=0) formatted_res.append({ 'text': text, 'bbox': [float(x1), float(y1), float(x2), float(y2)] }) if i < 3: # Log first 3 for debugging print(f"DEBUG: Line {i}: text='{text}', bbox=[{x1:.1f},{y1:.1f},{x2:.1f},{y2:.1f}]") print(f"DEBUG: Converted to {len(formatted_res)} formatted results") result = [{ 'type': 'text', 'bbox': [0, 0, w, h], 'res': formatted_res }] elif isinstance(first_item, np.ndarray): # Format: tuple([box_array, box_array, ...], [text_info, text_info, ...], [scores, ...]) # Boxes and texts are in SEPARATE lists - need to zip them! print(f"DEBUG: Boxes and texts are separate - zipping them together") boxes = result[0] # List of box arrays texts_and_scores = result[1] if len(result) > 1 else [] # List of (text, score) tuples print(f"DEBUG: Found {len(boxes)} boxes and {len(texts_and_scores)} text items") formatted_res = [] for i, (box, text_info) in enumerate(zip(boxes, texts_and_scores)): text = text_info[0] if isinstance(text_info, (list, tuple)) else str(text_info) pts = np.array(box) x1, y1 = np.min(pts, axis=0) x2, y2 = np.max(pts, axis=0) formatted_res.append({ 'text': text, 'bbox': [float(x1), float(y1), float(x2), float(y2)] }) if i < 3: # Log first 3 print(f"DEBUG: Line {i}: text='{text}', bbox=[{x1:.1f},{y1:.1f},{x2:.1f},{y2:.1f}]") print(f"DEBUG: Successfully converted {len(formatted_res)} lines!") result = [{ 'type': 'text', 'bbox': [0, 0, w, h], 'res': formatted_res }] else: print(f"DEBUG: Unexpected first_item type: {type(first_item)}") result = [] else: print(f"DEBUG: result[0] is not a list or is empty") result = [] elif isinstance(result, list) and len(result) > 0 and not isinstance(result[0], dict): print("DEBUG: Converting list PaddleOCR format to PPStructure format") lines = result formatted_res = [] for i, line in enumerate(lines): if isinstance(line, (list, tuple)) and len(line) >= 2: box = line[0] text = line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1]) pts = np.array(box) x1, y1 = np.min(pts, axis=0) x2, y2 = np.max(pts, axis=0) formatted_res.append({ 'text': text, 'bbox': [float(x1), float(y1), float(x2), float(y2)] }) result = [{ 'type': 'text', 'bbox': [0, 0, w, h], 'res': formatted_res }] # Sort regions by y-coordinate to ensure reading order print(f"DEBUG: About to sort. result type = {type(result)}, length = {len(result) if isinstance(result, list) else 'N/A'}") if isinstance(result, list) and len(result) > 0: print(f"DEBUG: result[0] = {result[0] if isinstance(result[0], dict) else 'NOT DICT'}") sorted_res = sorted(result, key=lambda x: (x['bbox'][1], x['bbox'][0])) processed_output = [] for region in sorted_res: region_type = region.get('type', '').lower() res = region.get('res', {}) item = { 'type': region_type, 'bbox': region.get('bbox'), # [x1, y1, x2, y2] 'html': None, 'lines': [] } # Helper to get bbox from text_region or bbox key def get_bbox(data): if not isinstance(data, dict): return None # Try common keys box = data.get('bbox') or data.get('text_region') or data.get('poly') if not box: return None # If it's a polygon [[x,y], [x,y], [x,y], [x,y]] if isinstance(box, list) and len(box) == 4 and isinstance(box[0], list): pts = np.array(box) x1, y1 = np.min(pts, axis=0) x2, y2 = np.max(pts, axis=0) return [float(x1), float(y1), float(x2), float(y2)] # If it's a flat list [x1, y1, x2, y2] if isinstance(box, list) and len(box) == 4 and not isinstance(box[0], list): return [float(i) for i in box] return None if region_type == 'table': item['html'] = res.get('html') # Table cells/lines table_res = res.get('cell', []) or res.get('content', []) if isinstance(table_res, list): for cell in table_res: item['lines'].append({ 'text': cell.get('text', '') if isinstance(cell, dict) else str(cell), 'bbox': get_bbox(cell) }) else: # Regular text region if isinstance(res, list): for line in res: if isinstance(line, dict): item['lines'].append({ 'text': line.get('text', ''), 'bbox': get_bbox(line) }) elif isinstance(line, (list, tuple)) and len(line) >= 2: # PaddleOCR format: [bbox, (text, conf)] box = line[0] text = line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1]) l_item = {'text': text, 'bbox': None} # Convert box to [x1, y1, x2, y2] if isinstance(box, list) and len(box) == 4 and isinstance(box[0], list): pts = np.array(box) x1, y1 = np.min(pts, axis=0) x2, y2 = np.max(pts, axis=0) l_item['bbox'] = [float(x1), float(y1), float(x2), float(y2)] item['lines'].append(l_item) # Helper to calculate IoU for overlap filtering def get_iou(box1, box2): if not box1 or not box2: return 0 x_left = max(box1[0], box2[0]) y_top = max(box1[1], box2[1]) x_right = min(box1[2], box2[2]) y_bottom = min(box1[3], box2[3]) if x_right < x_left or y_bottom < y_top: return 0.0 intersection_area = (x_right - x_left) * (y_bottom - y_top) area1 = (box1[2] - box1[0]) * (box1[3] - box1[1]) area2 = (box2[2] - box2[0]) * (box2[3] - box2[1]) return intersection_area / float(area1 + area2 - intersection_area) # Filter out overlapping lines (EXTREMELY aggressive threshold) unique_lines = [] raw_lines = item['lines'] # Sort by confidence/size if possible, but here we just use order for i, line in enumerate(raw_lines): if not line.get('bbox'): unique_lines.append(line) continue is_duplicate = False for existing in unique_lines: if existing.get('bbox') and get_iou(line['bbox'], existing['bbox']) > 0.15: if len(line['text']) > len(existing['text']): existing['text'] = line['text'] # Keep longer text is_duplicate = True break if not is_duplicate: unique_lines.append(line) item['lines'] = unique_lines processed_output.append(item) # --- NEW: Layout Recovery (Absolute Positioning Docx) --- docx_path = None custom_docx_path = os.path.join(save_folder, f"{img_name}_layout.docx") # 1. Try Native PaddleOCR Recovery (Better for Tables/Structure) try: print(f"Attempting native docx recovery for {img_name}...") convert_info_docx(img, result, save_folder, img_name) # Helper: convert_info_docx saves as "{img_name}_ocr.docx" built_in_file = os.path.join(save_folder, f"{img_name}_ocr.docx") if os.path.exists(built_in_file): docx_path = built_in_file print(f"✓ Native recovery successful: {docx_path}") except Exception as e: print(f"Native recovery failed: {e}") # 2. Fallback to Custom Layout (if native failed or didn't produce file) if not docx_path: try: print("Falling back to custom layout engine...") self.custom_convert_to_docx(processed_output, custom_docx_path, img_width_px=w, img_height_px=h) docx_path = custom_docx_path except Exception as e: print(f"Custom layout docx generation also failed: {e}") # Prepare metadata for frontend scaling metadata = { 'width': w, 'height': h, 'image_name': img_name } return { "processed_output": processed_output, "raw_result": result, "docx_path": docx_path, "metadata": metadata } def regenerate_docx_from_result(self, result, img_path_or_array, save_folder="./output", img_name="edited_result"): """ Regenerates the Word document using the existing result structure (but potentially edited text). Reuses the native convert_info_docx function to ensure 100% consistent formatting. """ if isinstance(img_path_or_array, str): img = cv2.imread(img_path_or_array) else: img = img_path_or_array if img is None: raise ValueError("Image could not be loaded for regeneration.") print(f"Regenerating docx for {img_name} with edited data...") docx_path = None # 1. Try Native PaddleOCR Recovery first (Primary method) try: # result structure MUST retain the original format: # [{'type': 'text', 'bbox': [x,y,w,h], 'res': [{'text': '...', 'bbox': [...]}, ...]}] convert_info_docx(img, result, save_folder, img_name) # Helper: convert_info_docx saves as "{img_name}_ocr.docx" built_in_file = os.path.join(save_folder, f"{img_name}_ocr.docx") if os.path.exists(built_in_file): docx_path = built_in_file print(f"✓ Regeneration successful: {docx_path}") except Exception as e: print(f"Regeneration failed: {e}") return None return docx_path if __name__ == "__main__": # Test initialization engine = LocalOCREngine() print("Engine Test Complete.")