""" Watermark Remover Module Removes text watermarks from PDF pages using image processing Optimized for file size and quality """ import io import fitz # PyMuPDF import numpy as np from PIL import Image from concurrent.futures import ThreadPoolExecutor import time try: import cv2 CV2_AVAILABLE = True except ImportError: CV2_AVAILABLE = False def remove_watermark_from_pdf( pdf_bytes: bytes, watermark_text: str = "Educated Nepal", method: str = "inpaint", intensity: int = 50, dpi: int = 150, jpeg_quality: int = 75, max_workers: int = 4 ) -> bytes: """ Remove watermark from PDF pages with optimized output size. Uses JPEG compression to keep file size small. Args: pdf_bytes: Input PDF as bytes watermark_text: Text to remove (not used in current methods) method: 'inpaint', 'threshold', or 'color' intensity: 0-100, higher = more aggressive removal dpi: Resolution for processing (lower = smaller file, 100-150 recommended) jpeg_quality: JPEG quality 10-100 (lower = smaller file, 60-80 recommended) max_workers: Parallel processing threads """ if not CV2_AVAILABLE: raise ImportError("OpenCV not installed. Run: pip install opencv-python-headless") start_time = time.time() original_size = len(pdf_bytes) doc = fitz.open(stream=pdf_bytes, filetype="pdf") output_doc = fitz.open() # Get original page sizes to maintain dimensions page_sizes = [(doc[i].rect.width, doc[i].rect.height) for i in range(len(doc))] def process_page(page_num): page = doc[page_num] orig_width, orig_height = page_sizes[page_num] # Render at specified DPI mat = fitz.Matrix(dpi / 72, dpi / 72) pix = page.get_pixmap(matrix=mat) # Convert to numpy array directly (faster than PNG encoding) img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n) # Convert RGB to BGR for OpenCV if needed if pix.n == 4: # RGBA img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR) elif pix.n == 3: # RGB img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Process to remove watermark if method == "inpaint": result = remove_by_inpainting(img, intensity) elif method == "threshold": result = remove_by_threshold(img, intensity) elif method == "color": result = remove_by_color(img, intensity) else: result = remove_by_inpainting(img, intensity) # Convert back to RGB for PIL result_rgb = cv2.cvtColor(result, cv2.COLOR_BGR2RGB) # Encode as JPEG with specified quality pil_img = Image.fromarray(result_rgb) jpeg_buffer = io.BytesIO() pil_img.save(jpeg_buffer, format='JPEG', quality=jpeg_quality, optimize=True) jpeg_bytes = jpeg_buffer.getvalue() return page_num, jpeg_bytes, (orig_width, orig_height) # Process pages in parallel with ThreadPoolExecutor(max_workers=max_workers) as executor: results = list(executor.map(lambda i: process_page(i), range(len(doc)))) # Sort by page number results.sort(key=lambda x: x[0]) # Create output PDF with original page sizes for page_num, jpeg_bytes, (orig_width, orig_height) in results: # Create page with original dimensions pdf_page = output_doc.new_page(width=orig_width, height=orig_height) # Insert image to fill the page rect = fitz.Rect(0, 0, orig_width, orig_height) pdf_page.insert_image(rect, stream=jpeg_bytes) # Save with maximum compression output_bytes = output_doc.tobytes(deflate=True, garbage=4, clean=True) page_count = len(doc) doc.close() output_doc.close() elapsed = time.time() - start_time output_size = len(output_bytes) ratio = output_size / original_size if original_size > 0 else 1 print(f"Watermark removal: {page_count} pages in {elapsed:.1f}s, " f"{original_size/1024:.0f}KB -> {output_size/1024:.0f}KB ({ratio:.1%})") return output_bytes def remove_by_inpainting(img: np.ndarray, intensity: int) -> np.ndarray: """Remove watermark using inpainting - best for handwritten notes.""" gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # Adaptive threshold based on intensity # Higher intensity = lower threshold = more aggressive thresh_value = 220 - int(intensity * 0.4) # Range: 220 to 180 # Find light gray areas (watermark) _, mask = cv2.threshold(gray, thresh_value, 255, cv2.THRESH_BINARY) # Exclude pure white (paper) and very light areas white_mask = gray > 248 mask[white_mask] = 0 # Also exclude dark areas (actual content) dark_mask = gray < 200 mask[dark_mask] = 0 # Small dilation to cover watermark edges kernel = np.ones((2, 2), np.uint8) mask = cv2.dilate(mask, kernel, iterations=1) # Inpaint - use TELEA for better results result = cv2.inpaint(img, mask, inpaintRadius=3, flags=cv2.INPAINT_TELEA) return result def remove_by_threshold(img: np.ndarray, intensity: int) -> np.ndarray: """Remove watermark by converting light gray to white - fast method.""" gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # Threshold: higher intensity = more aggressive thresh_value = 230 - int(intensity * 0.5) # Range: 230 to 180 # Create mask for light gray areas mask = (gray > thresh_value) & (gray < 250) result = img.copy() result[mask] = [255, 255, 255] return result def remove_by_color(img: np.ndarray, intensity: int) -> np.ndarray: """Remove watermark by targeting gray color range.""" hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # Target low saturation (gray) and high value (light) lower_gray = np.array([0, 0, 200 - intensity // 2]) upper_gray = np.array([180, 40 + intensity // 3, 250]) mask = cv2.inRange(hsv, lower_gray, upper_gray) # Don't remove dark content mask[gray < 150] = 0 # Don't remove pure white mask[gray > 250] = 0 kernel = np.ones((2, 2), np.uint8) mask = cv2.dilate(mask, kernel, iterations=1) result = img.copy() result[mask > 0] = [255, 255, 255] return result def preview_single_page( pdf_bytes: bytes, page_num: int = 0, method: str = "inpaint", intensity: int = 50, dpi: int = 100 ) -> tuple[bytes, bytes]: """ Preview watermark removal on a single page. Returns (original_jpeg, processed_jpeg) for comparison. """ if not CV2_AVAILABLE: raise ImportError("OpenCV not installed") doc = fitz.open(stream=pdf_bytes, filetype="pdf") if page_num >= len(doc): page_num = 0 page = doc[page_num] mat = fitz.Matrix(dpi / 72, dpi / 72) pix = page.get_pixmap(matrix=mat) # Convert to numpy img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n) if pix.n == 4: img_bgr = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR) img_rgb = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB) else: img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) img_rgb = img # Original as JPEG pil_original = Image.fromarray(img_rgb) orig_buffer = io.BytesIO() pil_original.save(orig_buffer, format='JPEG', quality=85) original_bytes = orig_buffer.getvalue() # Process if method == "inpaint": result = remove_by_inpainting(img_bgr, intensity) elif method == "threshold": result = remove_by_threshold(img_bgr, intensity) else: result = remove_by_color(img_bgr, intensity) # Processed as JPEG result_rgb = cv2.cvtColor(result, cv2.COLOR_BGR2RGB) pil_processed = Image.fromarray(result_rgb) proc_buffer = io.BytesIO() pil_processed.save(proc_buffer, format='JPEG', quality=85) processed_bytes = proc_buffer.getvalue() doc.close() return original_bytes, processed_bytes