Spaces:

shiva0013
/

pdftools

Sleeping

File size: 8,265 Bytes

32a841c

"""
Watermark Remover Module
Removes text watermarks from PDF pages using image processing
Optimized for file size and quality
"""

import io
import fitz  # PyMuPDF
import numpy as np
from PIL import Image
from concurrent.futures import ThreadPoolExecutor
import time

try:
    import cv2
    CV2_AVAILABLE = True
except ImportError:
    CV2_AVAILABLE = False


def remove_watermark_from_pdf(
    pdf_bytes: bytes,
    watermark_text: str = "Educated Nepal",
    method: str = "inpaint",
    intensity: int = 50,
    dpi: int = 150,
    jpeg_quality: int = 75,
    max_workers: int = 4
) -> bytes:
    """
    Remove watermark from PDF pages with optimized output size.
    Uses JPEG compression to keep file size small.
    
    Args:
        pdf_bytes: Input PDF as bytes
        watermark_text: Text to remove (not used in current methods)
        method: 'inpaint', 'threshold', or 'color'
        intensity: 0-100, higher = more aggressive removal
        dpi: Resolution for processing (lower = smaller file, 100-150 recommended)
        jpeg_quality: JPEG quality 10-100 (lower = smaller file, 60-80 recommended)
        max_workers: Parallel processing threads
    """
    if not CV2_AVAILABLE:
        raise ImportError("OpenCV not installed. Run: pip install opencv-python-headless")
    
    start_time = time.time()
    original_size = len(pdf_bytes)
    
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    output_doc = fitz.open()
    
    # Get original page sizes to maintain dimensions
    page_sizes = [(doc[i].rect.width, doc[i].rect.height) for i in range(len(doc))]
    
    def process_page(page_num):
        page = doc[page_num]
        orig_width, orig_height = page_sizes[page_num]
        
        # Render at specified DPI
        mat = fitz.Matrix(dpi / 72, dpi / 72)
        pix = page.get_pixmap(matrix=mat)
        
        # Convert to numpy array directly (faster than PNG encoding)
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
        
        # Convert RGB to BGR for OpenCV if needed
        if pix.n == 4:  # RGBA
            img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
        elif pix.n == 3:  # RGB
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        
        # Process to remove watermark
        if method == "inpaint":
            result = remove_by_inpainting(img, intensity)
        elif method == "threshold":
            result = remove_by_threshold(img, intensity)
        elif method == "color":
            result = remove_by_color(img, intensity)
        else:
            result = remove_by_inpainting(img, intensity)
        
        # Convert back to RGB for PIL
        result_rgb = cv2.cvtColor(result, cv2.COLOR_BGR2RGB)
        
        # Encode as JPEG with specified quality
        pil_img = Image.fromarray(result_rgb)
        jpeg_buffer = io.BytesIO()
        pil_img.save(jpeg_buffer, format='JPEG', quality=jpeg_quality, optimize=True)
        jpeg_bytes = jpeg_buffer.getvalue()
        
        return page_num, jpeg_bytes, (orig_width, orig_height)
    
    # Process pages in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(executor.map(lambda i: process_page(i), range(len(doc))))
    
    # Sort by page number
    results.sort(key=lambda x: x[0])
    
    # Create output PDF with original page sizes
    for page_num, jpeg_bytes, (orig_width, orig_height) in results:
        # Create page with original dimensions
        pdf_page = output_doc.new_page(width=orig_width, height=orig_height)
        
        # Insert image to fill the page
        rect = fitz.Rect(0, 0, orig_width, orig_height)
        pdf_page.insert_image(rect, stream=jpeg_bytes)
    
    # Save with maximum compression
    output_bytes = output_doc.tobytes(deflate=True, garbage=4, clean=True)
    
    page_count = len(doc)
    doc.close()
    output_doc.close()
    
    elapsed = time.time() - start_time
    output_size = len(output_bytes)
    ratio = output_size / original_size if original_size > 0 else 1
    print(f"Watermark removal: {page_count} pages in {elapsed:.1f}s, "
          f"{original_size/1024:.0f}KB -> {output_size/1024:.0f}KB ({ratio:.1%})")
    
    return output_bytes


def remove_by_inpainting(img: np.ndarray, intensity: int) -> np.ndarray:
    """Remove watermark using inpainting - best for handwritten notes."""
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Adaptive threshold based on intensity
    # Higher intensity = lower threshold = more aggressive
    thresh_value = 220 - int(intensity * 0.4)  # Range: 220 to 180
    
    # Find light gray areas (watermark)
    _, mask = cv2.threshold(gray, thresh_value, 255, cv2.THRESH_BINARY)
    
    # Exclude pure white (paper) and very light areas
    white_mask = gray > 248
    mask[white_mask] = 0
    
    # Also exclude dark areas (actual content)
    dark_mask = gray < 200
    mask[dark_mask] = 0
    
    # Small dilation to cover watermark edges
    kernel = np.ones((2, 2), np.uint8)
    mask = cv2.dilate(mask, kernel, iterations=1)
    
    # Inpaint - use TELEA for better results
    result = cv2.inpaint(img, mask, inpaintRadius=3, flags=cv2.INPAINT_TELEA)
    
    return result


def remove_by_threshold(img: np.ndarray, intensity: int) -> np.ndarray:
    """Remove watermark by converting light gray to white - fast method."""
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Threshold: higher intensity = more aggressive
    thresh_value = 230 - int(intensity * 0.5)  # Range: 230 to 180
    
    # Create mask for light gray areas
    mask = (gray > thresh_value) & (gray < 250)
    
    result = img.copy()
    result[mask] = [255, 255, 255]
    
    return result


def remove_by_color(img: np.ndarray, intensity: int) -> np.ndarray:
    """Remove watermark by targeting gray color range."""
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Target low saturation (gray) and high value (light)
    lower_gray = np.array([0, 0, 200 - intensity // 2])
    upper_gray = np.array([180, 40 + intensity // 3, 250])
    
    mask = cv2.inRange(hsv, lower_gray, upper_gray)
    
    # Don't remove dark content
    mask[gray < 150] = 0
    
    # Don't remove pure white
    mask[gray > 250] = 0
    
    kernel = np.ones((2, 2), np.uint8)
    mask = cv2.dilate(mask, kernel, iterations=1)
    
    result = img.copy()
    result[mask > 0] = [255, 255, 255]
    
    return result


def preview_single_page(
    pdf_bytes: bytes,
    page_num: int = 0,
    method: str = "inpaint",
    intensity: int = 50,
    dpi: int = 100
) -> tuple[bytes, bytes]:
    """
    Preview watermark removal on a single page.
    Returns (original_jpeg, processed_jpeg) for comparison.
    """
    if not CV2_AVAILABLE:
        raise ImportError("OpenCV not installed")
    
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    
    if page_num >= len(doc):
        page_num = 0
    
    page = doc[page_num]
    mat = fitz.Matrix(dpi / 72, dpi / 72)
    pix = page.get_pixmap(matrix=mat)
    
    # Convert to numpy
    img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
    
    if pix.n == 4:
        img_bgr = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
    else:
        img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        img_rgb = img
    
    # Original as JPEG
    pil_original = Image.fromarray(img_rgb)
    orig_buffer = io.BytesIO()
    pil_original.save(orig_buffer, format='JPEG', quality=85)
    original_bytes = orig_buffer.getvalue()
    
    # Process
    if method == "inpaint":
        result = remove_by_inpainting(img_bgr, intensity)
    elif method == "threshold":
        result = remove_by_threshold(img_bgr, intensity)
    else:
        result = remove_by_color(img_bgr, intensity)
    
    # Processed as JPEG
    result_rgb = cv2.cvtColor(result, cv2.COLOR_BGR2RGB)
    pil_processed = Image.fromarray(result_rgb)
    proc_buffer = io.BytesIO()
    pil_processed.save(proc_buffer, format='JPEG', quality=85)
    processed_bytes = proc_buffer.getvalue()
    
    doc.close()
    return original_bytes, processed_bytes