File size: 8,265 Bytes
32a841c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
"""
Watermark Remover Module
Removes text watermarks from PDF pages using image processing
Optimized for file size and quality
"""

import io
import fitz  # PyMuPDF
import numpy as np
from PIL import Image
from concurrent.futures import ThreadPoolExecutor
import time

try:
    import cv2
    CV2_AVAILABLE = True
except ImportError:
    CV2_AVAILABLE = False


def remove_watermark_from_pdf(
    pdf_bytes: bytes,
    watermark_text: str = "Educated Nepal",
    method: str = "inpaint",
    intensity: int = 50,
    dpi: int = 150,
    jpeg_quality: int = 75,
    max_workers: int = 4
) -> bytes:
    """
    Remove watermark from PDF pages with optimized output size.
    Uses JPEG compression to keep file size small.
    
    Args:
        pdf_bytes: Input PDF as bytes
        watermark_text: Text to remove (not used in current methods)
        method: 'inpaint', 'threshold', or 'color'
        intensity: 0-100, higher = more aggressive removal
        dpi: Resolution for processing (lower = smaller file, 100-150 recommended)
        jpeg_quality: JPEG quality 10-100 (lower = smaller file, 60-80 recommended)
        max_workers: Parallel processing threads
    """
    if not CV2_AVAILABLE:
        raise ImportError("OpenCV not installed. Run: pip install opencv-python-headless")
    
    start_time = time.time()
    original_size = len(pdf_bytes)
    
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    output_doc = fitz.open()
    
    # Get original page sizes to maintain dimensions
    page_sizes = [(doc[i].rect.width, doc[i].rect.height) for i in range(len(doc))]
    
    def process_page(page_num):
        page = doc[page_num]
        orig_width, orig_height = page_sizes[page_num]
        
        # Render at specified DPI
        mat = fitz.Matrix(dpi / 72, dpi / 72)
        pix = page.get_pixmap(matrix=mat)
        
        # Convert to numpy array directly (faster than PNG encoding)
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
        
        # Convert RGB to BGR for OpenCV if needed
        if pix.n == 4:  # RGBA
            img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
        elif pix.n == 3:  # RGB
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        
        # Process to remove watermark
        if method == "inpaint":
            result = remove_by_inpainting(img, intensity)
        elif method == "threshold":
            result = remove_by_threshold(img, intensity)
        elif method == "color":
            result = remove_by_color(img, intensity)
        else:
            result = remove_by_inpainting(img, intensity)
        
        # Convert back to RGB for PIL
        result_rgb = cv2.cvtColor(result, cv2.COLOR_BGR2RGB)
        
        # Encode as JPEG with specified quality
        pil_img = Image.fromarray(result_rgb)
        jpeg_buffer = io.BytesIO()
        pil_img.save(jpeg_buffer, format='JPEG', quality=jpeg_quality, optimize=True)
        jpeg_bytes = jpeg_buffer.getvalue()
        
        return page_num, jpeg_bytes, (orig_width, orig_height)
    
    # Process pages in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(executor.map(lambda i: process_page(i), range(len(doc))))
    
    # Sort by page number
    results.sort(key=lambda x: x[0])
    
    # Create output PDF with original page sizes
    for page_num, jpeg_bytes, (orig_width, orig_height) in results:
        # Create page with original dimensions
        pdf_page = output_doc.new_page(width=orig_width, height=orig_height)
        
        # Insert image to fill the page
        rect = fitz.Rect(0, 0, orig_width, orig_height)
        pdf_page.insert_image(rect, stream=jpeg_bytes)
    
    # Save with maximum compression
    output_bytes = output_doc.tobytes(deflate=True, garbage=4, clean=True)
    
    page_count = len(doc)
    doc.close()
    output_doc.close()
    
    elapsed = time.time() - start_time
    output_size = len(output_bytes)
    ratio = output_size / original_size if original_size > 0 else 1
    print(f"Watermark removal: {page_count} pages in {elapsed:.1f}s, "
          f"{original_size/1024:.0f}KB -> {output_size/1024:.0f}KB ({ratio:.1%})")
    
    return output_bytes


def remove_by_inpainting(img: np.ndarray, intensity: int) -> np.ndarray:
    """Remove watermark using inpainting - best for handwritten notes."""
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Adaptive threshold based on intensity
    # Higher intensity = lower threshold = more aggressive
    thresh_value = 220 - int(intensity * 0.4)  # Range: 220 to 180
    
    # Find light gray areas (watermark)
    _, mask = cv2.threshold(gray, thresh_value, 255, cv2.THRESH_BINARY)
    
    # Exclude pure white (paper) and very light areas
    white_mask = gray > 248
    mask[white_mask] = 0
    
    # Also exclude dark areas (actual content)
    dark_mask = gray < 200
    mask[dark_mask] = 0
    
    # Small dilation to cover watermark edges
    kernel = np.ones((2, 2), np.uint8)
    mask = cv2.dilate(mask, kernel, iterations=1)
    
    # Inpaint - use TELEA for better results
    result = cv2.inpaint(img, mask, inpaintRadius=3, flags=cv2.INPAINT_TELEA)
    
    return result


def remove_by_threshold(img: np.ndarray, intensity: int) -> np.ndarray:
    """Remove watermark by converting light gray to white - fast method."""
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Threshold: higher intensity = more aggressive
    thresh_value = 230 - int(intensity * 0.5)  # Range: 230 to 180
    
    # Create mask for light gray areas
    mask = (gray > thresh_value) & (gray < 250)
    
    result = img.copy()
    result[mask] = [255, 255, 255]
    
    return result


def remove_by_color(img: np.ndarray, intensity: int) -> np.ndarray:
    """Remove watermark by targeting gray color range."""
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Target low saturation (gray) and high value (light)
    lower_gray = np.array([0, 0, 200 - intensity // 2])
    upper_gray = np.array([180, 40 + intensity // 3, 250])
    
    mask = cv2.inRange(hsv, lower_gray, upper_gray)
    
    # Don't remove dark content
    mask[gray < 150] = 0
    
    # Don't remove pure white
    mask[gray > 250] = 0
    
    kernel = np.ones((2, 2), np.uint8)
    mask = cv2.dilate(mask, kernel, iterations=1)
    
    result = img.copy()
    result[mask > 0] = [255, 255, 255]
    
    return result


def preview_single_page(
    pdf_bytes: bytes,
    page_num: int = 0,
    method: str = "inpaint",
    intensity: int = 50,
    dpi: int = 100
) -> tuple[bytes, bytes]:
    """
    Preview watermark removal on a single page.
    Returns (original_jpeg, processed_jpeg) for comparison.
    """
    if not CV2_AVAILABLE:
        raise ImportError("OpenCV not installed")
    
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    
    if page_num >= len(doc):
        page_num = 0
    
    page = doc[page_num]
    mat = fitz.Matrix(dpi / 72, dpi / 72)
    pix = page.get_pixmap(matrix=mat)
    
    # Convert to numpy
    img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
    
    if pix.n == 4:
        img_bgr = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
    else:
        img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        img_rgb = img
    
    # Original as JPEG
    pil_original = Image.fromarray(img_rgb)
    orig_buffer = io.BytesIO()
    pil_original.save(orig_buffer, format='JPEG', quality=85)
    original_bytes = orig_buffer.getvalue()
    
    # Process
    if method == "inpaint":
        result = remove_by_inpainting(img_bgr, intensity)
    elif method == "threshold":
        result = remove_by_threshold(img_bgr, intensity)
    else:
        result = remove_by_color(img_bgr, intensity)
    
    # Processed as JPEG
    result_rgb = cv2.cvtColor(result, cv2.COLOR_BGR2RGB)
    pil_processed = Image.fromarray(result_rgb)
    proc_buffer = io.BytesIO()
    pil_processed.save(proc_buffer, format='JPEG', quality=85)
    processed_bytes = proc_buffer.getvalue()
    
    doc.close()
    return original_bytes, processed_bytes