Spaces:

shiva0013
/

pdftools

Sleeping

App Files Files Community

pdftools / watermark_remover.py

Shivakafle038

PDF Tools Web App - compress, convert, watermark removal

32a841c 4 months ago

raw

history blame contribute delete

8.27 kB

	"""
	Watermark Remover Module
	Removes text watermarks from PDF pages using image processing
	Optimized for file size and quality
	"""

	import io
	import fitz # PyMuPDF
	import numpy as np
	from PIL import Image
	from concurrent.futures import ThreadPoolExecutor
	import time

	try:
	import cv2
	CV2_AVAILABLE = True
	except ImportError:
	CV2_AVAILABLE = False


	def remove_watermark_from_pdf(
	pdf_bytes: bytes,
	watermark_text: str = "Educated Nepal",
	method: str = "inpaint",
	intensity: int = 50,
	dpi: int = 150,
	jpeg_quality: int = 75,
	max_workers: int = 4
	) -> bytes:
	"""
	Remove watermark from PDF pages with optimized output size.
	Uses JPEG compression to keep file size small.

	Args:
	pdf_bytes: Input PDF as bytes
	watermark_text: Text to remove (not used in current methods)
	method: 'inpaint', 'threshold', or 'color'
	intensity: 0-100, higher = more aggressive removal
	dpi: Resolution for processing (lower = smaller file, 100-150 recommended)
	jpeg_quality: JPEG quality 10-100 (lower = smaller file, 60-80 recommended)
	max_workers: Parallel processing threads
	"""
	if not CV2_AVAILABLE:
	raise ImportError("OpenCV not installed. Run: pip install opencv-python-headless")

	start_time = time.time()
	original_size = len(pdf_bytes)

	doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	output_doc = fitz.open()

	# Get original page sizes to maintain dimensions
	page_sizes = [(doc[i].rect.width, doc[i].rect.height) for i in range(len(doc))]

	def process_page(page_num):
	page = doc[page_num]
	orig_width, orig_height = page_sizes[page_num]

	# Render at specified DPI
	mat = fitz.Matrix(dpi / 72, dpi / 72)
	pix = page.get_pixmap(matrix=mat)

	# Convert to numpy array directly (faster than PNG encoding)
	img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)

	# Convert RGB to BGR for OpenCV if needed
	if pix.n == 4: # RGBA
	img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
	elif pix.n == 3: # RGB
	img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

	# Process to remove watermark
	if method == "inpaint":
	result = remove_by_inpainting(img, intensity)
	elif method == "threshold":
	result = remove_by_threshold(img, intensity)
	elif method == "color":
	result = remove_by_color(img, intensity)
	else:
	result = remove_by_inpainting(img, intensity)

	# Convert back to RGB for PIL
	result_rgb = cv2.cvtColor(result, cv2.COLOR_BGR2RGB)

	# Encode as JPEG with specified quality
	pil_img = Image.fromarray(result_rgb)
	jpeg_buffer = io.BytesIO()
	pil_img.save(jpeg_buffer, format='JPEG', quality=jpeg_quality, optimize=True)
	jpeg_bytes = jpeg_buffer.getvalue()

	return page_num, jpeg_bytes, (orig_width, orig_height)

	# Process pages in parallel
	with ThreadPoolExecutor(max_workers=max_workers) as executor:
	results = list(executor.map(lambda i: process_page(i), range(len(doc))))

	# Sort by page number
	results.sort(key=lambda x: x[0])

	# Create output PDF with original page sizes
	for page_num, jpeg_bytes, (orig_width, orig_height) in results:
	# Create page with original dimensions
	pdf_page = output_doc.new_page(width=orig_width, height=orig_height)

	# Insert image to fill the page
	rect = fitz.Rect(0, 0, orig_width, orig_height)
	pdf_page.insert_image(rect, stream=jpeg_bytes)

	# Save with maximum compression
	output_bytes = output_doc.tobytes(deflate=True, garbage=4, clean=True)

	page_count = len(doc)
	doc.close()
	output_doc.close()

	elapsed = time.time() - start_time
	output_size = len(output_bytes)
	ratio = output_size / original_size if original_size > 0 else 1
	print(f"Watermark removal: {page_count} pages in {elapsed:.1f}s, "
	f"{original_size/1024:.0f}KB -> {output_size/1024:.0f}KB ({ratio:.1%})")

	return output_bytes


	def remove_by_inpainting(img: np.ndarray, intensity: int) -> np.ndarray:
	"""Remove watermark using inpainting - best for handwritten notes."""
	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

	# Adaptive threshold based on intensity
	# Higher intensity = lower threshold = more aggressive
	thresh_value = 220 - int(intensity * 0.4) # Range: 220 to 180

	# Find light gray areas (watermark)
	_, mask = cv2.threshold(gray, thresh_value, 255, cv2.THRESH_BINARY)

	# Exclude pure white (paper) and very light areas
	white_mask = gray > 248
	mask[white_mask] = 0

	# Also exclude dark areas (actual content)
	dark_mask = gray < 200
	mask[dark_mask] = 0

	# Small dilation to cover watermark edges
	kernel = np.ones((2, 2), np.uint8)
	mask = cv2.dilate(mask, kernel, iterations=1)

	# Inpaint - use TELEA for better results
	result = cv2.inpaint(img, mask, inpaintRadius=3, flags=cv2.INPAINT_TELEA)

	return result


	def remove_by_threshold(img: np.ndarray, intensity: int) -> np.ndarray:
	"""Remove watermark by converting light gray to white - fast method."""
	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

	# Threshold: higher intensity = more aggressive
	thresh_value = 230 - int(intensity * 0.5) # Range: 230 to 180

	# Create mask for light gray areas
	mask = (gray > thresh_value) & (gray < 250)

	result = img.copy()
	result[mask] = [255, 255, 255]

	return result


	def remove_by_color(img: np.ndarray, intensity: int) -> np.ndarray:
	"""Remove watermark by targeting gray color range."""
	hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

	# Target low saturation (gray) and high value (light)
	lower_gray = np.array([0, 0, 200 - intensity // 2])
	upper_gray = np.array([180, 40 + intensity // 3, 250])

	mask = cv2.inRange(hsv, lower_gray, upper_gray)

	# Don't remove dark content
	mask[gray < 150] = 0

	# Don't remove pure white
	mask[gray > 250] = 0

	kernel = np.ones((2, 2), np.uint8)
	mask = cv2.dilate(mask, kernel, iterations=1)

	result = img.copy()
	result[mask > 0] = [255, 255, 255]

	return result


	def preview_single_page(
	pdf_bytes: bytes,
	page_num: int = 0,
	method: str = "inpaint",
	intensity: int = 50,
	dpi: int = 100
	) -> tuple[bytes, bytes]:
	"""
	Preview watermark removal on a single page.
	Returns (original_jpeg, processed_jpeg) for comparison.
	"""
	if not CV2_AVAILABLE:
	raise ImportError("OpenCV not installed")

	doc = fitz.open(stream=pdf_bytes, filetype="pdf")

	if page_num >= len(doc):
	page_num = 0

	page = doc[page_num]
	mat = fitz.Matrix(dpi / 72, dpi / 72)
	pix = page.get_pixmap(matrix=mat)

	# Convert to numpy
	img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)

	if pix.n == 4:
	img_bgr = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
	img_rgb = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
	else:
	img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
	img_rgb = img

	# Original as JPEG
	pil_original = Image.fromarray(img_rgb)
	orig_buffer = io.BytesIO()
	pil_original.save(orig_buffer, format='JPEG', quality=85)
	original_bytes = orig_buffer.getvalue()

	# Process
	if method == "inpaint":
	result = remove_by_inpainting(img_bgr, intensity)
	elif method == "threshold":
	result = remove_by_threshold(img_bgr, intensity)
	else:
	result = remove_by_color(img_bgr, intensity)

	# Processed as JPEG
	result_rgb = cv2.cvtColor(result, cv2.COLOR_BGR2RGB)
	pil_processed = Image.fromarray(result_rgb)
	proc_buffer = io.BytesIO()
	pil_processed.save(proc_buffer, format='JPEG', quality=85)
	processed_bytes = proc_buffer.getvalue()

	doc.close()
	return original_bytes, processed_bytes