Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- app.py +79 -0
- examples/chart_good1.png +3 -0
- px_image2pptx/__init__.py +23 -0
- px_image2pptx/assemble.py +470 -0
- px_image2pptx/cli.py +100 -0
- px_image2pptx/inpaint.py +84 -0
- px_image2pptx/ocr.py +105 -0
- px_image2pptx/pipeline.py +136 -0
- px_image2pptx/textmask.py +167 -0
- requirements.txt +9 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
examples/chart_good1.png filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Gradio demo for px-image2pptx — deploy on Hugging Face Spaces."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import tempfile
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True"
|
| 8 |
+
|
| 9 |
+
import gradio as gr
|
| 10 |
+
from PIL import Image
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def convert(image_path, lang):
|
| 14 |
+
from px_image2pptx import image_to_pptx
|
| 15 |
+
|
| 16 |
+
# Convert WebP to PNG if needed (PaddleOCR doesn't support WebP)
|
| 17 |
+
img = Image.open(image_path)
|
| 18 |
+
if img.format == "WEBP" or image_path.lower().endswith(".webp"):
|
| 19 |
+
png_path = image_path.rsplit(".", 1)[0] + ".png"
|
| 20 |
+
img.save(png_path)
|
| 21 |
+
image_path = png_path
|
| 22 |
+
|
| 23 |
+
tmpdir = tempfile.mkdtemp()
|
| 24 |
+
out_pptx = os.path.join(tmpdir, "output.pptx")
|
| 25 |
+
work_dir = os.path.join(tmpdir, "work")
|
| 26 |
+
|
| 27 |
+
report = image_to_pptx(
|
| 28 |
+
image_path,
|
| 29 |
+
out_pptx,
|
| 30 |
+
lang=lang,
|
| 31 |
+
work_dir=work_dir,
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
# Load the inpainted background for preview
|
| 35 |
+
bg_path = os.path.join(work_dir, "background.png")
|
| 36 |
+
bg_preview = Image.open(bg_path) if os.path.exists(bg_path) else None
|
| 37 |
+
|
| 38 |
+
summary = (
|
| 39 |
+
f"**Text boxes:** {report['text_boxes']} \n"
|
| 40 |
+
f"**OCR regions:** {report['ocr_regions']} \n"
|
| 41 |
+
f"**Slide size:** {report['slide_size']['width_inches']}x"
|
| 42 |
+
f"{report['slide_size']['height_inches']}\" \n"
|
| 43 |
+
f"**Timings:** {report.get('timings', {})}"
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
return bg_preview, out_pptx, summary
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
demo = gr.Interface(
|
| 50 |
+
fn=convert,
|
| 51 |
+
inputs=[
|
| 52 |
+
gr.Image(type="filepath", label="Input image (slide, poster, infographic)"),
|
| 53 |
+
gr.Radio(
|
| 54 |
+
choices=["auto", "en", "ch"],
|
| 55 |
+
value="auto",
|
| 56 |
+
label="OCR language",
|
| 57 |
+
info="auto = Chinese model (handles both Chinese & English)",
|
| 58 |
+
),
|
| 59 |
+
],
|
| 60 |
+
outputs=[
|
| 61 |
+
gr.Image(label="Inpainted background (text removed)"),
|
| 62 |
+
gr.File(label="Download .pptx"),
|
| 63 |
+
gr.Markdown(label="Report"),
|
| 64 |
+
],
|
| 65 |
+
title="px-image2pptx",
|
| 66 |
+
description=(
|
| 67 |
+
"Convert a static image to an editable PowerPoint file. "
|
| 68 |
+
"OCR detects text, classical CV builds a text mask, LAMA inpaints "
|
| 69 |
+
"the background clean, and python-pptx reconstructs editable text boxes.\n\n"
|
| 70 |
+
"For a full browser-based editor, visit [pxGenius.ai](https://pxgenius.ai)."
|
| 71 |
+
),
|
| 72 |
+
examples=[
|
| 73 |
+
["examples/chart_good1.png", "auto"],
|
| 74 |
+
],
|
| 75 |
+
cache_examples=False,
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
if __name__ == "__main__":
|
| 79 |
+
demo.launch()
|
examples/chart_good1.png
ADDED
|
Git LFS Details
|
px_image2pptx/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""px-image2pptx -- Convert static images to editable PowerPoint slides.
|
| 2 |
+
|
| 3 |
+
Pipeline: image → OCR → textmask → mask-clip → inpaint → PPTX assembly.
|
| 4 |
+
|
| 5 |
+
OCR detects text regions. Textmask detects text ink pixels. Mask-clip ANDs
|
| 6 |
+
them so only OCR-confirmed text is masked. LAMA inpaints the masked regions.
|
| 7 |
+
PPTX assembly places editable text boxes over the clean background.
|
| 8 |
+
|
| 9 |
+
Quick start::
|
| 10 |
+
|
| 11 |
+
from px_image2pptx import image_to_pptx
|
| 12 |
+
image_to_pptx("slide.png", "output.pptx")
|
| 13 |
+
|
| 14 |
+
Or from the command line::
|
| 15 |
+
|
| 16 |
+
px-image2pptx slide.png -o output.pptx
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
__version__ = "0.1.0"
|
| 20 |
+
|
| 21 |
+
from px_image2pptx.pipeline import image_to_pptx
|
| 22 |
+
|
| 23 |
+
__all__ = ["image_to_pptx"]
|
px_image2pptx/assemble.py
ADDED
|
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""PPTX assembly — place text boxes and background onto editable slides.
|
| 2 |
+
|
| 3 |
+
Pure python-pptx assembly: no ML models, no LLM calls. Takes OCR regions,
|
| 4 |
+
background image, and optional tight mask → produces editable .pptx.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
import numpy as np
|
| 12 |
+
from PIL import Image, ImageFont
|
| 13 |
+
from pptx import Presentation
|
| 14 |
+
from pptx.util import Inches, Pt, Emu
|
| 15 |
+
from pptx.dml.color import RGBColor
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# ── Coordinate mapping ────────────────────────────────────────
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def px_to_emu(px: float, px_per_inch: float) -> int:
|
| 22 |
+
"""Convert image pixels to EMU (914400 per inch)."""
|
| 23 |
+
return int(px / px_per_inch * 914400)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class SlideMapper:
|
| 27 |
+
"""Maps image pixel coordinates to slide EMU coordinates."""
|
| 28 |
+
|
| 29 |
+
def __init__(self, img_w: int, img_h: int, slide_w_inches: float | None = None):
|
| 30 |
+
self.img_w = img_w
|
| 31 |
+
self.img_h = img_h
|
| 32 |
+
aspect = img_w / img_h
|
| 33 |
+
|
| 34 |
+
if slide_w_inches and slide_w_inches > 0:
|
| 35 |
+
self.slide_w = slide_w_inches
|
| 36 |
+
self.slide_h = slide_w_inches / aspect
|
| 37 |
+
elif aspect > 1.5:
|
| 38 |
+
self.slide_w, self.slide_h = 13.333, 7.5
|
| 39 |
+
elif aspect > 1.2:
|
| 40 |
+
self.slide_w, self.slide_h = 10.0, 7.5
|
| 41 |
+
else:
|
| 42 |
+
self.slide_w = 10.0
|
| 43 |
+
self.slide_h = 10.0 / aspect
|
| 44 |
+
|
| 45 |
+
self.ppi = img_w / self.slide_w
|
| 46 |
+
|
| 47 |
+
def to_emu(self, px: float) -> int:
|
| 48 |
+
return px_to_emu(px, self.ppi)
|
| 49 |
+
|
| 50 |
+
def bbox_to_emu(self, x1, y1, x2, y2):
|
| 51 |
+
return (self.to_emu(x1), self.to_emu(y1),
|
| 52 |
+
self.to_emu(x2 - x1), self.to_emu(y2 - y1))
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# ── Font measurement ──────────────────────────────────────────
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def _load_reference_font():
|
| 59 |
+
"""Load a system sans-serif font for text width measurement."""
|
| 60 |
+
candidates = [
|
| 61 |
+
"/System/Library/Fonts/Supplemental/Arial.ttf",
|
| 62 |
+
"/System/Library/Fonts/Helvetica.ttc",
|
| 63 |
+
"/Library/Fonts/Arial.ttf",
|
| 64 |
+
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
|
| 65 |
+
]
|
| 66 |
+
for path in candidates:
|
| 67 |
+
try:
|
| 68 |
+
return ImageFont.truetype(path, 72), True
|
| 69 |
+
except Exception:
|
| 70 |
+
continue
|
| 71 |
+
return None, False
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
_REF_FONT, _HAS_FONT = _load_reference_font()
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def _is_cjk(ch: str) -> bool:
|
| 78 |
+
cp = ord(ch)
|
| 79 |
+
return (0x4E00 <= cp <= 0x9FFF or 0x3400 <= cp <= 0x4DBF or
|
| 80 |
+
0x3000 <= cp <= 0x303F or 0xFF00 <= cp <= 0xFFEF or
|
| 81 |
+
0xF900 <= cp <= 0xFAFF or 0x2E80 <= cp <= 0x2EFF or
|
| 82 |
+
0x31C0 <= cp <= 0x31EF)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def estimate_text_width_pt(text: str, font_pt: float) -> float:
|
| 86 |
+
"""Measure rendered text width in points.
|
| 87 |
+
|
| 88 |
+
Hybrid: PIL font metrics for Latin, 1.0x em for CJK.
|
| 89 |
+
"""
|
| 90 |
+
if _HAS_FONT:
|
| 91 |
+
width = 0.0
|
| 92 |
+
latin_run: list[str] = []
|
| 93 |
+
|
| 94 |
+
def flush():
|
| 95 |
+
nonlocal width
|
| 96 |
+
if latin_run:
|
| 97 |
+
ref_w = _REF_FONT.getlength("".join(latin_run))
|
| 98 |
+
width += ref_w * (font_pt / 72.0)
|
| 99 |
+
latin_run.clear()
|
| 100 |
+
|
| 101 |
+
for ch in text:
|
| 102 |
+
if _is_cjk(ch):
|
| 103 |
+
flush()
|
| 104 |
+
width += font_pt * 1.0
|
| 105 |
+
else:
|
| 106 |
+
latin_run.append(ch)
|
| 107 |
+
flush()
|
| 108 |
+
return width
|
| 109 |
+
|
| 110 |
+
# Fallback: heuristic
|
| 111 |
+
width = 0.0
|
| 112 |
+
for ch in text:
|
| 113 |
+
if _is_cjk(ch):
|
| 114 |
+
width += font_pt * 1.0
|
| 115 |
+
elif ch == " ":
|
| 116 |
+
width += font_pt * 0.25
|
| 117 |
+
else:
|
| 118 |
+
width += font_pt * 0.50
|
| 119 |
+
return width
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def autoscale_font(
|
| 123 |
+
text: str,
|
| 124 |
+
bbox_w_px: float,
|
| 125 |
+
bbox_h_px: float,
|
| 126 |
+
ppi: float,
|
| 127 |
+
min_pt: int = 8,
|
| 128 |
+
max_pt: int = 72,
|
| 129 |
+
) -> int:
|
| 130 |
+
"""Auto-scale font size to fill 90-94% of bbox width."""
|
| 131 |
+
line_h_pt = (bbox_h_px / ppi) * 72
|
| 132 |
+
pt = max(min_pt, min(max_pt, round(line_h_pt)))
|
| 133 |
+
|
| 134 |
+
bbox_w_pt = (bbox_w_px / ppi) * 72
|
| 135 |
+
lines = text.split("\n")
|
| 136 |
+
longest = max(lines, key=len) if lines else text
|
| 137 |
+
|
| 138 |
+
# Shrink to fit
|
| 139 |
+
for _ in range(40):
|
| 140 |
+
if estimate_text_width_pt(longest, pt) <= bbox_w_pt * 0.94 or pt <= min_pt:
|
| 141 |
+
break
|
| 142 |
+
pt = max(min_pt, pt - 1)
|
| 143 |
+
|
| 144 |
+
# Grow to fill
|
| 145 |
+
for _ in range(40):
|
| 146 |
+
if estimate_text_width_pt(longest, pt) >= bbox_w_pt * 0.90 or pt >= max_pt:
|
| 147 |
+
break
|
| 148 |
+
if estimate_text_width_pt(longest, pt + 1) > bbox_w_pt * 0.94:
|
| 149 |
+
break
|
| 150 |
+
pt = min(max_pt, pt + 1)
|
| 151 |
+
|
| 152 |
+
return pt
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
# ── Text grouping ─────────────────────────────────────────────
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def group_text_lines(
|
| 159 |
+
regions: list[dict],
|
| 160 |
+
y_threshold: float = 0.6,
|
| 161 |
+
x_gap_factor: float = 3.0,
|
| 162 |
+
) -> list[list[dict]]:
|
| 163 |
+
"""Merge word-level OCR regions into line-level groups.
|
| 164 |
+
|
| 165 |
+
Two-pass: group by vertical proximity, then split by horizontal gaps
|
| 166 |
+
to prevent merging left/right columns.
|
| 167 |
+
"""
|
| 168 |
+
if not regions:
|
| 169 |
+
return []
|
| 170 |
+
|
| 171 |
+
for r in regions:
|
| 172 |
+
b = r["bbox"]
|
| 173 |
+
r["_cy"] = (b["y1"] + b["y2"]) / 2
|
| 174 |
+
r["_h"] = b["y2"] - b["y1"]
|
| 175 |
+
|
| 176 |
+
sorted_regions = sorted(regions, key=lambda r: r["_cy"])
|
| 177 |
+
|
| 178 |
+
# Pass 1: vertical grouping
|
| 179 |
+
y_lines: list[list[dict]] = []
|
| 180 |
+
current = [sorted_regions[0]]
|
| 181 |
+
for r in sorted_regions[1:]:
|
| 182 |
+
line_cy = sum(rr["_cy"] for rr in current) / len(current)
|
| 183 |
+
line_h = max(rr["_h"] for rr in current)
|
| 184 |
+
if abs(r["_cy"] - line_cy) < line_h * y_threshold:
|
| 185 |
+
current.append(r)
|
| 186 |
+
else:
|
| 187 |
+
y_lines.append(current)
|
| 188 |
+
current = [r]
|
| 189 |
+
y_lines.append(current)
|
| 190 |
+
|
| 191 |
+
# Pass 2: split by horizontal gaps
|
| 192 |
+
lines: list[list[dict]] = []
|
| 193 |
+
for y_line in y_lines:
|
| 194 |
+
y_line.sort(key=lambda r: r["bbox"]["x1"])
|
| 195 |
+
if len(y_line) <= 1:
|
| 196 |
+
lines.append(y_line)
|
| 197 |
+
continue
|
| 198 |
+
heights = sorted(rr["_h"] for rr in y_line)
|
| 199 |
+
median_h = heights[len(heights) // 2]
|
| 200 |
+
max_gap = median_h * x_gap_factor
|
| 201 |
+
|
| 202 |
+
segment = [y_line[0]]
|
| 203 |
+
for r in y_line[1:]:
|
| 204 |
+
gap = r["bbox"]["x1"] - segment[-1]["bbox"]["x2"]
|
| 205 |
+
if gap > max_gap:
|
| 206 |
+
lines.append(segment)
|
| 207 |
+
segment = [r]
|
| 208 |
+
else:
|
| 209 |
+
segment.append(r)
|
| 210 |
+
lines.append(segment)
|
| 211 |
+
|
| 212 |
+
for r in regions:
|
| 213 |
+
r.pop("_cy", None)
|
| 214 |
+
r.pop("_h", None)
|
| 215 |
+
|
| 216 |
+
return lines
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
def group_bbox(group: list[dict]) -> tuple[int, int, int, int]:
|
| 220 |
+
x1 = min(r["bbox"]["x1"] for r in group)
|
| 221 |
+
y1 = min(r["bbox"]["y1"] for r in group)
|
| 222 |
+
x2 = max(r["bbox"]["x2"] for r in group)
|
| 223 |
+
y2 = max(r["bbox"]["y2"] for r in group)
|
| 224 |
+
return x1, y1, x2, y2
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
def group_to_text(group: list[dict]) -> str:
|
| 228 |
+
"""Convert a group of OCR regions to display text."""
|
| 229 |
+
if not group:
|
| 230 |
+
return ""
|
| 231 |
+
for r in group:
|
| 232 |
+
b = r["bbox"]
|
| 233 |
+
r["_cy"] = (b["y1"] + b["y2"]) / 2
|
| 234 |
+
r["_h"] = b["y2"] - b["y1"]
|
| 235 |
+
|
| 236 |
+
sorted_r = sorted(group, key=lambda r: r["_cy"])
|
| 237 |
+
lines: list[list[dict]] = []
|
| 238 |
+
current = [sorted_r[0]]
|
| 239 |
+
for r in sorted_r[1:]:
|
| 240 |
+
line_cy = sum(rr["_cy"] for rr in current) / len(current)
|
| 241 |
+
line_h = max(rr["_h"] for rr in current)
|
| 242 |
+
if abs(r["_cy"] - line_cy) < line_h * 0.6:
|
| 243 |
+
current.append(r)
|
| 244 |
+
else:
|
| 245 |
+
lines.append(current)
|
| 246 |
+
current = [r]
|
| 247 |
+
lines.append(current)
|
| 248 |
+
|
| 249 |
+
text_lines = []
|
| 250 |
+
for line in lines:
|
| 251 |
+
line.sort(key=lambda r: r["bbox"]["x1"])
|
| 252 |
+
text_lines.append(" ".join(r["text"] for r in line))
|
| 253 |
+
|
| 254 |
+
for r in group:
|
| 255 |
+
r.pop("_cy", None)
|
| 256 |
+
r.pop("_h", None)
|
| 257 |
+
|
| 258 |
+
return "\n".join(text_lines)
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
# ── Text color detection ──────────────────────────────────────
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
def _local_bg_color(crop: np.ndarray, border: int = 2) -> np.ndarray:
|
| 265 |
+
h, w = crop.shape[:2]
|
| 266 |
+
if h < border * 2 + 1 or w < border * 2 + 1:
|
| 267 |
+
return np.median(crop.reshape(-1, 3), axis=0)
|
| 268 |
+
pixels = np.concatenate([
|
| 269 |
+
crop[:border, :].reshape(-1, 3),
|
| 270 |
+
crop[-border:, :].reshape(-1, 3),
|
| 271 |
+
crop[border:-border, :border].reshape(-1, 3),
|
| 272 |
+
crop[border:-border, -border:].reshape(-1, 3),
|
| 273 |
+
])
|
| 274 |
+
return np.median(pixels, axis=0)
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
def detect_text_color(
|
| 278 |
+
img_rgb: np.ndarray,
|
| 279 |
+
tight_mask: np.ndarray,
|
| 280 |
+
x1: int, y1: int, x2: int, y2: int,
|
| 281 |
+
default: tuple[int, int, int] = (0x33, 0x33, 0x33),
|
| 282 |
+
min_contrast: float = 40,
|
| 283 |
+
) -> tuple[int, int, int]:
|
| 284 |
+
"""Detect text color from original image using tight mask.
|
| 285 |
+
|
| 286 |
+
Strategy 1: median of tight-mask ink pixels (dark text on light bg).
|
| 287 |
+
Strategy 2: if color ≈ background, sample pixels most different from bg
|
| 288 |
+
(handles white text on dark bg where textmask misses the text).
|
| 289 |
+
"""
|
| 290 |
+
h, w = img_rgb.shape[:2]
|
| 291 |
+
bx1, by1 = max(0, int(x1)), max(0, int(y1))
|
| 292 |
+
bx2, by2 = min(w, int(x2)), min(h, int(y2))
|
| 293 |
+
if bx2 <= bx1 or by2 <= by1:
|
| 294 |
+
return default
|
| 295 |
+
|
| 296 |
+
crop = img_rgb[by1:by2, bx1:bx2]
|
| 297 |
+
mask_crop = tight_mask[by1:by2, bx1:bx2]
|
| 298 |
+
bg = _local_bg_color(crop)
|
| 299 |
+
|
| 300 |
+
# Strategy 1: tight mask ink pixels
|
| 301 |
+
ink_pixels = crop[mask_crop > 128]
|
| 302 |
+
if len(ink_pixels) >= 3:
|
| 303 |
+
median = np.median(ink_pixels, axis=0)
|
| 304 |
+
dist = float(((median - bg.astype(float)) ** 2).sum() ** 0.5)
|
| 305 |
+
if dist >= min_contrast:
|
| 306 |
+
c = median.astype(int)
|
| 307 |
+
return (int(c[0]), int(c[1]), int(c[2]))
|
| 308 |
+
|
| 309 |
+
# Strategy 2: pixels most different from background
|
| 310 |
+
flat = crop.reshape(-1, 3).astype(float)
|
| 311 |
+
dists = np.sqrt(((flat - bg.astype(float)) ** 2).sum(axis=1))
|
| 312 |
+
threshold = np.percentile(dists, 80)
|
| 313 |
+
far_pixels = flat[dists >= max(threshold, min_contrast * 0.5)]
|
| 314 |
+
|
| 315 |
+
if len(far_pixels) >= 3:
|
| 316 |
+
median = np.median(far_pixels, axis=0).astype(int)
|
| 317 |
+
return (int(median[0]), int(median[1]), int(median[2]))
|
| 318 |
+
|
| 319 |
+
return default
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
# ── Background detection ────────────��─────────────────────────
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
def detect_bg_color(image_path: str, border_px: int = 20) -> tuple[int, ...] | None:
|
| 326 |
+
"""Sample border pixels to detect solid background color.
|
| 327 |
+
|
| 328 |
+
Returns (r, g, b) if low variance, else None.
|
| 329 |
+
"""
|
| 330 |
+
img = np.array(Image.open(image_path).convert("RGB"))
|
| 331 |
+
h, w = img.shape[:2]
|
| 332 |
+
border = np.concatenate([
|
| 333 |
+
img[:border_px, :].reshape(-1, 3),
|
| 334 |
+
img[-border_px:, :].reshape(-1, 3),
|
| 335 |
+
img[border_px:-border_px, :border_px].reshape(-1, 3),
|
| 336 |
+
img[border_px:-border_px, -border_px:].reshape(-1, 3),
|
| 337 |
+
])
|
| 338 |
+
std = border.std(axis=0).mean()
|
| 339 |
+
if std < 25:
|
| 340 |
+
median = np.median(border, axis=0).astype(int)
|
| 341 |
+
return tuple(median)
|
| 342 |
+
return None
|
| 343 |
+
|
| 344 |
+
|
| 345 |
+
# ── PPTX assembly ─────────────────────────────────────────────
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
def assemble_pptx(
|
| 349 |
+
image_path: str,
|
| 350 |
+
ocr_regions: list[dict],
|
| 351 |
+
output_path: str,
|
| 352 |
+
background_path: str | None = None,
|
| 353 |
+
tight_mask: np.ndarray | None = None,
|
| 354 |
+
min_font: int = 8,
|
| 355 |
+
max_font: int = 72,
|
| 356 |
+
slide_w_inches: float | None = None,
|
| 357 |
+
) -> dict:
|
| 358 |
+
"""Assemble an editable PPTX from OCR regions and background.
|
| 359 |
+
|
| 360 |
+
Args:
|
| 361 |
+
image_path: Original input image.
|
| 362 |
+
ocr_regions: List of OCR region dicts with bbox and text.
|
| 363 |
+
output_path: Where to save the .pptx file.
|
| 364 |
+
background_path: Inpainted background image (or None for solid bg).
|
| 365 |
+
tight_mask: Pre-dilation text mask for color detection (H, W), uint8.
|
| 366 |
+
min_font: Minimum font size in points.
|
| 367 |
+
max_font: Maximum font size in points.
|
| 368 |
+
slide_w_inches: Override slide width (auto-detected from aspect ratio).
|
| 369 |
+
|
| 370 |
+
Returns:
|
| 371 |
+
Report dict with assembly statistics.
|
| 372 |
+
"""
|
| 373 |
+
img = Image.open(image_path)
|
| 374 |
+
img_w, img_h = img.size
|
| 375 |
+
mapper = SlideMapper(img_w, img_h, slide_w_inches)
|
| 376 |
+
|
| 377 |
+
# Create presentation
|
| 378 |
+
prs = Presentation()
|
| 379 |
+
prs.slide_width = Inches(mapper.slide_w)
|
| 380 |
+
prs.slide_height = Inches(mapper.slide_h)
|
| 381 |
+
slide = prs.slides.add_slide(prs.slide_layouts[6]) # Blank
|
| 382 |
+
|
| 383 |
+
# Background
|
| 384 |
+
if background_path and Path(background_path).exists():
|
| 385 |
+
slide.shapes.add_picture(
|
| 386 |
+
background_path, Emu(0), Emu(0),
|
| 387 |
+
Inches(mapper.slide_w), Inches(mapper.slide_h),
|
| 388 |
+
)
|
| 389 |
+
bg_mode = "inpainted"
|
| 390 |
+
else:
|
| 391 |
+
bg_color = detect_bg_color(image_path)
|
| 392 |
+
if bg_color:
|
| 393 |
+
fill = slide.background.fill
|
| 394 |
+
fill.solid()
|
| 395 |
+
fill.fore_color.rgb = RGBColor(*bg_color)
|
| 396 |
+
bg_mode = f"solid rgb{bg_color}"
|
| 397 |
+
else:
|
| 398 |
+
slide.shapes.add_picture(
|
| 399 |
+
image_path, Emu(0), Emu(0),
|
| 400 |
+
Inches(mapper.slide_w), Inches(mapper.slide_h),
|
| 401 |
+
)
|
| 402 |
+
bg_mode = "original"
|
| 403 |
+
|
| 404 |
+
# Load image array for color detection
|
| 405 |
+
img_rgb = None
|
| 406 |
+
if tight_mask is not None:
|
| 407 |
+
img_rgb = np.array(Image.open(image_path).convert("RGB"))
|
| 408 |
+
|
| 409 |
+
# Group OCR regions into lines
|
| 410 |
+
text_groups = group_text_lines(ocr_regions)
|
| 411 |
+
|
| 412 |
+
# Add text boxes
|
| 413 |
+
count = 0
|
| 414 |
+
for group in text_groups:
|
| 415 |
+
x1, y1, x2, y2 = group_bbox(group)
|
| 416 |
+
text = group_to_text(group)
|
| 417 |
+
if not text.strip():
|
| 418 |
+
continue
|
| 419 |
+
|
| 420 |
+
left, top, width, height = mapper.bbox_to_emu(x1, y1, x2, y2)
|
| 421 |
+
pad = mapper.to_emu(2)
|
| 422 |
+
left = max(0, left - pad)
|
| 423 |
+
top = max(0, top - pad)
|
| 424 |
+
width += pad * 2
|
| 425 |
+
height += pad * 2
|
| 426 |
+
|
| 427 |
+
txBox = slide.shapes.add_textbox(left, top, width, height)
|
| 428 |
+
tf = txBox.text_frame
|
| 429 |
+
tf.word_wrap = True
|
| 430 |
+
tf.margin_left = tf.margin_right = tf.margin_top = tf.margin_bottom = Emu(0)
|
| 431 |
+
|
| 432 |
+
# Font size
|
| 433 |
+
bbox_w = x2 - x1
|
| 434 |
+
region_heights = [r["bbox"]["y2"] - r["bbox"]["y1"] for r in group]
|
| 435 |
+
line_h = sorted(region_heights)[len(region_heights) // 2]
|
| 436 |
+
font_size = autoscale_font(text, bbox_w, line_h, mapper.ppi, min_font, max_font)
|
| 437 |
+
|
| 438 |
+
# Font color
|
| 439 |
+
if img_rgb is not None and tight_mask is not None:
|
| 440 |
+
r, g, b = detect_text_color(img_rgb, tight_mask, x1, y1, x2, y2)
|
| 441 |
+
else:
|
| 442 |
+
r, g, b = 0x33, 0x33, 0x33
|
| 443 |
+
color = RGBColor(r, g, b)
|
| 444 |
+
|
| 445 |
+
lines = text.split("\n")
|
| 446 |
+
p = tf.paragraphs[0]
|
| 447 |
+
p.text = lines[0]
|
| 448 |
+
p.font.size = Pt(font_size)
|
| 449 |
+
p.font.color.rgb = color
|
| 450 |
+
for line in lines[1:]:
|
| 451 |
+
p = tf.add_paragraph()
|
| 452 |
+
p.text = line
|
| 453 |
+
p.font.size = Pt(font_size)
|
| 454 |
+
p.font.color.rgb = color
|
| 455 |
+
|
| 456 |
+
count += 1
|
| 457 |
+
|
| 458 |
+
prs.save(output_path)
|
| 459 |
+
|
| 460 |
+
return {
|
| 461 |
+
"image_size": {"width": img_w, "height": img_h},
|
| 462 |
+
"slide_size": {
|
| 463 |
+
"width_inches": round(mapper.slide_w, 2),
|
| 464 |
+
"height_inches": round(mapper.slide_h, 2),
|
| 465 |
+
},
|
| 466 |
+
"ppi": round(mapper.ppi, 1),
|
| 467 |
+
"background": bg_mode,
|
| 468 |
+
"text_boxes": count,
|
| 469 |
+
"ocr_regions": len(ocr_regions),
|
| 470 |
+
}
|
px_image2pptx/cli.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Command-line interface for px-image2pptx.
|
| 2 |
+
|
| 3 |
+
Usage::
|
| 4 |
+
|
| 5 |
+
# Full pipeline (OCR + textmask + inpaint + PPTX)
|
| 6 |
+
px-image2pptx slide.png -o output.pptx
|
| 7 |
+
|
| 8 |
+
# With pre-computed OCR
|
| 9 |
+
px-image2pptx slide.png -o output.pptx --ocr-json text_regions.json
|
| 10 |
+
|
| 11 |
+
# Skip inpainting (solid background or use original)
|
| 12 |
+
px-image2pptx slide.png -o output.pptx --skip-inpaint
|
| 13 |
+
|
| 14 |
+
# Chinese slide
|
| 15 |
+
px-image2pptx slide.png -o output.pptx --lang ch
|
| 16 |
+
|
| 17 |
+
# Keep intermediate files
|
| 18 |
+
px-image2pptx slide.png -o output.pptx --work-dir ./debug/
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from __future__ import annotations
|
| 22 |
+
|
| 23 |
+
import argparse
|
| 24 |
+
import sys
|
| 25 |
+
import time
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _parse_args(argv=None):
|
| 29 |
+
parser = argparse.ArgumentParser(
|
| 30 |
+
prog="px-image2pptx",
|
| 31 |
+
description="Convert static images to editable PowerPoint slides.",
|
| 32 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 33 |
+
epilog="""\
|
| 34 |
+
examples:
|
| 35 |
+
px-image2pptx slide.png -o output.pptx
|
| 36 |
+
px-image2pptx slide.png -o output.pptx --lang ch
|
| 37 |
+
px-image2pptx slide.png -o output.pptx --skip-inpaint
|
| 38 |
+
px-image2pptx slide.png -o output.pptx --ocr-json ocr.json
|
| 39 |
+
px-image2pptx slide.png -o output.pptx --work-dir ./debug/
|
| 40 |
+
""",
|
| 41 |
+
)
|
| 42 |
+
parser.add_argument("image", help="Input image (PNG/JPG/WebP)")
|
| 43 |
+
parser.add_argument("-o", "--output", default="output.pptx",
|
| 44 |
+
help="Output PPTX path (default: output.pptx)")
|
| 45 |
+
parser.add_argument("--ocr-json", default=None,
|
| 46 |
+
help="Pre-computed OCR JSON (skips OCR step)")
|
| 47 |
+
parser.add_argument("--lang", default="auto", choices=["auto", "en", "ch"],
|
| 48 |
+
help="OCR language (default: auto-detect)")
|
| 49 |
+
parser.add_argument("--sensitivity", type=float, default=16,
|
| 50 |
+
help="Textmask sensitivity (default: 16)")
|
| 51 |
+
parser.add_argument("--dilation", type=int, default=12,
|
| 52 |
+
help="Textmask dilation pixels (default: 12)")
|
| 53 |
+
parser.add_argument("--min-font", type=int, default=8,
|
| 54 |
+
help="Minimum font size in points (default: 8)")
|
| 55 |
+
parser.add_argument("--max-font", type=int, default=72,
|
| 56 |
+
help="Maximum font size in points (default: 72)")
|
| 57 |
+
parser.add_argument("--skip-inpaint", action="store_true",
|
| 58 |
+
help="Skip LAMA inpainting (use original or solid bg)")
|
| 59 |
+
parser.add_argument("--work-dir", default=None,
|
| 60 |
+
help="Directory for intermediate files")
|
| 61 |
+
return parser.parse_args(argv)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def main(argv=None):
|
| 65 |
+
args = _parse_args(argv)
|
| 66 |
+
|
| 67 |
+
from px_image2pptx.pipeline import image_to_pptx
|
| 68 |
+
|
| 69 |
+
t0 = time.time()
|
| 70 |
+
report = image_to_pptx(
|
| 71 |
+
image_path=args.image,
|
| 72 |
+
output_path=args.output,
|
| 73 |
+
ocr_json=args.ocr_json,
|
| 74 |
+
lang=args.lang,
|
| 75 |
+
sensitivity=args.sensitivity,
|
| 76 |
+
dilation=args.dilation,
|
| 77 |
+
min_font=args.min_font,
|
| 78 |
+
max_font=args.max_font,
|
| 79 |
+
skip_inpaint=args.skip_inpaint,
|
| 80 |
+
work_dir=args.work_dir,
|
| 81 |
+
)
|
| 82 |
+
elapsed = time.time() - t0
|
| 83 |
+
|
| 84 |
+
print(f"Saved: {args.output}")
|
| 85 |
+
print(f" Text boxes: {report['text_boxes']}")
|
| 86 |
+
print(f" OCR regions: {report['ocr_regions']}")
|
| 87 |
+
print(f" Background: {report['background']}")
|
| 88 |
+
print(f" Slide: {report['slide_size']['width_inches']}x"
|
| 89 |
+
f"{report['slide_size']['height_inches']}\"")
|
| 90 |
+
print(f" Time: {elapsed:.1f}s", end="")
|
| 91 |
+
if "timings" in report:
|
| 92 |
+
t = report["timings"]
|
| 93 |
+
parts = [f"{k}={v}s" for k, v in t.items()]
|
| 94 |
+
print(f" ({', '.join(parts)})")
|
| 95 |
+
else:
|
| 96 |
+
print()
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
if __name__ == "__main__":
|
| 100 |
+
main()
|
px_image2pptx/inpaint.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""LAMA neural inpainting — reconstruct masked regions.
|
| 2 |
+
|
| 3 |
+
Requires the optional ``inpaint`` extra: ``pip install px-image2pptx[inpaint]``.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
from PIL import Image
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def _ensure_lama():
|
| 13 |
+
"""Import LAMA dependencies, raising a helpful error if not installed."""
|
| 14 |
+
try:
|
| 15 |
+
import torch
|
| 16 |
+
from simple_lama_inpainting.models.model import (
|
| 17 |
+
download_model, LAMA_MODEL_URL, prepare_img_and_mask,
|
| 18 |
+
)
|
| 19 |
+
return torch, download_model, LAMA_MODEL_URL, prepare_img_and_mask
|
| 20 |
+
except ImportError:
|
| 21 |
+
raise ImportError(
|
| 22 |
+
"LAMA inpainting requires PyTorch and simple-lama-inpainting.\n"
|
| 23 |
+
"Install with:\n pip install px-image2pptx[inpaint]"
|
| 24 |
+
) from None
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def inpaint(
|
| 28 |
+
image: np.ndarray,
|
| 29 |
+
mask: np.ndarray,
|
| 30 |
+
) -> np.ndarray:
|
| 31 |
+
"""Inpaint masked regions of an image using LAMA.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
image: RGB numpy array (H, W, 3), uint8.
|
| 35 |
+
mask: Grayscale numpy array (H, W), uint8. 255 = inpaint.
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
Inpainted RGB numpy array (H, W, 3), uint8.
|
| 39 |
+
"""
|
| 40 |
+
torch, download_model, LAMA_MODEL_URL, prepare_img_and_mask = _ensure_lama()
|
| 41 |
+
|
| 42 |
+
if torch.backends.mps.is_available():
|
| 43 |
+
device = torch.device("mps")
|
| 44 |
+
elif torch.cuda.is_available():
|
| 45 |
+
device = torch.device("cuda")
|
| 46 |
+
else:
|
| 47 |
+
device = torch.device("cpu")
|
| 48 |
+
|
| 49 |
+
model_path = download_model(LAMA_MODEL_URL)
|
| 50 |
+
model = torch.jit.load(model_path, map_location=device)
|
| 51 |
+
model.eval()
|
| 52 |
+
model.to(device)
|
| 53 |
+
|
| 54 |
+
pil_image = Image.fromarray(image)
|
| 55 |
+
pil_mask = Image.fromarray(mask)
|
| 56 |
+
img_t, mask_t = prepare_img_and_mask(pil_image, pil_mask, device)
|
| 57 |
+
|
| 58 |
+
with torch.inference_mode():
|
| 59 |
+
inpainted = model(img_t, mask_t)
|
| 60 |
+
result = inpainted[0].permute(1, 2, 0).detach().cpu().numpy()
|
| 61 |
+
result = np.clip(result * 255, 0, 255).astype(np.uint8)
|
| 62 |
+
|
| 63 |
+
return result
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def inpaint_file(
|
| 67 |
+
image_path: str,
|
| 68 |
+
mask_path: str,
|
| 69 |
+
output_path: str,
|
| 70 |
+
) -> str:
|
| 71 |
+
"""Inpaint an image file with a mask file, save result.
|
| 72 |
+
|
| 73 |
+
Returns the output path.
|
| 74 |
+
"""
|
| 75 |
+
import cv2
|
| 76 |
+
|
| 77 |
+
image = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
|
| 78 |
+
mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
|
| 79 |
+
|
| 80 |
+
result = inpaint(image, mask)
|
| 81 |
+
|
| 82 |
+
result_bgr = cv2.cvtColor(result, cv2.COLOR_RGB2BGR)
|
| 83 |
+
cv2.imwrite(output_path, result_bgr)
|
| 84 |
+
return output_path
|
px_image2pptx/ocr.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OCR text detection using PaddleOCR.
|
| 2 |
+
|
| 3 |
+
Detects text regions with bounding boxes, text content, and confidence scores.
|
| 4 |
+
Requires the optional ``ocr`` extra: ``pip install px-image2pptx[ocr]``.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Any
|
| 12 |
+
|
| 13 |
+
import cv2
|
| 14 |
+
import numpy as np
|
| 15 |
+
from PIL import Image, ImageDraw
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _ensure_paddleocr():
|
| 19 |
+
"""Import PaddleOCR, raising a helpful error if not installed."""
|
| 20 |
+
try:
|
| 21 |
+
from paddleocr import PaddleOCR
|
| 22 |
+
return PaddleOCR
|
| 23 |
+
except ImportError:
|
| 24 |
+
raise ImportError(
|
| 25 |
+
"PaddleOCR is required for OCR. Install with:\n"
|
| 26 |
+
" pip install px-image2pptx[ocr]"
|
| 27 |
+
) from None
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def run_ocr(image_path: str | Path, lang: str = "ch") -> list[dict]:
|
| 31 |
+
"""Run PaddleOCR on an image and return structured text regions.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
image_path: Path to the input image.
|
| 35 |
+
lang: OCR language (default "ch"). Use "en" for English only.
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
List of text region dicts, each with:
|
| 39 |
+
- id: int
|
| 40 |
+
- text: str
|
| 41 |
+
- confidence: float
|
| 42 |
+
- bbox: {"x1": int, "y1": int, "x2": int, "y2": int}
|
| 43 |
+
"""
|
| 44 |
+
import os
|
| 45 |
+
os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True"
|
| 46 |
+
|
| 47 |
+
PaddleOCR = _ensure_paddleocr()
|
| 48 |
+
|
| 49 |
+
ocr = PaddleOCR(
|
| 50 |
+
lang=lang,
|
| 51 |
+
use_textline_orientation=False,
|
| 52 |
+
use_doc_orientation_classify=False,
|
| 53 |
+
use_doc_unwarping=False,
|
| 54 |
+
)
|
| 55 |
+
results = list(ocr.predict(str(image_path)))
|
| 56 |
+
|
| 57 |
+
regions = []
|
| 58 |
+
idx = 0
|
| 59 |
+
for page in results:
|
| 60 |
+
polys = page.get("dt_polys", [])
|
| 61 |
+
texts = page.get("rec_texts", [])
|
| 62 |
+
scores = page.get("rec_scores", [])
|
| 63 |
+
for poly, text, conf in zip(polys, texts, scores):
|
| 64 |
+
xs = [p[0] for p in poly]
|
| 65 |
+
ys = [p[1] for p in poly]
|
| 66 |
+
regions.append({
|
| 67 |
+
"id": idx,
|
| 68 |
+
"text": text,
|
| 69 |
+
"confidence": round(float(conf), 4),
|
| 70 |
+
"bbox": {
|
| 71 |
+
"x1": int(min(xs)),
|
| 72 |
+
"y1": int(min(ys)),
|
| 73 |
+
"x2": int(max(xs)),
|
| 74 |
+
"y2": int(max(ys)),
|
| 75 |
+
},
|
| 76 |
+
})
|
| 77 |
+
idx += 1
|
| 78 |
+
|
| 79 |
+
return regions
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def save_ocr_json(regions: list[dict], path: str | Path) -> None:
|
| 83 |
+
"""Save OCR regions to JSON file."""
|
| 84 |
+
with open(path, "w") as f:
|
| 85 |
+
json.dump({"text_regions": regions}, f, indent=2, ensure_ascii=False)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def load_ocr_json(path: str | Path) -> list[dict]:
|
| 89 |
+
"""Load OCR regions from JSON file."""
|
| 90 |
+
with open(path) as f:
|
| 91 |
+
data = json.load(f)
|
| 92 |
+
return data.get("text_regions", [])
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def draw_ocr_overlay(image_path: str | Path, regions: list[dict]) -> Image.Image:
|
| 96 |
+
"""Draw OCR bounding boxes on image for visualization."""
|
| 97 |
+
img = Image.open(image_path).convert("RGB")
|
| 98 |
+
draw = ImageDraw.Draw(img, "RGBA")
|
| 99 |
+
for r in regions:
|
| 100 |
+
b = r["bbox"]
|
| 101 |
+
draw.rectangle([b["x1"], b["y1"], b["x2"], b["y2"]],
|
| 102 |
+
outline=(255, 50, 50), width=3)
|
| 103 |
+
draw.rectangle([b["x1"], b["y1"], b["x2"], b["y2"]],
|
| 104 |
+
fill=(255, 50, 50, 40))
|
| 105 |
+
return img
|
px_image2pptx/pipeline.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""End-to-end pipeline: image → editable PPTX.
|
| 2 |
+
|
| 3 |
+
Orchestrates: OCR → textmask → mask-clip → inpaint → PPTX assembly.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import time
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
import cv2
|
| 13 |
+
import numpy as np
|
| 14 |
+
from PIL import Image
|
| 15 |
+
|
| 16 |
+
from px_image2pptx.assemble import assemble_pptx
|
| 17 |
+
from px_image2pptx.textmask import compute_masks
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def image_to_pptx(
|
| 21 |
+
image_path: str | Path,
|
| 22 |
+
output_path: str | Path = "output.pptx",
|
| 23 |
+
*,
|
| 24 |
+
ocr_json: str | Path | None = None,
|
| 25 |
+
lang: str = "auto",
|
| 26 |
+
sensitivity: float = 16,
|
| 27 |
+
dilation: int = 12,
|
| 28 |
+
mask_padding: int = 15,
|
| 29 |
+
min_font: int = 8,
|
| 30 |
+
max_font: int = 72,
|
| 31 |
+
skip_inpaint: bool = False,
|
| 32 |
+
work_dir: str | Path | None = None,
|
| 33 |
+
) -> dict:
|
| 34 |
+
"""Convert a static image to an editable PPTX.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
image_path: Input image (PNG/JPG/WebP).
|
| 38 |
+
output_path: Where to save the .pptx file.
|
| 39 |
+
ocr_json: Pre-computed OCR JSON (skip OCR step if provided).
|
| 40 |
+
lang: OCR language ("en", "ch", or "auto" to detect).
|
| 41 |
+
sensitivity: Textmask sensitivity (lower = more aggressive).
|
| 42 |
+
dilation: Textmask dilation in pixels.
|
| 43 |
+
mask_padding: Padding around OCR bboxes for mask clipping.
|
| 44 |
+
min_font: Minimum font size in points.
|
| 45 |
+
max_font: Maximum font size in points.
|
| 46 |
+
skip_inpaint: If True, skip inpainting (use original as background).
|
| 47 |
+
work_dir: Directory for intermediate files (default: temp dir).
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
Report dict with pipeline statistics.
|
| 51 |
+
"""
|
| 52 |
+
image_path = str(image_path)
|
| 53 |
+
output_path = str(output_path)
|
| 54 |
+
timings = {}
|
| 55 |
+
|
| 56 |
+
# Work directory for intermediates (only created when explicitly requested)
|
| 57 |
+
save_intermediates = work_dir is not None
|
| 58 |
+
if save_intermediates:
|
| 59 |
+
wdir = Path(work_dir)
|
| 60 |
+
wdir.mkdir(parents=True, exist_ok=True)
|
| 61 |
+
|
| 62 |
+
# Step 1: OCR
|
| 63 |
+
t0 = time.time()
|
| 64 |
+
if ocr_json:
|
| 65 |
+
from px_image2pptx.ocr import load_ocr_json
|
| 66 |
+
ocr_regions = load_ocr_json(ocr_json)
|
| 67 |
+
else:
|
| 68 |
+
from px_image2pptx.ocr import run_ocr, save_ocr_json
|
| 69 |
+
|
| 70 |
+
# "ch" model handles both Chinese and English, so use it as default
|
| 71 |
+
ocr_lang = "ch" if lang == "auto" else lang
|
| 72 |
+
ocr_regions = run_ocr(image_path, lang=ocr_lang)
|
| 73 |
+
|
| 74 |
+
if save_intermediates:
|
| 75 |
+
save_ocr_json(ocr_regions, wdir / "text_regions.json")
|
| 76 |
+
timings["ocr"] = round(time.time() - t0, 2)
|
| 77 |
+
|
| 78 |
+
# Step 2: Textmask → clip to OCR → dilate
|
| 79 |
+
t0 = time.time()
|
| 80 |
+
image_bgr = cv2.imread(image_path)
|
| 81 |
+
tight_mask, clipped_mask, dilated_mask = compute_masks(
|
| 82 |
+
image_bgr, ocr_regions,
|
| 83 |
+
sensitivity=sensitivity, dilation=dilation, padding=mask_padding,
|
| 84 |
+
)
|
| 85 |
+
if save_intermediates:
|
| 86 |
+
Image.fromarray(tight_mask).save(str(wdir / "tight_mask.png"))
|
| 87 |
+
Image.fromarray(clipped_mask).save(str(wdir / "clipped_mask.png"))
|
| 88 |
+
Image.fromarray(dilated_mask).save(str(wdir / "mask.png"))
|
| 89 |
+
timings["textmask"] = round(time.time() - t0, 2)
|
| 90 |
+
|
| 91 |
+
# Step 3: Inpaint
|
| 92 |
+
background_path = None
|
| 93 |
+
_temp_bg = None
|
| 94 |
+
if not skip_inpaint:
|
| 95 |
+
t0 = time.time()
|
| 96 |
+
from px_image2pptx.inpaint import inpaint
|
| 97 |
+
|
| 98 |
+
image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
|
| 99 |
+
result = inpaint(image_rgb, dilated_mask)
|
| 100 |
+
|
| 101 |
+
if save_intermediates:
|
| 102 |
+
bg_path = str(wdir / "background.png")
|
| 103 |
+
Image.fromarray(result).save(bg_path)
|
| 104 |
+
background_path = bg_path
|
| 105 |
+
else:
|
| 106 |
+
import tempfile
|
| 107 |
+
_temp_bg = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
|
| 108 |
+
Image.fromarray(result).save(_temp_bg.name)
|
| 109 |
+
background_path = _temp_bg.name
|
| 110 |
+
timings["inpaint"] = round(time.time() - t0, 2)
|
| 111 |
+
|
| 112 |
+
# Step 4: Assemble PPTX
|
| 113 |
+
t0 = time.time()
|
| 114 |
+
report = assemble_pptx(
|
| 115 |
+
image_path=image_path,
|
| 116 |
+
ocr_regions=ocr_regions,
|
| 117 |
+
output_path=output_path,
|
| 118 |
+
background_path=background_path,
|
| 119 |
+
tight_mask=tight_mask,
|
| 120 |
+
min_font=min_font,
|
| 121 |
+
max_font=max_font,
|
| 122 |
+
)
|
| 123 |
+
timings["assemble"] = round(time.time() - t0, 2)
|
| 124 |
+
|
| 125 |
+
# Clean up temp background file
|
| 126 |
+
if _temp_bg is not None:
|
| 127 |
+
import os
|
| 128 |
+
os.unlink(_temp_bg.name)
|
| 129 |
+
|
| 130 |
+
report["timings"] = timings
|
| 131 |
+
if save_intermediates:
|
| 132 |
+
report["work_dir"] = str(wdir)
|
| 133 |
+
with open(wdir / "report.json", "w") as f:
|
| 134 |
+
json.dump(report, f, indent=2)
|
| 135 |
+
|
| 136 |
+
return report
|
px_image2pptx/textmask.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Text ink detection using classical computer vision.
|
| 2 |
+
|
| 3 |
+
Detects text pixels directly from image using adaptive thresholding,
|
| 4 |
+
connected component filtering, and Canny edge reinforcement. No ML model.
|
| 5 |
+
|
| 6 |
+
Returns both a tight mask (actual ink pixels, for color sampling) and a
|
| 7 |
+
dilated mask (for inpainting with safe coverage).
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import cv2
|
| 13 |
+
import numpy as np
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def detect_text_ink(
|
| 17 |
+
image: np.ndarray,
|
| 18 |
+
block_size: int = 25,
|
| 19 |
+
sensitivity: float = 16,
|
| 20 |
+
max_component_pct: float = 2.0,
|
| 21 |
+
min_component_area: int = 8,
|
| 22 |
+
max_density: float = 0.9,
|
| 23 |
+
max_density_area: int = 500,
|
| 24 |
+
edge_neighborhood: int = 15,
|
| 25 |
+
min_final_area: int = 10,
|
| 26 |
+
) -> np.ndarray:
|
| 27 |
+
"""Detect text ink pixels using adaptive thresholding and component analysis.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
image: BGR numpy array (H, W, 3), uint8.
|
| 31 |
+
block_size: Adaptive threshold block size (must be odd, >= 3).
|
| 32 |
+
sensitivity: Adaptive threshold C parameter. Higher = less sensitive.
|
| 33 |
+
max_component_pct: Max connected component area as % of image.
|
| 34 |
+
min_component_area: Min component area in pixels (noise filter).
|
| 35 |
+
max_density: Components with density above this AND area above
|
| 36 |
+
max_density_area are treated as solid blobs.
|
| 37 |
+
max_density_area: Minimum area for density filtering to apply.
|
| 38 |
+
edge_neighborhood: Radius (px) for Canny edge reinforcement.
|
| 39 |
+
min_final_area: Final cleanup — components smaller than this removed.
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
Binary mask (H, W), uint8, 255 = text ink, 0 = background.
|
| 43 |
+
"""
|
| 44 |
+
h, w = image.shape[:2]
|
| 45 |
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
| 46 |
+
|
| 47 |
+
# Ensure block_size is valid
|
| 48 |
+
if block_size % 2 == 0:
|
| 49 |
+
block_size += 1
|
| 50 |
+
if block_size < 3:
|
| 51 |
+
block_size = 3
|
| 52 |
+
|
| 53 |
+
# Step 1: Dual thresholding (adaptive + Otsu intersection)
|
| 54 |
+
adaptive = cv2.adaptiveThreshold(
|
| 55 |
+
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,
|
| 56 |
+
blockSize=block_size, C=sensitivity,
|
| 57 |
+
)
|
| 58 |
+
_, otsu = cv2.threshold(
|
| 59 |
+
gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU,
|
| 60 |
+
)
|
| 61 |
+
combined = cv2.bitwise_and(adaptive, otsu)
|
| 62 |
+
|
| 63 |
+
# Step 2: Connect nearby stroke fragments
|
| 64 |
+
kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
|
| 65 |
+
candidates = cv2.morphologyEx(combined, cv2.MORPH_CLOSE, kernel_close)
|
| 66 |
+
|
| 67 |
+
# Step 3: Connected component filtering
|
| 68 |
+
num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(
|
| 69 |
+
candidates, connectivity=8,
|
| 70 |
+
)
|
| 71 |
+
max_area = h * w * (max_component_pct / 100.0)
|
| 72 |
+
text_mask = np.zeros((h, w), dtype=np.uint8)
|
| 73 |
+
|
| 74 |
+
for i in range(1, num_labels):
|
| 75 |
+
x, y, cw, ch, area = stats[i]
|
| 76 |
+
if area > max_area:
|
| 77 |
+
continue
|
| 78 |
+
if area < min_component_area:
|
| 79 |
+
continue
|
| 80 |
+
if cw > w * 0.3 and ch > h * 0.3:
|
| 81 |
+
continue
|
| 82 |
+
bbox_area = max(cw * ch, 1)
|
| 83 |
+
density = area / bbox_area
|
| 84 |
+
if density > max_density and area > max_density_area:
|
| 85 |
+
continue
|
| 86 |
+
text_mask[labels == i] = 255
|
| 87 |
+
|
| 88 |
+
# Step 4: Canny edge reinforcement near detected text
|
| 89 |
+
edges = cv2.Canny(gray, 80, 200)
|
| 90 |
+
kernel_near = cv2.getStructuringElement(
|
| 91 |
+
cv2.MORPH_ELLIPSE,
|
| 92 |
+
(edge_neighborhood * 2 + 1, edge_neighborhood * 2 + 1),
|
| 93 |
+
)
|
| 94 |
+
text_neighborhood = cv2.dilate(text_mask, kernel_near)
|
| 95 |
+
edge_near_text = cv2.bitwise_and(edges, text_neighborhood)
|
| 96 |
+
text_mask = cv2.bitwise_or(text_mask, edge_near_text)
|
| 97 |
+
|
| 98 |
+
# Step 5: Final cleanup
|
| 99 |
+
kernel_fill = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
|
| 100 |
+
text_mask = cv2.morphologyEx(text_mask, cv2.MORPH_CLOSE, kernel_fill)
|
| 101 |
+
|
| 102 |
+
num_labels2, labels2, stats2, _ = cv2.connectedComponentsWithStats(
|
| 103 |
+
text_mask, connectivity=8,
|
| 104 |
+
)
|
| 105 |
+
clean_mask = np.zeros((h, w), dtype=np.uint8)
|
| 106 |
+
for i in range(1, num_labels2):
|
| 107 |
+
if stats2[i, cv2.CC_STAT_AREA] >= min_final_area:
|
| 108 |
+
clean_mask[labels2 == i] = 255
|
| 109 |
+
|
| 110 |
+
return clean_mask
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def dilate_mask(mask: np.ndarray, dilation_px: int) -> np.ndarray:
|
| 114 |
+
"""Apply morphological dilation to a binary mask."""
|
| 115 |
+
if dilation_px <= 0 or not np.any(mask):
|
| 116 |
+
return mask.copy()
|
| 117 |
+
kernel = cv2.getStructuringElement(
|
| 118 |
+
cv2.MORPH_ELLIPSE,
|
| 119 |
+
(dilation_px * 2 + 1, dilation_px * 2 + 1),
|
| 120 |
+
)
|
| 121 |
+
return cv2.dilate(mask, kernel, iterations=1)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def clip_mask_to_ocr(
|
| 125 |
+
mask: np.ndarray,
|
| 126 |
+
ocr_regions: list[dict],
|
| 127 |
+
padding: int = 15,
|
| 128 |
+
) -> np.ndarray:
|
| 129 |
+
"""Clip text mask to OCR-confirmed regions only.
|
| 130 |
+
|
| 131 |
+
ANDs the textmask with rectangles from OCR bounding boxes so only
|
| 132 |
+
pixels inside known text regions survive. Prevents masking illustrations,
|
| 133 |
+
borders, and icons that textmask wrongly detects as text.
|
| 134 |
+
"""
|
| 135 |
+
h, w = mask.shape[:2]
|
| 136 |
+
ocr_mask = np.zeros_like(mask)
|
| 137 |
+
|
| 138 |
+
for r in ocr_regions:
|
| 139 |
+
b = r["bbox"]
|
| 140 |
+
y1 = max(0, b["y1"] - padding)
|
| 141 |
+
x1 = max(0, b["x1"] - padding)
|
| 142 |
+
y2 = min(h, b["y2"] + padding)
|
| 143 |
+
x2 = min(w, b["x2"] + padding)
|
| 144 |
+
ocr_mask[y1:y2, x1:x2] = 255
|
| 145 |
+
|
| 146 |
+
return np.minimum(mask, ocr_mask)
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def compute_masks(
|
| 150 |
+
image_bgr: np.ndarray,
|
| 151 |
+
ocr_regions: list[dict],
|
| 152 |
+
sensitivity: float = 16,
|
| 153 |
+
dilation: int = 12,
|
| 154 |
+
padding: int = 15,
|
| 155 |
+
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
| 156 |
+
"""Full textmask pipeline: detect → clip to OCR → dilate.
|
| 157 |
+
|
| 158 |
+
Returns:
|
| 159 |
+
(tight_mask, clipped_mask, dilated_mask)
|
| 160 |
+
- tight_mask: raw ink pixels (for color sampling)
|
| 161 |
+
- clipped_mask: tight mask AND-ed with OCR bboxes
|
| 162 |
+
- dilated_mask: clipped + dilation (for inpainting)
|
| 163 |
+
"""
|
| 164 |
+
tight = detect_text_ink(image_bgr, sensitivity=sensitivity)
|
| 165 |
+
clipped = clip_mask_to_ocr(tight, ocr_regions, padding=padding)
|
| 166 |
+
dilated = dilate_mask(clipped, dilation)
|
| 167 |
+
return tight, clipped, dilated
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Pillow>=10.0
|
| 2 |
+
numpy>=1.24
|
| 3 |
+
opencv-python-headless>=4.8
|
| 4 |
+
python-pptx>=0.6.21
|
| 5 |
+
paddleocr>=3.0
|
| 6 |
+
paddlepaddle>=3.0
|
| 7 |
+
simple-lama-inpainting>=0.1.0
|
| 8 |
+
torch>=2.0
|
| 9 |
+
gradio>=4.0
|