Spaces:

pxGenius
/

image2pptx

Running

App Files Files Community

pxGenius commited on about 1 month ago

Commit

f7d9770

verified ·

1 Parent(s): 7dd4487

Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

.gitattributes +1 -0
app.py +79 -0
examples/chart_good1.png +3 -0
px_image2pptx/__init__.py +23 -0
px_image2pptx/assemble.py +470 -0
px_image2pptx/cli.py +100 -0
px_image2pptx/inpaint.py +84 -0
px_image2pptx/ocr.py +105 -0
px_image2pptx/pipeline.py +136 -0
px_image2pptx/textmask.py +167 -0
requirements.txt +9 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/chart_good1.png filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""Gradio demo for px-image2pptx — deploy on Hugging Face Spaces."""
+import os
+import tempfile
+from pathlib import Path
+os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True"
+import gradio as gr
+from PIL import Image
+def convert(image_path, lang):
+    from px_image2pptx import image_to_pptx
+    # Convert WebP to PNG if needed (PaddleOCR doesn't support WebP)
+    img = Image.open(image_path)
+    if img.format == "WEBP" or image_path.lower().endswith(".webp"):
+        png_path = image_path.rsplit(".", 1)[0] + ".png"
+        img.save(png_path)
+        image_path = png_path
+    tmpdir = tempfile.mkdtemp()
+    out_pptx = os.path.join(tmpdir, "output.pptx")
+    work_dir = os.path.join(tmpdir, "work")
+    report = image_to_pptx(
+        image_path,
+        out_pptx,
+        lang=lang,
+        work_dir=work_dir,
+    )
+    # Load the inpainted background for preview
+    bg_path = os.path.join(work_dir, "background.png")
+    bg_preview = Image.open(bg_path) if os.path.exists(bg_path) else None
+    summary = (
+        f"**Text boxes:** {report['text_boxes']}  \n"
+        f"**OCR regions:** {report['ocr_regions']}  \n"
+        f"**Slide size:** {report['slide_size']['width_inches']}x"
+        f"{report['slide_size']['height_inches']}\"  \n"
+        f"**Timings:** {report.get('timings', {})}"
+    )
+    return bg_preview, out_pptx, summary
+demo = gr.Interface(
+    fn=convert,
+    inputs=[
+        gr.Image(type="filepath", label="Input image (slide, poster, infographic)"),
+        gr.Radio(
+            choices=["auto", "en", "ch"],
+            value="auto",
+            label="OCR language",
+            info="auto = Chinese model (handles both Chinese & English)",
+        ),
+    ],
+    outputs=[
+        gr.Image(label="Inpainted background (text removed)"),
+        gr.File(label="Download .pptx"),
+        gr.Markdown(label="Report"),
+    ],
+    title="px-image2pptx",
+    description=(
+        "Convert a static image to an editable PowerPoint file. "
+        "OCR detects text, classical CV builds a text mask, LAMA inpaints "
+        "the background clean, and python-pptx reconstructs editable text boxes.\n\n"
+        "For a full browser-based editor, visit [pxGenius.ai](https://pxgenius.ai)."
+    ),
+    examples=[
+        ["examples/chart_good1.png", "auto"],
+    ],
+    cache_examples=False,
+)
+if __name__ == "__main__":
+    demo.launch()

examples/chart_good1.png ADDED Viewed

Git LFS Details

SHA256: cb74fedf598eaca6b5684f8fbf88a1a39897cdeaedcc50667879ec95bb0f60f4
Pointer size: 131 Bytes
Size of remote file: 105 kB

px_image2pptx/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""px-image2pptx -- Convert static images to editable PowerPoint slides.
+Pipeline: image → OCR → textmask → mask-clip → inpaint → PPTX assembly.
+OCR detects text regions. Textmask detects text ink pixels. Mask-clip ANDs
+them so only OCR-confirmed text is masked. LAMA inpaints the masked regions.
+PPTX assembly places editable text boxes over the clean background.
+Quick start::
+    from px_image2pptx import image_to_pptx
+    image_to_pptx("slide.png", "output.pptx")
+Or from the command line::
+    px-image2pptx slide.png -o output.pptx
+"""
+__version__ = "0.1.0"
+from px_image2pptx.pipeline import image_to_pptx
+__all__ = ["image_to_pptx"]

px_image2pptx/assemble.py ADDED Viewed

	@@ -0,0 +1,470 @@

+"""PPTX assembly — place text boxes and background onto editable slides.
+Pure python-pptx assembly: no ML models, no LLM calls. Takes OCR regions,
+background image, and optional tight mask → produces editable .pptx.
+"""
+from __future__ import annotations
+from pathlib import Path
+import numpy as np
+from PIL import Image, ImageFont
+from pptx import Presentation
+from pptx.util import Inches, Pt, Emu
+from pptx.dml.color import RGBColor
+# ── Coordinate mapping ────────────────────────────────────────
+def px_to_emu(px: float, px_per_inch: float) -> int:
+    """Convert image pixels to EMU (914400 per inch)."""
+    return int(px / px_per_inch * 914400)
+class SlideMapper:
+    """Maps image pixel coordinates to slide EMU coordinates."""
+    def __init__(self, img_w: int, img_h: int, slide_w_inches: float | None = None):
+        self.img_w = img_w
+        self.img_h = img_h
+        aspect = img_w / img_h
+        if slide_w_inches and slide_w_inches > 0:
+            self.slide_w = slide_w_inches
+            self.slide_h = slide_w_inches / aspect
+        elif aspect > 1.5:
+            self.slide_w, self.slide_h = 13.333, 7.5
+        elif aspect > 1.2:
+            self.slide_w, self.slide_h = 10.0, 7.5
+        else:
+            self.slide_w = 10.0
+            self.slide_h = 10.0 / aspect
+        self.ppi = img_w / self.slide_w
+    def to_emu(self, px: float) -> int:
+        return px_to_emu(px, self.ppi)
+    def bbox_to_emu(self, x1, y1, x2, y2):
+        return (self.to_emu(x1), self.to_emu(y1),
+                self.to_emu(x2 - x1), self.to_emu(y2 - y1))
+# ── Font measurement ──────────────────────────────────────────
+def _load_reference_font():
+    """Load a system sans-serif font for text width measurement."""
+    candidates = [
+        "/System/Library/Fonts/Supplemental/Arial.ttf",
+        "/System/Library/Fonts/Helvetica.ttc",
+        "/Library/Fonts/Arial.ttf",
+        "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
+    ]
+    for path in candidates:
+        try:
+            return ImageFont.truetype(path, 72), True
+        except Exception:
+            continue
+    return None, False
+_REF_FONT, _HAS_FONT = _load_reference_font()
+def _is_cjk(ch: str) -> bool:
+    cp = ord(ch)
+    return (0x4E00 <= cp <= 0x9FFF or 0x3400 <= cp <= 0x4DBF or
+            0x3000 <= cp <= 0x303F or 0xFF00 <= cp <= 0xFFEF or
+            0xF900 <= cp <= 0xFAFF or 0x2E80 <= cp <= 0x2EFF or
+            0x31C0 <= cp <= 0x31EF)
+def estimate_text_width_pt(text: str, font_pt: float) -> float:
+    """Measure rendered text width in points.
+    Hybrid: PIL font metrics for Latin, 1.0x em for CJK.
+    """
+    if _HAS_FONT:
+        width = 0.0
+        latin_run: list[str] = []
+        def flush():
+            nonlocal width
+            if latin_run:
+                ref_w = _REF_FONT.getlength("".join(latin_run))
+                width += ref_w * (font_pt / 72.0)
+                latin_run.clear()
+        for ch in text:
+            if _is_cjk(ch):
+                flush()
+                width += font_pt * 1.0
+            else:
+                latin_run.append(ch)
+        flush()
+        return width
+    # Fallback: heuristic
+    width = 0.0
+    for ch in text:
+        if _is_cjk(ch):
+            width += font_pt * 1.0
+        elif ch == " ":
+            width += font_pt * 0.25
+        else:
+            width += font_pt * 0.50
+    return width
+def autoscale_font(
+    text: str,
+    bbox_w_px: float,
+    bbox_h_px: float,
+    ppi: float,
+    min_pt: int = 8,
+    max_pt: int = 72,
+) -> int:
+    """Auto-scale font size to fill 90-94% of bbox width."""
+    line_h_pt = (bbox_h_px / ppi) * 72
+    pt = max(min_pt, min(max_pt, round(line_h_pt)))
+    bbox_w_pt = (bbox_w_px / ppi) * 72
+    lines = text.split("\n")
+    longest = max(lines, key=len) if lines else text
+    # Shrink to fit
+    for _ in range(40):
+        if estimate_text_width_pt(longest, pt) <= bbox_w_pt * 0.94 or pt <= min_pt:
+            break
+        pt = max(min_pt, pt - 1)
+    # Grow to fill
+    for _ in range(40):
+        if estimate_text_width_pt(longest, pt) >= bbox_w_pt * 0.90 or pt >= max_pt:
+            break
+        if estimate_text_width_pt(longest, pt + 1) > bbox_w_pt * 0.94:
+            break
+        pt = min(max_pt, pt + 1)
+    return pt
+# ── Text grouping ─────────────────────────────────────────────
+def group_text_lines(
+    regions: list[dict],
+    y_threshold: float = 0.6,
+    x_gap_factor: float = 3.0,
+) -> list[list[dict]]:
+    """Merge word-level OCR regions into line-level groups.
+    Two-pass: group by vertical proximity, then split by horizontal gaps
+    to prevent merging left/right columns.
+    """
+    if not regions:
+        return []
+    for r in regions:
+        b = r["bbox"]
+        r["_cy"] = (b["y1"] + b["y2"]) / 2
+        r["_h"] = b["y2"] - b["y1"]
+    sorted_regions = sorted(regions, key=lambda r: r["_cy"])
+    # Pass 1: vertical grouping
+    y_lines: list[list[dict]] = []
+    current = [sorted_regions[0]]
+    for r in sorted_regions[1:]:
+        line_cy = sum(rr["_cy"] for rr in current) / len(current)
+        line_h = max(rr["_h"] for rr in current)
+        if abs(r["_cy"] - line_cy) < line_h * y_threshold:
+            current.append(r)
+        else:
+            y_lines.append(current)
+            current = [r]
+    y_lines.append(current)
+    # Pass 2: split by horizontal gaps
+    lines: list[list[dict]] = []
+    for y_line in y_lines:
+        y_line.sort(key=lambda r: r["bbox"]["x1"])
+        if len(y_line) <= 1:
+            lines.append(y_line)
+            continue
+        heights = sorted(rr["_h"] for rr in y_line)
+        median_h = heights[len(heights) // 2]
+        max_gap = median_h * x_gap_factor
+        segment = [y_line[0]]
+        for r in y_line[1:]:
+            gap = r["bbox"]["x1"] - segment[-1]["bbox"]["x2"]
+            if gap > max_gap:
+                lines.append(segment)
+                segment = [r]
+            else:
+                segment.append(r)
+        lines.append(segment)
+    for r in regions:
+        r.pop("_cy", None)
+        r.pop("_h", None)
+    return lines
+def group_bbox(group: list[dict]) -> tuple[int, int, int, int]:
+    x1 = min(r["bbox"]["x1"] for r in group)
+    y1 = min(r["bbox"]["y1"] for r in group)
+    x2 = max(r["bbox"]["x2"] for r in group)
+    y2 = max(r["bbox"]["y2"] for r in group)
+    return x1, y1, x2, y2
+def group_to_text(group: list[dict]) -> str:
+    """Convert a group of OCR regions to display text."""
+    if not group:
+        return ""
+    for r in group:
+        b = r["bbox"]
+        r["_cy"] = (b["y1"] + b["y2"]) / 2
+        r["_h"] = b["y2"] - b["y1"]
+    sorted_r = sorted(group, key=lambda r: r["_cy"])
+    lines: list[list[dict]] = []
+    current = [sorted_r[0]]
+    for r in sorted_r[1:]:
+        line_cy = sum(rr["_cy"] for rr in current) / len(current)
+        line_h = max(rr["_h"] for rr in current)
+        if abs(r["_cy"] - line_cy) < line_h * 0.6:
+            current.append(r)
+        else:
+            lines.append(current)
+            current = [r]
+    lines.append(current)
+    text_lines = []
+    for line in lines:
+        line.sort(key=lambda r: r["bbox"]["x1"])
+        text_lines.append(" ".join(r["text"] for r in line))
+    for r in group:
+        r.pop("_cy", None)
+        r.pop("_h", None)
+    return "\n".join(text_lines)
+# ── Text color detection ──────────────────────────────────────
+def _local_bg_color(crop: np.ndarray, border: int = 2) -> np.ndarray:
+    h, w = crop.shape[:2]
+    if h < border * 2 + 1 or w < border * 2 + 1:
+        return np.median(crop.reshape(-1, 3), axis=0)
+    pixels = np.concatenate([
+        crop[:border, :].reshape(-1, 3),
+        crop[-border:, :].reshape(-1, 3),
+        crop[border:-border, :border].reshape(-1, 3),
+        crop[border:-border, -border:].reshape(-1, 3),
+    ])
+    return np.median(pixels, axis=0)
+def detect_text_color(
+    img_rgb: np.ndarray,
+    tight_mask: np.ndarray,
+    x1: int, y1: int, x2: int, y2: int,
+    default: tuple[int, int, int] = (0x33, 0x33, 0x33),
+    min_contrast: float = 40,
+) -> tuple[int, int, int]:
+    """Detect text color from original image using tight mask.
+    Strategy 1: median of tight-mask ink pixels (dark text on light bg).
+    Strategy 2: if color ≈ background, sample pixels most different from bg
+    (handles white text on dark bg where textmask misses the text).
+    """
+    h, w = img_rgb.shape[:2]
+    bx1, by1 = max(0, int(x1)), max(0, int(y1))
+    bx2, by2 = min(w, int(x2)), min(h, int(y2))
+    if bx2 <= bx1 or by2 <= by1:
+        return default
+    crop = img_rgb[by1:by2, bx1:bx2]
+    mask_crop = tight_mask[by1:by2, bx1:bx2]
+    bg = _local_bg_color(crop)
+    # Strategy 1: tight mask ink pixels
+    ink_pixels = crop[mask_crop > 128]
+    if len(ink_pixels) >= 3:
+        median = np.median(ink_pixels, axis=0)
+        dist = float(((median - bg.astype(float)) ** 2).sum() ** 0.5)
+        if dist >= min_contrast:
+            c = median.astype(int)
+            return (int(c[0]), int(c[1]), int(c[2]))
+    # Strategy 2: pixels most different from background
+    flat = crop.reshape(-1, 3).astype(float)
+    dists = np.sqrt(((flat - bg.astype(float)) ** 2).sum(axis=1))
+    threshold = np.percentile(dists, 80)
+    far_pixels = flat[dists >= max(threshold, min_contrast * 0.5)]
+    if len(far_pixels) >= 3:
+        median = np.median(far_pixels, axis=0).astype(int)
+        return (int(median[0]), int(median[1]), int(median[2]))
+    return default
+# ── Background detection ────────────��─────────────────────────
+def detect_bg_color(image_path: str, border_px: int = 20) -> tuple[int, ...] | None:
+    """Sample border pixels to detect solid background color.
+    Returns (r, g, b) if low variance, else None.
+    """
+    img = np.array(Image.open(image_path).convert("RGB"))
+    h, w = img.shape[:2]
+    border = np.concatenate([
+        img[:border_px, :].reshape(-1, 3),
+        img[-border_px:, :].reshape(-1, 3),
+        img[border_px:-border_px, :border_px].reshape(-1, 3),
+        img[border_px:-border_px, -border_px:].reshape(-1, 3),
+    ])
+    std = border.std(axis=0).mean()
+    if std < 25:
+        median = np.median(border, axis=0).astype(int)
+        return tuple(median)
+    return None
+# ── PPTX assembly ─────────────────────────────────────────────
+def assemble_pptx(
+    image_path: str,
+    ocr_regions: list[dict],
+    output_path: str,
+    background_path: str | None = None,
+    tight_mask: np.ndarray | None = None,
+    min_font: int = 8,
+    max_font: int = 72,
+    slide_w_inches: float | None = None,
+) -> dict:
+    """Assemble an editable PPTX from OCR regions and background.
+    Args:
+        image_path: Original input image.
+        ocr_regions: List of OCR region dicts with bbox and text.
+        output_path: Where to save the .pptx file.
+        background_path: Inpainted background image (or None for solid bg).
+        tight_mask: Pre-dilation text mask for color detection (H, W), uint8.
+        min_font: Minimum font size in points.
+        max_font: Maximum font size in points.
+        slide_w_inches: Override slide width (auto-detected from aspect ratio).
+    Returns:
+        Report dict with assembly statistics.
+    """
+    img = Image.open(image_path)
+    img_w, img_h = img.size
+    mapper = SlideMapper(img_w, img_h, slide_w_inches)
+    # Create presentation
+    prs = Presentation()
+    prs.slide_width = Inches(mapper.slide_w)
+    prs.slide_height = Inches(mapper.slide_h)
+    slide = prs.slides.add_slide(prs.slide_layouts[6])  # Blank
+    # Background
+    if background_path and Path(background_path).exists():
+        slide.shapes.add_picture(
+            background_path, Emu(0), Emu(0),
+            Inches(mapper.slide_w), Inches(mapper.slide_h),
+        )
+        bg_mode = "inpainted"
+    else:
+        bg_color = detect_bg_color(image_path)
+        if bg_color:
+            fill = slide.background.fill
+            fill.solid()
+            fill.fore_color.rgb = RGBColor(*bg_color)
+            bg_mode = f"solid rgb{bg_color}"
+        else:
+            slide.shapes.add_picture(
+                image_path, Emu(0), Emu(0),
+                Inches(mapper.slide_w), Inches(mapper.slide_h),
+            )
+            bg_mode = "original"
+    # Load image array for color detection
+    img_rgb = None
+    if tight_mask is not None:
+        img_rgb = np.array(Image.open(image_path).convert("RGB"))
+    # Group OCR regions into lines
+    text_groups = group_text_lines(ocr_regions)
+    # Add text boxes
+    count = 0
+    for group in text_groups:
+        x1, y1, x2, y2 = group_bbox(group)
+        text = group_to_text(group)
+        if not text.strip():
+            continue
+        left, top, width, height = mapper.bbox_to_emu(x1, y1, x2, y2)
+        pad = mapper.to_emu(2)
+        left = max(0, left - pad)
+        top = max(0, top - pad)
+        width += pad * 2
+        height += pad * 2
+        txBox = slide.shapes.add_textbox(left, top, width, height)
+        tf = txBox.text_frame
+        tf.word_wrap = True
+        tf.margin_left = tf.margin_right = tf.margin_top = tf.margin_bottom = Emu(0)
+        # Font size
+        bbox_w = x2 - x1
+        region_heights = [r["bbox"]["y2"] - r["bbox"]["y1"] for r in group]
+        line_h = sorted(region_heights)[len(region_heights) // 2]
+        font_size = autoscale_font(text, bbox_w, line_h, mapper.ppi, min_font, max_font)
+        # Font color
+        if img_rgb is not None and tight_mask is not None:
+            r, g, b = detect_text_color(img_rgb, tight_mask, x1, y1, x2, y2)
+        else:
+            r, g, b = 0x33, 0x33, 0x33
+        color = RGBColor(r, g, b)
+        lines = text.split("\n")
+        p = tf.paragraphs[0]
+        p.text = lines[0]
+        p.font.size = Pt(font_size)
+        p.font.color.rgb = color
+        for line in lines[1:]:
+            p = tf.add_paragraph()
+            p.text = line
+            p.font.size = Pt(font_size)
+            p.font.color.rgb = color
+        count += 1
+    prs.save(output_path)
+    return {
+        "image_size": {"width": img_w, "height": img_h},
+        "slide_size": {
+            "width_inches": round(mapper.slide_w, 2),
+            "height_inches": round(mapper.slide_h, 2),
+        },
+        "ppi": round(mapper.ppi, 1),
+        "background": bg_mode,
+        "text_boxes": count,
+        "ocr_regions": len(ocr_regions),
+    }

px_image2pptx/cli.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""Command-line interface for px-image2pptx.
+Usage::
+    # Full pipeline (OCR + textmask + inpaint + PPTX)
+    px-image2pptx slide.png -o output.pptx
+    # With pre-computed OCR
+    px-image2pptx slide.png -o output.pptx --ocr-json text_regions.json
+    # Skip inpainting (solid background or use original)
+    px-image2pptx slide.png -o output.pptx --skip-inpaint
+    # Chinese slide
+    px-image2pptx slide.png -o output.pptx --lang ch
+    # Keep intermediate files
+    px-image2pptx slide.png -o output.pptx --work-dir ./debug/
+"""
+from __future__ import annotations
+import argparse
+import sys
+import time
+def _parse_args(argv=None):
+    parser = argparse.ArgumentParser(
+        prog="px-image2pptx",
+        description="Convert static images to editable PowerPoint slides.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""\
+examples:
+  px-image2pptx slide.png -o output.pptx
+  px-image2pptx slide.png -o output.pptx --lang ch
+  px-image2pptx slide.png -o output.pptx --skip-inpaint
+  px-image2pptx slide.png -o output.pptx --ocr-json ocr.json
+  px-image2pptx slide.png -o output.pptx --work-dir ./debug/
+""",
+    )
+    parser.add_argument("image", help="Input image (PNG/JPG/WebP)")
+    parser.add_argument("-o", "--output", default="output.pptx",
+                        help="Output PPTX path (default: output.pptx)")
+    parser.add_argument("--ocr-json", default=None,
+                        help="Pre-computed OCR JSON (skips OCR step)")
+    parser.add_argument("--lang", default="auto", choices=["auto", "en", "ch"],
+                        help="OCR language (default: auto-detect)")
+    parser.add_argument("--sensitivity", type=float, default=16,
+                        help="Textmask sensitivity (default: 16)")
+    parser.add_argument("--dilation", type=int, default=12,
+                        help="Textmask dilation pixels (default: 12)")
+    parser.add_argument("--min-font", type=int, default=8,
+                        help="Minimum font size in points (default: 8)")
+    parser.add_argument("--max-font", type=int, default=72,
+                        help="Maximum font size in points (default: 72)")
+    parser.add_argument("--skip-inpaint", action="store_true",
+                        help="Skip LAMA inpainting (use original or solid bg)")
+    parser.add_argument("--work-dir", default=None,
+                        help="Directory for intermediate files")
+    return parser.parse_args(argv)
+def main(argv=None):
+    args = _parse_args(argv)
+    from px_image2pptx.pipeline import image_to_pptx
+    t0 = time.time()
+    report = image_to_pptx(
+        image_path=args.image,
+        output_path=args.output,
+        ocr_json=args.ocr_json,
+        lang=args.lang,
+        sensitivity=args.sensitivity,
+        dilation=args.dilation,
+        min_font=args.min_font,
+        max_font=args.max_font,
+        skip_inpaint=args.skip_inpaint,
+        work_dir=args.work_dir,
+    )
+    elapsed = time.time() - t0
+    print(f"Saved: {args.output}")
+    print(f"  Text boxes: {report['text_boxes']}")
+    print(f"  OCR regions: {report['ocr_regions']}")
+    print(f"  Background: {report['background']}")
+    print(f"  Slide: {report['slide_size']['width_inches']}x"
+          f"{report['slide_size']['height_inches']}\"")
+    print(f"  Time: {elapsed:.1f}s", end="")
+    if "timings" in report:
+        t = report["timings"]
+        parts = [f"{k}={v}s" for k, v in t.items()]
+        print(f" ({', '.join(parts)})")
+    else:
+        print()
+if __name__ == "__main__":
+    main()

px_image2pptx/inpaint.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""LAMA neural inpainting — reconstruct masked regions.
+Requires the optional ``inpaint`` extra: ``pip install px-image2pptx[inpaint]``.
+"""
+from __future__ import annotations
+import numpy as np
+from PIL import Image
+def _ensure_lama():
+    """Import LAMA dependencies, raising a helpful error if not installed."""
+    try:
+        import torch
+        from simple_lama_inpainting.models.model import (
+            download_model, LAMA_MODEL_URL, prepare_img_and_mask,
+        )
+        return torch, download_model, LAMA_MODEL_URL, prepare_img_and_mask
+    except ImportError:
+        raise ImportError(
+            "LAMA inpainting requires PyTorch and simple-lama-inpainting.\n"
+            "Install with:\n  pip install px-image2pptx[inpaint]"
+        ) from None
+def inpaint(
+    image: np.ndarray,
+    mask: np.ndarray,
+) -> np.ndarray:
+    """Inpaint masked regions of an image using LAMA.
+    Args:
+        image: RGB numpy array (H, W, 3), uint8.
+        mask: Grayscale numpy array (H, W), uint8. 255 = inpaint.
+    Returns:
+        Inpainted RGB numpy array (H, W, 3), uint8.
+    """
+    torch, download_model, LAMA_MODEL_URL, prepare_img_and_mask = _ensure_lama()
+    if torch.backends.mps.is_available():
+        device = torch.device("mps")
+    elif torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+    model_path = download_model(LAMA_MODEL_URL)
+    model = torch.jit.load(model_path, map_location=device)
+    model.eval()
+    model.to(device)
+    pil_image = Image.fromarray(image)
+    pil_mask = Image.fromarray(mask)
+    img_t, mask_t = prepare_img_and_mask(pil_image, pil_mask, device)
+    with torch.inference_mode():
+        inpainted = model(img_t, mask_t)
+        result = inpainted[0].permute(1, 2, 0).detach().cpu().numpy()
+        result = np.clip(result * 255, 0, 255).astype(np.uint8)
+    return result
+def inpaint_file(
+    image_path: str,
+    mask_path: str,
+    output_path: str,
+) -> str:
+    """Inpaint an image file with a mask file, save result.
+    Returns the output path.
+    """
+    import cv2
+    image = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
+    mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
+    result = inpaint(image, mask)
+    result_bgr = cv2.cvtColor(result, cv2.COLOR_RGB2BGR)
+    cv2.imwrite(output_path, result_bgr)
+    return output_path

px_image2pptx/ocr.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""OCR text detection using PaddleOCR.
+Detects text regions with bounding boxes, text content, and confidence scores.
+Requires the optional ``ocr`` extra: ``pip install px-image2pptx[ocr]``.
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any
+import cv2
+import numpy as np
+from PIL import Image, ImageDraw
+def _ensure_paddleocr():
+    """Import PaddleOCR, raising a helpful error if not installed."""
+    try:
+        from paddleocr import PaddleOCR
+        return PaddleOCR
+    except ImportError:
+        raise ImportError(
+            "PaddleOCR is required for OCR. Install with:\n"
+            "  pip install px-image2pptx[ocr]"
+        ) from None
+def run_ocr(image_path: str | Path, lang: str = "ch") -> list[dict]:
+    """Run PaddleOCR on an image and return structured text regions.
+    Args:
+        image_path: Path to the input image.
+        lang: OCR language (default "ch"). Use "en" for English only.
+    Returns:
+        List of text region dicts, each with:
+        - id: int
+        - text: str
+        - confidence: float
+        - bbox: {"x1": int, "y1": int, "x2": int, "y2": int}
+    """
+    import os
+    os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True"
+    PaddleOCR = _ensure_paddleocr()
+    ocr = PaddleOCR(
+        lang=lang,
+        use_textline_orientation=False,
+        use_doc_orientation_classify=False,
+        use_doc_unwarping=False,
+    )
+    results = list(ocr.predict(str(image_path)))
+    regions = []
+    idx = 0
+    for page in results:
+        polys = page.get("dt_polys", [])
+        texts = page.get("rec_texts", [])
+        scores = page.get("rec_scores", [])
+        for poly, text, conf in zip(polys, texts, scores):
+            xs = [p[0] for p in poly]
+            ys = [p[1] for p in poly]
+            regions.append({
+                "id": idx,
+                "text": text,
+                "confidence": round(float(conf), 4),
+                "bbox": {
+                    "x1": int(min(xs)),
+                    "y1": int(min(ys)),
+                    "x2": int(max(xs)),
+                    "y2": int(max(ys)),
+                },
+            })
+            idx += 1
+    return regions
+def save_ocr_json(regions: list[dict], path: str | Path) -> None:
+    """Save OCR regions to JSON file."""
+    with open(path, "w") as f:
+        json.dump({"text_regions": regions}, f, indent=2, ensure_ascii=False)
+def load_ocr_json(path: str | Path) -> list[dict]:
+    """Load OCR regions from JSON file."""
+    with open(path) as f:
+        data = json.load(f)
+    return data.get("text_regions", [])
+def draw_ocr_overlay(image_path: str | Path, regions: list[dict]) -> Image.Image:
+    """Draw OCR bounding boxes on image for visualization."""
+    img = Image.open(image_path).convert("RGB")
+    draw = ImageDraw.Draw(img, "RGBA")
+    for r in regions:
+        b = r["bbox"]
+        draw.rectangle([b["x1"], b["y1"], b["x2"], b["y2"]],
+                        outline=(255, 50, 50), width=3)
+        draw.rectangle([b["x1"], b["y1"], b["x2"], b["y2"]],
+                        fill=(255, 50, 50, 40))
+    return img

px_image2pptx/pipeline.py ADDED Viewed

	@@ -0,0 +1,136 @@

+"""End-to-end pipeline: image → editable PPTX.
+Orchestrates: OCR → textmask → mask-clip → inpaint → PPTX assembly.
+"""
+from __future__ import annotations
+import json
+import time
+from pathlib import Path
+import cv2
+import numpy as np
+from PIL import Image
+from px_image2pptx.assemble import assemble_pptx
+from px_image2pptx.textmask import compute_masks
+def image_to_pptx(
+    image_path: str | Path,
+    output_path: str | Path = "output.pptx",
+    *,
+    ocr_json: str | Path | None = None,
+    lang: str = "auto",
+    sensitivity: float = 16,
+    dilation: int = 12,
+    mask_padding: int = 15,
+    min_font: int = 8,
+    max_font: int = 72,
+    skip_inpaint: bool = False,
+    work_dir: str | Path | None = None,
+) -> dict:
+    """Convert a static image to an editable PPTX.
+    Args:
+        image_path: Input image (PNG/JPG/WebP).
+        output_path: Where to save the .pptx file.
+        ocr_json: Pre-computed OCR JSON (skip OCR step if provided).
+        lang: OCR language ("en", "ch", or "auto" to detect).
+        sensitivity: Textmask sensitivity (lower = more aggressive).
+        dilation: Textmask dilation in pixels.
+        mask_padding: Padding around OCR bboxes for mask clipping.
+        min_font: Minimum font size in points.
+        max_font: Maximum font size in points.
+        skip_inpaint: If True, skip inpainting (use original as background).
+        work_dir: Directory for intermediate files (default: temp dir).
+    Returns:
+        Report dict with pipeline statistics.
+    """
+    image_path = str(image_path)
+    output_path = str(output_path)
+    timings = {}
+    # Work directory for intermediates (only created when explicitly requested)
+    save_intermediates = work_dir is not None
+    if save_intermediates:
+        wdir = Path(work_dir)
+        wdir.mkdir(parents=True, exist_ok=True)
+    # Step 1: OCR
+    t0 = time.time()
+    if ocr_json:
+        from px_image2pptx.ocr import load_ocr_json
+        ocr_regions = load_ocr_json(ocr_json)
+    else:
+        from px_image2pptx.ocr import run_ocr, save_ocr_json
+        # "ch" model handles both Chinese and English, so use it as default
+        ocr_lang = "ch" if lang == "auto" else lang
+        ocr_regions = run_ocr(image_path, lang=ocr_lang)
+        if save_intermediates:
+            save_ocr_json(ocr_regions, wdir / "text_regions.json")
+    timings["ocr"] = round(time.time() - t0, 2)
+    # Step 2: Textmask → clip to OCR → dilate
+    t0 = time.time()
+    image_bgr = cv2.imread(image_path)
+    tight_mask, clipped_mask, dilated_mask = compute_masks(
+        image_bgr, ocr_regions,
+        sensitivity=sensitivity, dilation=dilation, padding=mask_padding,
+    )
+    if save_intermediates:
+        Image.fromarray(tight_mask).save(str(wdir / "tight_mask.png"))
+        Image.fromarray(clipped_mask).save(str(wdir / "clipped_mask.png"))
+        Image.fromarray(dilated_mask).save(str(wdir / "mask.png"))
+    timings["textmask"] = round(time.time() - t0, 2)
+    # Step 3: Inpaint
+    background_path = None
+    _temp_bg = None
+    if not skip_inpaint:
+        t0 = time.time()
+        from px_image2pptx.inpaint import inpaint
+        image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
+        result = inpaint(image_rgb, dilated_mask)
+        if save_intermediates:
+            bg_path = str(wdir / "background.png")
+            Image.fromarray(result).save(bg_path)
+            background_path = bg_path
+        else:
+            import tempfile
+            _temp_bg = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
+            Image.fromarray(result).save(_temp_bg.name)
+            background_path = _temp_bg.name
+        timings["inpaint"] = round(time.time() - t0, 2)
+    # Step 4: Assemble PPTX
+    t0 = time.time()
+    report = assemble_pptx(
+        image_path=image_path,
+        ocr_regions=ocr_regions,
+        output_path=output_path,
+        background_path=background_path,
+        tight_mask=tight_mask,
+        min_font=min_font,
+        max_font=max_font,
+    )
+    timings["assemble"] = round(time.time() - t0, 2)
+    # Clean up temp background file
+    if _temp_bg is not None:
+        import os
+        os.unlink(_temp_bg.name)
+    report["timings"] = timings
+    if save_intermediates:
+        report["work_dir"] = str(wdir)
+        with open(wdir / "report.json", "w") as f:
+            json.dump(report, f, indent=2)
+    return report

px_image2pptx/textmask.py ADDED Viewed

	@@ -0,0 +1,167 @@

+"""Text ink detection using classical computer vision.
+Detects text pixels directly from image using adaptive thresholding,
+connected component filtering, and Canny edge reinforcement. No ML model.
+Returns both a tight mask (actual ink pixels, for color sampling) and a
+dilated mask (for inpainting with safe coverage).
+"""
+from __future__ import annotations
+import cv2
+import numpy as np
+def detect_text_ink(
+    image: np.ndarray,
+    block_size: int = 25,
+    sensitivity: float = 16,
+    max_component_pct: float = 2.0,
+    min_component_area: int = 8,
+    max_density: float = 0.9,
+    max_density_area: int = 500,
+    edge_neighborhood: int = 15,
+    min_final_area: int = 10,
+) -> np.ndarray:
+    """Detect text ink pixels using adaptive thresholding and component analysis.
+    Args:
+        image: BGR numpy array (H, W, 3), uint8.
+        block_size: Adaptive threshold block size (must be odd, >= 3).
+        sensitivity: Adaptive threshold C parameter. Higher = less sensitive.
+        max_component_pct: Max connected component area as % of image.
+        min_component_area: Min component area in pixels (noise filter).
+        max_density: Components with density above this AND area above
+            max_density_area are treated as solid blobs.
+        max_density_area: Minimum area for density filtering to apply.
+        edge_neighborhood: Radius (px) for Canny edge reinforcement.
+        min_final_area: Final cleanup — components smaller than this removed.
+    Returns:
+        Binary mask (H, W), uint8, 255 = text ink, 0 = background.
+    """
+    h, w = image.shape[:2]
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    # Ensure block_size is valid
+    if block_size % 2 == 0:
+        block_size += 1
+    if block_size < 3:
+        block_size = 3
+    # Step 1: Dual thresholding (adaptive + Otsu intersection)
+    adaptive = cv2.adaptiveThreshold(
+        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,
+        blockSize=block_size, C=sensitivity,
+    )
+    _, otsu = cv2.threshold(
+        gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU,
+    )
+    combined = cv2.bitwise_and(adaptive, otsu)
+    # Step 2: Connect nearby stroke fragments
+    kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
+    candidates = cv2.morphologyEx(combined, cv2.MORPH_CLOSE, kernel_close)
+    # Step 3: Connected component filtering
+    num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(
+        candidates, connectivity=8,
+    )
+    max_area = h * w * (max_component_pct / 100.0)
+    text_mask = np.zeros((h, w), dtype=np.uint8)
+    for i in range(1, num_labels):
+        x, y, cw, ch, area = stats[i]
+        if area > max_area:
+            continue
+        if area < min_component_area:
+            continue
+        if cw > w * 0.3 and ch > h * 0.3:
+            continue
+        bbox_area = max(cw * ch, 1)
+        density = area / bbox_area
+        if density > max_density and area > max_density_area:
+            continue
+        text_mask[labels == i] = 255
+    # Step 4: Canny edge reinforcement near detected text
+    edges = cv2.Canny(gray, 80, 200)
+    kernel_near = cv2.getStructuringElement(
+        cv2.MORPH_ELLIPSE,
+        (edge_neighborhood * 2 + 1, edge_neighborhood * 2 + 1),
+    )
+    text_neighborhood = cv2.dilate(text_mask, kernel_near)
+    edge_near_text = cv2.bitwise_and(edges, text_neighborhood)
+    text_mask = cv2.bitwise_or(text_mask, edge_near_text)
+    # Step 5: Final cleanup
+    kernel_fill = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
+    text_mask = cv2.morphologyEx(text_mask, cv2.MORPH_CLOSE, kernel_fill)
+    num_labels2, labels2, stats2, _ = cv2.connectedComponentsWithStats(
+        text_mask, connectivity=8,
+    )
+    clean_mask = np.zeros((h, w), dtype=np.uint8)
+    for i in range(1, num_labels2):
+        if stats2[i, cv2.CC_STAT_AREA] >= min_final_area:
+            clean_mask[labels2 == i] = 255
+    return clean_mask
+def dilate_mask(mask: np.ndarray, dilation_px: int) -> np.ndarray:
+    """Apply morphological dilation to a binary mask."""
+    if dilation_px <= 0 or not np.any(mask):
+        return mask.copy()
+    kernel = cv2.getStructuringElement(
+        cv2.MORPH_ELLIPSE,
+        (dilation_px * 2 + 1, dilation_px * 2 + 1),
+    )
+    return cv2.dilate(mask, kernel, iterations=1)
+def clip_mask_to_ocr(
+    mask: np.ndarray,
+    ocr_regions: list[dict],
+    padding: int = 15,
+) -> np.ndarray:
+    """Clip text mask to OCR-confirmed regions only.
+    ANDs the textmask with rectangles from OCR bounding boxes so only
+    pixels inside known text regions survive. Prevents masking illustrations,
+    borders, and icons that textmask wrongly detects as text.
+    """
+    h, w = mask.shape[:2]
+    ocr_mask = np.zeros_like(mask)
+    for r in ocr_regions:
+        b = r["bbox"]
+        y1 = max(0, b["y1"] - padding)
+        x1 = max(0, b["x1"] - padding)
+        y2 = min(h, b["y2"] + padding)
+        x2 = min(w, b["x2"] + padding)
+        ocr_mask[y1:y2, x1:x2] = 255
+    return np.minimum(mask, ocr_mask)
+def compute_masks(
+    image_bgr: np.ndarray,
+    ocr_regions: list[dict],
+    sensitivity: float = 16,
+    dilation: int = 12,
+    padding: int = 15,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """Full textmask pipeline: detect → clip to OCR → dilate.
+    Returns:
+        (tight_mask, clipped_mask, dilated_mask)
+        - tight_mask: raw ink pixels (for color sampling)
+        - clipped_mask: tight mask AND-ed with OCR bboxes
+        - dilated_mask: clipped + dilation (for inpainting)
+    """
+    tight = detect_text_ink(image_bgr, sensitivity=sensitivity)
+    clipped = clip_mask_to_ocr(tight, ocr_regions, padding=padding)
+    dilated = dilate_mask(clipped, dilation)
+    return tight, clipped, dilated

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+Pillow>=10.0
+numpy>=1.24
+opencv-python-headless>=4.8
+python-pptx>=0.6.21
+paddleocr>=3.0
+paddlepaddle>=3.0
+simple-lama-inpainting>=0.1.0
+torch>=2.0
+gradio>=4.0