Spaces:

pxGenius
/

image2pptx

Running

mm commited on 8 days ago

Commit

6e513f7

1 Parent(s): 39f456e

Fix PaddlePaddle 3.3.0 PIR crash, cache models, add --max-inpaint-size

- Exclude paddlepaddle 3.3.0 (PIR regression, Paddle#77340)
- Disable PIR API as safety net
- Cache PaddleOCR and LAMA models in memory across calls
- Add max_inpaint_size option to downscale large images before LAMA

Files changed (6) hide show

app.py +1 -0
px_image2pptx/cli.py +4 -0
px_image2pptx/inpaint.py +55 -14
px_image2pptx/ocr.py +17 -11
px_image2pptx/pipeline.py +4 -1
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import tempfile
 from pathlib import Path
 os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True"
 import gradio as gr
 from PIL import Image

 from pathlib import Path
 os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True"
+os.environ["FLAGS_enable_pir_api"] = "0"
 import gradio as gr
 from PIL import Image

px_image2pptx/cli.py CHANGED Viewed

@@ -56,6 +56,9 @@ examples:
                         help="Maximum font size in points (default: 72)")
     parser.add_argument("--skip-inpaint", action="store_true",
                         help="Skip LAMA inpainting (use original or solid bg)")
     parser.add_argument("--work-dir", default=None,
                         help="Directory for intermediate files")
     return parser.parse_args(argv)
@@ -77,6 +80,7 @@ def main(argv=None):
         min_font=args.min_font,
         max_font=args.max_font,
         skip_inpaint=args.skip_inpaint,
         work_dir=args.work_dir,
     )
     elapsed = time.time() - t0

                         help="Maximum font size in points (default: 72)")
     parser.add_argument("--skip-inpaint", action="store_true",
                         help="Skip LAMA inpainting (use original or solid bg)")
+    parser.add_argument("--max-inpaint-size", type=int, default=None,
+                        help="Downscale longer edge to N px before inpainting "
+                             "(e.g. 2048). Reduces time for large images.")
     parser.add_argument("--work-dir", default=None,
                         help="Directory for intermediate files")
     return parser.parse_args(argv)
         min_font=args.min_font,
         max_font=args.max_font,
         skip_inpaint=args.skip_inpaint,
+        max_inpaint_size=args.max_inpaint_size,
         work_dir=args.work_dir,
     )
     elapsed = time.time() - t0

px_image2pptx/inpaint.py CHANGED Viewed

@@ -24,20 +24,17 @@ def _ensure_lama():
         ) from None
-def inpaint(
-    image: np.ndarray,
-    mask: np.ndarray,
-) -> np.ndarray:
-    """Inpaint masked regions of an image using LAMA.
-    Args:
-        image: RGB numpy array (H, W, 3), uint8.
-        mask: Grayscale numpy array (H, W), uint8. 255 = inpaint.
-    Returns:
-        Inpainted RGB numpy array (H, W, 3), uint8.
-    """
-    torch, download_model, LAMA_MODEL_URL, prepare_img_and_mask = _ensure_lama()
     if torch.backends.mps.is_available():
         device = torch.device("mps")
@@ -51,8 +48,47 @@ def inpaint(
     model.eval()
     model.to(device)
-    pil_image = Image.fromarray(image)
-    pil_mask = Image.fromarray(mask)
     img_t, mask_t = prepare_img_and_mask(pil_image, pil_mask, device)
     with torch.inference_mode():
@@ -60,6 +96,11 @@ def inpaint(
         result = inpainted[0].permute(1, 2, 0).detach().cpu().numpy()
         result = np.clip(result * 255, 0, 255).astype(np.uint8)
     return result

         ) from None
+_cached_model = None
+_cached_device = None
+def _get_model():
+    """Return the cached LAMA model, loading it on first call."""
+    global _cached_model, _cached_device
+    if _cached_model is not None:
+        return _cached_model, _cached_device
+    torch, download_model, LAMA_MODEL_URL, _ = _ensure_lama()
     if torch.backends.mps.is_available():
         device = torch.device("mps")
     model.eval()
     model.to(device)
+    _cached_model = model
+    _cached_device = device
+    return model, device
+def inpaint(
+    image: np.ndarray,
+    mask: np.ndarray,
+    max_size: int | None = None,
+) -> np.ndarray:
+    """Inpaint masked regions of an image using LAMA.
+    Args:
+        image: RGB numpy array (H, W, 3), uint8.
+        mask: Grayscale numpy array (H, W), uint8. 255 = inpaint.
+        max_size: If set, downscale the longer edge to this many pixels
+            before LAMA inference, then upscale the result back.
+            Reduces memory and compute for large images.
+    Returns:
+        Inpainted RGB numpy array (H, W, 3), uint8, same size as input.
+    """
+    _, _, _, prepare_img_and_mask = _ensure_lama()
+    import torch
+    model, device = _get_model()
+    orig_h, orig_w = image.shape[:2]
+    scaled = False
+    if max_size and max(orig_h, orig_w) > max_size:
+        scale = max_size / max(orig_h, orig_w)
+        new_w = round(orig_w * scale)
+        new_h = round(orig_h * scale)
+        pil_image = Image.fromarray(image).resize((new_w, new_h), Image.LANCZOS)
+        pil_mask = Image.fromarray(mask).resize((new_w, new_h), Image.NEAREST)
+        scaled = True
+    else:
+        pil_image = Image.fromarray(image)
+        pil_mask = Image.fromarray(mask)
     img_t, mask_t = prepare_img_and_mask(pil_image, pil_mask, device)
     with torch.inference_mode():
         result = inpainted[0].permute(1, 2, 0).detach().cpu().numpy()
         result = np.clip(result * 255, 0, 255).astype(np.uint8)
+    if scaled:
+        result = np.array(
+            Image.fromarray(result).resize((orig_w, orig_h), Image.LANCZOS)
+        )
     return result

px_image2pptx/ocr.py CHANGED Viewed

@@ -27,6 +27,22 @@ def _ensure_paddleocr():
         ) from None
 def run_ocr(image_path: str | Path, lang: str = "ch") -> list[dict]:
     """Run PaddleOCR on an image and return structured text regions.
@@ -41,17 +57,7 @@ def run_ocr(image_path: str | Path, lang: str = "ch") -> list[dict]:
         - confidence: float
         - bbox: {"x1": int, "y1": int, "x2": int, "y2": int}
     """
-    import os
-    os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True"
-    PaddleOCR = _ensure_paddleocr()
-    ocr = PaddleOCR(
-        lang=lang,
-        use_textline_orientation=False,
-        use_doc_orientation_classify=False,
-        use_doc_unwarping=False,
-    )
     results = list(ocr.predict(str(image_path)))
     regions = []

         ) from None
+_ocr_cache: dict[str, Any] = {}
+def _get_ocr(lang: str):
+    """Return a cached PaddleOCR instance for the given language."""
+    if lang not in _ocr_cache:
+        PaddleOCR = _ensure_paddleocr()
+        _ocr_cache[lang] = PaddleOCR(
+            lang=lang,
+            use_textline_orientation=False,
+            use_doc_orientation_classify=False,
+            use_doc_unwarping=False,
+        )
+    return _ocr_cache[lang]
 def run_ocr(image_path: str | Path, lang: str = "ch") -> list[dict]:
     """Run PaddleOCR on an image and return structured text regions.
         - confidence: float
         - bbox: {"x1": int, "y1": int, "x2": int, "y2": int}
     """
+    ocr = _get_ocr(lang)
     results = list(ocr.predict(str(image_path)))
     regions = []

px_image2pptx/pipeline.py CHANGED Viewed

@@ -29,6 +29,7 @@ def image_to_pptx(
     min_font: int = 8,
     max_font: int = 72,
     skip_inpaint: bool = False,
     work_dir: str | Path | None = None,
 ) -> dict:
     """Convert a static image to an editable PPTX.
@@ -44,6 +45,8 @@ def image_to_pptx(
         min_font: Minimum font size in points.
         max_font: Maximum font size in points.
         skip_inpaint: If True, skip inpainting (use original as background).
         work_dir: Directory for intermediate files (default: temp dir).
     Returns:
@@ -96,7 +99,7 @@ def image_to_pptx(
         from px_image2pptx.inpaint import inpaint
         image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
-        result = inpaint(image_rgb, dilated_mask)
         if save_intermediates:
             bg_path = str(wdir / "background.png")

     min_font: int = 8,
     max_font: int = 72,
     skip_inpaint: bool = False,
+    max_inpaint_size: int | None = None,
     work_dir: str | Path | None = None,
 ) -> dict:
     """Convert a static image to an editable PPTX.
         min_font: Minimum font size in points.
         max_font: Maximum font size in points.
         skip_inpaint: If True, skip inpainting (use original as background).
+        max_inpaint_size: If set, downscale the longer edge to this many
+            pixels before LAMA inpainting. Reduces time for large images.
         work_dir: Directory for intermediate files (default: temp dir).
     Returns:
         from px_image2pptx.inpaint import inpaint
         image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
+        result = inpaint(image_rgb, dilated_mask, max_size=max_inpaint_size)
         if save_intermediates:
             bg_path = str(wdir / "background.png")

requirements.txt CHANGED Viewed

@@ -3,7 +3,7 @@ numpy>=1.24
 opencv-python-headless>=4.8
 python-pptx>=0.6.21
 paddleocr>=3.0
-paddlepaddle>=3.0
 simple-lama-inpainting>=0.1.0
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch>=2.0

 opencv-python-headless>=4.8
 python-pptx>=0.6.21
 paddleocr>=3.0
+paddlepaddle>=3.0,!=3.3.0
 simple-lama-inpainting>=0.1.0
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch>=2.0