pxGenius commited on
Commit
f7d9770
·
verified ·
1 Parent(s): 7dd4487

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ examples/chart_good1.png filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gradio demo for px-image2pptx — deploy on Hugging Face Spaces."""
2
+
3
+ import os
4
+ import tempfile
5
+ from pathlib import Path
6
+
7
+ os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True"
8
+
9
+ import gradio as gr
10
+ from PIL import Image
11
+
12
+
13
+ def convert(image_path, lang):
14
+ from px_image2pptx import image_to_pptx
15
+
16
+ # Convert WebP to PNG if needed (PaddleOCR doesn't support WebP)
17
+ img = Image.open(image_path)
18
+ if img.format == "WEBP" or image_path.lower().endswith(".webp"):
19
+ png_path = image_path.rsplit(".", 1)[0] + ".png"
20
+ img.save(png_path)
21
+ image_path = png_path
22
+
23
+ tmpdir = tempfile.mkdtemp()
24
+ out_pptx = os.path.join(tmpdir, "output.pptx")
25
+ work_dir = os.path.join(tmpdir, "work")
26
+
27
+ report = image_to_pptx(
28
+ image_path,
29
+ out_pptx,
30
+ lang=lang,
31
+ work_dir=work_dir,
32
+ )
33
+
34
+ # Load the inpainted background for preview
35
+ bg_path = os.path.join(work_dir, "background.png")
36
+ bg_preview = Image.open(bg_path) if os.path.exists(bg_path) else None
37
+
38
+ summary = (
39
+ f"**Text boxes:** {report['text_boxes']} \n"
40
+ f"**OCR regions:** {report['ocr_regions']} \n"
41
+ f"**Slide size:** {report['slide_size']['width_inches']}x"
42
+ f"{report['slide_size']['height_inches']}\" \n"
43
+ f"**Timings:** {report.get('timings', {})}"
44
+ )
45
+
46
+ return bg_preview, out_pptx, summary
47
+
48
+
49
+ demo = gr.Interface(
50
+ fn=convert,
51
+ inputs=[
52
+ gr.Image(type="filepath", label="Input image (slide, poster, infographic)"),
53
+ gr.Radio(
54
+ choices=["auto", "en", "ch"],
55
+ value="auto",
56
+ label="OCR language",
57
+ info="auto = Chinese model (handles both Chinese & English)",
58
+ ),
59
+ ],
60
+ outputs=[
61
+ gr.Image(label="Inpainted background (text removed)"),
62
+ gr.File(label="Download .pptx"),
63
+ gr.Markdown(label="Report"),
64
+ ],
65
+ title="px-image2pptx",
66
+ description=(
67
+ "Convert a static image to an editable PowerPoint file. "
68
+ "OCR detects text, classical CV builds a text mask, LAMA inpaints "
69
+ "the background clean, and python-pptx reconstructs editable text boxes.\n\n"
70
+ "For a full browser-based editor, visit [pxGenius.ai](https://pxgenius.ai)."
71
+ ),
72
+ examples=[
73
+ ["examples/chart_good1.png", "auto"],
74
+ ],
75
+ cache_examples=False,
76
+ )
77
+
78
+ if __name__ == "__main__":
79
+ demo.launch()
examples/chart_good1.png ADDED

Git LFS Details

  • SHA256: cb74fedf598eaca6b5684f8fbf88a1a39897cdeaedcc50667879ec95bb0f60f4
  • Pointer size: 131 Bytes
  • Size of remote file: 105 kB
px_image2pptx/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """px-image2pptx -- Convert static images to editable PowerPoint slides.
2
+
3
+ Pipeline: image → OCR → textmask → mask-clip → inpaint → PPTX assembly.
4
+
5
+ OCR detects text regions. Textmask detects text ink pixels. Mask-clip ANDs
6
+ them so only OCR-confirmed text is masked. LAMA inpaints the masked regions.
7
+ PPTX assembly places editable text boxes over the clean background.
8
+
9
+ Quick start::
10
+
11
+ from px_image2pptx import image_to_pptx
12
+ image_to_pptx("slide.png", "output.pptx")
13
+
14
+ Or from the command line::
15
+
16
+ px-image2pptx slide.png -o output.pptx
17
+ """
18
+
19
+ __version__ = "0.1.0"
20
+
21
+ from px_image2pptx.pipeline import image_to_pptx
22
+
23
+ __all__ = ["image_to_pptx"]
px_image2pptx/assemble.py ADDED
@@ -0,0 +1,470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """PPTX assembly — place text boxes and background onto editable slides.
2
+
3
+ Pure python-pptx assembly: no ML models, no LLM calls. Takes OCR regions,
4
+ background image, and optional tight mask → produces editable .pptx.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+
11
+ import numpy as np
12
+ from PIL import Image, ImageFont
13
+ from pptx import Presentation
14
+ from pptx.util import Inches, Pt, Emu
15
+ from pptx.dml.color import RGBColor
16
+
17
+
18
+ # ── Coordinate mapping ────────────────────────────────────────
19
+
20
+
21
+ def px_to_emu(px: float, px_per_inch: float) -> int:
22
+ """Convert image pixels to EMU (914400 per inch)."""
23
+ return int(px / px_per_inch * 914400)
24
+
25
+
26
+ class SlideMapper:
27
+ """Maps image pixel coordinates to slide EMU coordinates."""
28
+
29
+ def __init__(self, img_w: int, img_h: int, slide_w_inches: float | None = None):
30
+ self.img_w = img_w
31
+ self.img_h = img_h
32
+ aspect = img_w / img_h
33
+
34
+ if slide_w_inches and slide_w_inches > 0:
35
+ self.slide_w = slide_w_inches
36
+ self.slide_h = slide_w_inches / aspect
37
+ elif aspect > 1.5:
38
+ self.slide_w, self.slide_h = 13.333, 7.5
39
+ elif aspect > 1.2:
40
+ self.slide_w, self.slide_h = 10.0, 7.5
41
+ else:
42
+ self.slide_w = 10.0
43
+ self.slide_h = 10.0 / aspect
44
+
45
+ self.ppi = img_w / self.slide_w
46
+
47
+ def to_emu(self, px: float) -> int:
48
+ return px_to_emu(px, self.ppi)
49
+
50
+ def bbox_to_emu(self, x1, y1, x2, y2):
51
+ return (self.to_emu(x1), self.to_emu(y1),
52
+ self.to_emu(x2 - x1), self.to_emu(y2 - y1))
53
+
54
+
55
+ # ── Font measurement ──────────────────────────────────────────
56
+
57
+
58
+ def _load_reference_font():
59
+ """Load a system sans-serif font for text width measurement."""
60
+ candidates = [
61
+ "/System/Library/Fonts/Supplemental/Arial.ttf",
62
+ "/System/Library/Fonts/Helvetica.ttc",
63
+ "/Library/Fonts/Arial.ttf",
64
+ "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
65
+ ]
66
+ for path in candidates:
67
+ try:
68
+ return ImageFont.truetype(path, 72), True
69
+ except Exception:
70
+ continue
71
+ return None, False
72
+
73
+
74
+ _REF_FONT, _HAS_FONT = _load_reference_font()
75
+
76
+
77
+ def _is_cjk(ch: str) -> bool:
78
+ cp = ord(ch)
79
+ return (0x4E00 <= cp <= 0x9FFF or 0x3400 <= cp <= 0x4DBF or
80
+ 0x3000 <= cp <= 0x303F or 0xFF00 <= cp <= 0xFFEF or
81
+ 0xF900 <= cp <= 0xFAFF or 0x2E80 <= cp <= 0x2EFF or
82
+ 0x31C0 <= cp <= 0x31EF)
83
+
84
+
85
+ def estimate_text_width_pt(text: str, font_pt: float) -> float:
86
+ """Measure rendered text width in points.
87
+
88
+ Hybrid: PIL font metrics for Latin, 1.0x em for CJK.
89
+ """
90
+ if _HAS_FONT:
91
+ width = 0.0
92
+ latin_run: list[str] = []
93
+
94
+ def flush():
95
+ nonlocal width
96
+ if latin_run:
97
+ ref_w = _REF_FONT.getlength("".join(latin_run))
98
+ width += ref_w * (font_pt / 72.0)
99
+ latin_run.clear()
100
+
101
+ for ch in text:
102
+ if _is_cjk(ch):
103
+ flush()
104
+ width += font_pt * 1.0
105
+ else:
106
+ latin_run.append(ch)
107
+ flush()
108
+ return width
109
+
110
+ # Fallback: heuristic
111
+ width = 0.0
112
+ for ch in text:
113
+ if _is_cjk(ch):
114
+ width += font_pt * 1.0
115
+ elif ch == " ":
116
+ width += font_pt * 0.25
117
+ else:
118
+ width += font_pt * 0.50
119
+ return width
120
+
121
+
122
+ def autoscale_font(
123
+ text: str,
124
+ bbox_w_px: float,
125
+ bbox_h_px: float,
126
+ ppi: float,
127
+ min_pt: int = 8,
128
+ max_pt: int = 72,
129
+ ) -> int:
130
+ """Auto-scale font size to fill 90-94% of bbox width."""
131
+ line_h_pt = (bbox_h_px / ppi) * 72
132
+ pt = max(min_pt, min(max_pt, round(line_h_pt)))
133
+
134
+ bbox_w_pt = (bbox_w_px / ppi) * 72
135
+ lines = text.split("\n")
136
+ longest = max(lines, key=len) if lines else text
137
+
138
+ # Shrink to fit
139
+ for _ in range(40):
140
+ if estimate_text_width_pt(longest, pt) <= bbox_w_pt * 0.94 or pt <= min_pt:
141
+ break
142
+ pt = max(min_pt, pt - 1)
143
+
144
+ # Grow to fill
145
+ for _ in range(40):
146
+ if estimate_text_width_pt(longest, pt) >= bbox_w_pt * 0.90 or pt >= max_pt:
147
+ break
148
+ if estimate_text_width_pt(longest, pt + 1) > bbox_w_pt * 0.94:
149
+ break
150
+ pt = min(max_pt, pt + 1)
151
+
152
+ return pt
153
+
154
+
155
+ # ── Text grouping ─────────────────────────────────────────────
156
+
157
+
158
+ def group_text_lines(
159
+ regions: list[dict],
160
+ y_threshold: float = 0.6,
161
+ x_gap_factor: float = 3.0,
162
+ ) -> list[list[dict]]:
163
+ """Merge word-level OCR regions into line-level groups.
164
+
165
+ Two-pass: group by vertical proximity, then split by horizontal gaps
166
+ to prevent merging left/right columns.
167
+ """
168
+ if not regions:
169
+ return []
170
+
171
+ for r in regions:
172
+ b = r["bbox"]
173
+ r["_cy"] = (b["y1"] + b["y2"]) / 2
174
+ r["_h"] = b["y2"] - b["y1"]
175
+
176
+ sorted_regions = sorted(regions, key=lambda r: r["_cy"])
177
+
178
+ # Pass 1: vertical grouping
179
+ y_lines: list[list[dict]] = []
180
+ current = [sorted_regions[0]]
181
+ for r in sorted_regions[1:]:
182
+ line_cy = sum(rr["_cy"] for rr in current) / len(current)
183
+ line_h = max(rr["_h"] for rr in current)
184
+ if abs(r["_cy"] - line_cy) < line_h * y_threshold:
185
+ current.append(r)
186
+ else:
187
+ y_lines.append(current)
188
+ current = [r]
189
+ y_lines.append(current)
190
+
191
+ # Pass 2: split by horizontal gaps
192
+ lines: list[list[dict]] = []
193
+ for y_line in y_lines:
194
+ y_line.sort(key=lambda r: r["bbox"]["x1"])
195
+ if len(y_line) <= 1:
196
+ lines.append(y_line)
197
+ continue
198
+ heights = sorted(rr["_h"] for rr in y_line)
199
+ median_h = heights[len(heights) // 2]
200
+ max_gap = median_h * x_gap_factor
201
+
202
+ segment = [y_line[0]]
203
+ for r in y_line[1:]:
204
+ gap = r["bbox"]["x1"] - segment[-1]["bbox"]["x2"]
205
+ if gap > max_gap:
206
+ lines.append(segment)
207
+ segment = [r]
208
+ else:
209
+ segment.append(r)
210
+ lines.append(segment)
211
+
212
+ for r in regions:
213
+ r.pop("_cy", None)
214
+ r.pop("_h", None)
215
+
216
+ return lines
217
+
218
+
219
+ def group_bbox(group: list[dict]) -> tuple[int, int, int, int]:
220
+ x1 = min(r["bbox"]["x1"] for r in group)
221
+ y1 = min(r["bbox"]["y1"] for r in group)
222
+ x2 = max(r["bbox"]["x2"] for r in group)
223
+ y2 = max(r["bbox"]["y2"] for r in group)
224
+ return x1, y1, x2, y2
225
+
226
+
227
+ def group_to_text(group: list[dict]) -> str:
228
+ """Convert a group of OCR regions to display text."""
229
+ if not group:
230
+ return ""
231
+ for r in group:
232
+ b = r["bbox"]
233
+ r["_cy"] = (b["y1"] + b["y2"]) / 2
234
+ r["_h"] = b["y2"] - b["y1"]
235
+
236
+ sorted_r = sorted(group, key=lambda r: r["_cy"])
237
+ lines: list[list[dict]] = []
238
+ current = [sorted_r[0]]
239
+ for r in sorted_r[1:]:
240
+ line_cy = sum(rr["_cy"] for rr in current) / len(current)
241
+ line_h = max(rr["_h"] for rr in current)
242
+ if abs(r["_cy"] - line_cy) < line_h * 0.6:
243
+ current.append(r)
244
+ else:
245
+ lines.append(current)
246
+ current = [r]
247
+ lines.append(current)
248
+
249
+ text_lines = []
250
+ for line in lines:
251
+ line.sort(key=lambda r: r["bbox"]["x1"])
252
+ text_lines.append(" ".join(r["text"] for r in line))
253
+
254
+ for r in group:
255
+ r.pop("_cy", None)
256
+ r.pop("_h", None)
257
+
258
+ return "\n".join(text_lines)
259
+
260
+
261
+ # ── Text color detection ──────────────────────────────────────
262
+
263
+
264
+ def _local_bg_color(crop: np.ndarray, border: int = 2) -> np.ndarray:
265
+ h, w = crop.shape[:2]
266
+ if h < border * 2 + 1 or w < border * 2 + 1:
267
+ return np.median(crop.reshape(-1, 3), axis=0)
268
+ pixels = np.concatenate([
269
+ crop[:border, :].reshape(-1, 3),
270
+ crop[-border:, :].reshape(-1, 3),
271
+ crop[border:-border, :border].reshape(-1, 3),
272
+ crop[border:-border, -border:].reshape(-1, 3),
273
+ ])
274
+ return np.median(pixels, axis=0)
275
+
276
+
277
+ def detect_text_color(
278
+ img_rgb: np.ndarray,
279
+ tight_mask: np.ndarray,
280
+ x1: int, y1: int, x2: int, y2: int,
281
+ default: tuple[int, int, int] = (0x33, 0x33, 0x33),
282
+ min_contrast: float = 40,
283
+ ) -> tuple[int, int, int]:
284
+ """Detect text color from original image using tight mask.
285
+
286
+ Strategy 1: median of tight-mask ink pixels (dark text on light bg).
287
+ Strategy 2: if color ≈ background, sample pixels most different from bg
288
+ (handles white text on dark bg where textmask misses the text).
289
+ """
290
+ h, w = img_rgb.shape[:2]
291
+ bx1, by1 = max(0, int(x1)), max(0, int(y1))
292
+ bx2, by2 = min(w, int(x2)), min(h, int(y2))
293
+ if bx2 <= bx1 or by2 <= by1:
294
+ return default
295
+
296
+ crop = img_rgb[by1:by2, bx1:bx2]
297
+ mask_crop = tight_mask[by1:by2, bx1:bx2]
298
+ bg = _local_bg_color(crop)
299
+
300
+ # Strategy 1: tight mask ink pixels
301
+ ink_pixels = crop[mask_crop > 128]
302
+ if len(ink_pixels) >= 3:
303
+ median = np.median(ink_pixels, axis=0)
304
+ dist = float(((median - bg.astype(float)) ** 2).sum() ** 0.5)
305
+ if dist >= min_contrast:
306
+ c = median.astype(int)
307
+ return (int(c[0]), int(c[1]), int(c[2]))
308
+
309
+ # Strategy 2: pixels most different from background
310
+ flat = crop.reshape(-1, 3).astype(float)
311
+ dists = np.sqrt(((flat - bg.astype(float)) ** 2).sum(axis=1))
312
+ threshold = np.percentile(dists, 80)
313
+ far_pixels = flat[dists >= max(threshold, min_contrast * 0.5)]
314
+
315
+ if len(far_pixels) >= 3:
316
+ median = np.median(far_pixels, axis=0).astype(int)
317
+ return (int(median[0]), int(median[1]), int(median[2]))
318
+
319
+ return default
320
+
321
+
322
+ # ── Background detection ────────────��─────────────────────────
323
+
324
+
325
+ def detect_bg_color(image_path: str, border_px: int = 20) -> tuple[int, ...] | None:
326
+ """Sample border pixels to detect solid background color.
327
+
328
+ Returns (r, g, b) if low variance, else None.
329
+ """
330
+ img = np.array(Image.open(image_path).convert("RGB"))
331
+ h, w = img.shape[:2]
332
+ border = np.concatenate([
333
+ img[:border_px, :].reshape(-1, 3),
334
+ img[-border_px:, :].reshape(-1, 3),
335
+ img[border_px:-border_px, :border_px].reshape(-1, 3),
336
+ img[border_px:-border_px, -border_px:].reshape(-1, 3),
337
+ ])
338
+ std = border.std(axis=0).mean()
339
+ if std < 25:
340
+ median = np.median(border, axis=0).astype(int)
341
+ return tuple(median)
342
+ return None
343
+
344
+
345
+ # ── PPTX assembly ─────────────────────────────────────────────
346
+
347
+
348
+ def assemble_pptx(
349
+ image_path: str,
350
+ ocr_regions: list[dict],
351
+ output_path: str,
352
+ background_path: str | None = None,
353
+ tight_mask: np.ndarray | None = None,
354
+ min_font: int = 8,
355
+ max_font: int = 72,
356
+ slide_w_inches: float | None = None,
357
+ ) -> dict:
358
+ """Assemble an editable PPTX from OCR regions and background.
359
+
360
+ Args:
361
+ image_path: Original input image.
362
+ ocr_regions: List of OCR region dicts with bbox and text.
363
+ output_path: Where to save the .pptx file.
364
+ background_path: Inpainted background image (or None for solid bg).
365
+ tight_mask: Pre-dilation text mask for color detection (H, W), uint8.
366
+ min_font: Minimum font size in points.
367
+ max_font: Maximum font size in points.
368
+ slide_w_inches: Override slide width (auto-detected from aspect ratio).
369
+
370
+ Returns:
371
+ Report dict with assembly statistics.
372
+ """
373
+ img = Image.open(image_path)
374
+ img_w, img_h = img.size
375
+ mapper = SlideMapper(img_w, img_h, slide_w_inches)
376
+
377
+ # Create presentation
378
+ prs = Presentation()
379
+ prs.slide_width = Inches(mapper.slide_w)
380
+ prs.slide_height = Inches(mapper.slide_h)
381
+ slide = prs.slides.add_slide(prs.slide_layouts[6]) # Blank
382
+
383
+ # Background
384
+ if background_path and Path(background_path).exists():
385
+ slide.shapes.add_picture(
386
+ background_path, Emu(0), Emu(0),
387
+ Inches(mapper.slide_w), Inches(mapper.slide_h),
388
+ )
389
+ bg_mode = "inpainted"
390
+ else:
391
+ bg_color = detect_bg_color(image_path)
392
+ if bg_color:
393
+ fill = slide.background.fill
394
+ fill.solid()
395
+ fill.fore_color.rgb = RGBColor(*bg_color)
396
+ bg_mode = f"solid rgb{bg_color}"
397
+ else:
398
+ slide.shapes.add_picture(
399
+ image_path, Emu(0), Emu(0),
400
+ Inches(mapper.slide_w), Inches(mapper.slide_h),
401
+ )
402
+ bg_mode = "original"
403
+
404
+ # Load image array for color detection
405
+ img_rgb = None
406
+ if tight_mask is not None:
407
+ img_rgb = np.array(Image.open(image_path).convert("RGB"))
408
+
409
+ # Group OCR regions into lines
410
+ text_groups = group_text_lines(ocr_regions)
411
+
412
+ # Add text boxes
413
+ count = 0
414
+ for group in text_groups:
415
+ x1, y1, x2, y2 = group_bbox(group)
416
+ text = group_to_text(group)
417
+ if not text.strip():
418
+ continue
419
+
420
+ left, top, width, height = mapper.bbox_to_emu(x1, y1, x2, y2)
421
+ pad = mapper.to_emu(2)
422
+ left = max(0, left - pad)
423
+ top = max(0, top - pad)
424
+ width += pad * 2
425
+ height += pad * 2
426
+
427
+ txBox = slide.shapes.add_textbox(left, top, width, height)
428
+ tf = txBox.text_frame
429
+ tf.word_wrap = True
430
+ tf.margin_left = tf.margin_right = tf.margin_top = tf.margin_bottom = Emu(0)
431
+
432
+ # Font size
433
+ bbox_w = x2 - x1
434
+ region_heights = [r["bbox"]["y2"] - r["bbox"]["y1"] for r in group]
435
+ line_h = sorted(region_heights)[len(region_heights) // 2]
436
+ font_size = autoscale_font(text, bbox_w, line_h, mapper.ppi, min_font, max_font)
437
+
438
+ # Font color
439
+ if img_rgb is not None and tight_mask is not None:
440
+ r, g, b = detect_text_color(img_rgb, tight_mask, x1, y1, x2, y2)
441
+ else:
442
+ r, g, b = 0x33, 0x33, 0x33
443
+ color = RGBColor(r, g, b)
444
+
445
+ lines = text.split("\n")
446
+ p = tf.paragraphs[0]
447
+ p.text = lines[0]
448
+ p.font.size = Pt(font_size)
449
+ p.font.color.rgb = color
450
+ for line in lines[1:]:
451
+ p = tf.add_paragraph()
452
+ p.text = line
453
+ p.font.size = Pt(font_size)
454
+ p.font.color.rgb = color
455
+
456
+ count += 1
457
+
458
+ prs.save(output_path)
459
+
460
+ return {
461
+ "image_size": {"width": img_w, "height": img_h},
462
+ "slide_size": {
463
+ "width_inches": round(mapper.slide_w, 2),
464
+ "height_inches": round(mapper.slide_h, 2),
465
+ },
466
+ "ppi": round(mapper.ppi, 1),
467
+ "background": bg_mode,
468
+ "text_boxes": count,
469
+ "ocr_regions": len(ocr_regions),
470
+ }
px_image2pptx/cli.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Command-line interface for px-image2pptx.
2
+
3
+ Usage::
4
+
5
+ # Full pipeline (OCR + textmask + inpaint + PPTX)
6
+ px-image2pptx slide.png -o output.pptx
7
+
8
+ # With pre-computed OCR
9
+ px-image2pptx slide.png -o output.pptx --ocr-json text_regions.json
10
+
11
+ # Skip inpainting (solid background or use original)
12
+ px-image2pptx slide.png -o output.pptx --skip-inpaint
13
+
14
+ # Chinese slide
15
+ px-image2pptx slide.png -o output.pptx --lang ch
16
+
17
+ # Keep intermediate files
18
+ px-image2pptx slide.png -o output.pptx --work-dir ./debug/
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import argparse
24
+ import sys
25
+ import time
26
+
27
+
28
+ def _parse_args(argv=None):
29
+ parser = argparse.ArgumentParser(
30
+ prog="px-image2pptx",
31
+ description="Convert static images to editable PowerPoint slides.",
32
+ formatter_class=argparse.RawDescriptionHelpFormatter,
33
+ epilog="""\
34
+ examples:
35
+ px-image2pptx slide.png -o output.pptx
36
+ px-image2pptx slide.png -o output.pptx --lang ch
37
+ px-image2pptx slide.png -o output.pptx --skip-inpaint
38
+ px-image2pptx slide.png -o output.pptx --ocr-json ocr.json
39
+ px-image2pptx slide.png -o output.pptx --work-dir ./debug/
40
+ """,
41
+ )
42
+ parser.add_argument("image", help="Input image (PNG/JPG/WebP)")
43
+ parser.add_argument("-o", "--output", default="output.pptx",
44
+ help="Output PPTX path (default: output.pptx)")
45
+ parser.add_argument("--ocr-json", default=None,
46
+ help="Pre-computed OCR JSON (skips OCR step)")
47
+ parser.add_argument("--lang", default="auto", choices=["auto", "en", "ch"],
48
+ help="OCR language (default: auto-detect)")
49
+ parser.add_argument("--sensitivity", type=float, default=16,
50
+ help="Textmask sensitivity (default: 16)")
51
+ parser.add_argument("--dilation", type=int, default=12,
52
+ help="Textmask dilation pixels (default: 12)")
53
+ parser.add_argument("--min-font", type=int, default=8,
54
+ help="Minimum font size in points (default: 8)")
55
+ parser.add_argument("--max-font", type=int, default=72,
56
+ help="Maximum font size in points (default: 72)")
57
+ parser.add_argument("--skip-inpaint", action="store_true",
58
+ help="Skip LAMA inpainting (use original or solid bg)")
59
+ parser.add_argument("--work-dir", default=None,
60
+ help="Directory for intermediate files")
61
+ return parser.parse_args(argv)
62
+
63
+
64
+ def main(argv=None):
65
+ args = _parse_args(argv)
66
+
67
+ from px_image2pptx.pipeline import image_to_pptx
68
+
69
+ t0 = time.time()
70
+ report = image_to_pptx(
71
+ image_path=args.image,
72
+ output_path=args.output,
73
+ ocr_json=args.ocr_json,
74
+ lang=args.lang,
75
+ sensitivity=args.sensitivity,
76
+ dilation=args.dilation,
77
+ min_font=args.min_font,
78
+ max_font=args.max_font,
79
+ skip_inpaint=args.skip_inpaint,
80
+ work_dir=args.work_dir,
81
+ )
82
+ elapsed = time.time() - t0
83
+
84
+ print(f"Saved: {args.output}")
85
+ print(f" Text boxes: {report['text_boxes']}")
86
+ print(f" OCR regions: {report['ocr_regions']}")
87
+ print(f" Background: {report['background']}")
88
+ print(f" Slide: {report['slide_size']['width_inches']}x"
89
+ f"{report['slide_size']['height_inches']}\"")
90
+ print(f" Time: {elapsed:.1f}s", end="")
91
+ if "timings" in report:
92
+ t = report["timings"]
93
+ parts = [f"{k}={v}s" for k, v in t.items()]
94
+ print(f" ({', '.join(parts)})")
95
+ else:
96
+ print()
97
+
98
+
99
+ if __name__ == "__main__":
100
+ main()
px_image2pptx/inpaint.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LAMA neural inpainting — reconstruct masked regions.
2
+
3
+ Requires the optional ``inpaint`` extra: ``pip install px-image2pptx[inpaint]``.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import numpy as np
9
+ from PIL import Image
10
+
11
+
12
+ def _ensure_lama():
13
+ """Import LAMA dependencies, raising a helpful error if not installed."""
14
+ try:
15
+ import torch
16
+ from simple_lama_inpainting.models.model import (
17
+ download_model, LAMA_MODEL_URL, prepare_img_and_mask,
18
+ )
19
+ return torch, download_model, LAMA_MODEL_URL, prepare_img_and_mask
20
+ except ImportError:
21
+ raise ImportError(
22
+ "LAMA inpainting requires PyTorch and simple-lama-inpainting.\n"
23
+ "Install with:\n pip install px-image2pptx[inpaint]"
24
+ ) from None
25
+
26
+
27
+ def inpaint(
28
+ image: np.ndarray,
29
+ mask: np.ndarray,
30
+ ) -> np.ndarray:
31
+ """Inpaint masked regions of an image using LAMA.
32
+
33
+ Args:
34
+ image: RGB numpy array (H, W, 3), uint8.
35
+ mask: Grayscale numpy array (H, W), uint8. 255 = inpaint.
36
+
37
+ Returns:
38
+ Inpainted RGB numpy array (H, W, 3), uint8.
39
+ """
40
+ torch, download_model, LAMA_MODEL_URL, prepare_img_and_mask = _ensure_lama()
41
+
42
+ if torch.backends.mps.is_available():
43
+ device = torch.device("mps")
44
+ elif torch.cuda.is_available():
45
+ device = torch.device("cuda")
46
+ else:
47
+ device = torch.device("cpu")
48
+
49
+ model_path = download_model(LAMA_MODEL_URL)
50
+ model = torch.jit.load(model_path, map_location=device)
51
+ model.eval()
52
+ model.to(device)
53
+
54
+ pil_image = Image.fromarray(image)
55
+ pil_mask = Image.fromarray(mask)
56
+ img_t, mask_t = prepare_img_and_mask(pil_image, pil_mask, device)
57
+
58
+ with torch.inference_mode():
59
+ inpainted = model(img_t, mask_t)
60
+ result = inpainted[0].permute(1, 2, 0).detach().cpu().numpy()
61
+ result = np.clip(result * 255, 0, 255).astype(np.uint8)
62
+
63
+ return result
64
+
65
+
66
+ def inpaint_file(
67
+ image_path: str,
68
+ mask_path: str,
69
+ output_path: str,
70
+ ) -> str:
71
+ """Inpaint an image file with a mask file, save result.
72
+
73
+ Returns the output path.
74
+ """
75
+ import cv2
76
+
77
+ image = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
78
+ mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
79
+
80
+ result = inpaint(image, mask)
81
+
82
+ result_bgr = cv2.cvtColor(result, cv2.COLOR_RGB2BGR)
83
+ cv2.imwrite(output_path, result_bgr)
84
+ return output_path
px_image2pptx/ocr.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """OCR text detection using PaddleOCR.
2
+
3
+ Detects text regions with bounding boxes, text content, and confidence scores.
4
+ Requires the optional ``ocr`` extra: ``pip install px-image2pptx[ocr]``.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ import cv2
14
+ import numpy as np
15
+ from PIL import Image, ImageDraw
16
+
17
+
18
+ def _ensure_paddleocr():
19
+ """Import PaddleOCR, raising a helpful error if not installed."""
20
+ try:
21
+ from paddleocr import PaddleOCR
22
+ return PaddleOCR
23
+ except ImportError:
24
+ raise ImportError(
25
+ "PaddleOCR is required for OCR. Install with:\n"
26
+ " pip install px-image2pptx[ocr]"
27
+ ) from None
28
+
29
+
30
+ def run_ocr(image_path: str | Path, lang: str = "ch") -> list[dict]:
31
+ """Run PaddleOCR on an image and return structured text regions.
32
+
33
+ Args:
34
+ image_path: Path to the input image.
35
+ lang: OCR language (default "ch"). Use "en" for English only.
36
+
37
+ Returns:
38
+ List of text region dicts, each with:
39
+ - id: int
40
+ - text: str
41
+ - confidence: float
42
+ - bbox: {"x1": int, "y1": int, "x2": int, "y2": int}
43
+ """
44
+ import os
45
+ os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True"
46
+
47
+ PaddleOCR = _ensure_paddleocr()
48
+
49
+ ocr = PaddleOCR(
50
+ lang=lang,
51
+ use_textline_orientation=False,
52
+ use_doc_orientation_classify=False,
53
+ use_doc_unwarping=False,
54
+ )
55
+ results = list(ocr.predict(str(image_path)))
56
+
57
+ regions = []
58
+ idx = 0
59
+ for page in results:
60
+ polys = page.get("dt_polys", [])
61
+ texts = page.get("rec_texts", [])
62
+ scores = page.get("rec_scores", [])
63
+ for poly, text, conf in zip(polys, texts, scores):
64
+ xs = [p[0] for p in poly]
65
+ ys = [p[1] for p in poly]
66
+ regions.append({
67
+ "id": idx,
68
+ "text": text,
69
+ "confidence": round(float(conf), 4),
70
+ "bbox": {
71
+ "x1": int(min(xs)),
72
+ "y1": int(min(ys)),
73
+ "x2": int(max(xs)),
74
+ "y2": int(max(ys)),
75
+ },
76
+ })
77
+ idx += 1
78
+
79
+ return regions
80
+
81
+
82
+ def save_ocr_json(regions: list[dict], path: str | Path) -> None:
83
+ """Save OCR regions to JSON file."""
84
+ with open(path, "w") as f:
85
+ json.dump({"text_regions": regions}, f, indent=2, ensure_ascii=False)
86
+
87
+
88
+ def load_ocr_json(path: str | Path) -> list[dict]:
89
+ """Load OCR regions from JSON file."""
90
+ with open(path) as f:
91
+ data = json.load(f)
92
+ return data.get("text_regions", [])
93
+
94
+
95
+ def draw_ocr_overlay(image_path: str | Path, regions: list[dict]) -> Image.Image:
96
+ """Draw OCR bounding boxes on image for visualization."""
97
+ img = Image.open(image_path).convert("RGB")
98
+ draw = ImageDraw.Draw(img, "RGBA")
99
+ for r in regions:
100
+ b = r["bbox"]
101
+ draw.rectangle([b["x1"], b["y1"], b["x2"], b["y2"]],
102
+ outline=(255, 50, 50), width=3)
103
+ draw.rectangle([b["x1"], b["y1"], b["x2"], b["y2"]],
104
+ fill=(255, 50, 50, 40))
105
+ return img
px_image2pptx/pipeline.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """End-to-end pipeline: image → editable PPTX.
2
+
3
+ Orchestrates: OCR → textmask → mask-clip → inpaint → PPTX assembly.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ import time
10
+ from pathlib import Path
11
+
12
+ import cv2
13
+ import numpy as np
14
+ from PIL import Image
15
+
16
+ from px_image2pptx.assemble import assemble_pptx
17
+ from px_image2pptx.textmask import compute_masks
18
+
19
+
20
+ def image_to_pptx(
21
+ image_path: str | Path,
22
+ output_path: str | Path = "output.pptx",
23
+ *,
24
+ ocr_json: str | Path | None = None,
25
+ lang: str = "auto",
26
+ sensitivity: float = 16,
27
+ dilation: int = 12,
28
+ mask_padding: int = 15,
29
+ min_font: int = 8,
30
+ max_font: int = 72,
31
+ skip_inpaint: bool = False,
32
+ work_dir: str | Path | None = None,
33
+ ) -> dict:
34
+ """Convert a static image to an editable PPTX.
35
+
36
+ Args:
37
+ image_path: Input image (PNG/JPG/WebP).
38
+ output_path: Where to save the .pptx file.
39
+ ocr_json: Pre-computed OCR JSON (skip OCR step if provided).
40
+ lang: OCR language ("en", "ch", or "auto" to detect).
41
+ sensitivity: Textmask sensitivity (lower = more aggressive).
42
+ dilation: Textmask dilation in pixels.
43
+ mask_padding: Padding around OCR bboxes for mask clipping.
44
+ min_font: Minimum font size in points.
45
+ max_font: Maximum font size in points.
46
+ skip_inpaint: If True, skip inpainting (use original as background).
47
+ work_dir: Directory for intermediate files (default: temp dir).
48
+
49
+ Returns:
50
+ Report dict with pipeline statistics.
51
+ """
52
+ image_path = str(image_path)
53
+ output_path = str(output_path)
54
+ timings = {}
55
+
56
+ # Work directory for intermediates (only created when explicitly requested)
57
+ save_intermediates = work_dir is not None
58
+ if save_intermediates:
59
+ wdir = Path(work_dir)
60
+ wdir.mkdir(parents=True, exist_ok=True)
61
+
62
+ # Step 1: OCR
63
+ t0 = time.time()
64
+ if ocr_json:
65
+ from px_image2pptx.ocr import load_ocr_json
66
+ ocr_regions = load_ocr_json(ocr_json)
67
+ else:
68
+ from px_image2pptx.ocr import run_ocr, save_ocr_json
69
+
70
+ # "ch" model handles both Chinese and English, so use it as default
71
+ ocr_lang = "ch" if lang == "auto" else lang
72
+ ocr_regions = run_ocr(image_path, lang=ocr_lang)
73
+
74
+ if save_intermediates:
75
+ save_ocr_json(ocr_regions, wdir / "text_regions.json")
76
+ timings["ocr"] = round(time.time() - t0, 2)
77
+
78
+ # Step 2: Textmask → clip to OCR → dilate
79
+ t0 = time.time()
80
+ image_bgr = cv2.imread(image_path)
81
+ tight_mask, clipped_mask, dilated_mask = compute_masks(
82
+ image_bgr, ocr_regions,
83
+ sensitivity=sensitivity, dilation=dilation, padding=mask_padding,
84
+ )
85
+ if save_intermediates:
86
+ Image.fromarray(tight_mask).save(str(wdir / "tight_mask.png"))
87
+ Image.fromarray(clipped_mask).save(str(wdir / "clipped_mask.png"))
88
+ Image.fromarray(dilated_mask).save(str(wdir / "mask.png"))
89
+ timings["textmask"] = round(time.time() - t0, 2)
90
+
91
+ # Step 3: Inpaint
92
+ background_path = None
93
+ _temp_bg = None
94
+ if not skip_inpaint:
95
+ t0 = time.time()
96
+ from px_image2pptx.inpaint import inpaint
97
+
98
+ image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
99
+ result = inpaint(image_rgb, dilated_mask)
100
+
101
+ if save_intermediates:
102
+ bg_path = str(wdir / "background.png")
103
+ Image.fromarray(result).save(bg_path)
104
+ background_path = bg_path
105
+ else:
106
+ import tempfile
107
+ _temp_bg = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
108
+ Image.fromarray(result).save(_temp_bg.name)
109
+ background_path = _temp_bg.name
110
+ timings["inpaint"] = round(time.time() - t0, 2)
111
+
112
+ # Step 4: Assemble PPTX
113
+ t0 = time.time()
114
+ report = assemble_pptx(
115
+ image_path=image_path,
116
+ ocr_regions=ocr_regions,
117
+ output_path=output_path,
118
+ background_path=background_path,
119
+ tight_mask=tight_mask,
120
+ min_font=min_font,
121
+ max_font=max_font,
122
+ )
123
+ timings["assemble"] = round(time.time() - t0, 2)
124
+
125
+ # Clean up temp background file
126
+ if _temp_bg is not None:
127
+ import os
128
+ os.unlink(_temp_bg.name)
129
+
130
+ report["timings"] = timings
131
+ if save_intermediates:
132
+ report["work_dir"] = str(wdir)
133
+ with open(wdir / "report.json", "w") as f:
134
+ json.dump(report, f, indent=2)
135
+
136
+ return report
px_image2pptx/textmask.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Text ink detection using classical computer vision.
2
+
3
+ Detects text pixels directly from image using adaptive thresholding,
4
+ connected component filtering, and Canny edge reinforcement. No ML model.
5
+
6
+ Returns both a tight mask (actual ink pixels, for color sampling) and a
7
+ dilated mask (for inpainting with safe coverage).
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import cv2
13
+ import numpy as np
14
+
15
+
16
+ def detect_text_ink(
17
+ image: np.ndarray,
18
+ block_size: int = 25,
19
+ sensitivity: float = 16,
20
+ max_component_pct: float = 2.0,
21
+ min_component_area: int = 8,
22
+ max_density: float = 0.9,
23
+ max_density_area: int = 500,
24
+ edge_neighborhood: int = 15,
25
+ min_final_area: int = 10,
26
+ ) -> np.ndarray:
27
+ """Detect text ink pixels using adaptive thresholding and component analysis.
28
+
29
+ Args:
30
+ image: BGR numpy array (H, W, 3), uint8.
31
+ block_size: Adaptive threshold block size (must be odd, >= 3).
32
+ sensitivity: Adaptive threshold C parameter. Higher = less sensitive.
33
+ max_component_pct: Max connected component area as % of image.
34
+ min_component_area: Min component area in pixels (noise filter).
35
+ max_density: Components with density above this AND area above
36
+ max_density_area are treated as solid blobs.
37
+ max_density_area: Minimum area for density filtering to apply.
38
+ edge_neighborhood: Radius (px) for Canny edge reinforcement.
39
+ min_final_area: Final cleanup — components smaller than this removed.
40
+
41
+ Returns:
42
+ Binary mask (H, W), uint8, 255 = text ink, 0 = background.
43
+ """
44
+ h, w = image.shape[:2]
45
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
46
+
47
+ # Ensure block_size is valid
48
+ if block_size % 2 == 0:
49
+ block_size += 1
50
+ if block_size < 3:
51
+ block_size = 3
52
+
53
+ # Step 1: Dual thresholding (adaptive + Otsu intersection)
54
+ adaptive = cv2.adaptiveThreshold(
55
+ gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,
56
+ blockSize=block_size, C=sensitivity,
57
+ )
58
+ _, otsu = cv2.threshold(
59
+ gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU,
60
+ )
61
+ combined = cv2.bitwise_and(adaptive, otsu)
62
+
63
+ # Step 2: Connect nearby stroke fragments
64
+ kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
65
+ candidates = cv2.morphologyEx(combined, cv2.MORPH_CLOSE, kernel_close)
66
+
67
+ # Step 3: Connected component filtering
68
+ num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(
69
+ candidates, connectivity=8,
70
+ )
71
+ max_area = h * w * (max_component_pct / 100.0)
72
+ text_mask = np.zeros((h, w), dtype=np.uint8)
73
+
74
+ for i in range(1, num_labels):
75
+ x, y, cw, ch, area = stats[i]
76
+ if area > max_area:
77
+ continue
78
+ if area < min_component_area:
79
+ continue
80
+ if cw > w * 0.3 and ch > h * 0.3:
81
+ continue
82
+ bbox_area = max(cw * ch, 1)
83
+ density = area / bbox_area
84
+ if density > max_density and area > max_density_area:
85
+ continue
86
+ text_mask[labels == i] = 255
87
+
88
+ # Step 4: Canny edge reinforcement near detected text
89
+ edges = cv2.Canny(gray, 80, 200)
90
+ kernel_near = cv2.getStructuringElement(
91
+ cv2.MORPH_ELLIPSE,
92
+ (edge_neighborhood * 2 + 1, edge_neighborhood * 2 + 1),
93
+ )
94
+ text_neighborhood = cv2.dilate(text_mask, kernel_near)
95
+ edge_near_text = cv2.bitwise_and(edges, text_neighborhood)
96
+ text_mask = cv2.bitwise_or(text_mask, edge_near_text)
97
+
98
+ # Step 5: Final cleanup
99
+ kernel_fill = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
100
+ text_mask = cv2.morphologyEx(text_mask, cv2.MORPH_CLOSE, kernel_fill)
101
+
102
+ num_labels2, labels2, stats2, _ = cv2.connectedComponentsWithStats(
103
+ text_mask, connectivity=8,
104
+ )
105
+ clean_mask = np.zeros((h, w), dtype=np.uint8)
106
+ for i in range(1, num_labels2):
107
+ if stats2[i, cv2.CC_STAT_AREA] >= min_final_area:
108
+ clean_mask[labels2 == i] = 255
109
+
110
+ return clean_mask
111
+
112
+
113
+ def dilate_mask(mask: np.ndarray, dilation_px: int) -> np.ndarray:
114
+ """Apply morphological dilation to a binary mask."""
115
+ if dilation_px <= 0 or not np.any(mask):
116
+ return mask.copy()
117
+ kernel = cv2.getStructuringElement(
118
+ cv2.MORPH_ELLIPSE,
119
+ (dilation_px * 2 + 1, dilation_px * 2 + 1),
120
+ )
121
+ return cv2.dilate(mask, kernel, iterations=1)
122
+
123
+
124
+ def clip_mask_to_ocr(
125
+ mask: np.ndarray,
126
+ ocr_regions: list[dict],
127
+ padding: int = 15,
128
+ ) -> np.ndarray:
129
+ """Clip text mask to OCR-confirmed regions only.
130
+
131
+ ANDs the textmask with rectangles from OCR bounding boxes so only
132
+ pixels inside known text regions survive. Prevents masking illustrations,
133
+ borders, and icons that textmask wrongly detects as text.
134
+ """
135
+ h, w = mask.shape[:2]
136
+ ocr_mask = np.zeros_like(mask)
137
+
138
+ for r in ocr_regions:
139
+ b = r["bbox"]
140
+ y1 = max(0, b["y1"] - padding)
141
+ x1 = max(0, b["x1"] - padding)
142
+ y2 = min(h, b["y2"] + padding)
143
+ x2 = min(w, b["x2"] + padding)
144
+ ocr_mask[y1:y2, x1:x2] = 255
145
+
146
+ return np.minimum(mask, ocr_mask)
147
+
148
+
149
+ def compute_masks(
150
+ image_bgr: np.ndarray,
151
+ ocr_regions: list[dict],
152
+ sensitivity: float = 16,
153
+ dilation: int = 12,
154
+ padding: int = 15,
155
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
156
+ """Full textmask pipeline: detect → clip to OCR → dilate.
157
+
158
+ Returns:
159
+ (tight_mask, clipped_mask, dilated_mask)
160
+ - tight_mask: raw ink pixels (for color sampling)
161
+ - clipped_mask: tight mask AND-ed with OCR bboxes
162
+ - dilated_mask: clipped + dilation (for inpainting)
163
+ """
164
+ tight = detect_text_ink(image_bgr, sensitivity=sensitivity)
165
+ clipped = clip_mask_to_ocr(tight, ocr_regions, padding=padding)
166
+ dilated = dilate_mask(clipped, dilation)
167
+ return tight, clipped, dilated
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ Pillow>=10.0
2
+ numpy>=1.24
3
+ opencv-python-headless>=4.8
4
+ python-pptx>=0.6.21
5
+ paddleocr>=3.0
6
+ paddlepaddle>=3.0
7
+ simple-lama-inpainting>=0.1.0
8
+ torch>=2.0
9
+ gradio>=4.0