Spaces:

phamha
/

engineering-drawing

Sleeping

App Files Files Community

Harry Pham commited on 19 days ago

Commit

4fef2dd

0 Parent(s):

init space

Browse files

Files changed (3) hide show

.gitignore +9 -0
app.py +150 -0
src/inference.py +307 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+Dataset/
+outputs/
+venv/
+__pycache__/
+*.pyc
+*.pkl
+*.h5
+*.log
+*.json

app.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# app.py — Gradio demo
+# Đặt ở root project: engineering-drawing-ai/app.py
+import os, sys, json, tempfile
+import gradio as gr
+import cv2
+import numpy as np
+from PIL import Image
+# Auto-download weights từ HuggingFace Hub nếu chưa có
+CHECKPOINT = "best.pt"
+HF_REPO    = "phamha/drawing-model-weights"  # ← sửa sau
+def ensure_weights():
+    if not os.path.exists(CHECKPOINT):
+        print("[INFO] Downloading model weights from HuggingFace...")
+        try:
+            from huggingface_hub import hf_hub_download
+            hf_hub_download(
+                repo_id=HF_REPO,
+                filename="best.pt",
+                local_dir=".",
+                local_dir_use_symlinks=False,
+            )
+            print("[INFO] Weights downloaded.")
+        except Exception as e:
+            print(f"[ERROR] Cannot download weights: {e}")
+            raise
+ensure_weights()
+# Import pipeline SAU KHI đảm bảo weights tồn tại
+sys.path.insert(0, ".")
+from src.inference import run_pipeline
+# ── Xử lý ảnh ──────────────────────────────────────────────
+def process(image: Image.Image):
+    if image is None:
+        return None, "{}", "Chưa có ảnh."
+    tmp_dir  = tempfile.mkdtemp()
+    tmp_path = os.path.join(tmp_dir, "input.jpg")
+    image.save(tmp_path, quality=95)
+    try:
+        result, vis_path = run_pipeline(
+            image_path  = tmp_path,
+            output_dir  = tmp_dir,
+            checkpoint  = CHECKPOINT,
+            conf_thresh = 0.3,
+        )
+    except Exception as e:
+        import traceback
+        return None, "{}", f"Lỗi pipeline:\n{traceback.format_exc()}"
+    # Ảnh kết quả
+    vis_bgr = cv2.imread(vis_path)
+    vis_rgb = cv2.cvtColor(vis_bgr, cv2.COLOR_BGR2RGB)
+    # JSON sạch (bỏ crop_path)
+    clean_objs = []
+    for obj in result["objects"]:
+        clean_objs.append({
+            "id":          obj["id"],
+            "class":       obj["class"],
+            "confidence":  obj["confidence"],
+            "bbox":        obj["bbox"],
+            "ocr_content": obj["ocr_content"],
+        })
+    json_str = json.dumps(
+        {"image": result["image"], "objects": clean_objs},
+        ensure_ascii=False, indent=2
+    )
+    # OCR text đẹp
+    ocr_parts = []
+    for obj in result["objects"]:
+        content = obj.get("ocr_content")
+        if not content:
+            continue
+        if isinstance(content, dict):        # Table
+            content = content.get("text", "")
+        if not content.strip():
+            continue
+        sep = "─" * 46
+        ocr_parts.append(
+            f"{sep}\n"
+            f"[{obj['class']} #{obj['id']}]  conf={obj['confidence']}\n"
+            f"{sep}\n{content}"
+        )
+    ocr_text = "\n\n".join(ocr_parts) or "Không phát hiện Note / Table."
+    return vis_rgb, json_str, ocr_text
+# ── Gradio UI ───────────────────────────────────────────────
+with gr.Blocks(title="Engineering Drawing Analyzer", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🔧 Engineering Drawing Analyzer
+    **Tự động phát hiện và trích xuất văn bản từ bản vẽ kỹ thuật**
+    Hỗ trợ 3 loại vùng:
+    - 🟢 **PartDrawing** — vùng bản vẽ chi tiết
+    - 🟠 **Note** — ghi chú, chú thích
+    - 🔴 **Table** — bảng dữ liệu kỹ thuật
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            inp = gr.Image(type="pil", label="📁 Upload bản vẽ kỹ thuật")
+            btn = gr.Button("🔍 Detect & OCR", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            out_img = gr.Image(label="✅ Kết quả detection")
+    with gr.Row():
+        with gr.Column(scale=1):
+            out_json = gr.Code(
+                language="json",
+                label="📋 JSON output",
+                lines=25,
+            )
+        with gr.Column(scale=1):
+            out_ocr = gr.Textbox(
+                label="📝 OCR content (Note & Table)",
+                lines=25,
+                max_lines=60,
+            )
+    btn.click(
+        fn      = process,
+        inputs  = [inp],
+        outputs = [out_img, out_json, out_ocr],
+    )
+    gr.Markdown("""
+    ---
+    **Model:** RT-DETR-L fine-tuned | **OCR:** EasyOCR (vi+en) + PaddleOCR fallback
+    **mAP50:** 0.942 | **Dataset:** Engineering drawings (Vietnamese technical)
+    """)
+if __name__ == "__main__":
+    demo.launch(
+        server_name = "0.0.0.0",
+        server_port = 7860,
+        share       = False,   # đổi True nếu muốn link public tạm
+    )

src/inference.py ADDED Viewed

	@@ -0,0 +1,307 @@

+# src/inference.py
+# ── Patch torch.load — PHẢI LÀ DÒNG ĐẦU TIÊN ──────────────
+import torch
+_orig_torch_load = torch.load
+def _patched_load(*args, **kwargs):
+    kwargs.setdefault("weights_only", False)
+    return _orig_torch_load(*args, **kwargs)
+torch.load = _patched_load
+# ───────────────────────────────────────────────────────────
+import cv2
+import json
+import numpy as np
+from pathlib import Path
+from ultralytics import RTDETR
+# ── Device ─────────────────────────────────────────────────
+DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
+print(f"[INFO] Device: {DEVICE}")
+# ── Class config ────────────────────────────────────────────
+CLASS_NAMES   = ["note", "part-drawing", "table"]
+CLASS_DISPLAY = {
+    "note":         "Note",
+    "part-drawing": "PartDrawing",
+    "table":        "Table",
+}
+COLORS = {
+    "note":         (0,  165, 255),
+    "part-drawing": (0,  200,   0),
+    "table":        (0,   0,  220),
+}
+# ───────────────────────────────────────────────────────────
+# DETECTION MODEL
+# ───────────────────────────────────────────────────────────
+_det_model = None
+def get_det_model(checkpoint: str = "best.pt") -> RTDETR:
+    global _det_model
+    if _det_model is None:
+        print(f"[INFO] Loading detection model: {checkpoint}")
+        _det_model = RTDETR(checkpoint)
+    return _det_model
+# ───────────────────────────────────────────────────────────
+# OCR ENGINES
+# ───────────────────────────────────────────────────────────
+_easy_reader   = None
+_paddle_engine = None
+def get_easy_reader():
+    global _easy_reader
+    if _easy_reader is None:
+        import easyocr
+        print("[INFO] Loading EasyOCR (vi + en)...")
+        _easy_reader = easyocr.Reader(
+            ["vi", "en"],
+            gpu=False,
+            verbose=False,
+        )
+    return _easy_reader
+def get_paddle_engine():
+    global _paddle_engine
+    if _paddle_engine is None:
+        from paddleocr import PaddleOCR
+        print("[INFO] Loading PaddleOCR (vi)...")
+        _paddle_engine = PaddleOCR(
+            use_angle_cls=True,
+            lang="vi",
+            show_log=False,
+            use_gpu=False,
+        )
+    return _paddle_engine
+# ───────────────────────────────────────────────────────────
+# PREPROCESSING
+# ───────────────────────────────────────────────────────────
+def preprocess_for_ocr(img_bgr: np.ndarray) -> np.ndarray:
+    h, w = img_bgr.shape[:2]
+    # Upscale nếu quá nhỏ
+    if w < 800:
+        scale   = 800 / w
+        img_bgr = cv2.resize(
+            img_bgr,
+            (int(w * scale), int(h * scale)),
+            interpolation=cv2.INTER_CUBIC,
+        )
+    gray  = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
+    gray  = clahe.apply(gray)
+    gray  = cv2.fastNlMeansDenoising(gray, h=15,
+                                      templateWindowSize=7,
+                                      searchWindowSize=21)
+    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
+    gray   = cv2.filter2D(gray, -1, kernel)
+    return cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
+# ───────────────────────────────────────────────────────────
+# OCR: NOTE
+# ───────────────────────────────────────────────────────────
+def ocr_note(img_path: str) -> str:
+    img = cv2.imread(img_path)
+    if img is None:
+        return ""
+    img_proc = preprocess_for_ocr(img)
+    # EasyOCR
+    try:
+        reader  = get_easy_reader()
+        results = reader.readtext(img_proc, detail=1, paragraph=False,
+                                  width_ths=0.7, height_ths=0.7)
+        lines = [t for (_, t, c) in results if c >= 0.2 and t.strip()]
+        if lines:
+            return "\n".join(lines)
+    except Exception as e:
+        print(f"[WARN] EasyOCR note: {e}")
+    # Fallback PaddleOCR
+    try:
+        ocr    = get_paddle_engine()
+        result = ocr.ocr(img_proc, cls=True)
+        if result and result[0]:
+            return "\n".join(l[1][0] for l in result[0] if l[1][1] >= 0.2)
+    except Exception as e:
+        print(f"[WARN] PaddleOCR note: {e}")
+    return ""
+# ───────────────────────────────────────────────────────────
+# OCR: TABLE
+# ───────────────────────────────────────────────────────────
+def _group_rows(items: list) -> list:
+    if not items:
+        return []
+    items = sorted(items, key=lambda x: x["y"])
+    y_vals = [it["y"] for it in items]
+    if len(y_vals) > 1:
+        gaps   = [y_vals[i+1] - y_vals[i] for i in range(len(y_vals)-1)]
+        thresh = max(8, (sum(gaps)/len(gaps)) * 0.6)
+    else:
+        thresh = 12
+    rows, cur = [], [items[0]]
+    for item in items[1:]:
+        if item["y"] - cur[-1]["y"] < thresh:
+            cur.append(item)
+        else:
+            cur.sort(key=lambda x: x["x"])
+            rows.append([i["text"] for i in cur])
+            cur = [item]
+    cur.sort(key=lambda x: x["x"])
+    rows.append([i["text"] for i in cur])
+    return rows
+def ocr_table(img_path: str) -> dict:
+    img = cv2.imread(img_path)
+    if img is None:
+        return {"rows": [], "text": ""}
+    img_proc = preprocess_for_ocr(img)
+    items    = []
+    # EasyOCR
+    try:
+        reader  = get_easy_reader()
+        results = reader.readtext(img_proc, detail=1, paragraph=False,
+                                  width_ths=0.5, height_ths=0.5)
+        for (pts, text, conf) in results:
+            if conf < 0.2 or not text.strip():
+                continue
+            items.append({
+                "text": text.strip(),
+                "y": sum(p[1] for p in pts) / 4,
+                "x": sum(p[0] for p in pts) / 4,
+            })
+    except Exception as e:
+        print(f"[WARN] EasyOCR table: {e}")
+    # Fallback PaddleOCR
+    if not items:
+        try:
+            ocr    = get_paddle_engine()
+            result = ocr.ocr(img_proc, cls=True)
+            if result and result[0]:
+                for line in result[0]:
+                    pts, (text, conf) = line[0], line[1]
+                    if conf < 0.2 or not text.strip():
+                        continue
+                    items.append({
+                        "text": text.strip(),
+                        "y": sum(p[1] for p in pts) / 4,
+                        "x": sum(p[0] for p in pts) / 4,
+                    })
+        except Exception as e:
+            print(f"[WARN] PaddleOCR table: {e}")
+    if not items:
+        return {"rows": [], "text": ""}
+    rows = _group_rows(items)
+    return {
+        "rows": rows,
+        "text": "\n".join(" | ".join(r) for r in rows),
+    }
+# ───────────────────────────────────────────────────────────
+# MAIN PIPELINE
+# ───────────────────────────────────────────────────────────
+def run_pipeline(
+    image_path:  str,
+    output_dir:  str   = "outputs",
+    checkpoint:  str   = "best.pt",
+    conf_thresh: float = 0.3,
+) -> tuple:
+    image_path = str(image_path)
+    img_name   = Path(image_path).name
+    stem       = Path(image_path).stem
+    crop_dir   = Path(output_dir) / stem / "crops"
+    crop_dir.mkdir(parents=True, exist_ok=True)
+    # 1. Detect
+    model   = get_det_model(checkpoint)
+    results = model(image_path, imgsz=1024, conf=conf_thresh,
+                    iou=0.5, device=DEVICE, verbose=False)
+    img_bgr = cv2.imread(image_path)
+    if img_bgr is None:
+        raise ValueError(f"Không đọc được ảnh: {image_path}")
+    objects = []
+    for i, box in enumerate(results[0].boxes):
+        x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
+        cls_idx  = int(box.cls[0])
+        conf_val = round(float(box.conf[0]), 4)
+        cls_raw  = CLASS_NAMES[cls_idx]
+        cls_show = CLASS_DISPLAY[cls_raw]
+        # 2. Crop
+        pad  = 6
+        crop = img_bgr[max(0,y1-pad):min(img_bgr.shape[0],y2+pad),
+                       max(0,x1-pad):min(img_bgr.shape[1],x2+pad)]
+        crop_path = str(crop_dir / f"{cls_show}_{i+1}.jpg")
+        cv2.imwrite(crop_path, crop, [cv2.IMWRITE_JPEG_QUALITY, 95])
+        # 3. OCR
+        ocr_content = None
+        if cls_raw == "note":
+            print(f"[OCR] Note #{i+1}...")
+            ocr_content = ocr_note(crop_path)
+            print(f"      → {repr(ocr_content[:80]) if ocr_content else 'EMPTY'}")
+        elif cls_raw == "table":
+            print(f"[OCR] Table #{i+1}...")
+            ocr_content = ocr_table(crop_path)
+            print(f"      → {repr(ocr_content.get('text','')[:80]) if ocr_content else 'EMPTY'}")
+        objects.append({
+            "id":          i + 1,
+            "class":       cls_show,
+            "confidence":  conf_val,
+            "bbox":        {"x1": x1, "y1": y1, "x2": x2, "y2": y2},
+            "crop_path":   crop_path,
+            "ocr_content": ocr_content,
+        })
+        # 4. Vẽ bbox
+        color = COLORS[cls_raw]
+        cv2.rectangle(img_bgr, (x1, y1), (x2, y2), color, 2)
+        label = f"{cls_show} {conf_val:.2f}"
+        (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
+        cv2.rectangle(img_bgr, (x1, y1-th-10), (x1+tw+8, y1), color, -1)
+        cv2.putText(img_bgr, label, (x1+4, y1-4),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,255,255), 2)
+    # 5. Lưu visualize
+    vis_path = str(Path(output_dir) / stem / "result_vis.jpg")
+    cv2.imwrite(vis_path, img_bgr)
+    # 6. Lưu JSON
+    result    = {"image": img_name, "objects": objects}
+    json_path = str(Path(output_dir) / stem / "result.json")
+    with open(json_path, "w", encoding="utf-8") as f:
+        json.dump(result, f, ensure_ascii=False, indent=2)
+    print(f"\n[✓] {len(objects)} objects | vis→{vis_path} | json→{json_path}")
+    return result, vis_path
+# ── CLI ──────────────────────────────────────────────────────
+if __name__ == "__main__":
+    import sys
+    img = sys.argv[1] if len(sys.argv) > 1 else "test.jpg"
+    result, _ = run_pipeline(img)
+    print(json.dumps(result, ensure_ascii=False, indent=2))