Bengali detection model — mAP50=0.8790

Browse files

Files changed (8) hide show

.gitattributes +2 -0
README.md +69 -0
bengali_det.onnx +3 -0
bengali_det.pt +3 -0
dataset.yaml +6 -0
detection_results.png +3 -0
pipeline.py +144 -0
sample_pages.png +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+detection_results.png filter=lfs diff=lfs merge=lfs -text
+sample_pages.png filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,69 @@

+---
+language: bn
+license: mit
+tags:
+  - object-detection
+  - ocr
+  - bengali
+  - yolov8
+  - document-understanding
+metrics:
+  - map
+---
+# Bengali OCR — Text Detection Model
+**Project:** DocReader BD — CSC4233 NLP, AIUB
+**Architecture:** YOLOv8n (~3.2M params)
+**Task:** Detect word-level bounding boxes in Bengali documents
+**Companion recognition model:** `Sarjinkhan2003/bengali-ocr-recognition`
+## Results
+| Metric | Value |
+|---|---|
+| mAP@0.5 | 0.8790 |
+| mAP@0.5:0.95 | 0.6344 |
+| Precision | 0.8722 |
+| Recall | 0.8519 |
+## Quick start — full pipeline
+```python
+# pip install ultralytics huggingface_hub torch torchvision Pillow
+from pipeline import BengaliDocOCR
+# Load both detection + recognition from HuggingFace
+ocr = BengaliDocOCR.from_hub(device="cuda")  # or "cpu"
+# Run on a document
+result = ocr.read_document("bengali_doc.jpg")
+print(result["text"])            # full text
+for item in result["items"]:     # word-level
+    print(item["bbox"], item["text"])
+```
+## Detection only
+```python
+from ultralytics import YOLO
+from huggingface_hub import hf_hub_download
+det_path = hf_hub_download("Sarjinkhan2003/bengali-ocr-detection", "bengali_det.pt")
+model    = YOLO(det_path)
+results  = model.predict("doc.jpg", conf=0.25)
+for box in results[0].boxes:
+    print(box.xyxy[0].tolist(), box.conf[0].item())
+```
+## Files
+| File | Description |
+|---|---|
+| `bengali_det.pt` | YOLOv8 weights (PyTorch) |
+| `bengali_det.onnx` | ONNX export (CPU-friendly) |
+| `pipeline.py` | Combined detection + recognition pipeline |
+| `dataset.yaml` | Dataset config used for training |
+## Training data
+- BN-HTRd: real annotated Bengali handwritten document pages
+- 3,000 synthetic pages (auto-generated with Pillow)

bengali_det.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1fdc69233343dd24c1686dedf33f25f0ea5723b3cc30e9ce158e3bce3ea5826e
+size 12391968

bengali_det.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80aa25413c7ae2cec9c9ce9366b3df59ad05d8c81ba809f22c2e30d76e582ad6
+size 6217706

dataset.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+names:
+- word
+nc: 1
+path: /content/detection_data
+train: images/train
+val: images/val

detection_results.png ADDED Viewed

Git LFS Details

SHA256: 9969185671b19c09324534c1a710e94029bbb8549f06b3de90b857ddf2ebcc09
Pointer size: 132 Bytes
Size of remote file: 1.49 MB

pipeline.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""
+Bengali OCR — Full Pipeline
+Detection (YOLOv8) + Recognition (BengaliCRNN)
+Usage:
+    from pipeline import BengaliDocOCR
+    ocr = BengaliDocOCR.from_hub()
+    result = ocr.read_document("page.jpg")
+    print(result["text"])
+"""
+import json, os, torch
+from pathlib import Path
+from PIL import Image
+from torchvision import transforms
+from huggingface_hub import hf_hub_download
+DETECT_REPO = "Sarjinkhan2003/bengali-ocr-detection"
+RECOG_REPO  = "Sarjinkhan2003/bengali-ocr-recognition"
+class BengaliDocOCR:
+    """
+    Full Bengali document OCR pipeline.
+    Combines:
+      - YOLOv8n text detection
+      - LightCRNN text recognition
+    """
+    def __init__(self, det_model, rec_model, idx2char,
+                 img_h=64, img_w=256, device="cpu"):
+        self.det     = det_model
+        self.rec     = rec_model.to(device).eval()
+        self.idx2char= idx2char
+        self.device  = device
+        self.img_h   = img_h
+        self.img_w   = img_w
+        self.tf = transforms.Compose([
+            transforms.Grayscale(1),
+            transforms.Resize((img_h, img_w)),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5],[0.5])
+        ])
+    @classmethod
+    def from_hub(cls, device="cpu"):
+        """Download both models from HuggingFace and build pipeline."""
+        from ultralytics import YOLO
+        import importlib.util
+        # Detection model
+        det_path = hf_hub_download(DETECT_REPO, "bengali_det.pt")
+        det_model = YOLO(det_path)
+        # Recognition model
+        net_path   = hf_hub_download(RECOG_REPO, "bengali_crnn.py")
+        ckpt_path  = hf_hub_download(RECOG_REPO, "bengali_crnn.pth")
+        vocab_path = hf_hub_download(RECOG_REPO, "vocab.json")
+        spec = importlib.util.spec_from_file_location("bengali_crnn", net_path)
+        mod  = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(mod)
+        vocab    = json.load(open(vocab_path, encoding="utf-8"))
+        idx2char = {int(k): v for k,v in vocab["idx2char"].items()}
+        rec_model = mod.Model(1, 256, 256, vocab["num_classes"])
+        ckpt = torch.load(ckpt_path, map_location=device)
+        rec_model.load_state_dict(ckpt["model_state_dict"])
+        return cls(det_model, rec_model, idx2char, device=device)
+    def _recognize(self, crop):
+        """Run recognition on a single cropped word image."""
+        tensor = self.tf(crop).unsqueeze(0).to(self.device)
+        with torch.no_grad():
+            out = self.rec(tensor)
+        _, preds = out.permute(1,0,2).max(2)
+        chars, prev = [], None
+        for p in preds[0].tolist():
+            if p != 0 and p != prev:
+                chars.append(self.idx2char.get(p, ""))
+            prev = p
+        return "".join(chars)
+    def _sort_boxes(self, boxes):
+        """
+        Sort detected boxes in reading order:
+        top-to-bottom, left-to-right within each row.
+        Rows are grouped by vertical proximity.
+        """
+        if not boxes:
+            return boxes
+        # Sort by y-center first
+        boxes_sorted = sorted(boxes, key=lambda b: (b[1]+b[3])/2)
+        if len(boxes_sorted) == 0:
+            return boxes_sorted
+        # Group into rows (boxes within LINE_THRESH of each other = same row)
+        line_thresh = max(10, (boxes_sorted[0][3] - boxes_sorted[0][1]) * 0.6)
+        rows, current_row = [], [boxes_sorted[0]]
+        for b in boxes_sorted[1:]:
+            cy_prev = (current_row[-1][1] + current_row[-1][3]) / 2
+            cy_curr = (b[1] + b[3]) / 2
+            if abs(cy_curr - cy_prev) < line_thresh:
+                current_row.append(b)
+            else:
+                rows.append(sorted(current_row, key=lambda b: b[0]))  # sort by x
+                current_row = [b]
+        rows.append(sorted(current_row, key=lambda b: b[0]))
+        return [b for row in rows for b in row]
+    def read_document(self, image_path, conf=0.25):
+        """
+        Full pipeline: detect → sort → recognize → assemble.
+        Returns dict:
+          text     : full document text string
+          items    : list of {"bbox": [x1,y1,x2,y2], "text": str, "conf": float}
+          pageCount: 1
+        """
+        img     = Image.open(image_path).convert("RGB")
+        results = self.det.predict(image_path, conf=conf, verbose=False)
+        boxes   = [box.xyxy[0].tolist() + [box.conf[0].item()]
+                   for box in results[0].boxes]
+        # Sort into reading order
+        boxes_xy = [[b[0],b[1],b[2],b[3]] for b in boxes]
+        sorted_boxes = self._sort_boxes(boxes_xy)
+        items, texts = [], []
+        for bbox in sorted_boxes:
+            x1, y1, x2, y2 = [int(v) for v in bbox]
+            crop = img.crop((x1, y1, x2, y2))
+            if crop.width < 4 or crop.height < 4:
+                continue
+            text = self._recognize(crop)
+            if text.strip():
+                items.append({"bbox": [x1,y1,x2,y2], "text": text})
+                texts.append(text)
+        return {
+            "text"      : " ".join(texts),
+            "items"     : items,
+            "pageCount" : 1
+        }

sample_pages.png ADDED Viewed

Git LFS Details

SHA256: 05cffadc7cac710a0974e8d49bdf64c35eb2b05401d8cd58afd1ef7df69f8bb5
Pointer size: 132 Bytes
Size of remote file: 1.03 MB