Sarjinkhan2003 commited on
Commit
f8800e2
·
verified ·
1 Parent(s): 50a12b9

Bengali detection model — mAP50=0.8790

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ detection_results.png filter=lfs diff=lfs merge=lfs -text
37
+ sample_pages.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: bn
3
+ license: mit
4
+ tags:
5
+ - object-detection
6
+ - ocr
7
+ - bengali
8
+ - yolov8
9
+ - document-understanding
10
+ metrics:
11
+ - map
12
+ ---
13
+
14
+ # Bengali OCR — Text Detection Model
15
+
16
+ **Project:** DocReader BD — CSC4233 NLP, AIUB
17
+ **Architecture:** YOLOv8n (~3.2M params)
18
+ **Task:** Detect word-level bounding boxes in Bengali documents
19
+ **Companion recognition model:** `Sarjinkhan2003/bengali-ocr-recognition`
20
+
21
+ ## Results
22
+
23
+ | Metric | Value |
24
+ |---|---|
25
+ | mAP@0.5 | 0.8790 |
26
+ | mAP@0.5:0.95 | 0.6344 |
27
+ | Precision | 0.8722 |
28
+ | Recall | 0.8519 |
29
+
30
+ ## Quick start — full pipeline
31
+
32
+ ```python
33
+ # pip install ultralytics huggingface_hub torch torchvision Pillow
34
+ from pipeline import BengaliDocOCR
35
+
36
+ # Load both detection + recognition from HuggingFace
37
+ ocr = BengaliDocOCR.from_hub(device="cuda") # or "cpu"
38
+
39
+ # Run on a document
40
+ result = ocr.read_document("bengali_doc.jpg")
41
+ print(result["text"]) # full text
42
+ for item in result["items"]: # word-level
43
+ print(item["bbox"], item["text"])
44
+ ```
45
+
46
+ ## Detection only
47
+
48
+ ```python
49
+ from ultralytics import YOLO
50
+ from huggingface_hub import hf_hub_download
51
+
52
+ det_path = hf_hub_download("Sarjinkhan2003/bengali-ocr-detection", "bengali_det.pt")
53
+ model = YOLO(det_path)
54
+ results = model.predict("doc.jpg", conf=0.25)
55
+ for box in results[0].boxes:
56
+ print(box.xyxy[0].tolist(), box.conf[0].item())
57
+ ```
58
+
59
+ ## Files
60
+ | File | Description |
61
+ |---|---|
62
+ | `bengali_det.pt` | YOLOv8 weights (PyTorch) |
63
+ | `bengali_det.onnx` | ONNX export (CPU-friendly) |
64
+ | `pipeline.py` | Combined detection + recognition pipeline |
65
+ | `dataset.yaml` | Dataset config used for training |
66
+
67
+ ## Training data
68
+ - BN-HTRd: real annotated Bengali handwritten document pages
69
+ - 3,000 synthetic pages (auto-generated with Pillow)
bengali_det.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fdc69233343dd24c1686dedf33f25f0ea5723b3cc30e9ce158e3bce3ea5826e
3
+ size 12391968
bengali_det.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80aa25413c7ae2cec9c9ce9366b3df59ad05d8c81ba809f22c2e30d76e582ad6
3
+ size 6217706
dataset.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ names:
2
+ - word
3
+ nc: 1
4
+ path: /content/detection_data
5
+ train: images/train
6
+ val: images/val
detection_results.png ADDED

Git LFS Details

  • SHA256: 9969185671b19c09324534c1a710e94029bbb8549f06b3de90b857ddf2ebcc09
  • Pointer size: 132 Bytes
  • Size of remote file: 1.49 MB
pipeline.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ Bengali OCR — Full Pipeline
4
+ Detection (YOLOv8) + Recognition (BengaliCRNN)
5
+
6
+ Usage:
7
+ from pipeline import BengaliDocOCR
8
+ ocr = BengaliDocOCR.from_hub()
9
+ result = ocr.read_document("page.jpg")
10
+ print(result["text"])
11
+ """
12
+ import json, os, torch
13
+ from pathlib import Path
14
+ from PIL import Image
15
+ from torchvision import transforms
16
+ from huggingface_hub import hf_hub_download
17
+
18
+ DETECT_REPO = "Sarjinkhan2003/bengali-ocr-detection"
19
+ RECOG_REPO = "Sarjinkhan2003/bengali-ocr-recognition"
20
+
21
+
22
+ class BengaliDocOCR:
23
+ """
24
+ Full Bengali document OCR pipeline.
25
+ Combines:
26
+ - YOLOv8n text detection
27
+ - LightCRNN text recognition
28
+ """
29
+
30
+ def __init__(self, det_model, rec_model, idx2char,
31
+ img_h=64, img_w=256, device="cpu"):
32
+ self.det = det_model
33
+ self.rec = rec_model.to(device).eval()
34
+ self.idx2char= idx2char
35
+ self.device = device
36
+ self.img_h = img_h
37
+ self.img_w = img_w
38
+ self.tf = transforms.Compose([
39
+ transforms.Grayscale(1),
40
+ transforms.Resize((img_h, img_w)),
41
+ transforms.ToTensor(),
42
+ transforms.Normalize([0.5],[0.5])
43
+ ])
44
+
45
+ @classmethod
46
+ def from_hub(cls, device="cpu"):
47
+ """Download both models from HuggingFace and build pipeline."""
48
+ from ultralytics import YOLO
49
+ import importlib.util
50
+
51
+ # Detection model
52
+ det_path = hf_hub_download(DETECT_REPO, "bengali_det.pt")
53
+ det_model = YOLO(det_path)
54
+
55
+ # Recognition model
56
+ net_path = hf_hub_download(RECOG_REPO, "bengali_crnn.py")
57
+ ckpt_path = hf_hub_download(RECOG_REPO, "bengali_crnn.pth")
58
+ vocab_path = hf_hub_download(RECOG_REPO, "vocab.json")
59
+
60
+ spec = importlib.util.spec_from_file_location("bengali_crnn", net_path)
61
+ mod = importlib.util.module_from_spec(spec)
62
+ spec.loader.exec_module(mod)
63
+
64
+ vocab = json.load(open(vocab_path, encoding="utf-8"))
65
+ idx2char = {int(k): v for k,v in vocab["idx2char"].items()}
66
+ rec_model = mod.Model(1, 256, 256, vocab["num_classes"])
67
+ ckpt = torch.load(ckpt_path, map_location=device)
68
+ rec_model.load_state_dict(ckpt["model_state_dict"])
69
+
70
+ return cls(det_model, rec_model, idx2char, device=device)
71
+
72
+ def _recognize(self, crop):
73
+ """Run recognition on a single cropped word image."""
74
+ tensor = self.tf(crop).unsqueeze(0).to(self.device)
75
+ with torch.no_grad():
76
+ out = self.rec(tensor)
77
+ _, preds = out.permute(1,0,2).max(2)
78
+ chars, prev = [], None
79
+ for p in preds[0].tolist():
80
+ if p != 0 and p != prev:
81
+ chars.append(self.idx2char.get(p, ""))
82
+ prev = p
83
+ return "".join(chars)
84
+
85
+ def _sort_boxes(self, boxes):
86
+ """
87
+ Sort detected boxes in reading order:
88
+ top-to-bottom, left-to-right within each row.
89
+ Rows are grouped by vertical proximity.
90
+ """
91
+ if not boxes:
92
+ return boxes
93
+ # Sort by y-center first
94
+ boxes_sorted = sorted(boxes, key=lambda b: (b[1]+b[3])/2)
95
+ if len(boxes_sorted) == 0:
96
+ return boxes_sorted
97
+ # Group into rows (boxes within LINE_THRESH of each other = same row)
98
+ line_thresh = max(10, (boxes_sorted[0][3] - boxes_sorted[0][1]) * 0.6)
99
+ rows, current_row = [], [boxes_sorted[0]]
100
+ for b in boxes_sorted[1:]:
101
+ cy_prev = (current_row[-1][1] + current_row[-1][3]) / 2
102
+ cy_curr = (b[1] + b[3]) / 2
103
+ if abs(cy_curr - cy_prev) < line_thresh:
104
+ current_row.append(b)
105
+ else:
106
+ rows.append(sorted(current_row, key=lambda b: b[0])) # sort by x
107
+ current_row = [b]
108
+ rows.append(sorted(current_row, key=lambda b: b[0]))
109
+ return [b for row in rows for b in row]
110
+
111
+ def read_document(self, image_path, conf=0.25):
112
+ """
113
+ Full pipeline: detect → sort → recognize → assemble.
114
+
115
+ Returns dict:
116
+ text : full document text string
117
+ items : list of {"bbox": [x1,y1,x2,y2], "text": str, "conf": float}
118
+ pageCount: 1
119
+ """
120
+ img = Image.open(image_path).convert("RGB")
121
+ results = self.det.predict(image_path, conf=conf, verbose=False)
122
+ boxes = [box.xyxy[0].tolist() + [box.conf[0].item()]
123
+ for box in results[0].boxes]
124
+
125
+ # Sort into reading order
126
+ boxes_xy = [[b[0],b[1],b[2],b[3]] for b in boxes]
127
+ sorted_boxes = self._sort_boxes(boxes_xy)
128
+
129
+ items, texts = [], []
130
+ for bbox in sorted_boxes:
131
+ x1, y1, x2, y2 = [int(v) for v in bbox]
132
+ crop = img.crop((x1, y1, x2, y2))
133
+ if crop.width < 4 or crop.height < 4:
134
+ continue
135
+ text = self._recognize(crop)
136
+ if text.strip():
137
+ items.append({"bbox": [x1,y1,x2,y2], "text": text})
138
+ texts.append(text)
139
+
140
+ return {
141
+ "text" : " ".join(texts),
142
+ "items" : items,
143
+ "pageCount" : 1
144
+ }
sample_pages.png ADDED

Git LFS Details

  • SHA256: 05cffadc7cac710a0974e8d49bdf64c35eb2b05401d8cd58afd1ef7df69f8bb5
  • Pointer size: 132 Bytes
  • Size of remote file: 1.03 MB