Text Detection Model

YOLO-based text detection model trained on ICDAR2003.
Model

Architecture: YOLO11m
Input size: 640
Metric: mAP50-95 = 0.692
Usage

from huggingface_hub import hf_hub_download
from ultralytics import YOLO
import json
import cv2
import matplotlib.pyplot as plt


# plot function
def visualize_bbox(img_path, predictions, conf_thres=0.8, font=cv2.FONT_HERSHEY_SIMPLEX):
    img = cv2.imread(img_path)

    for prediction in predictions:
        conf_score = prediction["confidence"]
        if conf_score < conf_thres:
            continue
        bbox = prediction["box"]
        xmin = int(bbox["x1"])
        ymin = int(bbox["y1"])
        xmax = int(bbox["x2"])
        ymax = int(bbox["y2"])
        cv2.rectangle(img, (xmin, ymin), (xmax, ymax), (0, 255, 0), 3)
        text = f"{conf_score:.2f}"
        (text_width, text_height), _ = cv2.getTextSize(text, font, 1, 2)
        cv2.rectangle(
            img,
            (xmin, ymin - text_height - 5),
            (xmin + text_width, ymin),
            (0, 255, 0),
            -1,
        )
        cv2.putText(img, text, (xmin, ymin - 5), font, 1, (0, 0, 0), 2)
    return img


# download model and sample 
model_path = hf_hub_download(
    repo_id="huytqvn/text-detection-str-pipeline",
    filename="best.pt"
)
sample_path = hf_hub_download(
    repo_id="huytqvn/text-detection-str-pipeline",
    filename="sample.JPG"
)

# load model
model = YOLO(model_path)

# inference
results = model(sample_path)
predict = json.loads(results[0].to_json())

# visualize
visualize_img = visualize_bbox(sample_path, predict, conf_thres=0.75)
visualize_img = cv2.cvtColor(visualize_img, cv2.COLOR_BGR2RGB)
plt.imshow(visualize_img)
plt.axis("off")
plt.show()
Downloads last month: 6