import time from typing import List, Tuple, Union import numpy as np import torch from ultralytics import YOLO logger = app_logger.getChild("models.detector.ultralytics") class YOLOInference(BaseInference): def __init__(self, model_path: str, imsz: int = 640, conf_threshold: float = 0.25, nms_threshold: float = 0.45, device: str = "cpu"): """ Initializing the YOLO class using the official Ultralytics SDK. Args: model_path: Path to the model file (.pt, .onnx, or .torchscript). imsz: Input image size for the model. conf_threshold: Confidence threshold to filter out low-confidence boxes. nms_threshold: IoU threshold for Non-Maximum Suppression. device: Computing device ('cpu' or 'cuda'). """ super().__init__(config={"device": device}) self.model_path = model_path self.imsz = imsz self.conf_threshold = conf_threshold self.nms_threshold = nms_threshold self.load_model(model_path) def load_model(self, model_path: str): """ Loads the model into memory. Ultralytics handle various formats automatically. """ logger.info(f"[load] Loading Ultralytics model from {model_path} on {self.device}") # The YOLO class automatically handles weights and architecture configuration self.model = YOLO(model_path) self.model.to(self.device) def predict(self, im_bgr: Union[np.ndarray, List[np.ndarray]]) -> List[List[YOLOResult]]: """ Performs end-to-end inference including preprocessing, model forward pass, and NMS. Args: im_bgr: A single image or a list of images in BGR format (numpy arrays). Returns: A list of lists containing YOLOResult objects for each input image. """ if isinstance(im_bgr, np.ndarray): im_bgr = [im_bgr] start_time = time.time() logger.debug(f"[infer] Starting detector inference on {len(im_bgr)} frame(s)") final_results = [] try: # Ultralytics .predict() handles letterboxing, normalization, and NMS internally. # It also automatically scales coordinates back to the original image size. results = self.model.predict( source=im_bgr, imgsz=self.imsz, conf=self.conf_threshold, iou=self.nms_threshold, device=self.device, verbose=False, save=False ) for i, res in enumerate(results): # res.boxes.data contains [x1, y1, x2, y2, confidence, class_id] boxes_data = res.boxes.data.cpu().numpy() frame_results = [] for box in boxes_data: # box[:5] extract [x1, y1, x2, y2, confidence] # We pass the scaled coordinates and the original image to your YOLOResult wrapper frame_results.append(YOLOResult(box[:5], im_bgr[i])) final_results.append(frame_results) return final_results except Exception as e: logger.error(f"Inference error occurred: {e}") # Return empty lists to prevent the pipeline from breaking return [[] for _ in range(len(im_bgr))] finally: logger.info( f"[infer] Detector inference completed in {(time.time() - start_time) * 1000:.2f} ms " f"for {len(im_bgr)} frame(s)" )