| from transformers import AutoImageProcessor, AutoModelForDepthEstimation |
| from PIL import Image |
| import torch |
| import torch.nn.functional as F |
| import io |
| import base64 |
| import numpy as np |
| import json |
|
|
|
|
| class EndpointHandler: |
| def __init__(self, path=""): |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" |
| self.processor = AutoImageProcessor.from_pretrained(path) |
| self.model = AutoModelForDepthEstimation.from_pretrained(path) |
| self.model.to(self.device) |
| self.model.eval() |
|
|
| def _coerce_to_image_bytes(self, obj): |
| """ |
| Accepts: |
| - bytes/bytearray: raw image bytes |
| - str: base64 string OR JSON string containing {"inputs": "..."} OR plain text (fallback) |
| - dict: expects dict["inputs"] (which can itself be str/bytes/etc) |
| Returns: |
| - image_bytes (bytes) |
| """ |
| |
| if isinstance(obj, dict): |
| if "inputs" not in obj: |
| raise ValueError(f'Missing "inputs" key. Keys={list(obj.keys())}') |
| return self._coerce_to_image_bytes(obj["inputs"]) |
|
|
| |
| if isinstance(obj, (bytes, bytearray)): |
| b = bytes(obj) |
| |
| try: |
| txt = b.decode("utf-8") |
| if txt.lstrip().startswith("{") and '"inputs"' in txt: |
| return self._coerce_to_image_bytes(json.loads(txt)) |
| except Exception: |
| pass |
| return b |
|
|
| |
| if isinstance(obj, str): |
| s = obj.strip() |
|
|
| |
| if s.startswith("{") and '"inputs"' in s: |
| try: |
| return self._coerce_to_image_bytes(json.loads(s)) |
| except Exception: |
| pass |
|
|
| |
| try: |
| return base64.b64decode(s, validate=False) |
| except Exception: |
| |
| return s.encode("utf-8") |
|
|
| raise ValueError(f"Unsupported request type: {type(obj)}") |
|
|
| def __call__(self, data): |
| image_bytes = self._coerce_to_image_bytes(data) |
|
|
| |
| image = Image.open(io.BytesIO(image_bytes)).convert("RGB") |
| orig_w, orig_h = image.size |
|
|
| inputs_t = self.processor(images=image, return_tensors="pt") |
| inputs_t = {k: v.to(self.device) for k, v in inputs_t.items()} |
|
|
| with torch.no_grad(): |
| outputs = self.model(**inputs_t) |
| predicted_depth = outputs.predicted_depth |
|
|
| |
| depth = predicted_depth.unsqueeze(1) |
| depth = F.interpolate( |
| depth, size=(orig_h, orig_w), mode="bicubic", align_corners=False |
| ) |
| depth = depth.squeeze(1).squeeze(0) |
| depth_np = depth.detach().float().cpu().numpy() |
|
|
| |
| dmin, dmax = float(depth_np.min()), float(depth_np.max()) |
| denom = (dmax - dmin) if (dmax - dmin) > 1e-12 else 1.0 |
| depth_uint8 = (((depth_np - dmin) / denom) * 255.0).clip(0, 255).astype(np.uint8) |
|
|
| depth_img = Image.fromarray(depth_uint8, mode="L") |
| buf = io.BytesIO() |
| depth_img.save(buf, format="PNG") |
| depth_png_base64 = base64.b64encode(buf.getvalue()).decode("utf-8") |
|
|
| |
| depth_f16 = depth_np.astype(np.float16) |
| depth_raw_base64_f16 = base64.b64encode(depth_f16.tobytes()).decode("utf-8") |
|
|
| return { |
| "type": "relative_depth", |
| "width": orig_w, |
| "height": orig_h, |
| "depth_png_base64": depth_png_base64, |
| "depth_raw_base64_f16": depth_raw_base64_f16, |
| "raw_dtype": "float16", |
| "raw_shape": [orig_h, orig_w], |
| "viz_min": dmin, |
| "viz_max": dmax, |
| } |