Spaces:

TrungTran
/

faceage_ClientScan

Running

File size: 10,419 Bytes

"""
FaceAge ClientScan — Gradio demo for HuggingFace Spaces.

Face detection : YuNet (OpenCV built-in, ~350 KB model, no extra deps)
Age/gender     : FaceAge ClientScan ONNX (CPU, ~1.2 GB)
"""
import os
import numpy as np

_HF_TOKEN = os.environ.get("HF_TOKEN") or None
import gradio as gr
from PIL import Image, ImageDraw, ImageFont

# ---------------------------------------------------------------------------
# Age/gender preprocessing  (ImageNet normalisation, matches training)
# ---------------------------------------------------------------------------

_MEAN     = np.array([0.485, 0.456, 0.406], dtype=np.float32)
_STD      = np.array([0.229, 0.224, 0.225], dtype=np.float32)
_IMG_SIZE = 224


def _preprocess(img_rgb: np.ndarray) -> np.ndarray:
    """HxWx3 uint8 RGB  →  1x3x224x224 float32."""
    from PIL import Image as _PIL
    pil = _PIL.fromarray(img_rgb).resize((_IMG_SIZE, _IMG_SIZE), _PIL.BICUBIC)
    arr = np.asarray(pil, dtype=np.float32) / 255.0
    arr = (arr - _MEAN) / _STD
    return np.ascontiguousarray(arr.transpose(2, 0, 1)[np.newaxis])


def _decode_age(logits: np.ndarray) -> float:
    """CORAL: age = Σ sigmoid(logits)."""
    logits = np.clip(logits, -88.0, 88.0)
    return float((1.0 / (1.0 + np.exp(-logits))).sum())


def _decode_gender(logits: np.ndarray) -> tuple[str, float]:
    ex    = np.exp(logits - logits.max())
    probs = ex / ex.sum()
    idx   = int(probs.argmax())
    return ("male" if idx == 1 else "female"), float(probs[idx])


# ---------------------------------------------------------------------------
# Age/gender model  (ONNX, loaded from HF Hub)
# ---------------------------------------------------------------------------

_ORT_SESSION = None
_ORT_IN_NAME = None


def _load_age_model():
    global _ORT_SESSION, _ORT_IN_NAME
    if _ORT_SESSION is not None:
        return

    import onnxruntime as ort
    from huggingface_hub import hf_hub_download

    print("[AgeModel] Downloading ONNX from HuggingFace Hub …")
    onnx_path = hf_hub_download(
        repo_id  = "TrungTran/faceage_ClientScan",
        filename = "faceage_dino_fp32.onnx",
        token    = _HF_TOKEN,
    )

    opts = ort.SessionOptions()
    opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
    opts.intra_op_num_threads = 4
    _ORT_SESSION = ort.InferenceSession(
        onnx_path, sess_options=opts,
        providers=["CPUExecutionProvider"],
    )
    _ORT_IN_NAME = _ORT_SESSION.get_inputs()[0].name
    print(f"[AgeModel] Ready  ({onnx_path})")


def _predict_crop(face_rgb: np.ndarray) -> dict:
    x = _preprocess(face_rgb)
    age_logits, gender_logits = _ORT_SESSION.run(None, {_ORT_IN_NAME: x})
    age            = _decode_age(age_logits[0])
    gender, conf   = _decode_gender(gender_logits[0])
    return {"age": age, "gender": gender, "conf": conf}


# ---------------------------------------------------------------------------
# YuNet face detector  (cv2.FaceDetectorYN, loaded from HuggingFace Hub)
# ---------------------------------------------------------------------------

_YUNET_REPO = "opencv/face_detection_yunet"
_YUNET_FILE = "face_detection_yunet_2023mar.onnx"
_DETECTOR   = None


def _load_detector():
    global _DETECTOR
    if _DETECTOR is not None:
        return

    from huggingface_hub import hf_hub_download
    import cv2

    try:
        yunet_path = hf_hub_download(repo_id=_YUNET_REPO, filename=_YUNET_FILE)
        print(f"[YuNet] Model: {yunet_path}")
    except Exception as e:
        print(f"[YuNet] Download failed: {e} — face detection disabled")
        _DETECTOR = "unavailable"
        return

    try:
        _DETECTOR = cv2.FaceDetectorYN.create(
            model           = yunet_path,
            config          = "",
            input_size      = (320, 320),
            score_threshold = 0.6,
            nms_threshold   = 0.3,
            top_k           = 100,
        )
        print("[YuNet] Face detector ready")
    except Exception as e:
        print(f"[YuNet] Init failed: {e} — face detection disabled")
        _DETECTOR = "unavailable"


_FACE_PAD = 0.10   # 10% proportional padding — matches LAGENDA benchmark MAE=3.555


def _detect_faces(img_rgb: np.ndarray,
                  min_face_px: int = 20) -> list[tuple[int, int, int, int]]:
    """
    Returns list of (x0, y0, x1, y1) with 10% proportional padding, sorted by area desc.
    Falls back to empty list if YuNet is unavailable.
    """
    if _DETECTOR == "unavailable" or _DETECTOR is None:
        return []

    import cv2
    h, w    = img_rgb.shape[:2]
    img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)

    _DETECTOR.setInputSize((w, h))
    _, faces = _DETECTOR.detect(img_bgr)   # None or Nx15: [x,y,w,h, ...]

    if faces is None:
        return []

    bboxes = []
    for face in faces:
        x, y, fw, fh = face[:4].astype(int)
        # 10% proportional padding (matches training/benchmark setup)
        pw = int(fw * _FACE_PAD)
        ph = int(fh * _FACE_PAD)
        x0 = max(0, x - pw)
        y0 = max(0, y - ph)
        x1 = min(w, x + fw + pw)
        y1 = min(h, y + fh + ph)
        if (x1 - x0) >= min_face_px and (y1 - y0) >= min_face_px:
            bboxes.append((x0, y0, x1, y1))

    bboxes.sort(key=lambda b: (b[2] - b[0]) * (b[3] - b[1]), reverse=True)
    return bboxes


# ---------------------------------------------------------------------------
# Drawing
# ---------------------------------------------------------------------------

_PALETTE = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4",
            "#FFEAA7", "#DDA0DD", "#98D8C8", "#F7DC6F"]

_FONT_PATHS = [
    "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
    "/usr/share/fonts/dejavu/DejaVuSans-Bold.ttf",
    "/System/Library/Fonts/Helvetica.ttc",
]


def _get_font(size: int):
    for path in _FONT_PATHS:
        try:
            return ImageFont.truetype(path, size)
        except Exception:
            pass
    return ImageFont.load_default()


def _draw_results(pil_img: Image.Image, results: list[dict]) -> Image.Image:
    draw     = ImageDraw.Draw(pil_img)
    font_lg  = _get_font(20)
    font_sm  = _get_font(15)

    for i, r in enumerate(results):
        color = _PALETTE[i % len(_PALETTE)]
        bbox  = r.get("bbox")
        label = f"{r['gender']}  {r['age']:.1f} y"

        if bbox:
            x0, y0, x1, y1 = bbox
            # Box
            draw.rectangle([x0, y0, x1, y1], outline=color, width=3)
            # Label background
            tw = int(draw.textlength(label, font=font_lg))
            th = 24
            lx0, ly0 = x0, max(0, y0 - th - 4)
            draw.rectangle([lx0, ly0, lx0 + tw + 10, ly0 + th + 4], fill=color)
            draw.text((lx0 + 5, ly0 + 2), label, fill="white", font=font_lg)
        else:
            # Full-image fallback — overlay in top-left corner
            full_label = f"{r['gender']}  {r['age']:.1f} y  ({r['conf']:.0%})"
            tw = int(draw.textlength(full_label, font=font_lg))
            draw.rectangle([8, 8, tw + 18, 38], fill=color)
            draw.text((13, 10), full_label, fill="white", font=font_lg)

    return pil_img


# ---------------------------------------------------------------------------
# Main predict function
# ---------------------------------------------------------------------------

def predict(image: Image.Image, max_faces: int,
            conf_thresh: float) -> tuple[Image.Image, str]:
    if image is None:
        return None, "⬆️  Please upload a photo."

    _load_age_model()
    _load_detector()

    img_rgb = np.asarray(image.convert("RGB"))
    bboxes  = _detect_faces(img_rgb)[:max_faces]

    results = []
    if bboxes:
        for bbox in bboxes:
            x0, y0, x1, y1 = bbox
            crop = img_rgb[y0:y1, x0:x1]
            r    = _predict_crop(crop)
            r["bbox"] = bbox
            results.append(r)
    else:
        # No faces found — run on the entire image
        r = _predict_crop(img_rgb)
        results.append(r)

    # Annotated output image
    out_img = image.convert("RGB").copy()
    out_img = _draw_results(out_img, results)

    # Text summary
    lines = []
    mode  = f"({len(bboxes)} face{'s' if len(bboxes)!=1 else ''} detected)" \
            if bboxes else "(no face detected — full image used)"
    lines.append(f"**{mode}**\n")
    for i, r in enumerate(results, 1):
        icon = "👨" if r["gender"] == "male" else "👩"
        lines.append(
            f"{icon} **Face {i}** — Age **{r['age']:.1f}**  ·  "
            f"{r['gender']} ({r['conf']:.0%})"
        )
    summary = "\n".join(lines)

    return out_img, summary


# ---------------------------------------------------------------------------
# Gradio UI
# ---------------------------------------------------------------------------

_DESC = """
## FaceAge ClientScan — Age & Gender Estimation

Upload a photo. **YuNet** auto-detects faces, then **FaceAge ClientScan** predicts age and gender.

| | |
|--|--|
| 🏆 LAGENDA 84k MAE | **3.555** (beats MiVOLO v2 paper 3.650, face-only) |
| 🧠 Backbone | DINOv3-ViT-L/16 (Meta AI, 307M params) |
| ⚡ Speed | ~100 ms / face on CPU (ONNX FP32) |
| 🔍 Detector | YuNet (OpenCV, ~350 KB) |

[📄 Model Card](https://huggingface.co/TrungTran/faceage_ClientScan)
"""

with gr.Blocks(title="FaceAge ClientScan", theme=gr.themes.Soft()) as demo:
    gr.Markdown(_DESC)

    with gr.Row():
        with gr.Column(scale=1):
            inp_img   = gr.Image(type="pil", label="📷 Upload photo or use webcam",
                                 sources=["upload", "webcam", "clipboard"])
            with gr.Row():
                inp_max   = gr.Slider(1, 10, value=5, step=1,
                                      label="Max faces")
                inp_conf  = gr.Slider(0.3, 0.9, value=0.6, step=0.05,
                                      label="Detection confidence")
            btn = gr.Button("🔍 Predict", variant="primary", size="lg")

        with gr.Column(scale=1):
            out_img  = gr.Image(type="pil", label="Result")
            out_text = gr.Markdown()

    btn.click(
        fn      = predict,
        inputs  = [inp_img, inp_max, inp_conf],
        outputs = [out_img, out_text],
    )

    gr.Markdown("""
---
*Our Collection: 4M Images.*
*DINOv3-ViT-L pretrained by Meta AI on LVD-1.68B images.*
""")

if __name__ == "__main__":
    demo.launch()