sapiens2-onnx / README.md
barakplasma's picture
Trim model card to browser-only; remove Node.js section
1b4a282 verified
metadata
license: other
license_name: sapiens2-license
license_link: https://github.com/facebookresearch/sapiens2/blob/main/LICENSE.md
pipeline_tag: image-feature-extraction
library_name: transformers
base_model: facebook/sapiens2-pretrain-0.1b
tags:
  - sapiens
  - sapiens2
  - vision-transformer
  - human-centric
  - feature-extraction
  - onnx
  - onnxruntime-web

Sapiens2-0.1B β€” ONNX Export

ONNX export of facebook/sapiens2-pretrain-0.1b, a vision transformer pretrained on 1 billion human images, packaged for browser inference via onnxruntime-web.

File Size Use
sapiens2_0.1b_int8.onnx 116 MB Browser (recommended)
sapiens2_0.1b_fp32.onnx 458 MB Server-side / higher precision
example_embeddings.js β€” Drop-in browser ES module

Output: a (batch, 768) float32 vector per image (CLS token).


What are embeddings?

The model encodes an image into a 768-dimensional vector that captures human-centric semantics β€” pose, body shape, clothing, and identity. Two images with similar people in similar poses will have embeddings close together in this space. Common uses:

  • Similarity search β€” find the most similar person/pose in a collection
  • Clustering β€” group images by pose, clothing, or activity
  • Classification β€” train a lightweight head on top of frozen embeddings
  • Retrieval β€” image β†’ nearest-neighbor lookup in a vector database

Browser quick start

npm install onnxruntime-web
import * as ort from "onnxruntime-web";

// Point WASM binaries at the CDN build
ort.env.wasm.wasmPaths = "https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/";

const MODEL_URL =
  "https://huggingface.co/barakplasma/sapiens2-onnx/resolve/main/sapiens2_0.1b_int8.onnx";

const H = 1024, W = 768;
const MEAN = [0.485, 0.456, 0.406];
const STD  = [0.229, 0.224, 0.225];

// Load once; reuse for all images. ~1-2 s cold start.
export async function loadModel() {
  return ort.InferenceSession.create(MODEL_URL, {
    executionProviders: ["webgpu", "wasm"], // WebGPU ~1-3 s/img, WASM ~20-60 s/img
    graphOptimizationLevel: "all",
  });
}

// Accepts any <img>, <canvas>, ImageBitmap, or VideoFrame
function imageToTensor(source) {
  const canvas = document.createElement("canvas");
  canvas.width = W;
  canvas.height = H;
  canvas.getContext("2d").drawImage(source, 0, 0, W, H);
  const { data } = canvas.getContext("2d").getImageData(0, 0, W, H); // RGBA uint8

  const t = new Float32Array(3 * H * W);
  for (let i = 0; i < H * W; i++) {
    t[i]             = (data[i * 4]     / 255 - MEAN[0]) / STD[0]; // R
    t[H * W + i]     = (data[i * 4 + 1] / 255 - MEAN[1]) / STD[1]; // G
    t[2 * H * W + i] = (data[i * 4 + 2] / 255 - MEAN[2]) / STD[2]; // B
  }
  return new ort.Tensor("float32", t, [1, 3, H, W]);
}

// Returns a Float32Array of length 768
export async function embed(session, imageSource) {
  const { embedding } = await session.run({ pixel_values: imageToTensor(imageSource) });
  return embedding.data;
}

// Cosine similarity: 1 = identical direction, 0 = orthogonal, -1 = opposite
export function cosineSimilarity(a, b) {
  let dot = 0, normA = 0, normB = 0;
  for (let i = 0; i < a.length; i++) {
    dot   += a[i] * b[i];
    normA += a[i] * a[i];
    normB += b[i] * b[i];
  }
  return dot / (Math.sqrt(normA) * Math.sqrt(normB));
}

Caching in IndexedDB

The INT8 model is 116 MB. After the first load, store it in IndexedDB so repeat visits skip the download entirely:

const DB_NAME = "sapiens2-onnx";
const STORE   = "models";

async function openDB() {
  return new Promise((resolve, reject) => {
    const req = indexedDB.open(DB_NAME, 1);
    req.onupgradeneeded = () => req.result.createObjectStore(STORE);
    req.onsuccess = () => resolve(req.result);
    req.onerror   = () => reject(req.error);
  });
}

export async function loadModelCached(url = MODEL_URL) {
  const db = await openDB();

  const cached = await new Promise(res => {
    const req = db.transaction(STORE).objectStore(STORE).get(url);
    req.onsuccess = () => res(req.result ?? null);
    req.onerror   = () => res(null);
  });

  const buf = cached ?? await fetch(url).then(r => r.arrayBuffer()).then(buf => {
    db.transaction(STORE, "readwrite").objectStore(STORE).put(buf, url);
    return buf;
  });

  return ort.InferenceSession.create(buf, {
    executionProviders: ["webgpu", "wasm"],
    graphOptimizationLevel: "all",
  });
}

Full worked example

See example_embeddings.js β€” a self-contained ES module you can drop into any browser project. It exports loadModelCached, embed, cosineSimilarity, l2Normalize, and findMostSimilar.

Usage example (assumes an <input type="file"> and two <img> elements):

<input type="file" id="fileA" accept="image/*">
<input type="file" id="fileB" accept="image/*">
<img id="imgA"> <img id="imgB">
<p id="result"></p>

<script type="module">
import { loadModelCached, embed, cosineSimilarity } from "./example_embeddings.js";

const session = await loadModelCached();

async function onFileChange(inputId, imgId) {
  const file = document.getElementById(inputId).files[0];
  const img  = document.getElementById(imgId);
  img.src = URL.createObjectURL(file);
  await img.decode();
  return embed(session, img);
}

let embA, embB;
document.getElementById("fileA").onchange = async () => {
  embA = await onFileChange("fileA", "imgA");
  if (embA && embB) showSimilarity();
};
document.getElementById("fileB").onchange = async () => {
  embB = await onFileChange("fileB", "imgB");
  if (embA && embB) showSimilarity();
};

function showSimilarity() {
  const score = cosineSimilarity(embA, embB);
  document.getElementById("result").textContent =
    `Similarity: ${score.toFixed(4)}`;
}
</script>

Preprocessing spec

Input must be resized to exactly 1024 Γ— 768 (H Γ— W) and normalized with ImageNet statistics before passing to the model:

mean   = [0.485, 0.456, 0.406]   # per channel, RGB order
std    = [0.229, 0.224, 0.225]
value  = (pixel_uint8 / 255 βˆ’ mean) / std
layout = NCHW float32 β€” shape (batch, 3, 1024, 768)

Browser requirements

Minimum Recommended
Browser Chrome/Edge 113+ Chrome 120+
Execution provider WASM WebGPU
Free RAM 4 GB 8 GB
INT8 latency ~20–60 s (WASM) ~1–3 s (WebGPU)

WebGPU is available on Chrome/Edge 113+ desktop. Mobile is not viable at this resolution.


Model details

Base model facebook/sapiens2-pretrain-0.1b
Architecture Vision Transformer (RoPE, GQA, SwiGLU, RMSNorm, QK-norm)
Parameters 0.114 B
FLOPs 0.342 T
Embedding dim 768
Layers / heads 12 / 12
Input size 1024 Γ— 768 (H Γ— W)
Patch size 16 px β†’ 3,072 patch tokens
Output CLS token: (batch, 768) float32
Pretraining data 1 billion curated human images
ONNX opset 18
Quantization quantize_dynamic, QInt8 weights

Sapiens2 family

Model Params Embed dim Layers
Sapiens2-0.1B (this) 0.114 B 768 12
Sapiens2-0.4B 0.398 B 1024 24
Sapiens2-0.8B 0.818 B 1280 32
Sapiens2-1B 1.462 B 1536 40
Sapiens2-5B 5.071 B 2432 56

Only 0.1B is practical for browser inference. Larger variants require server-side deployment.


License

Released under the Sapiens2 License.

Citation

@article{khirodkarsapiens2,
  title   = {Sapiens2},
  author  = {Khirodkar, Rawal and Wen, He and Martinez, Julieta and Dong, Yuan and Su, Zhaoen and Saito, Shunsuke},
  journal = {arXiv preprint arXiv:2604.21681},
  year    = {2026}
}