barakplasma
/

sapiens2-onnx

+---
+license: other
+license_name: sapiens2-license
+license_link: https://github.com/facebookresearch/sapiens2/blob/main/LICENSE.md
+pipeline_tag: image-feature-extraction
+library_name: transformers
+base_model: facebook/sapiens2-pretrain-0.1b
+tags:
+  - sapiens
+  - sapiens2
+  - vision-transformer
+  - human-centric
+  - feature-extraction
+  - onnx
+  - onnxruntime-web
+---
+# Sapiens2-0.1B — ONNX Export
+ONNX export of [facebook/sapiens2-pretrain-0.1b](https://huggingface.co/facebook/sapiens2-pretrain-0.1b), a vision transformer pretrained on **1 billion human images**. This repo provides ready-to-run weights for browser inference via `onnxruntime-web` and server inference via `onnxruntime-node` or `onnxruntime`.
+| File | Size | Use |
+|---|---|---|
+| `sapiens2_0.1b_int8.onnx` | 116 MB | Browser / mobile (recommended) |
+| `sapiens2_0.1b_fp32.onnx` | 458 MB | Server-side / higher precision |
+| `example_embeddings.js` | — | Fully worked Node.js example |
+**Output:** a single `(batch, 768)` float32 embedding vector per image (CLS token).
+---
+## What are embeddings?
+The model encodes an image into a 768-dimensional vector that captures high-level human-centric semantics — pose, body shape, clothing, and identity. Two images with similar people in similar poses will have embeddings close together in this space. Common uses:
+- **Similarity search** — find the most similar person/pose in a database
+- **Clustering** — group images by body pose, clothing, or activity
+- **Classification** — train a lightweight head on top of frozen embeddings
+- **Retrieval-augmented generation** — image → embedding → nearest-neighbor lookup
+---
+## Generating embeddings — Browser
+```bash
+npm install onnxruntime-web
+```
+```js
+import * as ort from "onnxruntime-web";
+// Point WASM binaries at the CDN build (avoids bundler complexity)
+ort.env.wasm.wasmPaths = "https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/";
+const MODEL_URL =
+  "https://huggingface.co/barakplasma/sapiens2-onnx/resolve/main/sapiens2_0.1b_int8.onnx";
+const H = 1024, W = 768;
+const MEAN = [0.485, 0.456, 0.406];
+const STD  = [0.229, 0.224, 0.225];
+// Load once; reuse for all images
+export async function loadModel() {
+  return ort.InferenceSession.create(MODEL_URL, {
+    executionProviders: ["webgpu", "wasm"], // WebGPU ~1-3s, WASM ~20-60s
+    graphOptimizationLevel: "all",
+  });
+}
+// Resize an <img> or <canvas> to 1024×768 and convert to a float32 NCHW tensor
+function imageToTensor(source) {
+  const canvas = document.createElement("canvas");
+  canvas.width = W;
+  canvas.height = H;
+  canvas.getContext("2d").drawImage(source, 0, 0, W, H);
+  const { data } = canvas.getContext("2d").getImageData(0, 0, W, H); // RGBA uint8
+  const t = new Float32Array(3 * H * W);
+  for (let i = 0; i < H * W; i++) {
+    t[i]             = (data[i * 4]     / 255 - MEAN[0]) / STD[0]; // R plane
+    t[H * W + i]     = (data[i * 4 + 1] / 255 - MEAN[1]) / STD[1]; // G plane
+    t[2 * H * W + i] = (data[i * 4 + 2] / 255 - MEAN[2]) / STD[2]; // B plane
+  }
+  return new ort.Tensor("float32", t, [1, 3, H, W]);
+}
+// Returns a Float32Array of length 768
+export async function embed(session, imageElement) {
+  const feeds = { pixel_values: imageToTensor(imageElement) };
+  const { embedding } = await session.run(feeds);
+  return embedding.data;
+}
+// Cosine similarity between two embeddings (both Float32Array length 768)
+export function cosineSimilarity(a, b) {
+  let dot = 0, normA = 0, normB = 0;
+  for (let i = 0; i < a.length; i++) {
+    dot   += a[i] * b[i];
+    normA += a[i] * a[i];
+    normB += b[i] * b[i];
+  }
+  return dot / (Math.sqrt(normA) * Math.sqrt(normB));
+}
+// Example: compare two images
+async function compareImages(imgA, imgB) {
+  const session = await loadModel();
+  const [embA, embB] = await Promise.all([embed(session, imgA), embed(session, imgB)]);
+  const score = cosineSimilarity(embA, embB); // -1 (opposite) to 1 (identical)
+  console.log(`Similarity: ${score.toFixed(4)}`);
+  return score;
+}
+```
+### Caching the model in IndexedDB
+The INT8 model is 116 MB. Cache it in IndexedDB to skip the download on repeat visits:
+```js
+const DB_NAME = "sapiens2-onnx";
+const STORE   = "models";
+async function openDB() {
+  return new Promise((resolve, reject) => {
+    const req = indexedDB.open(DB_NAME, 1);
+    req.onupgradeneeded = () => req.result.createObjectStore(STORE);
+    req.onsuccess = () => resolve(req.result);
+    req.onerror   = () => reject(req.error);
+  });
+}
+export async function loadModelCached(url = MODEL_URL) {
+  const db  = await openDB();
+  const hit = await new Promise(res => {
+    const req = db.transaction(STORE).objectStore(STORE).get(url);
+    req.onsuccess = () => res(req.result);
+    req.onerror   = () => res(null);
+  });
+  const buf = hit ?? await fetch(url).then(r => r.arrayBuffer()).then(buf => {
+    db.transaction(STORE, "readwrite").objectStore(STORE).put(buf, url);
+    return buf;
+  });
+  return ort.InferenceSession.create(buf, {
+    executionProviders: ["webgpu", "wasm"],
+    graphOptimizationLevel: "all",
+  });
+}
+```
+---
+## Generating embeddings — Node.js
+See [`example_embeddings.js`](./example_embeddings.js) in this repo for a fully worked example. Short version:
+```bash
+npm install onnxruntime-node sharp
+```
+```js
+import * as ort from "onnxruntime-node";
+import sharp from "sharp";
+const H = 1024, W = 768;
+const MEAN = [0.485, 0.456, 0.406];
+const STD  = [0.229, 0.224, 0.225];
+async function embed(session, imagePath) {
+  const { data } = await sharp(imagePath)
+    .resize(W, H)   // sharp takes (width, height)
+    .raw()
+    .toBuffer({ resolveWithObject: true });
+  const t = new Float32Array(3 * H * W);
+  for (let i = 0; i < H * W; i++) {
+    t[i]             = (data[i * 3]     / 255 - MEAN[0]) / STD[0];
+    t[H * W + i]     = (data[i * 3 + 1] / 255 - MEAN[1]) / STD[1];
+    t[2 * H * W + i] = (data[i * 3 + 2] / 255 - MEAN[2]) / STD[2];
+  }
+  const { embedding } = await session.run({
+    pixel_values: new ort.Tensor("float32", t, [1, 3, H, W]),
+  });
+  return embedding.data; // Float32Array of length 768
+}
+const session = await ort.InferenceSession.create("sapiens2_0.1b_int8.onnx", {
+  executionProviders: ["cpu"],
+});
+const emb = await embed(session, "person.jpg");
+console.log("Embedding length:", emb.length); // 768
+```
+---
+## Generating embeddings — Python (onnxruntime)
+```python
+import onnxruntime as ort
+import numpy as np
+from PIL import Image
+H, W = 1024, 768
+MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32)
+STD  = np.array([0.229, 0.224, 0.225], dtype=np.float32)
+sess = ort.InferenceSession(
+    "sapiens2_0.1b_int8.onnx",
+    providers=["CPUExecutionProvider"],
+)
+def embed(image_path):
+    img = np.array(Image.open(image_path).convert("RGB").resize((W, H)), dtype=np.float32)
+    img = (img / 255.0 - MEAN) / STD          # normalize: (H, W, 3)
+    img = img.transpose(2, 0, 1)[np.newaxis]  # NCHW: (1, 3, H, W)
+    return sess.run(["embedding"], {"pixel_values": img})[0]  # (1, 768)
+# Compare two images
+a = embed("person_a.jpg")
+b = embed("person_b.jpg")
+similarity = np.dot(a[0], b[0]) / (np.linalg.norm(a) * np.linalg.norm(b))
+print(f"Similarity: {similarity:.4f}")  # -1 to 1
+```
+### Batch inference
+```python
+def embed_batch(image_paths, batch_size=4):
+    embeddings = []
+    for i in range(0, len(image_paths), batch_size):
+        batch_paths = image_paths[i : i + batch_size]
+        imgs = []
+        for p in batch_paths:
+            img = np.array(Image.open(p).convert("RGB").resize((W, H)), dtype=np.float32)
+            imgs.append((img / 255.0 - MEAN) / STD)
+        batch = np.stack(imgs).transpose(0, 3, 1, 2)  # (B, 3, H, W)
+        out = sess.run(["embedding"], {"pixel_values": batch})[0]  # (B, 768)
+        embeddings.append(out)
+    return np.concatenate(embeddings, axis=0)
+```
+---
+## Model details
+| | |
+|---|---|
+| **Base model** | [facebook/sapiens2-pretrain-0.1b](https://huggingface.co/facebook/sapiens2-pretrain-0.1b) |
+| **Architecture** | Vision Transformer (RoPE, GQA, SwiGLU, RMSNorm, QK-norm) |
+| **Parameters** | 0.114 B |
+| **FLOPs** | 0.342 T |
+| **Embedding dim** | 768 |
+| **Layers / heads** | 12 / 12 |
+| **Input size** | 1024 × 768 (H × W), RGB, ImageNet-normalized |
+| **Patch size** | 16 px → 3,072 patch tokens |
+| **Output** | CLS token: `(batch, 768)` float32 |
+| **Pretraining data** | 1 billion curated human images |
+| **ONNX opset** | 18 |
+| **Exporter** | `torch.onnx.export` (dynamo) |
+| **Quantization** | `onnxruntime.quantization.quantize_dynamic`, `QInt8` weights |
+### Preprocessing spec
+Images must be resized to exactly **1024 × 768 (H × W)** and normalized with ImageNet statistics:
+```
+mean = [0.485, 0.456, 0.406]   # per channel, RGB order
+std  = [0.229, 0.224, 0.225]
+pixel_values = (pixel / 255 - mean) / std
+layout: NCHW float32 — shape (batch, 3, 1024, 768)
+```
+---
+## Browser requirements
+| | Minimum | Recommended |
+|---|---|---|
+| **Browser** | Chrome/Edge 113+ | Chrome 120+ |
+| **Execution provider** | WASM (CPU) | WebGPU |
+| **RAM** | 4 GB free | 8 GB |
+| **INT8 inference time** | ~20–60 s (WASM) | ~1–3 s (WebGPU) |
+WebGPU is available in Chrome/Edge 113+ on desktop. Mobile inference is not recommended at this resolution.
+---
+## Export notes
+Two non-obvious issues arose during export from the original safetensors checkpoint:
+1. **bfloat16 in RoPE** — `RopePositionEmbedding` defaults to `dtype=bfloat16` via `pos_embed_rope_dtype="bf16"`. This is stored as a plain Python attribute (`self.dtype`), not a tensor buffer, so calling `.float()` on the model doesn't fix it. Must be overridden at construction time: `pos_embed_rope_dtype="fp32"`.
+2. **`aten::rms_norm` unsupported in legacy tracer** — the TorchScript-based exporter (`dynamo=False`) does not support `rms_norm`. The dynamo-based exporter was used instead. By default this produces a sidecar `.onnx.data` file; weights were inlined back into a single file via `onnx.save_model(..., save_as_external_data=False)`.
+---
+## Sapiens2 model family
+| Model | Params | Embed dim | Layers |
+|---|---|---|---|
+| **Sapiens2-0.1B** *(this)* | 0.114 B | 768 | 12 |
+| [Sapiens2-0.4B](https://huggingface.co/facebook/sapiens2-pretrain-0.4b) | 0.398 B | 1024 | 24 |
+| [Sapiens2-0.8B](https://huggingface.co/facebook/sapiens2-pretrain-0.8b) | 0.818 B | 1280 | 32 |
+| [Sapiens2-1B](https://huggingface.co/facebook/sapiens2-pretrain-1b) | 1.462 B | 1536 | 40 |
+| [Sapiens2-5B](https://huggingface.co/facebook/sapiens2-pretrain-5b) | 5.071 B | 2432 | 56 |
+Only the 0.1B model is practical for browser inference. Larger models require server-side deployment.
+---
+## License
+The original weights are released under the [Sapiens2 License](https://github.com/facebookresearch/sapiens2/blob/main/LICENSE.md). This ONNX conversion inherits the same license terms.
+## Citation
+```bibtex
+@article{khirodkarsapiens2,
+  title   = {Sapiens2},
+  author  = {Khirodkar, Rawal and Wen, He and Martinez, Julieta and Dong, Yuan and Su, Zhaoen and Saito, Shunsuke},
+  journal = {arXiv preprint arXiv:2604.21681},
+  year    = {2026}
+}
+```