Trim model card to browser-only; remove Node.js section

1b4a282 verified about 14 hours ago

8.24 kB

	---
	license: other
	license_name: sapiens2-license
	license_link: https://github.com/facebookresearch/sapiens2/blob/main/LICENSE.md
	pipeline_tag: image-feature-extraction
	library_name: transformers
	base_model: facebook/sapiens2-pretrain-0.1b
	tags:
	- sapiens
	- sapiens2
	- vision-transformer
	- human-centric
	- feature-extraction
	- onnx
	- onnxruntime-web
	---

	# Sapiens2-0.1B — ONNX Export

	ONNX export of [facebook/sapiens2-pretrain-0.1b](https://huggingface.co/facebook/sapiens2-pretrain-0.1b), a vision transformer pretrained on 1 billion human images, packaged for browser inference via `onnxruntime-web`.

	\| File \| Size \| Use \|
	\|---\|---\|---\|
	\| `sapiens2_0.1b_int8.onnx` \| 116 MB \| Browser (recommended) \|
	\| `sapiens2_0.1b_fp32.onnx` \| 458 MB \| Server-side / higher precision \|
	\| `example_embeddings.js` \| — \| Drop-in browser ES module \|

	Output: a `(batch, 768)` float32 vector per image (CLS token).

	---

	## What are embeddings?

	The model encodes an image into a 768-dimensional vector that captures human-centric semantics — pose, body shape, clothing, and identity. Two images with similar people in similar poses will have embeddings close together in this space. Common uses:

	- Similarity search — find the most similar person/pose in a collection
	- Clustering — group images by pose, clothing, or activity
	- Classification — train a lightweight head on top of frozen embeddings
	- Retrieval — image → nearest-neighbor lookup in a vector database

	---

	## Browser quick start

	```bash
	npm install onnxruntime-web
	```

	```js
	import * as ort from "onnxruntime-web";

	// Point WASM binaries at the CDN build
	ort.env.wasm.wasmPaths = "https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/";

	const MODEL_URL =
	"https://huggingface.co/barakplasma/sapiens2-onnx/resolve/main/sapiens2_0.1b_int8.onnx";

	const H = 1024, W = 768;
	const MEAN = [0.485, 0.456, 0.406];
	const STD = [0.229, 0.224, 0.225];

	// Load once; reuse for all images. ~1-2 s cold start.
	export async function loadModel() {
	return ort.InferenceSession.create(MODEL_URL, {
	executionProviders: ["webgpu", "wasm"], // WebGPU ~1-3 s/img, WASM ~20-60 s/img
	graphOptimizationLevel: "all",
	});
	}

	// Accepts any <img>, <canvas>, ImageBitmap, or VideoFrame
	function imageToTensor(source) {
	const canvas = document.createElement("canvas");
	canvas.width = W;
	canvas.height = H;
	canvas.getContext("2d").drawImage(source, 0, 0, W, H);
	const { data } = canvas.getContext("2d").getImageData(0, 0, W, H); // RGBA uint8

	const t = new Float32Array(3 * H * W);
	for (let i = 0; i < H * W; i++) {
	t[i] = (data[i * 4] / 255 - MEAN[0]) / STD[0]; // R
	t[H * W + i] = (data[i * 4 + 1] / 255 - MEAN[1]) / STD[1]; // G
	t[2 * H * W + i] = (data[i * 4 + 2] / 255 - MEAN[2]) / STD[2]; // B
	}
	return new ort.Tensor("float32", t, [1, 3, H, W]);
	}

	// Returns a Float32Array of length 768
	export async function embed(session, imageSource) {
	const { embedding } = await session.run({ pixel_values: imageToTensor(imageSource) });
	return embedding.data;
	}

	// Cosine similarity: 1 = identical direction, 0 = orthogonal, -1 = opposite
	export function cosineSimilarity(a, b) {
	let dot = 0, normA = 0, normB = 0;
	for (let i = 0; i < a.length; i++) {
	dot += a[i] * b[i];
	normA += a[i] * a[i];
	normB += b[i] * b[i];
	}
	return dot / (Math.sqrt(normA) * Math.sqrt(normB));
	}
	```

	---

	## Caching in IndexedDB

	The INT8 model is 116 MB. After the first load, store it in IndexedDB so repeat
	visits skip the download entirely:

	```js
	const DB_NAME = "sapiens2-onnx";
	const STORE = "models";

	async function openDB() {
	return new Promise((resolve, reject) => {
	const req = indexedDB.open(DB_NAME, 1);
	req.onupgradeneeded = () => req.result.createObjectStore(STORE);
	req.onsuccess = () => resolve(req.result);
	req.onerror = () => reject(req.error);
	});
	}

	export async function loadModelCached(url = MODEL_URL) {
	const db = await openDB();

	const cached = await new Promise(res => {
	const req = db.transaction(STORE).objectStore(STORE).get(url);
	req.onsuccess = () => res(req.result ?? null);
	req.onerror = () => res(null);
	});

	const buf = cached ?? await fetch(url).then(r => r.arrayBuffer()).then(buf => {
	db.transaction(STORE, "readwrite").objectStore(STORE).put(buf, url);
	return buf;
	});

	return ort.InferenceSession.create(buf, {
	executionProviders: ["webgpu", "wasm"],
	graphOptimizationLevel: "all",
	});
	}
	```

	---

	## Full worked example

	See [`example_embeddings.js`](./example_embeddings.js) — a self-contained ES module
	you can drop into any browser project. It exports `loadModelCached`, `embed`,
	`cosineSimilarity`, `l2Normalize`, and `findMostSimilar`.

	Usage example (assumes an `<input type="file">` and two `<img>` elements):

	```html
	<input type="file" id="fileA" accept="image/*">
	<input type="file" id="fileB" accept="image/*">
	<img id="imgA"> <img id="imgB">
	<p id="result"></p>

	<script type="module">
	import { loadModelCached, embed, cosineSimilarity } from "./example_embeddings.js";

	const session = await loadModelCached();

	async function onFileChange(inputId, imgId) {
	const file = document.getElementById(inputId).files[0];
	const img = document.getElementById(imgId);
	img.src = URL.createObjectURL(file);
	await img.decode();
	return embed(session, img);
	}

	let embA, embB;
	document.getElementById("fileA").onchange = async () => {
	embA = await onFileChange("fileA", "imgA");
	if (embA && embB) showSimilarity();
	};
	document.getElementById("fileB").onchange = async () => {
	embB = await onFileChange("fileB", "imgB");
	if (embA && embB) showSimilarity();
	};

	function showSimilarity() {
	const score = cosineSimilarity(embA, embB);
	document.getElementById("result").textContent =
	`Similarity: ${score.toFixed(4)}`;
	}
	</script>
	```

	---

	## Preprocessing spec

	Input must be resized to exactly 1024 × 768 (H × W) and normalized with
	ImageNet statistics before passing to the model:

	```
	mean = [0.485, 0.456, 0.406] # per channel, RGB order
	std = [0.229, 0.224, 0.225]
	value = (pixel_uint8 / 255 − mean) / std
	layout = NCHW float32 — shape (batch, 3, 1024, 768)
	```

	---

	## Browser requirements

	\| \| Minimum \| Recommended \|
	\|---\|---\|---\|
	\| Browser \| Chrome/Edge 113+ \| Chrome 120+ \|
	\| Execution provider \| WASM \| WebGPU \|
	\| Free RAM \| 4 GB \| 8 GB \|
	\| INT8 latency \| ~20–60 s (WASM) \| ~1–3 s (WebGPU) \|

	WebGPU is available on Chrome/Edge 113+ desktop. Mobile is not viable at this resolution.

	---

	## Model details

	\| \| \|
	\|---\|---\|
	\| Base model \| [facebook/sapiens2-pretrain-0.1b](https://huggingface.co/facebook/sapiens2-pretrain-0.1b) \|
	\| Architecture \| Vision Transformer (RoPE, GQA, SwiGLU, RMSNorm, QK-norm) \|
	\| Parameters \| 0.114 B \|
	\| FLOPs \| 0.342 T \|
	\| Embedding dim \| 768 \|
	\| Layers / heads \| 12 / 12 \|
	\| Input size \| 1024 × 768 (H × W) \|
	\| Patch size \| 16 px → 3,072 patch tokens \|
	\| Output \| CLS token: `(batch, 768)` float32 \|
	\| Pretraining data \| 1 billion curated human images \|
	\| ONNX opset \| 18 \|
	\| Quantization \| `quantize_dynamic`, QInt8 weights \|

	### Sapiens2 family

	\| Model \| Params \| Embed dim \| Layers \|
	\|---\|---\|---\|---\|
	\| Sapiens2-0.1B (this) \| 0.114 B \| 768 \| 12 \|
	\| [Sapiens2-0.4B](https://huggingface.co/facebook/sapiens2-pretrain-0.4b) \| 0.398 B \| 1024 \| 24 \|
	\| [Sapiens2-0.8B](https://huggingface.co/facebook/sapiens2-pretrain-0.8b) \| 0.818 B \| 1280 \| 32 \|
	\| [Sapiens2-1B](https://huggingface.co/facebook/sapiens2-pretrain-1b) \| 1.462 B \| 1536 \| 40 \|
	\| [Sapiens2-5B](https://huggingface.co/facebook/sapiens2-pretrain-5b) \| 5.071 B \| 2432 \| 56 \|

	Only 0.1B is practical for browser inference. Larger variants require server-side deployment.

	---

	## License

	Released under the [Sapiens2 License](https://github.com/facebookresearch/sapiens2/blob/main/LICENSE.md).

	## Citation

	```bibtex
	@article{khirodkarsapiens2,
	title = {Sapiens2},
	author = {Khirodkar, Rawal and Wen, He and Martinez, Julieta and Dong, Yuan and Su, Zhaoen and Saito, Shunsuke},
	journal = {arXiv preprint arXiv:2604.21681},
	year = {2026}
	}
	```