First verion of model card

Browse files

Files changed (6) hide show

.gitattributes +1 -0
README.md +174 -5
embedl_all-MiniLM-L6-v2_int8.onnx +3 -0
embedl_all-MiniLM-L6-v2_int8.pt2 +3 -0
infer_pt2.py +76 -0
infer_trt.py +163 -0

.gitattributes CHANGED Viewed

@@ -20,6 +20,7 @@
 *.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text

 *.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
+*.pt2 filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,5 +1,174 @@
----
-license: other
-license_name: embedl-models-community-licence-1.0
-license_link: https://github.com/embedl/embedl-models/blob/main/LICENSE
----

+---
+license: other
+license_name: embedl-models-community-licence-1.0
+license_link: https://github.com/embedl/embedl-models/blob/main/LICENSE
+base_model:
+  - sentence-transformers/all-MiniLM-L6-v2
+quantized_from:
+  - sentence-transformers/all-MiniLM-L6-v2
+tags:
+  - sentence-similarity
+  - quantization
+  - onnx
+  - tensorrt
+  - edge
+  - embedl
+gated: true
+extra_gated_heading: "Access Embedl All Minilm L6 V2"
+extra_gated_description: "To access this model, please review and accept the terms below. Your contact information is collected solely to manage access and, with your explicit consent, to notify you about updated or new optimized models from Embedl."
+extra_gated_button_content: "Agree and request access"
+extra_gated_prompt: "By requesting access you agree to the Embedl Models Community Licence and the upstream All Minilm L6 V2 License"
+extra_gated_fields:
+  Company: text
+  I agree to the Embedl Models Community Licence and upstream All Minilm L6 V2 License: checkbox
+  I consent to being contacted by Embedl about products and services (optional): checkbox
+---
+<!-- embedl-banner:start -->
+<style>
+.embedl-btn-primary { transition: background 160ms ease, box-shadow 160ms ease; }
+.embedl-btn-primary:hover { background: #4FDCE4 !important; box-shadow: 0 8px 22px rgba(45,212,221,0.45) !important; }
+.embedl-btn-secondary { transition: background 160ms ease; }
+.embedl-btn-secondary:hover { background: rgba(45,212,221,0.15) !important; }
+.embedl-headline { font-size: clamp(11px, 2.15vw, 15px) !important; }
+.embedl-btn-primary, .embedl-btn-secondary {
+  font-size: clamp(11px, 1.65vw, 13px) !important;
+  padding: clamp(6px, 1.1vw, 9px) clamp(10px, 1.6vw, 14px) !important;
+}
+</style>
+<div style="background:radial-gradient(600px 220px at 0% 50%,rgba(45,212,221,0.22) 0%,rgba(45,212,221,0) 60%),radial-gradient(400px 180px at 100% 100%,rgba(45,212,221,0.10) 0%,rgba(45,212,221,0) 55%),linear-gradient(135deg,#0B1626 0%,#142338 100%);border:1px solid rgba(45,212,221,0.28);border-radius:12px;padding:22px 24px;margin:0 0 24px 0;color:#F2F6FA;box-shadow:0 4px 16px rgba(11,22,38,0.18);overflow:hidden;box-sizing:border-box;max-width:100%;">
+  <table style="width:100%;border-collapse:collapse;border:0;background:transparent;">
+    <tr style="background:transparent;">
+      <td style="vertical-align:middle;border:0;padding:0;background:transparent;">
+        <div style="display:inline-block;font-size:10px;letter-spacing:0.08em;text-transform:uppercase;font-weight:700;color:#2DD4DD;background:rgba(45,212,221,0.15);border:1px solid rgba(45,212,221,0.35);padding:4px 10px;border-radius:999px;margin-bottom:10px;white-space:nowrap;">Optimized by Embedl</div>
+        <div class="embedl-headline" style="font-size:15px;font-weight:700;line-height:1.35;color:#F2F6FA;margin-bottom:4px;">Need to <span style="color:#2DD4DD;white-space:nowrap;">fine-tune</span>, hit <span style="color:#2DD4DD;white-space:nowrap;">performance targets</span>, or deploy on <span style="color:#2DD4DD;white-space:nowrap;">specific hardware</span>?</div>
+        <div style="font-size:13px;color:#9BA7B5;">We've got you covered.</div>
+      </td>
+      <td width="1%" style="vertical-align:middle;border:0;padding:0 0 0 18px;white-space:nowrap;text-align:right;background:transparent;">
+        <a href="https://www.embedl.com/models" class="embedl-btn-secondary" style="display:inline-block;font-size:13px;font-weight:600;padding:9px 14px;border-radius:6px;border:1px solid #2DD4DD;color:#2DD4DD;text-decoration:none;margin-right:8px;">Learn more</a>
+        <a href="https://www.embedl.com/contact" class="embedl-btn-primary" style="display:inline-block;font-size:13px;font-weight:600;padding:9px 14px;border-radius:6px;border:1px solid #2DD4DD;background:#2DD4DD;color:#0B1626;text-decoration:none;box-shadow:0 6px 18px rgba(45,212,221,0.28);">Get in touch →</a>
+      </td>
+    </tr>
+  </table>
+</div>
+<!-- embedl-banner:end -->
+# Embedl All Minilm L6 V2 (Quantized for TensorRT)
+Deployable INT8-quantized version of [`sentence-transformers/all-MiniLM-L6-v2`](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2),
+optimized with [embedl-deploy](https://github.com/embedl/embedl-deploy)
+for low-latency NVIDIA TensorRT inference on edge GPUs. Produces
+the same L2-normalised sentence embedding as the upstream encoder,
+in ~1/n the runtime.
+## Upstream Model
+<a href="https://hfviewer.com/sentence-transformers/all-MiniLM-L6-v2?utm_source=huggingface&amp;utm_medium=embedded_model_card&amp;utm_campaign=sentence-transformers__all-MiniLM-L6-v2_card" target="_blank" rel="noopener">
+  <img
+    src="https://hfviewer.com/api/card.svg?source=sentence-transformers%2Fall-MiniLM-L6-v2&amp;v=20260501clipcard"
+    alt="Open sentence-transformers/all-MiniLM-L6-v2 in hfviewer"
+    width="100%"
+  />
+</a>
+## Highlights
+- **Mixed-precision INT8/FP16 quantization** with hardware-aware
+  optimizations from [embedl-deploy](https://github.com/embedl/embedl-deploy).
+- **Drop-in replacement** for `sentence-transformers/all-MiniLM-L6-v2` in TensorRT pipelines —
+  same input pair (input_ids, attention_mask) at seq_len=128, same output embedding semantics
+  (mean-pooled, L2-normalised).
+- **Validated accuracy** within 0.0026 of the FP32 Spearman ρ on stsb
+  (see Accuracy table below).
+- **Faster than `trtexec --best`** on supported NVIDIA hardware (see Performance table below).
+- Includes both **ONNX** (for TensorRT) and **PT2**
+  (`torch.export`-loadable) artifacts plus runnable inference scripts.
+## Quick Start
+```bash
+pip install huggingface_hub transformers numpy
+python -c "from huggingface_hub import snapshot_download; snapshot_download('embedl/all-MiniLM-L6-v2-quantized-trt', local_dir='.')"
+python infer_pt2.py --sentence "A man is eating food."   # pure PyTorch via torch.export
+# or
+python infer_trt.py --sentence "A man is eating food."   # TensorRT (requires pycuda + tensorrt)
+```
+## Files
+| File | Purpose |
+|---|---|
+| `embedl_all-MiniLM-L6-v2_int8.onnx` | INT8-quantized ONNX with Q/DQ nodes — feed to TensorRT. |
+| `embedl_all-MiniLM-L6-v2_int8.pt2` | INT8-quantized `torch.export` ExportedProgram. |
+| `infer_trt.py` | Build a TRT engine from the ONNX and run sample inference. |
+| `infer_pt2.py` | Load the `.pt2` with `torch.export.load` and run sample inference. |
+## Performance
+Latency measured with TensorRT + `trtexec`, GPU compute time only
+(`--noDataTransfers`), CUDA Graph + Spin Wait enabled, clocks locked
+(`nvpmodel -m 0 && jetson_clocks` on Jetson).
+<img src="https://huggingface.co/datasets/embedl/documentation-images/resolve/main/all-MiniLM-L6-v2-quantized-trt/all-MiniLM-L6-v2-quantized-trt__orin-mountain-view.svg" alt="All Minilm L6 V2 benchmark on NVIDIA Jetson AGX Orin">
+### NVIDIA Jetson AGX Orin
+| Configuration | Mean Latency | Speedup vs FP16 |
+|---|---|---|
+| TensorRT FP16 | 0.41 ms | 1.00x |
+| TensorRT --best (unconstrained) | 0.41 ms | 1.01x |
+| **Embedl Deploy INT8** | **0.38 ms** | **1.07x** |
+## Accuracy
+Evaluated on the stsb validation split. The quantized model
+retains nearly all of the FP32 accuracy with a small tolerance.
+| Model | Spearman ρ |
+|---|---|
+| `sentence-transformers/all-MiniLM-L6-v2` FP32 (ours) | 0.8672 |
+| **Embedl All Minilm L6 V2 INT8** | **0.8646** |
+## Creating Your Own Optimized Models
+This artifact was produced with
+[embedl-deploy](https://github.com/embedl/embedl-deploy),
+Embedl's open-source PyTorch → TensorRT deployment library. You can
+apply the same workflow to your own models — see
+[the documentation](https://github.com/embedl/embedl-deploy#readme)
+for installation and usage.
+## License
+| Component | License |
+|---|---|
+| Optimized model artifacts (this repo) | [Embedl Models Community Licence v1.0](https://github.com/embedl/embedl-models/blob/main/LICENSE) — no redistribution as a hosted service |
+| Upstream architecture and weights | [All Minilm L6 V2 License](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) |
+## Contact
+We offer engineering support for on-prem/edge deployments and partner
+co-marketing opportunities. Reach out at
+[contact@embedl.com](mailto:contact@embedl.com), or open an issue on
+[GitHub](https://github.com/embedl/embedl-deploy).
+<!-- embedl-discord-banner:start -->
+<style>
+.embedl-discord-btn { transition: background 160ms ease, box-shadow 160ms ease; }
+.embedl-discord-btn:hover { background: #6C77F5 !important; box-shadow: 0 8px 22px rgba(88,101,242,0.55) !important; }
+</style>
+<div style="background:radial-gradient(600px 220px at 0% 50%,rgba(88,101,242,0.22) 0%,rgba(88,101,242,0) 60%),radial-gradient(400px 180px at 100% 100%,rgba(88,101,242,0.10) 0%,rgba(88,101,242,0) 55%),linear-gradient(135deg,#0B1626 0%,#142338 100%);border:1px solid rgba(88,101,242,0.35);border-radius:12px;padding:22px 24px;margin:24px 0 0 0;color:#F2F6FA;box-shadow:0 4px 16px rgba(11,22,38,0.18);overflow:hidden;box-sizing:border-box;max-width:100%;">
+  <table style="width:100%;border-collapse:collapse;border:0;background:transparent;">
+    <tr style="background:transparent;">
+      <td style="vertical-align:middle;border:0;padding:0;background:transparent;">
+        <div style="display:inline-block;font-size:10px;letter-spacing:0.08em;text-transform:uppercase;font-weight:700;color:#A5B4FC;background:rgba(88,101,242,0.18);border:1px solid rgba(88,101,242,0.45);padding:4px 10px;border-radius:999px;margin-bottom:10px;white-space:nowrap;">Community &amp; support</div>
+        <div style="font-size:15px;font-weight:700;line-height:1.35;color:#F2F6FA;margin-bottom:4px;">Need help with this model? Chat with the Embedl team and other engineers on <span style="color:#A5B4FC;white-space:nowrap;">Discord</span>.</div>
+        <div style="font-size:13px;color:#9BA7B5;">Quantization gotchas, hardware questions, fine-tuning tips — bring them all.</div>
+      </td>
+      <td width="1%" style="vertical-align:middle;border:0;padding:0 0 0 18px;white-space:nowrap;text-align:right;background:transparent;">
+        <a href="https://discord.gg/MTbMWdKqE" class="embedl-discord-btn" style="display:inline-block;font-size:13px;font-weight:600;padding:9px 14px;border-radius:6px;border:1px solid #5865F2;background:#5865F2;color:#FFFFFF;text-decoration:none;box-shadow:0 6px 18px rgba(88,101,242,0.35);">Join our Discord →</a>
+      </td>
+    </tr>
+  </table>
+</div>
+<!-- embedl-discord-banner:end -->

embedl_all-MiniLM-L6-v2_int8.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:693d54efc1f18fa72f6bb6b9245cea05efb5d0c2e3b37c41d5c0e438d7edc5bb
+size 89988793

embedl_all-MiniLM-L6-v2_int8.pt2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f24d335c16b7a94740668484f2f36021b1fd047df3bb50b41da18cf5f122251
+size 134547563

infer_pt2.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright (C) 2026 Embedl AB
+"""Run inference on the Embedl All Minilm L6 V2 INT8 sentence encoder via torch.export.
+Loads the shipped ``embedl_all-MiniLM-L6-v2_int8.pt2`` artifact with
+``torch.export.load`` and encodes a sentence (or pair of sentences)
+into an L2-normalised embedding. No TensorRT or ONNX runtime is
+required — just PyTorch + transformers (for the tokenizer).
+Usage::
+    python infer_pt2.py --sentence "A man is eating food."
+    python infer_pt2.py --sentence "A man is eating." \\
+        --sentence "A man is having a meal."
+"""
+import argparse
+from pathlib import Path
+import torch
+from transformers import AutoTokenizer
+PT2_PATH = Path(__file__).with_name("embedl_all-MiniLM-L6-v2_int8.pt2")
+TOKENIZER_ID = "sentence-transformers/all-MiniLM-L6-v2"
+MAX_LENGTH = 128
+def encode(model: torch.nn.Module, tokenizer, sentence: str) -> torch.Tensor:
+    enc = tokenizer(
+        sentence,
+        padding="max_length",
+        truncation=True,
+        max_length=MAX_LENGTH,
+        return_tensors="pt",
+    )
+    with torch.no_grad():
+        embedding = model(enc["input_ids"], enc["attention_mask"])
+    return embedding.squeeze(0)
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--sentence",
+        required=True,
+        action="append",
+        help="Sentence to encode. Pass twice to also print cosine similarity.",
+    )
+    args = parser.parse_args()
+    if not PT2_PATH.exists():
+        raise SystemExit(
+            f"Expected {PT2_PATH.name} next to this script. "
+            "Did you `huggingface-cli download` the repo?"
+        )
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID)
+    # The ExportedProgram captured the model in eval mode at export
+    # time, so no further .eval() / no_grad toggling is needed (and
+    # neither is supported on the .module() wrapper).
+    model = torch.export.load(str(PT2_PATH)).module()
+    embeddings = [encode(model, tokenizer, s) for s in args.sentence]
+    for i, (sentence, emb) in enumerate(zip(args.sentence, embeddings), 1):
+        first8 = ", ".join(f"{v:+.4f}" for v in emb[:8].tolist())
+        print(f"[{i}] {sentence!r}")
+        print(f"    embedding shape: {tuple(emb.shape)}")
+        print(f"    first 8 dims:    [{first8}]")
+    if len(embeddings) >= 2:
+        cos = torch.dot(embeddings[0], embeddings[1]).item()
+        print(f"\\ncosine similarity (sentences 1 & 2): {cos:+.4f}")
+if __name__ == "__main__":
+    main()

infer_trt.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# Copyright (C) 2026 Embedl AB
+"""Run inference on the Embedl All Minilm L6 V2 INT8 sentence encoder via TensorRT.
+Builds a TensorRT engine from the shipped
+``embedl_all-MiniLM-L6-v2_int8.onnx`` artifact (Q/DQ nodes baked in by
+embedl-deploy) and encodes a sentence into an L2-normalised
+embedding. The first run caches the engine to
+``embedl_all-MiniLM-L6-v2_int8.engine`` so reuse is fast.
+Requires TensorRT >= 10.1, pycuda (or cuda-python), and transformers
+(for the tokenizer). Tested on NVIDIA Jetson AGX Orin (JetPack 6)
+and discrete GPUs with CUDA 12.
+Usage::
+    python infer_trt.py --sentence "A man is eating food."
+"""
+import argparse
+import time
+from pathlib import Path
+import numpy as np
+import tensorrt as trt
+from transformers import AutoTokenizer
+try:
+    import pycuda.autoinit  # noqa: F401  (initializes CUDA context)
+    import pycuda.driver as cuda
+except ImportError as exc:  # pragma: no cover
+    raise SystemExit(
+        "pycuda is required. Install with: pip install pycuda"
+    ) from exc
+ONNX_PATH = Path(__file__).with_name("embedl_all-MiniLM-L6-v2_int8.onnx")
+ENGINE_PATH = Path(__file__).with_name("embedl_all-MiniLM-L6-v2_int8.engine")
+TOKENIZER_ID = "sentence-transformers/all-MiniLM-L6-v2"
+MAX_LENGTH = 128
+TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+def build_engine() -> bytes:
+    builder = trt.Builder(TRT_LOGGER)
+    network = builder.create_network(
+        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    )
+    parser = trt.OnnxParser(network, TRT_LOGGER)
+    with open(ONNX_PATH, "rb") as f:
+        if not parser.parse(f.read()):
+            for i in range(parser.num_errors):
+                print(parser.get_error(i))
+            raise RuntimeError("ONNX parse failed.")
+    config = builder.create_builder_config()
+    config.set_flag(trt.BuilderFlag.FP16)
+    config.set_flag(trt.BuilderFlag.INT8)
+    config.builder_optimization_level = 5
+    serialized = builder.build_serialized_network(network, config)
+    if serialized is None:
+        raise RuntimeError("Engine build failed.")
+    return bytes(serialized)
+def load_or_build_engine() -> trt.ICudaEngine:
+    if ENGINE_PATH.exists():
+        data = ENGINE_PATH.read_bytes()
+    else:
+        print(f"Building engine (first run) → {ENGINE_PATH.name} …")
+        data = build_engine()
+        ENGINE_PATH.write_bytes(data)
+    runtime = trt.Runtime(TRT_LOGGER)
+    return runtime.deserialize_cuda_engine(data)
+def tokenize(tokenizer, sentence: str):
+    enc = tokenizer(
+        sentence,
+        padding="max_length",
+        truncation=True,
+        max_length=MAX_LENGTH,
+        return_tensors="np",
+    )
+    return (
+        np.ascontiguousarray(enc["input_ids"].astype(np.int64)),
+        np.ascontiguousarray(enc["attention_mask"].astype(np.int64)),
+    )
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--sentence", required=True, type=str)
+    args = parser.parse_args()
+    if not ONNX_PATH.exists():
+        raise SystemExit(
+            f"Expected {ONNX_PATH.name} next to this script. "
+            "Did you download the HF repo?"
+        )
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID)
+    input_ids, attention_mask = tokenize(tokenizer, args.sentence)
+    engine = load_or_build_engine()
+    context = engine.create_execution_context()
+    # Resolve I/O tensor names by mode (input vs output) — order in
+    # the engine isn't guaranteed to match get_tensor_name(0..N).
+    input_names = []
+    output_names = []
+    for i in range(engine.num_io_tensors):
+        name = engine.get_tensor_name(i)
+        mode = engine.get_tensor_mode(name)
+        if mode == trt.TensorIOMode.INPUT:
+            input_names.append(name)
+        else:
+            output_names.append(name)
+    if len(input_names) != 2 or len(output_names) != 1:
+        raise RuntimeError(
+            f"Expected 2 inputs / 1 output, got "
+            f"{len(input_names)}/{len(output_names)}."
+        )
+    # Feed the inputs by canonical name so input_ids / attention_mask
+    # bind to the right tensor regardless of engine ordering.
+    inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
+    out_shape = tuple(engine.get_tensor_shape(output_names[0]))
+    h_out = np.empty(out_shape, dtype=np.float32)
+    d_inputs = {}
+    for name in input_names:
+        arr = inputs[name]
+        d_inputs[name] = cuda.mem_alloc(arr.nbytes)
+    d_out = cuda.mem_alloc(h_out.nbytes)
+    stream = cuda.Stream()
+    for name in input_names:
+        cuda.memcpy_htod_async(d_inputs[name], inputs[name], stream)
+        context.set_tensor_address(name, int(d_inputs[name]))
+    context.set_tensor_address(output_names[0], int(d_out))
+    # Warm-up + timed run.
+    for _ in range(5):
+        context.execute_async_v3(stream.handle)
+    stream.synchronize()
+    t0 = time.perf_counter()
+    context.execute_async_v3(stream.handle)
+    stream.synchronize()
+    latency_ms = (time.perf_counter() - t0) * 1000.0
+    cuda.memcpy_dtoh_async(h_out, d_out, stream)
+    stream.synchronize()
+    embedding = h_out.reshape(-1)
+    first8 = ", ".join(f"{v:+.4f}" for v in embedding[:8])
+    print(f"Latency (single-run, GPU compute): {latency_ms:.2f} ms")
+    print(f"Sentence: {args.sentence!r}")
+    print(f"Embedding shape: {embedding.shape}")
+    print(f"First 8 dims:    [{first8}]")
+if __name__ == "__main__":
+    main()