Automatic Speech Recognition
LiteRT
LiteRT
speech
audio
parakeet
tdt
on-device
mobile
android
streaming
Instructions to use spybyscript/parakeet-tdt-litert with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- LiteRT
How to use spybyscript/parakeet-tdt-litert with LiteRT:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
Upload LiteRT FP16 multi-sig bundle
Browse files- README.md +10 -2
- encoder_multisig.tflite +2 -2
- manifest.json +16 -14
README.md
CHANGED
|
@@ -63,15 +63,23 @@ Each signature has the same I/O shape contract:
|
|
| 63 |
```
|
| 64 |
inputs:
|
| 65 |
audio_signal : float32 [1, 128, T_mel] # log-mel features (NeMo preproc)
|
| 66 |
-
length :
|
| 67 |
outputs:
|
| 68 |
encoded : float32 [1, 1024, T_enc] # T_enc = (T_mel - 4) // 8
|
| 69 |
-
encoded_lengths :
|
| 70 |
```
|
| 71 |
|
| 72 |
Pick the smallest bucket that fits your input; pad shorter inputs with zeros
|
| 73 |
and pass the true length.
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
## Decoder + joint contract
|
| 76 |
|
| 77 |
```
|
|
|
|
| 63 |
```
|
| 64 |
inputs:
|
| 65 |
audio_signal : float32 [1, 128, T_mel] # log-mel features (NeMo preproc)
|
| 66 |
+
length : int32 [1] # actual mel frames used (≤ T_mel)
|
| 67 |
outputs:
|
| 68 |
encoded : float32 [1, 1024, T_enc] # T_enc = (T_mel - 4) // 8
|
| 69 |
+
encoded_lengths : int32 [1]
|
| 70 |
```
|
| 71 |
|
| 72 |
Pick the smallest bucket that fits your input; pad shorter inputs with zeros
|
| 73 |
and pass the true length.
|
| 74 |
|
| 75 |
+
**Why int32, not int64.** LiteRT's GPU/NPU delegates (LiteRT-CL / OpenCL,
|
| 76 |
+
NPU accelerator) reject int64 tensors entirely. With int64 length, every
|
| 77 |
+
internal CAST node touching it falls back to CPU and `CompiledModel.create()`
|
| 78 |
+
fails outright on Android with the GPU backend. This bundle is exported with
|
| 79 |
+
int32 length end-to-end (input → internal mask arange/comparisons → output
|
| 80 |
+
`encoded_lengths`). int32 covers >2 billion mel frames (~5 hours), so no
|
| 81 |
+
practical range loss.
|
| 82 |
+
|
| 83 |
## Decoder + joint contract
|
| 84 |
|
| 85 |
```
|
encoder_multisig.tflite
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bd24fd99bc73f92b5f9eab066e5e1bed907e0e2d733cb3472a4f0512c076e002
|
| 3 |
+
size 1249003688
|
manifest.json
CHANGED
|
@@ -27,6 +27,8 @@
|
|
| 27 |
"n_layers": 24,
|
| 28 |
"n_heads": 8,
|
| 29 |
"feat_in": 128,
|
|
|
|
|
|
|
| 30 |
"buckets": [
|
| 31 |
{
|
| 32 |
"n_mel_frames": 300,
|
|
@@ -171,8 +173,8 @@
|
|
| 171 |
"graph": "encoder",
|
| 172 |
"source_artifact": "encoder_dynamicT.pt2",
|
| 173 |
"output_artifact": "encoder_multisig.tflite",
|
| 174 |
-
"size_mb": 1191.
|
| 175 |
-
"convert_seconds":
|
| 176 |
"quant": "fp16",
|
| 177 |
"multisig": true,
|
| 178 |
"signatures": [
|
|
@@ -184,33 +186,33 @@
|
|
| 184 |
"parity_per_signature": {
|
| 185 |
"forward_T300": {
|
| 186 |
"ok": true,
|
| 187 |
-
"max_abs_diff": 0.
|
| 188 |
"per_output_diffs": [
|
| 189 |
-
0.
|
| 190 |
0.0
|
| 191 |
]
|
| 192 |
},
|
| 193 |
"forward_T500": {
|
| 194 |
"ok": true,
|
| 195 |
-
"max_abs_diff": 0.
|
| 196 |
"per_output_diffs": [
|
| 197 |
-
0.
|
| 198 |
0.0
|
| 199 |
]
|
| 200 |
},
|
| 201 |
"forward_T700": {
|
| 202 |
"ok": true,
|
| 203 |
-
"max_abs_diff": 0.
|
| 204 |
"per_output_diffs": [
|
| 205 |
-
0.
|
| 206 |
0.0
|
| 207 |
]
|
| 208 |
},
|
| 209 |
"forward_T1500": {
|
| 210 |
"ok": true,
|
| 211 |
-
"max_abs_diff": 0.
|
| 212 |
"per_output_diffs": [
|
| 213 |
-
0.
|
| 214 |
0.0
|
| 215 |
]
|
| 216 |
}
|
|
@@ -221,7 +223,7 @@
|
|
| 221 |
"source_artifact": "decoder_step.pt2",
|
| 222 |
"output_artifact": "decoder_step.tflite",
|
| 223 |
"size_mb": 22.55,
|
| 224 |
-
"convert_seconds":
|
| 225 |
"quant": "fp16",
|
| 226 |
"torch_output_shapes": [
|
| 227 |
[
|
|
@@ -313,7 +315,7 @@
|
|
| 313 |
"source_artifact": "joint_step.pt2",
|
| 314 |
"output_artifact": "joint_step.tflite",
|
| 315 |
"size_mb": 12.08,
|
| 316 |
-
"convert_seconds": 1.
|
| 317 |
"quant": "fp16",
|
| 318 |
"torch_output_shapes": [
|
| 319 |
[
|
|
@@ -325,9 +327,9 @@
|
|
| 325 |
],
|
| 326 |
"parity": {
|
| 327 |
"ok": true,
|
| 328 |
-
"max_abs_diff": 0.
|
| 329 |
"per_output_diffs": [
|
| 330 |
-
0.
|
| 331 |
],
|
| 332 |
"tflite_output_shapes": [
|
| 333 |
[
|
|
|
|
| 27 |
"n_layers": 24,
|
| 28 |
"n_heads": 8,
|
| 29 |
"feat_in": 128,
|
| 30 |
+
"attention_mode": "rel_pos",
|
| 31 |
+
"att_context_size": null,
|
| 32 |
"buckets": [
|
| 33 |
{
|
| 34 |
"n_mel_frames": 300,
|
|
|
|
| 173 |
"graph": "encoder",
|
| 174 |
"source_artifact": "encoder_dynamicT.pt2",
|
| 175 |
"output_artifact": "encoder_multisig.tflite",
|
| 176 |
+
"size_mb": 1191.14,
|
| 177 |
+
"convert_seconds": 367.97,
|
| 178 |
"quant": "fp16",
|
| 179 |
"multisig": true,
|
| 180 |
"signatures": [
|
|
|
|
| 186 |
"parity_per_signature": {
|
| 187 |
"forward_T300": {
|
| 188 |
"ok": true,
|
| 189 |
+
"max_abs_diff": 0.009477382525801659,
|
| 190 |
"per_output_diffs": [
|
| 191 |
+
0.009477382525801659,
|
| 192 |
0.0
|
| 193 |
]
|
| 194 |
},
|
| 195 |
"forward_T500": {
|
| 196 |
"ok": true,
|
| 197 |
+
"max_abs_diff": 0.0061398837715387344,
|
| 198 |
"per_output_diffs": [
|
| 199 |
+
0.0061398837715387344,
|
| 200 |
0.0
|
| 201 |
]
|
| 202 |
},
|
| 203 |
"forward_T700": {
|
| 204 |
"ok": true,
|
| 205 |
+
"max_abs_diff": 0.001271696761250496,
|
| 206 |
"per_output_diffs": [
|
| 207 |
+
0.001271696761250496,
|
| 208 |
0.0
|
| 209 |
]
|
| 210 |
},
|
| 211 |
"forward_T1500": {
|
| 212 |
"ok": true,
|
| 213 |
+
"max_abs_diff": 0.004102766513824463,
|
| 214 |
"per_output_diffs": [
|
| 215 |
+
0.004102766513824463,
|
| 216 |
0.0
|
| 217 |
]
|
| 218 |
}
|
|
|
|
| 223 |
"source_artifact": "decoder_step.pt2",
|
| 224 |
"output_artifact": "decoder_step.tflite",
|
| 225 |
"size_mb": 22.55,
|
| 226 |
+
"convert_seconds": 2.72,
|
| 227 |
"quant": "fp16",
|
| 228 |
"torch_output_shapes": [
|
| 229 |
[
|
|
|
|
| 315 |
"source_artifact": "joint_step.pt2",
|
| 316 |
"output_artifact": "joint_step.tflite",
|
| 317 |
"size_mb": 12.08,
|
| 318 |
+
"convert_seconds": 1.08,
|
| 319 |
"quant": "fp16",
|
| 320 |
"torch_output_shapes": [
|
| 321 |
[
|
|
|
|
| 327 |
],
|
| 328 |
"parity": {
|
| 329 |
"ok": true,
|
| 330 |
+
"max_abs_diff": 0.33984375,
|
| 331 |
"per_output_diffs": [
|
| 332 |
+
0.33984375
|
| 333 |
],
|
| 334 |
"tflite_output_shapes": [
|
| 335 |
[
|