spybyscript commited on
Commit
455fc08
·
verified ·
1 Parent(s): 5ca316d

Upload LiteRT FP16 multi-sig bundle

Browse files
Files changed (3) hide show
  1. README.md +10 -2
  2. encoder_multisig.tflite +2 -2
  3. manifest.json +16 -14
README.md CHANGED
@@ -63,15 +63,23 @@ Each signature has the same I/O shape contract:
63
  ```
64
  inputs:
65
  audio_signal : float32 [1, 128, T_mel] # log-mel features (NeMo preproc)
66
- length : int64 [1] # actual mel frames used (≤ T_mel)
67
  outputs:
68
  encoded : float32 [1, 1024, T_enc] # T_enc = (T_mel - 4) // 8
69
- encoded_lengths : int64 [1]
70
  ```
71
 
72
  Pick the smallest bucket that fits your input; pad shorter inputs with zeros
73
  and pass the true length.
74
 
 
 
 
 
 
 
 
 
75
  ## Decoder + joint contract
76
 
77
  ```
 
63
  ```
64
  inputs:
65
  audio_signal : float32 [1, 128, T_mel] # log-mel features (NeMo preproc)
66
+ length : int32 [1] # actual mel frames used (≤ T_mel)
67
  outputs:
68
  encoded : float32 [1, 1024, T_enc] # T_enc = (T_mel - 4) // 8
69
+ encoded_lengths : int32 [1]
70
  ```
71
 
72
  Pick the smallest bucket that fits your input; pad shorter inputs with zeros
73
  and pass the true length.
74
 
75
+ **Why int32, not int64.** LiteRT's GPU/NPU delegates (LiteRT-CL / OpenCL,
76
+ NPU accelerator) reject int64 tensors entirely. With int64 length, every
77
+ internal CAST node touching it falls back to CPU and `CompiledModel.create()`
78
+ fails outright on Android with the GPU backend. This bundle is exported with
79
+ int32 length end-to-end (input → internal mask arange/comparisons → output
80
+ `encoded_lengths`). int32 covers >2 billion mel frames (~5 hours), so no
81
+ practical range loss.
82
+
83
  ## Decoder + joint contract
84
 
85
  ```
encoder_multisig.tflite CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a97075644590cedce95a53083c876f56dce22d2e1e5807bc4ca2d6879f6183c8
3
- size 1249026196
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd24fd99bc73f92b5f9eab066e5e1bed907e0e2d733cb3472a4f0512c076e002
3
+ size 1249003688
manifest.json CHANGED
@@ -27,6 +27,8 @@
27
  "n_layers": 24,
28
  "n_heads": 8,
29
  "feat_in": 128,
 
 
30
  "buckets": [
31
  {
32
  "n_mel_frames": 300,
@@ -171,8 +173,8 @@
171
  "graph": "encoder",
172
  "source_artifact": "encoder_dynamicT.pt2",
173
  "output_artifact": "encoder_multisig.tflite",
174
- "size_mb": 1191.16,
175
- "convert_seconds": 402.16,
176
  "quant": "fp16",
177
  "multisig": true,
178
  "signatures": [
@@ -184,33 +186,33 @@
184
  "parity_per_signature": {
185
  "forward_T300": {
186
  "ok": true,
187
- "max_abs_diff": 0.0033329054713249207,
188
  "per_output_diffs": [
189
- 0.0033329054713249207,
190
  0.0
191
  ]
192
  },
193
  "forward_T500": {
194
  "ok": true,
195
- "max_abs_diff": 0.006780040450394154,
196
  "per_output_diffs": [
197
- 0.006780040450394154,
198
  0.0
199
  ]
200
  },
201
  "forward_T700": {
202
  "ok": true,
203
- "max_abs_diff": 0.0005690590478479862,
204
  "per_output_diffs": [
205
- 0.0005690590478479862,
206
  0.0
207
  ]
208
  },
209
  "forward_T1500": {
210
  "ok": true,
211
- "max_abs_diff": 0.003892328590154648,
212
  "per_output_diffs": [
213
- 0.003892328590154648,
214
  0.0
215
  ]
216
  }
@@ -221,7 +223,7 @@
221
  "source_artifact": "decoder_step.pt2",
222
  "output_artifact": "decoder_step.tflite",
223
  "size_mb": 22.55,
224
- "convert_seconds": 3.81,
225
  "quant": "fp16",
226
  "torch_output_shapes": [
227
  [
@@ -313,7 +315,7 @@
313
  "source_artifact": "joint_step.pt2",
314
  "output_artifact": "joint_step.tflite",
315
  "size_mb": 12.08,
316
- "convert_seconds": 1.13,
317
  "quant": "fp16",
318
  "torch_output_shapes": [
319
  [
@@ -325,9 +327,9 @@
325
  ],
326
  "parity": {
327
  "ok": true,
328
- "max_abs_diff": 0.275390625,
329
  "per_output_diffs": [
330
- 0.275390625
331
  ],
332
  "tflite_output_shapes": [
333
  [
 
27
  "n_layers": 24,
28
  "n_heads": 8,
29
  "feat_in": 128,
30
+ "attention_mode": "rel_pos",
31
+ "att_context_size": null,
32
  "buckets": [
33
  {
34
  "n_mel_frames": 300,
 
173
  "graph": "encoder",
174
  "source_artifact": "encoder_dynamicT.pt2",
175
  "output_artifact": "encoder_multisig.tflite",
176
+ "size_mb": 1191.14,
177
+ "convert_seconds": 367.97,
178
  "quant": "fp16",
179
  "multisig": true,
180
  "signatures": [
 
186
  "parity_per_signature": {
187
  "forward_T300": {
188
  "ok": true,
189
+ "max_abs_diff": 0.009477382525801659,
190
  "per_output_diffs": [
191
+ 0.009477382525801659,
192
  0.0
193
  ]
194
  },
195
  "forward_T500": {
196
  "ok": true,
197
+ "max_abs_diff": 0.0061398837715387344,
198
  "per_output_diffs": [
199
+ 0.0061398837715387344,
200
  0.0
201
  ]
202
  },
203
  "forward_T700": {
204
  "ok": true,
205
+ "max_abs_diff": 0.001271696761250496,
206
  "per_output_diffs": [
207
+ 0.001271696761250496,
208
  0.0
209
  ]
210
  },
211
  "forward_T1500": {
212
  "ok": true,
213
+ "max_abs_diff": 0.004102766513824463,
214
  "per_output_diffs": [
215
+ 0.004102766513824463,
216
  0.0
217
  ]
218
  }
 
223
  "source_artifact": "decoder_step.pt2",
224
  "output_artifact": "decoder_step.tflite",
225
  "size_mb": 22.55,
226
+ "convert_seconds": 2.72,
227
  "quant": "fp16",
228
  "torch_output_shapes": [
229
  [
 
315
  "source_artifact": "joint_step.pt2",
316
  "output_artifact": "joint_step.tflite",
317
  "size_mb": 12.08,
318
+ "convert_seconds": 1.08,
319
  "quant": "fp16",
320
  "torch_output_shapes": [
321
  [
 
327
  ],
328
  "parity": {
329
  "ok": true,
330
+ "max_abs_diff": 0.33984375,
331
  "per_output_diffs": [
332
+ 0.33984375
333
  ],
334
  "tflite_output_shapes": [
335
  [