parakeet-tdt-litert / manifest.json
spybyscript's picture
Upload LiteRT FP16 bundle
2a68489 verified
{
"model": "nvidia/parakeet-tdt-0.6b-v3",
"torch_version": "2.11.0+cu130",
"model_class": "EncDecRNNTBPEModel",
"vocab_size": 8192,
"blank_id": 8192,
"durations": [
0,
1,
2,
3,
4
],
"num_durations": 5,
"joint_output_dim": 8198,
"joint_token_logits_slice": [
0,
8193
],
"joint_duration_logits_slice": [
8193,
8198
],
"encoder": {
"d_model": 1024,
"subsampling_factor": 8,
"n_layers": 24,
"n_heads": 8,
"feat_in": 128,
"attention_mode": "rel_pos",
"att_context_size": null,
"buckets": [
{
"n_mel_frames": 1500,
"n_encoder_frames": 187,
"input_shape": [
1,
128,
1500
],
"output_shape": [
1,
1024,
188
],
"artifact": "encoder_T1500.pt2",
"size_mb": 2366.85
}
],
"multisig": false
},
"decoder": {
"num_layers": 2,
"hidden": 640,
"embed_dim": 640
},
"joint": {
"d_enc": 1024,
"d_pred": 640,
"joint_dim": 640
},
"preprocessor": {
"sample_rate": 16000,
"n_fft": 512,
"win_length": 400,
"hop_length": 160,
"n_mels": 128,
"preemph": 0.97,
"log": true,
"frame_rate_hz_post_subsample": 12.5
},
"artifacts": {
"decoder_step": {
"filename": "decoder_step.pt2",
"size_mb": 45.07,
"input_shapes": {
"token": [
1,
1
],
"h": [
2,
1,
640
],
"c": [
2,
1,
640
]
},
"input_dtypes": {
"token": "int64",
"h": "float32",
"c": "float32"
},
"output_shapes": {
"g": [
1,
1,
640
],
"h": [
2,
1,
640
],
"c": [
2,
1,
640
]
}
},
"joint_step": {
"filename": "joint_step.pt2",
"size_mb": 24.14,
"input_shapes": {
"enc_frame": [
1,
1024,
1
],
"pred_frame": [
1,
640,
1
]
},
"output_shape": [
1,
1,
1,
8198
]
}
},
"tokenizer": {
"saved": true,
"method": "serialized_model_proto",
"vocab_size": 8192
},
"litert": {
"quant": "fp16",
"results": [
{
"graph": "encoder",
"source_artifact": "encoder_T1500.pt2",
"output_artifact": "encoder_T1500.tflite",
"size_mb": 1150.88,
"convert_seconds": 158.59,
"quant": "fp16",
"torch_output_shapes": [
[
1,
1024,
188
],
[
1
]
],
"parity": {
"ok": true,
"max_abs_diff": 0.0,
"per_output_diffs": [
[
"shape mismatch",
[
1
],
[
1,
1024,
188
]
],
[
"shape mismatch",
[
1,
1024,
188
],
[
1
]
]
],
"tflite_output_shapes": [
[
1
],
[
1,
1024,
188
]
],
"torch_output_shapes": [
[
1,
1024,
188
],
[
1
]
]
}
},
{
"graph": "decoder_step",
"source_artifact": "decoder_step.pt2",
"output_artifact": "decoder_step.tflite",
"size_mb": 22.55,
"convert_seconds": 1.92,
"quant": "fp16",
"torch_output_shapes": [
[
1,
1,
640
],
[
2,
1,
640
],
[
2,
1,
640
]
],
"parity": {
"ok": true,
"max_abs_diff": 0.0044100284576416016,
"per_output_diffs": [
[
"shape mismatch",
[
2,
1,
640
],
[
1,
1,
640
]
],
[
"shape mismatch",
[
1,
1,
640
],
[
2,
1,
640
]
],
0.0044100284576416016
],
"tflite_output_shapes": [
[
2,
1,
640
],
[
1,
1,
640
],
[
2,
1,
640
]
],
"torch_output_shapes": [
[
1,
1,
640
],
[
2,
1,
640
],
[
2,
1,
640
]
]
}
},
{
"graph": "joint_step",
"source_artifact": "joint_step.pt2",
"output_artifact": "joint_step.tflite",
"size_mb": 12.08,
"convert_seconds": 1.61,
"quant": "fp16",
"torch_output_shapes": [
[
1,
1,
1,
8198
]
],
"parity": {
"ok": true,
"max_abs_diff": 0.408447265625,
"per_output_diffs": [
0.408447265625
],
"tflite_output_shapes": [
[
1,
1,
1,
8198
]
],
"torch_output_shapes": [
[
1,
1,
1,
8198
]
]
}
}
]
}
}