{ "model": "nvidia/parakeet-tdt-0.6b-v3", "torch_version": "2.11.0+cu130", "model_class": "EncDecRNNTBPEModel", "vocab_size": 8192, "blank_id": 8192, "durations": [ 0, 1, 2, 3, 4 ], "num_durations": 5, "joint_output_dim": 8198, "joint_token_logits_slice": [ 0, 8193 ], "joint_duration_logits_slice": [ 8193, 8198 ], "encoder": { "d_model": 1024, "subsampling_factor": 8, "n_layers": 24, "n_heads": 8, "feat_in": 128, "attention_mode": "rel_pos", "att_context_size": null, "buckets": [ { "n_mel_frames": 1500, "n_encoder_frames": 187, "input_shape": [ 1, 128, 1500 ], "output_shape": [ 1, 1024, 188 ], "artifact": "encoder_T1500.pt2", "size_mb": 2366.85 } ], "multisig": false }, "decoder": { "num_layers": 2, "hidden": 640, "embed_dim": 640 }, "joint": { "d_enc": 1024, "d_pred": 640, "joint_dim": 640 }, "preprocessor": { "sample_rate": 16000, "n_fft": 512, "win_length": 400, "hop_length": 160, "n_mels": 128, "preemph": 0.97, "log": true, "frame_rate_hz_post_subsample": 12.5 }, "artifacts": { "decoder_step": { "filename": "decoder_step.pt2", "size_mb": 45.07, "input_shapes": { "token": [ 1, 1 ], "h": [ 2, 1, 640 ], "c": [ 2, 1, 640 ] }, "input_dtypes": { "token": "int64", "h": "float32", "c": "float32" }, "output_shapes": { "g": [ 1, 1, 640 ], "h": [ 2, 1, 640 ], "c": [ 2, 1, 640 ] } }, "joint_step": { "filename": "joint_step.pt2", "size_mb": 24.14, "input_shapes": { "enc_frame": [ 1, 1024, 1 ], "pred_frame": [ 1, 640, 1 ] }, "output_shape": [ 1, 1, 1, 8198 ] } }, "tokenizer": { "saved": true, "method": "serialized_model_proto", "vocab_size": 8192 }, "litert": { "quant": "fp16", "results": [ { "graph": "encoder", "source_artifact": "encoder_T1500.pt2", "output_artifact": "encoder_T1500.tflite", "size_mb": 1150.88, "convert_seconds": 158.59, "quant": "fp16", "torch_output_shapes": [ [ 1, 1024, 188 ], [ 1 ] ], "parity": { "ok": true, "max_abs_diff": 0.0, "per_output_diffs": [ [ "shape mismatch", [ 1 ], [ 1, 1024, 188 ] ], [ "shape mismatch", [ 1, 1024, 188 ], [ 1 ] ] ], "tflite_output_shapes": [ [ 1 ], [ 1, 1024, 188 ] ], "torch_output_shapes": [ [ 1, 1024, 188 ], [ 1 ] ] } }, { "graph": "decoder_step", "source_artifact": "decoder_step.pt2", "output_artifact": "decoder_step.tflite", "size_mb": 22.55, "convert_seconds": 1.92, "quant": "fp16", "torch_output_shapes": [ [ 1, 1, 640 ], [ 2, 1, 640 ], [ 2, 1, 640 ] ], "parity": { "ok": true, "max_abs_diff": 0.0044100284576416016, "per_output_diffs": [ [ "shape mismatch", [ 2, 1, 640 ], [ 1, 1, 640 ] ], [ "shape mismatch", [ 1, 1, 640 ], [ 2, 1, 640 ] ], 0.0044100284576416016 ], "tflite_output_shapes": [ [ 2, 1, 640 ], [ 1, 1, 640 ], [ 2, 1, 640 ] ], "torch_output_shapes": [ [ 1, 1, 640 ], [ 2, 1, 640 ], [ 2, 1, 640 ] ] } }, { "graph": "joint_step", "source_artifact": "joint_step.pt2", "output_artifact": "joint_step.tflite", "size_mb": 12.08, "convert_seconds": 1.61, "quant": "fp16", "torch_output_shapes": [ [ 1, 1, 1, 8198 ] ], "parity": { "ok": true, "max_abs_diff": 0.408447265625, "per_output_diffs": [ 0.408447265625 ], "tflite_output_shapes": [ [ 1, 1, 1, 8198 ] ], "torch_output_shapes": [ [ 1, 1, 1, 8198 ] ] } } ] } }