Automatic Speech Recognition
LiteRT
LiteRT
speech
audio
parakeet
tdt
on-device
mobile
android
streaming
Instructions to use spybyscript/parakeet-tdt-litert with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- LiteRT
How to use spybyscript/parakeet-tdt-litert with LiteRT:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| { | |
| "model": "nvidia/parakeet-tdt-0.6b-v3", | |
| "torch_version": "2.11.0+cu130", | |
| "model_class": "EncDecRNNTBPEModel", | |
| "vocab_size": 8192, | |
| "blank_id": 8192, | |
| "durations": [ | |
| 0, | |
| 1, | |
| 2, | |
| 3, | |
| 4 | |
| ], | |
| "num_durations": 5, | |
| "joint_output_dim": 8198, | |
| "joint_token_logits_slice": [ | |
| 0, | |
| 8193 | |
| ], | |
| "joint_duration_logits_slice": [ | |
| 8193, | |
| 8198 | |
| ], | |
| "encoder": { | |
| "d_model": 1024, | |
| "subsampling_factor": 8, | |
| "n_layers": 24, | |
| "n_heads": 8, | |
| "feat_in": 128, | |
| "attention_mode": "rel_pos", | |
| "att_context_size": null, | |
| "buckets": [ | |
| { | |
| "n_mel_frames": 1500, | |
| "n_encoder_frames": 187, | |
| "input_shape": [ | |
| 1, | |
| 128, | |
| 1500 | |
| ], | |
| "output_shape": [ | |
| 1, | |
| 1024, | |
| 188 | |
| ], | |
| "artifact": "encoder_T1500.pt2", | |
| "size_mb": 2366.85 | |
| } | |
| ], | |
| "multisig": false | |
| }, | |
| "decoder": { | |
| "num_layers": 2, | |
| "hidden": 640, | |
| "embed_dim": 640 | |
| }, | |
| "joint": { | |
| "d_enc": 1024, | |
| "d_pred": 640, | |
| "joint_dim": 640 | |
| }, | |
| "preprocessor": { | |
| "sample_rate": 16000, | |
| "n_fft": 512, | |
| "win_length": 400, | |
| "hop_length": 160, | |
| "n_mels": 128, | |
| "preemph": 0.97, | |
| "log": true, | |
| "frame_rate_hz_post_subsample": 12.5 | |
| }, | |
| "artifacts": { | |
| "decoder_step": { | |
| "filename": "decoder_step.pt2", | |
| "size_mb": 45.07, | |
| "input_shapes": { | |
| "token": [ | |
| 1, | |
| 1 | |
| ], | |
| "h": [ | |
| 2, | |
| 1, | |
| 640 | |
| ], | |
| "c": [ | |
| 2, | |
| 1, | |
| 640 | |
| ] | |
| }, | |
| "input_dtypes": { | |
| "token": "int64", | |
| "h": "float32", | |
| "c": "float32" | |
| }, | |
| "output_shapes": { | |
| "g": [ | |
| 1, | |
| 1, | |
| 640 | |
| ], | |
| "h": [ | |
| 2, | |
| 1, | |
| 640 | |
| ], | |
| "c": [ | |
| 2, | |
| 1, | |
| 640 | |
| ] | |
| } | |
| }, | |
| "joint_step": { | |
| "filename": "joint_step.pt2", | |
| "size_mb": 24.14, | |
| "input_shapes": { | |
| "enc_frame": [ | |
| 1, | |
| 1024, | |
| 1 | |
| ], | |
| "pred_frame": [ | |
| 1, | |
| 640, | |
| 1 | |
| ] | |
| }, | |
| "output_shape": [ | |
| 1, | |
| 1, | |
| 1, | |
| 8198 | |
| ] | |
| } | |
| }, | |
| "tokenizer": { | |
| "saved": true, | |
| "method": "serialized_model_proto", | |
| "vocab_size": 8192 | |
| }, | |
| "litert": { | |
| "quant": "fp16", | |
| "results": [ | |
| { | |
| "graph": "encoder", | |
| "source_artifact": "encoder_T1500.pt2", | |
| "output_artifact": "encoder_T1500.tflite", | |
| "size_mb": 1150.88, | |
| "convert_seconds": 158.59, | |
| "quant": "fp16", | |
| "torch_output_shapes": [ | |
| [ | |
| 1, | |
| 1024, | |
| 188 | |
| ], | |
| [ | |
| 1 | |
| ] | |
| ], | |
| "parity": { | |
| "ok": true, | |
| "max_abs_diff": 0.0, | |
| "per_output_diffs": [ | |
| [ | |
| "shape mismatch", | |
| [ | |
| 1 | |
| ], | |
| [ | |
| 1, | |
| 1024, | |
| 188 | |
| ] | |
| ], | |
| [ | |
| "shape mismatch", | |
| [ | |
| 1, | |
| 1024, | |
| 188 | |
| ], | |
| [ | |
| 1 | |
| ] | |
| ] | |
| ], | |
| "tflite_output_shapes": [ | |
| [ | |
| 1 | |
| ], | |
| [ | |
| 1, | |
| 1024, | |
| 188 | |
| ] | |
| ], | |
| "torch_output_shapes": [ | |
| [ | |
| 1, | |
| 1024, | |
| 188 | |
| ], | |
| [ | |
| 1 | |
| ] | |
| ] | |
| } | |
| }, | |
| { | |
| "graph": "decoder_step", | |
| "source_artifact": "decoder_step.pt2", | |
| "output_artifact": "decoder_step.tflite", | |
| "size_mb": 22.55, | |
| "convert_seconds": 1.92, | |
| "quant": "fp16", | |
| "torch_output_shapes": [ | |
| [ | |
| 1, | |
| 1, | |
| 640 | |
| ], | |
| [ | |
| 2, | |
| 1, | |
| 640 | |
| ], | |
| [ | |
| 2, | |
| 1, | |
| 640 | |
| ] | |
| ], | |
| "parity": { | |
| "ok": true, | |
| "max_abs_diff": 0.0044100284576416016, | |
| "per_output_diffs": [ | |
| [ | |
| "shape mismatch", | |
| [ | |
| 2, | |
| 1, | |
| 640 | |
| ], | |
| [ | |
| 1, | |
| 1, | |
| 640 | |
| ] | |
| ], | |
| [ | |
| "shape mismatch", | |
| [ | |
| 1, | |
| 1, | |
| 640 | |
| ], | |
| [ | |
| 2, | |
| 1, | |
| 640 | |
| ] | |
| ], | |
| 0.0044100284576416016 | |
| ], | |
| "tflite_output_shapes": [ | |
| [ | |
| 2, | |
| 1, | |
| 640 | |
| ], | |
| [ | |
| 1, | |
| 1, | |
| 640 | |
| ], | |
| [ | |
| 2, | |
| 1, | |
| 640 | |
| ] | |
| ], | |
| "torch_output_shapes": [ | |
| [ | |
| 1, | |
| 1, | |
| 640 | |
| ], | |
| [ | |
| 2, | |
| 1, | |
| 640 | |
| ], | |
| [ | |
| 2, | |
| 1, | |
| 640 | |
| ] | |
| ] | |
| } | |
| }, | |
| { | |
| "graph": "joint_step", | |
| "source_artifact": "joint_step.pt2", | |
| "output_artifact": "joint_step.tflite", | |
| "size_mb": 12.08, | |
| "convert_seconds": 1.61, | |
| "quant": "fp16", | |
| "torch_output_shapes": [ | |
| [ | |
| 1, | |
| 1, | |
| 1, | |
| 8198 | |
| ] | |
| ], | |
| "parity": { | |
| "ok": true, | |
| "max_abs_diff": 0.408447265625, | |
| "per_output_diffs": [ | |
| 0.408447265625 | |
| ], | |
| "tflite_output_shapes": [ | |
| [ | |
| 1, | |
| 1, | |
| 1, | |
| 8198 | |
| ] | |
| ], | |
| "torch_output_shapes": [ | |
| [ | |
| 1, | |
| 1, | |
| 1, | |
| 8198 | |
| ] | |
| ] | |
| } | |
| } | |
| ] | |
| } | |
| } | |