{ "model": { "type": "nemotron_speech", "vocab_size": 1025, "num_mels": 128, "fft_size": 512, "hop_length": 160, "win_length": 400, "preemph": 0.97, "log_eps": 5.96046448e-08, "subsampling_factor": 8, "left_context": 70, "conv_context": 8, "pre_encode_cache_size": 9, "sample_rate": 16000, "chunk_samples": 8960, "blank_id": 1024, "max_symbols_per_step": 10, "encoder": { "filename": "encoder.onnx", "hidden_size": 1024, "num_hidden_layers": 24, "inputs": { "audio_features": "audio_signal", "input_lengths": "length", "cache_last_channel": "cache_last_channel", "cache_last_time": "cache_last_time", "cache_last_channel_len": "cache_last_channel_len" }, "outputs": { "encoder_outputs": "outputs", "output_lengths": "encoded_lengths", "cache_last_channel_next": "cache_last_channel_next", "cache_last_time_next": "cache_last_time_next", "cache_last_channel_len_next": "cache_last_channel_len_next" } }, "decoder": { "filename": "decoder.onnx", "hidden_size": 640, "num_hidden_layers": 2, "inputs": { "targets": "targets", "lstm_hidden_state": "h_in", "lstm_cell_state": "c_in" }, "outputs": { "outputs": "decoder_output", "lstm_hidden_state": "h_out", "lstm_cell_state": "c_out" } }, "joiner": { "filename": "joint.onnx", "inputs": { "encoder_outputs": "encoder_output", "decoder_outputs": "decoder_output" }, "outputs": { "logits": "joint_output" } } } }