| pretrained_path: sinarashidi/10epoch |
|
|
| sample_rate: 16000 |
|
|
| |
| wav2vec2_hub: m3hrdadfi/wav2vec2-large-xlsr-persian-v3 |
|
|
| |
| vocab_size: 100 |
| blank_index: 99 |
| bos_index: 97 |
| eos_index: 98 |
| pad_index: 99 |
| label_smoothing: 0.0 |
|
|
| |
| features_dim: 1024 |
|
|
| |
| enc_kernel_size: 3 |
| enc_stride: 2 |
|
|
| |
| embedding_size: 512 |
| d_model: 512 |
| nhead: 8 |
| num_encoder_layers: 0 |
| num_decoder_layers: 6 |
| d_ffn: 2048 |
| transformer_dropout: 0.1 |
| activation: !name:torch.nn.GELU |
| output_neurons: !ref <vocab_size> |
| attention_type: "RelPosMHAXL" |
|
|
| |
| min_decode_ratio: 0.0 |
| max_decode_ratio: 1.0 |
|
|
| wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 |
| source: !ref <wav2vec2_hub> |
| output_norm: True |
| freeze: True |
| freeze_feature_extractor: True |
| apply_spec_augment : True |
| save_path: wav2vec2_checkpoints |
|
|
| length_regulator: !new:speechbrain.nnet.CNN.Conv1d |
| input_shape: [null, null, !ref <features_dim>] |
| out_channels: !ref <embedding_size> |
| kernel_size: !ref <enc_kernel_size> |
| stride: !ref <enc_stride> |
|
|
| transformer_decoder: !new:speechbrain.lobes.models.transformer.TransformerST.TransformerST |
| input_size: !ref <embedding_size> |
| tgt_vocab: !ref <output_neurons> |
| d_model: !ref <d_model> |
| nhead: !ref <nhead> |
| num_encoder_layers: !ref <num_encoder_layers> |
| num_decoder_layers: !ref <num_decoder_layers> |
| d_ffn: !ref <d_ffn> |
| dropout: !ref <transformer_dropout> |
| activation: !ref <activation> |
| attention_type: !ref <attention_type> |
| normalize_before: True |
| causal: False |
|
|
| log_softmax: !new:speechbrain.nnet.activations.Softmax |
| apply_log: True |
|
|
| seq_lin: !new:speechbrain.nnet.linear.Linear |
| input_size: !ref <d_model> |
| n_neurons: !ref <output_neurons> |
|
|
| model: !new:torch.nn.ModuleList |
| - [!ref <length_regulator>, !ref <transformer_decoder>, !ref <seq_lin>] |
|
|
| encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential |
| wav2vec2: !ref <wav2vec2> |
| length_regulator: !ref <length_regulator> |
|
|
| decoder_beamsearch: !new:speechbrain.decoders.seq2seq.S2STransformerBeamSearcher |
| modules: [!ref <transformer_decoder>, !ref <seq_lin>] |
| bos_index: !ref <bos_index> |
| eos_index: !ref <eos_index> |
| min_decode_ratio: !ref <min_decode_ratio> |
| max_decode_ratio: !ref <max_decode_ratio> |
| beam_size: 10 |
| temperature: 1.0 |
|
|
| modules: |
| encoder: !ref <encoder> |
| decoder: !ref <decoder_beamsearch> |
|
|
| pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer |
| loadables: |
| model: !ref <model> |
| wav2vec2: !ref <wav2vec2> |
| paths: |
| wav2vec2: !ref <pretrained_path>/wav2vec2.ckpt |
| model: !ref <pretrained_path>/model.ckpt |