| |
| |
| |
|
|
|
|
| |
| HPARAMS_NEEDED: ["window_length", "stride", "encoder_dim", "out_n_neurons", "avg_pool", "label_encoder", "softmax"] |
| |
| MODULES_NEEDED: ["wav2vec2", "output_mlp"] |
|
|
| |
| wav2vec2_hub: "microsoft/wavlm-large" |
|
|
| |
| pretrained_path: speechbrain/emotion-diarization-wavlm-large |
|
|
| |
| window_length: 3 |
| stride: 1 |
| encoder_dim: 1024 |
| out_n_neurons: 4 |
|
|
| input_norm: !new:speechbrain.processing.features.InputNormalization |
| norm_type: sentence |
| std_norm: False |
|
|
| wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 |
| source: !ref <wav2vec2_hub> |
| output_norm: True |
| freeze: False |
| freeze_feature_extractor: True |
| save_path: wav2vec2_checkpoint |
|
|
| avg_pool: !new:speechbrain.nnet.pooling.Pooling1d |
| pool_type: "avg" |
| kernel_size: !ref <window_length> |
| stride: !ref <stride> |
| ceil_mode: True |
|
|
| output_mlp: !new:speechbrain.nnet.linear.Linear |
| input_size: !ref <encoder_dim> |
| n_neurons: !ref <out_n_neurons> |
| bias: False |
|
|
| model: !new:torch.nn.ModuleList |
| - [!ref <output_mlp>] |
|
|
| modules: |
| input_norm: !ref <input_norm> |
| wav2vec2: !ref <wav2vec2> |
| output_mlp: !ref <output_mlp> |
|
|
| log_softmax: !new:speechbrain.nnet.activations.Softmax |
| apply_log: True |
|
|
| label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder |
|
|
| pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer |
| loadables: |
| input_norm: !ref <input_norm> |
| wav2vec2: !ref <wav2vec2> |
| model: !ref <model> |
| label_encoder: !ref <label_encoder> |
| paths: |
| input_norm: !ref <pretrained_path>/input_norm.ckpt |
| wav2vec2: !ref <pretrained_path>/wav2vec2.ckpt |
| model: !ref <pretrained_path>/model.ckpt |
| label_encoder: !ref <pretrained_path>/label_encoder.txt |
|
|