| config: conf/tuning/prodiff_gst_xvector_base.yaml |
| print_config: false |
| log_level: INFO |
| dry_run: false |
| iterator_type: sequence |
| output_dir: exp/tts_prodiff_gst_xvector_base_raw_phn_none |
| ngpu: 1 |
| seed: 0 |
| num_workers: 6 |
| num_att_plot: 3 |
| num_valid_artifacts: 5 |
| dist_backend: nccl |
| dist_init_method: env:// |
| dist_world_size: 2 |
| dist_rank: 0 |
| local_rank: 0 |
| dist_master_addr: localhost |
| dist_master_port: 32945 |
| dist_launcher: null |
| multiprocessing_distributed: true |
| unused_parameters: false |
| sharded_ddp: false |
| growth_interval: 0 |
| min_grad_scale: -1 |
| cudnn_enabled: true |
| cudnn_benchmark: false |
| cudnn_deterministic: true |
| collect_stats: false |
| write_collected_feats: false |
| max_epoch: 600 |
| patience: null |
| val_scheduler_criterion: |
| - valid |
| - loss |
| early_stopping_criterion: |
| - valid |
| - loss |
| - min |
| best_model_criterion: |
| - - valid |
| - loss |
| - min |
| - - train |
| - loss |
| - min |
| keep_nbest_models: 5 |
| nbest_averaging_interval: 0 |
| grad_clip: 1.0 |
| grad_clip_type: 2.0 |
| grad_noise: false |
| accum_grad: 1 |
| no_forward_run: false |
| resume: true |
| train_dtype: float32 |
| use_amp: false |
| log_interval: null |
| use_matplotlib: true |
| use_tensorboard: true |
| detect_anomaly: false |
| pretrain_path: null |
| init_param: [] |
| ignore_init_mismatch: false |
| freeze_param: [] |
| num_iters_per_epoch: 250 |
| batch_size: 20 |
| valid_batch_size: null |
| valid_num_batches: null |
| batch_bins: 6000000 |
| valid_batch_bins: null |
| train_shape_file: |
| - exp/tts_stats_raw_phn_none/train/text_shape.phn |
| - exp/tts_stats_raw_phn_none/train/speech_shape |
| valid_shape_file: |
| - exp/tts_stats_raw_phn_none/valid/text_shape.phn |
| - exp/tts_stats_raw_phn_none/valid/speech_shape |
| batch_type: numel |
| valid_batch_type: null |
| fold_length: |
| - 150 |
| - 240000 |
| sort_in_batch: descending |
| sort_batch: descending |
| multiple_iterator: false |
| chunk_length: 500 |
| chunk_shift_ratio: 0.5 |
| num_cache_chunks: 1024 |
| train_data_path_and_name_and_type: |
| - - dump/raw/tr_no_dev_phn/text |
| - text |
| - text |
| - - data/tr_no_dev_phn/durations |
| - durations |
| - text_int |
| - - dump/raw/tr_no_dev_phn/wav.scp |
| - speech |
| - sound |
| - - exp/tts_stats_raw_phn_none/train/collect_feats/pitch.scp |
| - pitch |
| - npy |
| - - exp/tts_stats_raw_phn_none/train/collect_feats/energy.scp |
| - energy |
| - npy |
| - - xvector/tr_no_dev_phn/xvector.scp |
| - spembs |
| - kaldi_ark |
| valid_data_path_and_name_and_type: |
| - - dump/raw/dev_phn/text |
| - text |
| - text |
| - - data/dev_phn/durations |
| - durations |
| - text_int |
| - - dump/raw/dev_phn/wav.scp |
| - speech |
| - sound |
| - - exp/tts_stats_raw_phn_none/valid/collect_feats/pitch.scp |
| - pitch |
| - npy |
| - - exp/tts_stats_raw_phn_none/valid/collect_feats/energy.scp |
| - energy |
| - npy |
| - -xvector/dev_phn/xvector.scp |
| - spembs |
| - kaldi_ark |
| allow_variable_data_keys: false |
| max_cache_size: 0.0 |
| max_cache_fd: 32 |
| valid_max_cache_size: null |
| optim: adamw |
| optim_conf: |
| lr: 1.0 |
| betas: |
| - 0.9 |
| - 0.98 |
| scheduler: noamlr |
| scheduler_conf: |
| model_size: 384 |
| warmup_steps: 2000 |
| token_list: |
| - <blank> |
| - <unk> |
| - o |
| - a |
| - u |
| - i |
| - e |
| - k |
| - r |
| - t |
| - n |
| - 、 |
| - N |
| - s |
| - sh |
| - d |
| - m |
| - g |
| - b |
| - w |
| - cl |
| - j |
| - ch |
| - sil |
| - h |
| - y |
| - p |
| - ts |
| - z |
| - f |
| - ky |
| - U |
| - ny |
| - gy |
| - ry |
| - I |
| - hy |
| - my |
| - by |
| - py |
| - v |
| - <sos/eos> |
| odim: null |
| model_conf: |
| requires_word_duration: false |
| use_preprocessor: true |
| token_type: phn |
| bpemodel: null |
| non_linguistic_symbols: null |
| cleaner: null |
| g2p: null |
| feats_extract: fbank |
| feats_extract_conf: |
| n_fft: 2048 |
| hop_length: 300 |
| win_length: 1200 |
| fs: 24000 |
| fmin: 80 |
| fmax: 7600 |
| n_mels: 80 |
| normalize: global_mvn |
| normalize_conf: |
| stats_file: stats/feats_stats.npz |
| tts: prodiff |
| tts_conf: |
| adim: 384 |
| aheads: 2 |
| elayers: 4 |
| eunits: 1536 |
| positionwise_layer_type: conv1d-linear |
| positionwise_conv_kernel_size: 9 |
| use_masking: true |
| use_scaled_pos_enc: true |
| encoder_normalize_before: true |
| reduction_factor: 1 |
| init_type: xavier_uniform |
| init_enc_alpha: 1.0 |
| transformer_enc_dropout_rate: 0.05 |
| transformer_enc_positional_dropout_rate: 0.05 |
| transformer_enc_attn_dropout_rate: 0.05 |
| duration_predictor_layers: 2 |
| duration_predictor_chans: 512 |
| duration_predictor_kernel_size: 3 |
| pitch_predictor_layers: 2 |
| pitch_predictor_chans: 512 |
| pitch_predictor_kernel_size: 3 |
| pitch_predictor_dropout: 0.5 |
| pitch_embed_kernel_size: 1 |
| pitch_embed_dropout: 0.0 |
| stop_gradient_from_pitch_predictor: true |
| energy_predictor_layers: 2 |
| energy_predictor_chans: 512 |
| energy_predictor_kernel_size: 3 |
| energy_predictor_dropout: 0.5 |
| energy_embed_kernel_size: 1 |
| energy_embed_dropout: 0.0 |
| stop_gradient_from_energy_predictor: false |
| spks: -1 |
| spk_embed_dim: 192 |
| denoiser_layers: 20 |
| denoiser_channels: 256 |
| diffusion_steps: 4 |
| diffusion_timescale: 1 |
| diffusion_beta: 40.0 |
| diffusion_scheduler: vpsde |
| diffusion_cycle_ln: 1 |
| use_gst: true |
| gst_heads: 8 |
| gst_tokens: 256 |
| pitch_extract: dio |
| pitch_extract_conf: |
| fs: 24000 |
| n_fft: 2048 |
| hop_length: 300 |
| f0max: 400 |
| f0min: 80 |
| reduction_factor: 1 |
| pitch_normalize: global_mvn |
| pitch_normalize_conf: |
| stats_file: stats/pitch_stats.npz |
| energy_extract: energy |
| energy_extract_conf: |
| fs: 24000 |
| n_fft: 2048 |
| hop_length: 300 |
| win_length: 1200 |
| reduction_factor: 1 |
| energy_normalize: global_mvn |
| energy_normalize_conf: |
| stats_file: stats/energy_stats.npz |
| required: |
| - output_dir |
| - token_list |
| version: '202207' |
| distributed: true |
|
|