| model: FunASRNano |
| model_conf: |
| lsm_weight: 0.1 |
| length_normalized_loss: true |
| audio_encoder: SenseVoiceEncoderSmall |
| audio_encoder_conf: |
| output_size: 512 |
| attention_heads: 4 |
| linear_units: 2048 |
| num_blocks: 50 |
| tp_blocks: 20 |
| dropout_rate: 0.1 |
| positional_dropout_rate: 0.1 |
| attention_dropout_rate: 0.1 |
| input_layer: pe |
| pos_enc_class: SinusoidalPositionEncoder |
| normalize_before: true |
| kernel_size: 11 |
| sanm_shfit: 0 |
| selfattention_layer_type: sanm |
| freeze: true |
| freeze_layer_num: -1 |
| feat_permute: true |
| llm: Qwen3-0.6b |
| llm_conf: |
| hub: hf |
| freeze: true |
| llm_dtype: bf16 |
| init_param_path: Qwen3-0.6B |
| use_lora: false |
| lora_conf: |
| freeze_lora: true |
| task_type: CAUSAL_LM |
| r: 16 |
| lora_alpha: 32 |
| lora_dropout: 0.05 |
| bias: none |
| target_modules: |
| - q_proj |
| - v_proj |
| init_param_path: "" |
| audio_adaptor: Transformer |
| audio_adaptor_conf: |
| downsample_rate: 1 |
| use_low_frame_rate: true |
| ffn_dim: 2048 |
| llm_dim: 1024 |
| encoder_dim: 512 |
| n_layer: 2 |
| freeze: true |
| ctc_decoder: Transformer |
| detach_ctc_decoder: true |
| ctc_decoder_conf: |
| downsample_rate: 1 |
| ffn_dim: 2048 |
| llm_dim: 512 |
| encoder_dim: 512 |
| n_layer: 5 |
| freeze: false |
| ctc_weight: 1.0 |
| ctc_conf: |
| dropout_rate: 0.0 |
| ctc_type: builtin |
| reduce: true |
| ignore_nan_grad: true |
| frontend: WavFrontend |
| frontend_conf: |
| fs: 16000 |
| window: hamming |
| n_mels: 80 |
| frame_length: 25 |
| frame_shift: 10 |
| lfr_m: 7 |
| lfr_n: 6 |
| cmvn_file: null |
| train_conf: |
| use_lora: ${llm_conf.use_lora} |
| accum_grad: 1 |
| grad_clip: 5 |
| max_epoch: 2 |
| keep_nbest_models: 200 |
| log_interval: 100 |
| effective_save_name_excludes: |
| - llm. |
| resume: true |
| validate_interval: 2000 |
| save_checkpoint_interval: 2000 |
| avg_nbest_model: 100 |
| use_bf16: false |
| use_deepspeed: true |
| deepspeed_config: null |
| save_init_model: false |
| optim: adamw |
| optim_conf: |
| lr: 5.0e-06 |
| weight_decay: 0.0 |
| scheduler: warmuplr |
| scheduler_conf: |
| warmup_steps: 2500 |
| dataset: FunASR |
| dataset_conf: |
| index_ds: FunASR |
| batch_sampler: BatchSampler |
| batch_type: token |
| batch_size: 6000 |
| max_token_length: 3500 |
| shuffle: true |
| sort_size: 1024 |
| batch_size_scale_ratio_max: 2 |
| num_workers: 4 |
| audio_adaptor_downsample_rate: ${audio_adaptor_conf.downsample_rate} |
| audio_encoder_downsample_rate: 6 |
| data_split_num: 256 |
| batch_size_sample_max: 10 |
| retry: 2000 |
| batch_size_token_max: 6000 |
| max_source_length: 12000 |
| max_target_length: 2048 |
| prompt_classes: MultiContextPrompt |
| prompt_conf: |
| max_neg_hotwords_num: 0 |
| min_neg_hotwords_num: 0 |
| use_hist: false |
| use_one_pass_result: true |
| use_hotwords: true |
| use_asr_hotwords: true |
| chinese_hotwords_list: null |
| english_hotwords_list: null |
| ctc_tokenizer: SenseVoiceTokenizer |
| ctc_target_normalize: true |
| ctc_tokenizer_conf: |
| vocab_path: null |
| is_multilingual: true |
| num_languages: 8749 |
| min_source_length: 10 |
| batch_size_scale_threshold: 3000 |
| use_dynamic_output_ratio: 0.0 |
| tokenizer: HuggingfaceTokenizer |
| tokenizer_conf: |
| init_param_path: ${llm_conf.init_param_path} |
| enable_tf32: true |
| debug: false |
| train_data_set_list: null |
| valid_data_set_list: null |
| init_param: null |
| output_dir: null |
|
|