| train_config: |
| expdir: YOUR_DATA_ROOT_DIR/audio-flamingo-data/checkpoint |
| run_name: foundation_sft_8_shot |
| delete_previous_checkpoint: true |
| batch_size: 4 |
| gradient_accumulation_steps: 4 |
| seed: 42 |
| learning_rate: 0.00002 |
| lr_scheduler: constant |
| loss_multiplier: 1.0 |
| warmup_steps: 1875 |
| weight_decay: 0.1 |
| precision: fp32 |
| gradient_checkpointing: False |
| num_epochs: 100 |
| offline: false |
| freeze_lm_embeddings: false |
| logging_steps: 10 |
| dist_backend: nccl |
| dist_url: env:// |
| no_set_device_rank: false |
| fsdp: true |
| fsdp_use_orig_params: false |
| fsdp_sharding_strategy: full |
| horovod: false |
|
|
| |
| sft_config: |
| pretrained_path: YOUR_DATA_ROOT_DIR/audio-flamingo-data/checkpoint/foundation_pretrain/ |
| pretrained_ckpt: checkpoint_99.pt |
| unfreeze_full_lm: true |
|
|
| data_config: |
| dataset_blending_global_weight: 0.01 |
|
|
| dataset_blending_config: |
|
|
| |
|
|
| Clotho-AQA-AQA/train: |
| weight: 3.5 |
| prefix_prob: 1.0 |
| augmentations: |
| AQA_binary_instruction: 1.0 |
| Clotho-AQA-AQA/interleaved_knn-train: |
| weight: 0.5 |
| prefix_prob: 1.0 |
| augmentations: |
| AQA_binary_instruction: 1.0 |
|
|
| OpenAQA-AQA/train: |
| weight: 0.1 |
| prefix_prob: 1.0 |
| augmentations: |
| do_nothing: 1.0 |
|
|
| |
|
|
| Clotho-v2-AudioCaptioning/train: |
| weight: 2.0 |
| prefix_prob: 1.0 |
| augmentations: |
| AC_short: 1.0 |
| Clotho-v2-AudioCaptioning/interleaved_knn-train: |
| weight: 0.5 |
| prefix_prob: 1.0 |
| augmentations: |
| AC_short: 1.0 |
|
|
| Epidemic_sound-AudioCaptioning/train: |
| weight: 0.8 |
| prefix_prob: 1.0 |
| augmentations: |
| AC_short: 1.0 |
| Epidemic_sound-AudioCaptioning/interleaved_knn-train: |
| weight: 0.2 |
| prefix_prob: 1.0 |
| augmentations: |
| AC_short: 1.0 |
|
|
| MACS-AudioCaptioning/train: |
| weight: 0.8 |
| prefix_prob: 1.0 |
| augmentations: |
| AC_short: 1.0 |
| MACS-AudioCaptioning/interleaved_knn-train: |
| weight: 0.2 |
| prefix_prob: 1.0 |
| augmentations: |
| AC_short: 1.0 |
|
|
| |
|
|
| FSD50k-EventClassification/train: |
| weight: 0.9 |
| prefix_prob: 1.0 |
| augmentations: |
| default: 1.0 |
| FSD50k-EventClassification/interleaved_knn-train: |
| weight: 0.3 |
| prefix_prob: 1.0 |
| augmentations: |
| default: 1.0 |
|
|
| CochlScene-SceneClassification/train: |
| weight: 1.2 |
| prefix_prob: 1.0 |
| augmentations: |
| provide_all_labels: 0.9 |
| default: 0.1 |
| CochlScene-SceneClassification/interleaved_knn-train: |
| weight: 0.3 |
| prefix_prob: 1.0 |
| augmentations: |
| provide_all_labels: 0.9 |
| default: 0.1 |
|
|
| NonSpeech7k-EventClassification/train: |
| weight: 2.4 |
| prefix_prob: 1.0 |
| augmentations: |
| provide_all_labels: 0.9 |
| default: 0.1 |
| NonSpeech7k-EventClassification/interleaved_knn-train: |
| weight: 0.6 |
| prefix_prob: 1.0 |
| augmentations: |
| provide_all_labels: 0.9 |
| default: 0.1 |
|
|
| chime-home-EventClassification/train: |
| weight: 1.5 |
| prefix_prob: 1.0 |
| augmentations: |
| default: 0.5 |
| num_words: 0.5 |
| chime-home-EventClassification/interleaved_knn-train: |
| weight: 0.5 |
| prefix_prob: 1.0 |
| augmentations: |
| default: 0.5 |
| num_words: 0.5 |
|
|
| SONYC-UST-EventClassification/train: |
| weight: 0.8 |
| prefix_prob: 1.0 |
| augmentations: |
| default: 0.5 |
| num_words: 0.5 |
| SONYC-UST-EventClassification/interleaved_knn-train: |
| weight: 0.2 |
| prefix_prob: 1.0 |
| augmentations: |
| default: 0.5 |
| num_words: 0.5 |
|
|
| |
|
|
| emov-db-EmotionClassification/train: |
| weight: 1.6 |
| prefix_prob: 1.0 |
| augmentations: |
| provide_all_labels: 0.9 |
| default: 0.1 |
| emov-db-EmotionClassification/interleaved_knn-train: |
| weight: 0.4 |
| prefix_prob: 1.0 |
| augmentations: |
| provide_all_labels: 0.9 |
| default: 0.1 |
|
|
| jl-corpus-EmotionClassification/train: |
| weight: 6.0 |
| prefix_prob: 1.0 |
| augmentations: |
| provide_all_labels: 0.9 |
| default: 0.1 |
| jl-corpus-EmotionClassification/interleaved_knn-train: |
| weight: 1.5 |
| prefix_prob: 1.0 |
| augmentations: |
| provide_all_labels: 0.9 |
| default: 0.1 |
|
|
| tess-EmotionClassification/train: |
| weight: 2.0 |
| prefix_prob: 1.0 |
| augmentations: |
| provide_all_labels: 0.9 |
| default: 0.1 |
| tess-EmotionClassification/interleaved_knn-train: |
| weight: 0.5 |
| prefix_prob: 1.0 |
| augmentations: |
| provide_all_labels: 0.9 |
| default: 0.1 |
|
|
| OMGEmotion-EmotionClassification/train: |
| weight: 3.0 |
| prefix_prob: 1.0 |
| augmentations: |
| provide_all_labels: 0.9 |
| default: 0.1 |
|
|
| |
| |
| Music-AVQA-AQA_All/train: |
| weight: 5.0 |
| prefix_prob: 1.0 |
| augmentations: |
| AQA_binary_instruction: 1.0 |
| Music-AVQA-AQA_All/interleaved_knn-train: |
| weight: 1.0 |
| prefix_prob: 1.0 |
| augmentations: |
| AQA_binary_instruction: 1.0 |
|
|
| MU-LLAMA-AQA/train: |
| weight: 0.35 |
| prefix_prob: 1.0 |
| augmentations: |
| do_nothing: 1.0 |
| MU-LLAMA-AQA/interleaved_knn-train: |
| weight: 0.05 |
| prefix_prob: 1.0 |
| augmentations: |
| do_nothing: 1.0 |
|
|
| |
|
|
| LP-MusicCaps-MSD-AudioCaptioning/train: |
| weight: 0.025 |
| prefix_prob: 1.0 |
| augmentations: |
| AC_paragraph: 1.0 |
| LP-MusicCaps-MSD-AudioCaptioning/interleaved_knn-train: |
| weight: 0.007 |
| prefix_prob: 1.0 |
| augmentations: |
| AC_paragraph: 1.0 |
| |
| LP-MusicCaps-MC-AudioCaptioning/train: |
| weight: 2.0 |
| prefix_prob: 1.0 |
| augmentations: |
| AC_paragraph: 1.0 |
|
|
| LP-MusicCaps-MTT-AudioCaptioning/train: |
| weight: 0.8 |
| prefix_prob: 1.0 |
| augmentations: |
| AC_long: 1.0 |
| LP-MusicCaps-MTT-AudioCaptioning/interleaved_knn-train: |
| weight: 0.2 |
| prefix_prob: 1.0 |
| augmentations: |
| AC_long: 1.0 |
|
|
| MusicCaps-AudioCaptioning/train: |
| weight: 6.0 |
| prefix_prob: 1.0 |
| augmentations: |
| AC_paragraph: 1.0 |
|
|
| |
|
|
| NSynth-MIR/train: |
| weight: 1.0 |
| prefix_prob: 1.0 |
| augmentations: |
| do_nothing: 1.0 |
| NSynth-MIR/interleaved_knn-train: |
| weight: 1.0 |
| prefix_prob: 1.0 |
| augmentations: |
| do_nothing: 1.0 |
|
|
| mtg-jamendo-MusicTagging/train: |
| weight: 0.1 |
| prefix_prob: 1.0 |
| augmentations: |
| default: 1.0 |
| |
| FMA-GenreClassification/train: |
| weight: 0.4 |
| prefix_prob: 1.0 |
| augmentations: |
| do_nothing: 1.0 |
| FMA-GenreClassification/interleaved_knn-train: |
| weight: 0.3 |
| prefix_prob: 1.0 |
| augmentations: |
| do_nothing: 1.0 |
|
|
| musdbhq-InstrClassification/train: |
| weight: 1.0 |
| prefix_prob: 1.0 |
| augmentations: |
| provide_all_labels: 0.9 |
| default: 0.1 |
| |
| dataset_file_root: YOUR_DATA_ROOT_DIR/audio-flamingo-data/dataset_files |
| data_root: YOUR_DATA_ROOT_DIR/datasets |
| dataset_blending_output: dataset_blending.json |
| max_tokens: 512 |
| num_workers: 4 |
|
|
| valid_dataset_config: |
| Clotho-AQA-AQA/val: true |
| Clotho-v2-AudioCaptioning/val: true |
| Clotho-v2-AudioCaptioning/interleaved_knn-val: true |
| CochlScene-SceneClassification/val: true |
| CochlScene-SceneClassification/interleaved_knn-val: true |
| SONYC-UST-EventClassification/val: true |
| SONYC-UST-EventClassification/interleaved_knn-val: true |
| emov-db-EmotionClassification/val: true |
| emov-db-EmotionClassification/interleaved_knn-val: true |
| jl-corpus-EmotionClassification/val: true |
| jl-corpus-EmotionClassification/interleaved_knn-val: true |
| tess-EmotionClassification/val: true |
| tess-EmotionClassification/interleaved_knn-val: true |
| OMGEmotion-EmotionClassification/val: true |
| Music-AVQA-AQA_All/val: true |
| Music-AVQA-AQA_All/interleaved_knn-val: true |
| LP-MusicCaps-MTT-AudioCaptioning/val: true |
| LP-MusicCaps-MTT-AudioCaptioning/interleaved_knn-val: true |
| NSynth-MIR/val: true |
| mtg-jamendo-MusicTagging/val: true |
|
|
| clap_config: |
| |
| |
| |
| |
| |
| method: microsoft-clap |
| audio_embed_dim: 1024 |
| config_root: YOUR_REPO_ROOT_DIR/foundation/my_ms_clap/src/configs |
| model_name: 'clapcap' |
| checkpoint: YOUR_DATA_ROOT_DIR/audio-flamingo-data/clap/clapcap_weights_2023.pth |
|
|
| window_length: 7.0 |
| window_overlap: 5.25 |
| max_num_window: 16 |
| max_num_fewshot: 8 |
|
|
| model_config: |
| cache_dir: YOUR_DATA_ROOT_DIR/audio-flamingo-data/LLM_pretrained/.cache |
|
|
| lang_encoder_path: facebook/opt-iml-max-1.3b |
| tokenizer_path: facebook/opt-iml-max-1.3b |
| cross_attn_every_n_layers: 1 |
| audio_transformer_kwargs: { |
| n_head: 8, |
| n_layers: 3, |
| d_inner: 2048, |
| max_num_media: 128, |
| max_window_per_audio: 16, |
| } |