batch_size: 24 context_size: 4 datasets: avw_4k: data_folder: /path/to/dataset/avw_4k goals_per_obs: 4 test: /path/to/data_splits/avw_4k/val train: /path/to/data_splits/avw_4k/train distance: max_dist_cat: 16 min_dist_cat: -16 from_checkpoint: logs/training_stage1/checkpoints/latest.pth.tar sample_rate: 16000 input_sr: 48000 tokenizer_a_path: /path/to/pretrained/soundstream.pt grad_clip_val: 10.0 image_size: 224 len_traj_pred: 16 lr: 8.0e-4 model: AVCDiT-B/2 normalize: true num_workers: 12 results_dir: logs run_name: training_stage2 train: true