| model: |
| scale_factor: 1.15258426 |
| disable_first_stage_autocast: true |
| log_keys: |
| - txt |
| denoiser_config: |
| target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser |
| params: |
| num_idx: 1000 |
| quantize_c_noise: false |
| weighting_config: |
| target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting |
| scaling_config: |
| target: sgm.modules.diffusionmodules.denoiser_scaling.VideoScaling |
| discretization_config: |
| target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization |
| params: |
| shift_scale: 3.0 |
| network_config: |
| target: dit_video_concat.DiffusionTransformer |
| params: |
| time_embed_dim: 512 |
| elementwise_affine: true |
| num_frames: 49 |
| time_compressed_rate: 4 |
| latent_width: 90 |
| latent_height: 60 |
| num_layers: 30 |
| patch_size: 2 |
| in_channels: 16 |
| out_channels: 16 |
| hidden_size: 1920 |
| adm_in_channels: 256 |
| num_attention_heads: 30 |
| transformer_args: |
| checkpoint_activations: true |
| vocab_size: 1 |
| max_sequence_length: 64 |
| layernorm_order: pre |
| skip_init: false |
| model_parallel_size: 1 |
| is_decoder: false |
| modules: |
| pos_embed_config: |
| target: dit_video_concat.Basic3DPositionEmbeddingMixin |
| params: |
| text_length: 226 |
| height_interpolation: 1.875 |
| width_interpolation: 1.875 |
| patch_embed_config: |
| target: dit_video_concat.ImagePatchEmbeddingMixin |
| params: |
| text_hidden_size: 4096 |
| adaln_layer_config: |
| target: dit_video_concat.AdaLNMixin |
| params: |
| qk_ln: true |
| final_layer_config: |
| target: dit_video_concat.FinalLayerMixin |
| conditioner_config: |
| target: sgm.modules.GeneralConditioner |
| params: |
| emb_models: |
| - is_trainable: false |
| input_key: txt |
| ucg_rate: 0.1 |
| target: sgm.modules.encoders.modules.FrozenT5Embedder |
| params: |
| model_dir: /mnt/lustre/sichenyang.p/code/vla/CogVideo/sat/CogVideoX-2b-sat/t5-v1_1-xxl |
| max_length: 226 |
| first_stage_config: |
| target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper |
| params: |
| cp_size: 1 |
| ckpt_path: /mnt/lustre/sichenyang.p/code/vla/CogVideo/sat/CogVideoX-2b-sat/vae/3d-vae.pt |
| ignore_keys: |
| - loss |
| loss_config: |
| target: torch.nn.Identity |
| regularizer_config: |
| target: vae_modules.regularizers.DiagonalGaussianRegularizer |
| encoder_config: |
| target: vae_modules.cp_enc_dec.ContextParallelEncoder3D |
| params: |
| double_z: true |
| z_channels: 16 |
| resolution: 256 |
| in_channels: 3 |
| out_ch: 3 |
| ch: 128 |
| ch_mult: |
| - 1 |
| - 2 |
| - 2 |
| - 4 |
| attn_resolutions: [] |
| num_res_blocks: 3 |
| dropout: 0.0 |
| gather_norm: true |
| decoder_config: |
| target: vae_modules.cp_enc_dec.ContextParallelDecoder3D |
| params: |
| double_z: true |
| z_channels: 16 |
| resolution: 256 |
| in_channels: 3 |
| out_ch: 3 |
| ch: 128 |
| ch_mult: |
| - 1 |
| - 2 |
| - 2 |
| - 4 |
| attn_resolutions: [] |
| num_res_blocks: 3 |
| dropout: 0.0 |
| gather_norm: false |
| loss_fn_config: |
| target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss |
| params: |
| offset_noise_level: 0 |
| sigma_sampler_config: |
| target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling |
| params: |
| uniform_sampling: true |
| num_idx: 1000 |
| discretization_config: |
| target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization |
| params: |
| shift_scale: 3.0 |
| sampler_config: |
| target: sgm.modules.diffusionmodules.sampling.VPSDEDPMPP2MSampler |
| params: |
| num_steps: 50 |
| verbose: true |
| discretization_config: |
| target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization |
| params: |
| shift_scale: 3.0 |
| guider_config: |
| target: sgm.modules.diffusionmodules.guiders.DynamicCFG |
| params: |
| scale: 6 |
| exp: 5 |
| num_steps: 50 |
| args: |
| checkpoint_activations: true |
| model_parallel_size: 1 |
| experiment_name: dense_exp_6layer_gating_0.00002lr_all_continue |
| mode: finetune |
| load: /mnt/petrelfs/sichenyang.p/code/vla/CogVideo/sat_scy/ckpts_2b_lora/dense_exp_6layer_gating_0.00002lr_all_continue-09-20-12-08 |
| no_load_rng: true |
| train_iters: 100000 |
| eval_iters: 1 |
| eval_interval: 100 |
| eval_batch_size: 1 |
| save: ckpts_2b_lora |
| save_interval: 1000 |
| log_interval: 20 |
| train_data: |
| - /mnt/petrelfs/sichenyang.p/code/video_project/assets/data/mix_high_quality/vimeo+youtube+vecteezy+gen3.json |
| valid_data: |
| - /mnt/lustre/sichenyang.p/code/SD3_Vid/dataset_collection/data/gen3/all.json |
| split: 1,0,0 |
| num_workers: 8 |
| force_train: true |
| only_log_video_latents: true |
| data: |
| target: data_video.PetrelDataset |
| params: |
| video_size: |
| - 480 |
| - 720 |
| fps: 8 |
| max_num_frames: 49 |
| skip_frms_num: 3.0 |
| deepspeed: |
| train_micro_batch_size_per_gpu: 2 |
| gradient_accumulation_steps: 1 |
| steps_per_print: 50 |
| gradient_clipping: 0.1 |
| zero_optimization: |
| stage: 2 |
| cpu_offload: false |
| contiguous_gradients: false |
| overlap_comm: true |
| reduce_scatter: true |
| reduce_bucket_size: 1000000000 |
| allgather_bucket_size: 1000000000 |
| load_from_fp32_weights: false |
| zero_allow_untested_optimizer: true |
| bf16: |
| enabled: false |
| fp16: |
| enabled: true |
| loss_scale: 0 |
| loss_scale_window: 400 |
| hysteresis: 2 |
| min_loss_scale: 1 |
| optimizer: |
| type: sat.ops.FusedEmaAdam |
| params: |
| lr: 2.0e-05 |
| betas: |
| - 0.9 |
| - 0.95 |
| eps: 1.0e-08 |
| weight_decay: 0.0001 |
| activation_checkpointing: |
| partition_activations: false |
| contiguous_memory_optimization: false |
| wall_clock_breakdown: false |
|
|