| train: |
| epoch: 201 |
| batchsize: 8 |
| lr: 5e-5 |
| lr_gamma: 0.1 |
| lr_steps: [30, 40] |
| cos: True |
| checkpoint_every: 3000 |
|
|
| model: |
| target: modules.mage_model.MAGE |
| params: |
| codebook_size: 512 |
| frames_length: 10 |
| image_resolution: 16 |
| vision_width: 512 |
| dropout: 0.2 |
| use_cids: False |
| randomness: True |
| auto_beta: True |
| v_kl: 100 |
|
|
| first_stage_config: |
| target: ldm.models.autoencoder.AutoencoderKL |
| params: |
| monitor: val/rec_loss |
| embed_dim: 4 |
| ckpt_path: "models/autoencoders/kl_f8_cater/last_caterv2.ckpt" |
| ddconfig: |
| double_z: true |
| z_channels: 4 |
| resolution: 128 |
| in_channels: 3 |
| out_ch: 3 |
| ch: 128 |
| ch_mult: |
| - 1 |
| - 2 |
| - 4 |
| - 4 |
| num_res_blocks: 2 |
| attn_resolutions: [ ] |
| dropout: 0.0 |
| lossconfig: |
| target: torch.nn.Identity |
| text_encoder_config: |
| target: modules.mage_model.TransformerTextEncoder |
| params: |
| vocab_size: 50 |
| context_length: 38 |
| transformer_width: 512 |
| transformer_layers: 2 |
| output_dim: 512 |
| padding_idx: 0 |
| dropout: 0.1 |
| ma_config: |
| target: modules.mage_model.MAEncoder |
| params: |
| layers: 1 |
| d_model: 512 |
| generate_decoder_config: |
| target: modules.mage_model.FlatAxialDecoder |
| params: |
| in_channels: 512 |
| out_channels: 4 |
| model_channels: 512 |
| frames_length: 10 |
| layers: 6 |
|
|
| data: |
| target: dataload.CATER |
| params: |
| dataset: 'caterv2' |
| data_root: '../datasets/CATER-GEN-v2' |
| frames_length: 10 |
| sample_speed: [3.0, 6.0] |
| randomness: True |
|
|