| unet_additional_kwargs: |
| use_inflated_groupnorm: true |
| unet_use_cross_frame_attention: false |
| unet_use_temporal_attention: false |
| use_motion_module: true |
| motion_module_resolutions: |
| - 1 |
| - 2 |
| - 4 |
| - 8 |
| motion_module_mid_block: true |
| motion_module_decoder_only: false |
| motion_module_type: Vanilla |
| motion_module_kwargs: |
| num_attention_heads: 8 |
| num_transformer_block: 1 |
| attention_block_types: |
| - Temporal_Self |
| - Temporal_Self |
| temporal_position_encoding: true |
| temporal_position_encoding_max_len: 32 |
| temporal_attention_dim_div: 1 |
|
|
| noise_scheduler_kwargs: |
| beta_start: 0.00085 |
| beta_end: 0.012 |
| beta_schedule: "linear" |
| clip_sample: false |
| steps_offset: 1 |
| |
| prediction_type: "v_prediction" |
| rescale_betas_zero_snr: True |
| timestep_spacing: "trailing" |
|
|
| sampler: DDIM |