| model:
|
| target: cldm.cldm.ControlLDM
|
| params:
|
| linear_start: 0.00085
|
| linear_end: 0.0120
|
| num_timesteps_cond: 1
|
| log_every_t: 200
|
| timesteps: 1000
|
| first_stage_key: "jpg"
|
| cond_stage_key: "txt"
|
| control_key: "hint"
|
| image_size: 64
|
| channels: 4
|
| cond_stage_trainable: false
|
| conditioning_key: crossattn
|
| monitor: val/loss_simple_ema
|
| scale_factor: 0.18215
|
| use_ema: False
|
| only_mid_control: False
|
|
|
| control_stage_config:
|
| target: cldm.cldm.ControlNet
|
| params:
|
| image_size: 32
|
| in_channels: 4
|
| hint_channels: 3
|
| model_channels: 320
|
| attention_resolutions: [ 4, 2, 1 ]
|
| num_res_blocks: 2
|
| channel_mult: [ 1, 2, 4, 4 ]
|
| num_heads: 8
|
| use_spatial_transformer: True
|
| transformer_depth: 1
|
| context_dim: 768
|
| use_checkpoint: True
|
| legacy: False
|
|
|
| unet_config:
|
| target: cldm.cldm.ControlledUnetModel
|
| params:
|
| image_size: 32
|
| in_channels: 4
|
| out_channels: 4
|
| model_channels: 320
|
| attention_resolutions: [ 4, 2, 1 ]
|
| num_res_blocks: 2
|
| channel_mult: [ 1, 2, 4, 4 ]
|
| num_heads: 8
|
| use_spatial_transformer: True
|
| transformer_depth: 1
|
| context_dim: 768
|
| use_checkpoint: True
|
| legacy: False
|
|
|
| first_stage_config:
|
| target: ldm.models.autoencoder.AutoencoderKL
|
| params:
|
| embed_dim: 4
|
| monitor: val/rec_loss
|
| ddconfig:
|
| double_z: true
|
| z_channels: 4
|
| resolution: 256
|
| in_channels: 3
|
| out_ch: 3
|
| ch: 128
|
| ch_mult:
|
| - 1
|
| - 2
|
| - 4
|
| - 4
|
| num_res_blocks: 2
|
| attn_resolutions: []
|
| dropout: 0.0
|
| lossconfig:
|
| target: torch.nn.Identity
|
|
|
| cond_stage_config:
|
| target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
|
|