| model: |
| network: |
| dim: 512 |
| num_timesteps: 1000 |
| depth: 12 |
| dim_head: 64 |
| heads: 12 |
| diffusion: |
| image_embed_dim: ${model.network.dim} |
| timesteps: ${model.network.num_timesteps} |
| cond_drop_prob: 0.2 |
| image_embed_scale: 1.0 |
| text_embed_scale: 1.0 |
| beta_schedule: cosine |
| predict_x_start: true |
| data: |
| bs: 512 |
| format: webdataset |
| path: data/webdataset/sg3-lhq-256-clip/{00000..99}.tar |
| embed_noise_scale: 1.0 |
| sg_pkl: https://huggingface.co/justinpinkney/stylegan3-t-lhq-256/resolve/main/lhq-256-stylegan3-t-25Mimg.pkl |
| clip_variant: ViT-B/32 |
| n_latents: 1 |
| latent_dim: 512 |
| latent_repeats: |
| - 16 |
| val_im_samples: 64 |
| val_text_samples: data/text/landscape-val.txt |
| val_samples_per_text: 4 |
| wandb_project: clip2latent |
| wandb_entity: null |
| name: lhq_noise_1 |
| device: cuda:0 |
| train: |
| znorm_embed: false |
| znorm_latent: true |
| max_it: 1000000 |
| val_it: 10000 |
| lr: 0.0001 |
| weight_decay: 0.01 |
| ema_update_every: 1 |
| ema_beta: 0.99999 |
|
|